205 files changed, 186425 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
new file mode 100644
index 000000000000..359876627fce
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -0,0 +1,137 @@
+//===- CoroCleanup.cpp - Coroutine Cleanup Pass ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass lowers all remaining coroutine intrinsics.
+//===----------------------------------------------------------------------===//
+
+#include "CoroInternal.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "coro-cleanup"
+
+namespace {
+// Created on demand if CoroCleanup pass has work to do.
+struct Lowerer : coro::LowererBase {
+  IRBuilder<> Builder;
+  Lowerer(Module &M) : LowererBase(M), Builder(Context) {}
+  bool lowerRemainingCoroIntrinsics(Function &F);
+};
+}
+
+static void simplifyCFG(Function &F) {
+  llvm::legacy::FunctionPassManager FPM(F.getParent());
+  FPM.add(createCFGSimplificationPass());
+
+  FPM.doInitialization();
+  FPM.run(F);
+  FPM.doFinalization();
+}
+
+static void lowerSubFn(IRBuilder<> &Builder, CoroSubFnInst *SubFn) {
+  Builder.SetInsertPoint(SubFn);
+  Value *FrameRaw = SubFn->getFrame();
+  int Index = SubFn->getIndex();
+
+  auto *FrameTy = StructType::get(
+      SubFn->getContext(), {Builder.getInt8PtrTy(), Builder.getInt8PtrTy()});
+  PointerType *FramePtrTy = FrameTy->getPointerTo();
+
+  Builder.SetInsertPoint(SubFn);
+  auto *FramePtr = Builder.CreateBitCast(FrameRaw, FramePtrTy);
+  auto *Gep = Builder.CreateConstInBoundsGEP2_32(FrameTy, FramePtr, 0, Index);
+  auto *Load = Builder.CreateLoad(Gep);
+
+  SubFn->replaceAllUsesWith(Load);
+}
+
+bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) {
+  bool Changed = false;
+
+  for (auto IB = inst_begin(F), E = inst_end(F); IB != E;) {
+    Instruction &I = *IB++;
+    if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+      switch (II->getIntrinsicID()) {
+      default:
+        continue;
+      case Intrinsic::coro_begin:
+        II->replaceAllUsesWith(II->getArgOperand(1));
+        break;
+      case Intrinsic::coro_free:
+        II->replaceAllUsesWith(II->getArgOperand(1));
+        break;
+      case Intrinsic::coro_alloc:
+        II->replaceAllUsesWith(ConstantInt::getTrue(Context));
+        break;
+      case Intrinsic::coro_id:
+        II->replaceAllUsesWith(ConstantTokenNone::get(Context));
+        break;
+      case Intrinsic::coro_subfn_addr:
+        lowerSubFn(Builder, cast<CoroSubFnInst>(II));
+        break;
+      }
+      II->eraseFromParent();
+      Changed = true;
+    }
+  }
+
+  if (Changed) {
+    // After replacement were made we can cleanup the function body a little.
+    simplifyCFG(F);
+  }
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                              Top Level Driver
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+struct CoroCleanup : FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  CoroCleanup() : FunctionPass(ID) {
+    initializeCoroCleanupPass(*PassRegistry::getPassRegistry());
+  }
+
+  std::unique_ptr<Lowerer> L;
+
+  // This pass has work to do only if we find intrinsics we are going to lower
+  // in the module.
+  bool doInitialization(Module &M) override {
+    if (coro::declaresIntrinsics(M, {"llvm.coro.alloc", "llvm.coro.begin",
+                                     "llvm.coro.subfn.addr", "llvm.coro.free",
+                                     "llvm.coro.id"}))
+      L = llvm::make_unique<Lowerer>(M);
+    return false;
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (L)
+      return L->lowerRemainingCoroIntrinsics(F);
+    return false;
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    if (!L)
+      AU.setPreservesAll();
+  }
+  StringRef getPassName() const override { return "Coroutine Cleanup"; }
+};
+}
+
+char CoroCleanup::ID = 0;
+INITIALIZE_PASS(CoroCleanup, "coro-cleanup",
+                "Lower all coroutine related intrinsics", false, false)
+
+Pass *llvm::createCoroCleanupPass() { return new CoroCleanup(); }
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
new file mode 100644
index 000000000000..ba05896af150
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -0,0 +1,223 @@
+//===- CoroEarly.cpp - Coroutine Early Function Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass lowers coroutine intrinsics that hide the details of the exact
+// calling convention for coroutine resume and destroy functions and details of
+// the structure of the coroutine frame.
+//===----------------------------------------------------------------------===//
+
+#include "CoroInternal.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "coro-early"
+
+namespace {
+// Created on demand if CoroEarly pass has work to do.
+class Lowerer : public coro::LowererBase {
+  IRBuilder<> Builder;
+  PointerType *const AnyResumeFnPtrTy;
+
+  void lowerResumeOrDestroy(CallSite CS, CoroSubFnInst::ResumeKind);
+  void lowerCoroPromise(CoroPromiseInst *Intrin);
+  void lowerCoroDone(IntrinsicInst *II);
+
+public:
+  Lowerer(Module &M)
+      : LowererBase(M), Builder(Context),
+        AnyResumeFnPtrTy(FunctionType::get(Type::getVoidTy(Context), Int8Ptr,
+                                           /*isVarArg=*/false)
+                             ->getPointerTo()) {}
+  bool lowerEarlyIntrinsics(Function &F);
+};
+}
+
+// Replace a direct call to coro.resume or coro.destroy with an indirect call to
+// an address returned by coro.subfn.addr intrinsic. This is done so that
+// CGPassManager recognizes devirtualization when CoroElide pass replaces a call
+// to coro.subfn.addr with an appropriate function address.
+void Lowerer::lowerResumeOrDestroy(CallSite CS,
+                                   CoroSubFnInst::ResumeKind Index) {
+  Value *ResumeAddr =
+      makeSubFnCall(CS.getArgOperand(0), Index, CS.getInstruction());
+  CS.setCalledFunction(ResumeAddr);
+  CS.setCallingConv(CallingConv::Fast);
+}
+
+// Coroutine promise field is always at the fixed offset from the beginning of
+// the coroutine frame. i8* coro.promise(i8*, i1 from) intrinsic adds an offset
+// to a passed pointer to move from coroutine frame to coroutine promise and
+// vice versa. Since we don't know exactly which coroutine frame it is, we build
+// a coroutine frame mock up starting with two function pointers, followed by a
+// properly aligned coroutine promise field.
+// TODO: Handle the case when coroutine promise alloca has align override.
+void Lowerer::lowerCoroPromise(CoroPromiseInst *Intrin) {
+  Value *Operand = Intrin->getArgOperand(0);
+  unsigned Alignement = Intrin->getAlignment();
+  Type *Int8Ty = Builder.getInt8Ty();
+
+  auto *SampleStruct =
+      StructType::get(Context, {AnyResumeFnPtrTy, AnyResumeFnPtrTy, Int8Ty});
+  const DataLayout &DL = TheModule.getDataLayout();
+  int64_t Offset = alignTo(
+      DL.getStructLayout(SampleStruct)->getElementOffset(2), Alignement);
+  if (Intrin->isFromPromise())
+    Offset = -Offset;
+
+  Builder.SetInsertPoint(Intrin);
+  Value *Replacement =
+      Builder.CreateConstInBoundsGEP1_32(Int8Ty, Operand, Offset);
+
+  Intrin->replaceAllUsesWith(Replacement);
+  Intrin->eraseFromParent();
+}
+
+// When a coroutine reaches final suspend point, it zeros out ResumeFnAddr in
+// the coroutine frame (it is UB to resume from a final suspend point).
+// The llvm.coro.done intrinsic is used to check whether a coroutine is
+// suspended at the final suspend point or not.
+void Lowerer::lowerCoroDone(IntrinsicInst *II) {
+  Value *Operand = II->getArgOperand(0);
+
+  // ResumeFnAddr is the first pointer sized element of the coroutine frame.
+  auto *FrameTy = Int8Ptr;
+  PointerType *FramePtrTy = FrameTy->getPointerTo();
+
+  Builder.SetInsertPoint(II);
+  auto *BCI = Builder.CreateBitCast(Operand, FramePtrTy);
+  auto *Gep = Builder.CreateConstInBoundsGEP1_32(FrameTy, BCI, 0);
+  auto *Load = Builder.CreateLoad(Gep);
+  auto *Cond = Builder.CreateICmpEQ(Load, NullPtr);
+
+  II->replaceAllUsesWith(Cond);
+  II->eraseFromParent();
+}
+
+// Prior to CoroSplit, calls to coro.begin needs to be marked as NoDuplicate,
+// as CoroSplit assumes there is exactly one coro.begin. After CoroSplit,
+// NoDuplicate attribute will be removed from coro.begin otherwise, it will
+// interfere with inlining.
+static void setCannotDuplicate(CoroIdInst *CoroId) {
+  for (User *U : CoroId->users())
+    if (auto *CB = dyn_cast<CoroBeginInst>(U))
+      CB->setCannotDuplicate();
+}
+
+bool Lowerer::lowerEarlyIntrinsics(Function &F) {
+  bool Changed = false;
+  CoroIdInst *CoroId = nullptr;
+  SmallVector<CoroFreeInst *, 4> CoroFrees;
+  for (auto IB = inst_begin(F), IE = inst_end(F); IB != IE;) {
+    Instruction &I = *IB++;
+    if (auto CS = CallSite(&I)) {
+      switch (CS.getIntrinsicID()) {
+      default:
+        continue;
+      case Intrinsic::coro_free:
+        CoroFrees.push_back(cast<CoroFreeInst>(&I));
+        break;
+      case Intrinsic::coro_suspend:
+        // Make sure that final suspend point is not duplicated as CoroSplit
+        // pass expects that there is at most one final suspend point.
+        if (cast<CoroSuspendInst>(&I)->isFinal())
+          CS.setCannotDuplicate();
+        break;
+      case Intrinsic::coro_end:
+        // Make sure that fallthrough coro.end is not duplicated as CoroSplit
+        // pass expects that there is at most one fallthrough coro.end.
+        if (cast<CoroEndInst>(&I)->isFallthrough())
+          CS.setCannotDuplicate();
+        break;
+      case Intrinsic::coro_id:
+        // Mark a function that comes out of the frontend that has a coro.id
+        // with a coroutine attribute.
+        if (auto *CII = cast<CoroIdInst>(&I)) {
+          if (CII->getInfo().isPreSplit()) {
+            F.addFnAttr(CORO_PRESPLIT_ATTR, UNPREPARED_FOR_SPLIT);
+            setCannotDuplicate(CII);
+            CII->setCoroutineSelf();
+            CoroId = cast<CoroIdInst>(&I);
+          }
+        }
+        break;
+      case Intrinsic::coro_resume:
+        lowerResumeOrDestroy(CS, CoroSubFnInst::ResumeIndex);
+        break;
+      case Intrinsic::coro_destroy:
+        lowerResumeOrDestroy(CS, CoroSubFnInst::DestroyIndex);
+        break;
+      case Intrinsic::coro_promise:
+        lowerCoroPromise(cast<CoroPromiseInst>(&I));
+        break;
+      case Intrinsic::coro_done:
+        lowerCoroDone(cast<IntrinsicInst>(&I));
+        break;
+      }
+      Changed = true;
+    }
+  }
+  // Make sure that all CoroFree reference the coro.id intrinsic.
+  // Token type is not exposed through coroutine C/C++ builtins to plain C, so
+  // we allow specifying none and fixing it up here.
+  if (CoroId)
+    for (CoroFreeInst *CF : CoroFrees)
+      CF->setArgOperand(0, CoroId);
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                              Top Level Driver
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+struct CoroEarly : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid.
+  CoroEarly() : FunctionPass(ID) {
+    initializeCoroEarlyPass(*PassRegistry::getPassRegistry());
+  }
+
+  std::unique_ptr<Lowerer> L;
+
+  // This pass has work to do only if we find intrinsics we are going to lower
+  // in the module.
+  bool doInitialization(Module &M) override {
+    if (coro::declaresIntrinsics(M, {"llvm.coro.id", "llvm.coro.destroy",
+                                     "llvm.coro.done", "llvm.coro.end",
+                                     "llvm.coro.free", "llvm.coro.promise",
+                                     "llvm.coro.resume", "llvm.coro.suspend"}))
+      L = llvm::make_unique<Lowerer>(M);
+    return false;
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (!L)
+      return false;
+
+    return L->lowerEarlyIntrinsics(F);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+  StringRef getPassName() const override {
+    return "Lower early coroutine intrinsics";
+  }
+};
+}
+
+char CoroEarly::ID = 0;
+INITIALIZE_PASS(CoroEarly, "coro-early", "Lower early coroutine intrinsics",
+                false, false)
+
+Pass *llvm::createCoroEarlyPass() { return new CoroEarly(); }
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp
new file mode 100644
index 000000000000..42fd6d746145
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -0,0 +1,321 @@
+//===- CoroElide.cpp - Coroutine Frame Allocation Elision Pass ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass replaces dynamic allocation of coroutine frame with alloca and
+// replaces calls to llvm.coro.resume and llvm.coro.destroy with direct calls
+// to coroutine sub-functions.
+//===----------------------------------------------------------------------===//
+
+#include "CoroInternal.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "coro-elide"
+
+namespace {
+// Created on demand if CoroElide pass has work to do.
+struct Lowerer : coro::LowererBase {
+  SmallVector<CoroIdInst *, 4> CoroIds;
+  SmallVector<CoroBeginInst *, 1> CoroBegins;
+  SmallVector<CoroAllocInst *, 1> CoroAllocs;
+  SmallVector<CoroSubFnInst *, 4> ResumeAddr;
+  SmallVector<CoroSubFnInst *, 4> DestroyAddr;
+  SmallVector<CoroFreeInst *, 1> CoroFrees;
+
+  Lowerer(Module &M) : LowererBase(M) {}
+
+  void elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA);
+  bool shouldElide() const;
+  bool processCoroId(CoroIdInst *, AAResults &AA);
+};
+} // end anonymous namespace
+
+// Go through the list of coro.subfn.addr intrinsics and replace them with the
+// provided constant.
+static void replaceWithConstant(Constant *Value,
+                                SmallVectorImpl<CoroSubFnInst *> &Users) {
+  if (Users.empty())
+    return;
+
+  // See if we need to bitcast the constant to match the type of the intrinsic
+  // being replaced. Note: All coro.subfn.addr intrinsics return the same type,
+  // so we only need to examine the type of the first one in the list.
+  Type *IntrTy = Users.front()->getType();
+  Type *ValueTy = Value->getType();
+  if (ValueTy != IntrTy) {
+    // May need to tweak the function type to match the type expected at the
+    // use site.
+    assert(ValueTy->isPointerTy() && IntrTy->isPointerTy());
+    Value = ConstantExpr::getBitCast(Value, IntrTy);
+  }
+
+  // Now the value type matches the type of the intrinsic. Replace them all!
+  for (CoroSubFnInst *I : Users)
+    replaceAndRecursivelySimplify(I, Value);
+}
+
+// See if any operand of the call instruction references the coroutine frame.
+static bool operandReferences(CallInst *CI, AllocaInst *Frame, AAResults &AA) {
+  for (Value *Op : CI->operand_values())
+    if (AA.alias(Op, Frame) != NoAlias)
+      return true;
+  return false;
+}
+
+// Look for any tail calls referencing the coroutine frame and remove tail
+// attribute from them, since now coroutine frame resides on the stack and tail
+// call implies that the function does not references anything on the stack.
+static void removeTailCallAttribute(AllocaInst *Frame, AAResults &AA) {
+  Function &F = *Frame->getFunction();
+  MemoryLocation Mem(Frame);
+  for (Instruction &I : instructions(F))
+    if (auto *Call = dyn_cast<CallInst>(&I))
+      if (Call->isTailCall() && operandReferences(Call, Frame, AA)) {
+        // FIXME: If we ever hit this check. Evaluate whether it is more
+        // appropriate to retain musttail and allow the code to compile.
+        if (Call->isMustTailCall())
+          report_fatal_error("Call referring to the coroutine frame cannot be "
+                             "marked as musttail");
+        Call->setTailCall(false);
+      }
+}
+
+// Given a resume function @f.resume(%f.frame* %frame), returns %f.frame type.
+static Type *getFrameType(Function *Resume) {
+  auto *ArgType = Resume->arg_begin()->getType();
+  return cast<PointerType>(ArgType)->getElementType();
+}
+
+// Finds first non alloca instruction in the entry block of a function.
+static Instruction *getFirstNonAllocaInTheEntryBlock(Function *F) {
+  for (Instruction &I : F->getEntryBlock())
+    if (!isa<AllocaInst>(&I))
+      return &I;
+  llvm_unreachable("no terminator in the entry block");
+}
+
+// To elide heap allocations we need to suppress code blocks guarded by
+// llvm.coro.alloc and llvm.coro.free instructions.
+void Lowerer::elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA) {
+  LLVMContext &C = FrameTy->getContext();
+  auto *InsertPt =
+      getFirstNonAllocaInTheEntryBlock(CoroIds.front()->getFunction());
+
+  // Replacing llvm.coro.alloc with false will suppress dynamic
+  // allocation as it is expected for the frontend to generate the code that
+  // looks like:
+  //   id = coro.id(...)
+  //   mem = coro.alloc(id) ? malloc(coro.size()) : 0;
+  //   coro.begin(id, mem)
+  auto *False = ConstantInt::getFalse(C);
+  for (auto *CA : CoroAllocs) {
+    CA->replaceAllUsesWith(False);
+    CA->eraseFromParent();
+  }
+
+  // FIXME: Design how to transmit alignment information for every alloca that
+  // is spilled into the coroutine frame and recreate the alignment information
+  // here. Possibly we will need to do a mini SROA here and break the coroutine
+  // frame into individual AllocaInst recreating the original alignment.
+  const DataLayout &DL = F->getParent()->getDataLayout();
+  auto *Frame = new AllocaInst(FrameTy, DL.getAllocaAddrSpace(), "", InsertPt);
+  auto *FrameVoidPtr =
+      new BitCastInst(Frame, Type::getInt8PtrTy(C), "vFrame", InsertPt);
+
+  for (auto *CB : CoroBegins) {
+    CB->replaceAllUsesWith(FrameVoidPtr);
+    CB->eraseFromParent();
+  }
+
+  // Since now coroutine frame lives on the stack we need to make sure that
+  // any tail call referencing it, must be made non-tail call.
+  removeTailCallAttribute(Frame, AA);
+}
+
+bool Lowerer::shouldElide() const {
+  // If no CoroAllocs, we cannot suppress allocation, so elision is not
+  // possible.
+  if (CoroAllocs.empty())
+    return false;
+
+  // Check that for every coro.begin there is a coro.destroy directly
+  // referencing the SSA value of that coro.begin. If the value escaped, then
+  // coro.destroy would have been referencing a memory location storing that
+  // value and not the virtual register.
+
+  SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
+
+  for (CoroSubFnInst *DA : DestroyAddr) {
+    if (auto *CB = dyn_cast<CoroBeginInst>(DA->getFrame()))
+      ReferencedCoroBegins.insert(CB);
+    else
+      return false;
+  }
+
+  // If size of the set is the same as total number of CoroBegins, means we
+  // found a coro.free or coro.destroy mentioning a coro.begin and we can
+  // perform heap elision.
+  return ReferencedCoroBegins.size() == CoroBegins.size();
+}
+
+bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA) {
+  CoroBegins.clear();
+  CoroAllocs.clear();
+  CoroFrees.clear();
+  ResumeAddr.clear();
+  DestroyAddr.clear();
+
+  // Collect all coro.begin and coro.allocs associated with this coro.id.
+  for (User *U : CoroId->users()) {
+    if (auto *CB = dyn_cast<CoroBeginInst>(U))
+      CoroBegins.push_back(CB);
+    else if (auto *CA = dyn_cast<CoroAllocInst>(U))
+      CoroAllocs.push_back(CA);
+    else if (auto *CF = dyn_cast<CoroFreeInst>(U))
+      CoroFrees.push_back(CF);
+  }
+
+  // Collect all coro.subfn.addrs associated with coro.begin.
+  // Note, we only devirtualize the calls if their coro.subfn.addr refers to
+  // coro.begin directly. If we run into cases where this check is too
+  // conservative, we can consider relaxing the check.
+  for (CoroBeginInst *CB : CoroBegins) {
+    for (User *U : CB->users())
+      if (auto *II = dyn_cast<CoroSubFnInst>(U))
+        switch (II->getIndex()) {
+        case CoroSubFnInst::ResumeIndex:
+          ResumeAddr.push_back(II);
+          break;
+        case CoroSubFnInst::DestroyIndex:
+          DestroyAddr.push_back(II);
+          break;
+        default:
+          llvm_unreachable("unexpected coro.subfn.addr constant");
+        }
+  }
+
+  // PostSplit coro.id refers to an array of subfunctions in its Info
+  // argument.
+  ConstantArray *Resumers = CoroId->getInfo().Resumers;
+  assert(Resumers && "PostSplit coro.id Info argument must refer to an array"
+                     "of coroutine subfunctions");
+  auto *ResumeAddrConstant =
+      ConstantExpr::getExtractValue(Resumers, CoroSubFnInst::ResumeIndex);
+
+  replaceWithConstant(ResumeAddrConstant, ResumeAddr);
+
+  bool ShouldElide = shouldElide();
+
+  auto *DestroyAddrConstant = ConstantExpr::getExtractValue(
+      Resumers,
+      ShouldElide ? CoroSubFnInst::CleanupIndex : CoroSubFnInst::DestroyIndex);
+
+  replaceWithConstant(DestroyAddrConstant, DestroyAddr);
+
+  if (ShouldElide) {
+    auto *FrameTy = getFrameType(cast<Function>(ResumeAddrConstant));
+    elideHeapAllocations(CoroId->getFunction(), FrameTy, AA);
+    coro::replaceCoroFree(CoroId, /*Elide=*/true);
+  }
+
+  return true;
+}
+
+// See if there are any coro.subfn.addr instructions referring to coro.devirt
+// trigger, if so, replace them with a direct call to devirt trigger function.
+static bool replaceDevirtTrigger(Function &F) {
+  SmallVector<CoroSubFnInst *, 1> DevirtAddr;
+  for (auto &I : instructions(F))
+    if (auto *SubFn = dyn_cast<CoroSubFnInst>(&I))
+      if (SubFn->getIndex() == CoroSubFnInst::RestartTrigger)
+        DevirtAddr.push_back(SubFn);
+
+  if (DevirtAddr.empty())
+    return false;
+
+  Module &M = *F.getParent();
+  Function *DevirtFn = M.getFunction(CORO_DEVIRT_TRIGGER_FN);
+  assert(DevirtFn && "coro.devirt.fn not found");
+  replaceWithConstant(DevirtFn, DevirtAddr);
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                              Top Level Driver
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct CoroElide : FunctionPass {
+  static char ID;
+  CoroElide() : FunctionPass(ID) {
+    initializeCoroElidePass(*PassRegistry::getPassRegistry());
+  }
+
+  std::unique_ptr<Lowerer> L;
+
+  bool doInitialization(Module &M) override {
+    if (coro::declaresIntrinsics(M, {"llvm.coro.id"}))
+      L = llvm::make_unique<Lowerer>(M);
+    return false;
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (!L)
+      return false;
+
+    bool Changed = false;
+
+    if (F.hasFnAttribute(CORO_PRESPLIT_ATTR))
+      Changed = replaceDevirtTrigger(F);
+
+    L->CoroIds.clear();
+
+    // Collect all PostSplit coro.ids.
+    for (auto &I : instructions(F))
+      if (auto *CII = dyn_cast<CoroIdInst>(&I))
+        if (CII->getInfo().isPostSplit())
+          // If it is the coroutine itself, don't touch it.
+          if (CII->getCoroutine() != CII->getFunction())
+            L->CoroIds.push_back(CII);
+
+    // If we did not find any coro.id, there is nothing to do.
+    if (L->CoroIds.empty())
+      return Changed;
+
+    AAResults &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+    for (auto *CII : L->CoroIds)
+      Changed |= L->processCoroId(CII, AA);
+
+    return Changed;
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+  }
+  StringRef getPassName() const override { return "Coroutine Elision"; }
+};
+}
+
+char CoroElide::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    CoroElide, "coro-elide",
+    "Coroutine frame allocation elision and indirect calls replacement", false,
+    false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(
+    CoroElide, "coro-elide",
+    "Coroutine frame allocation elision and indirect calls replacement", false,
+    false)
+
+Pass *llvm::createCoroElidePass() { return new CoroElide(); }
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
new file mode 100644
index 000000000000..85e9003ec3c5
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -0,0 +1,862 @@
+//===- CoroFrame.cpp - Builds and manipulates coroutine frame -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file contains classes used to discover if for a particular value
+// there from sue to definition that crosses a suspend block.
+//
+// Using the information discovered we form a Coroutine Frame structure to
+// contain those values. All uses of those values are replaced with appropriate
+// GEP + load from the coroutine frame. At the point of the definition we spill
+// the value into the coroutine frame.
+//
+// TODO: pack values tightly using liveness info.
+//===----------------------------------------------------------------------===//
+
+#include "CoroInternal.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/circular_raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+// The "coro-suspend-crossing" flag is very noisy. There is another debug type,
+// "coro-frame", which results in leaner debug spew.
+#define DEBUG_TYPE "coro-suspend-crossing"
+
+enum { SmallVectorThreshold = 32 };
+
+// Provides two way mapping between the blocks and numbers.
+namespace {
+class BlockToIndexMapping {
+  SmallVector<BasicBlock *, SmallVectorThreshold> V;
+
+public:
+  size_t size() const { return V.size(); }
+
+  BlockToIndexMapping(Function &F) {
+    for (BasicBlock &BB : F)
+      V.push_back(&BB);
+    std::sort(V.begin(), V.end());
+  }
+
+  size_t blockToIndex(BasicBlock *BB) const {
+    auto *I = std::lower_bound(V.begin(), V.end(), BB);
+    assert(I != V.end() && *I == BB && "BasicBlockNumberng: Unknown block");
+    return I - V.begin();
+  }
+
+  BasicBlock *indexToBlock(unsigned Index) const { return V[Index]; }
+};
+} // end anonymous namespace
+
+// The SuspendCrossingInfo maintains data that allows to answer a question
+// whether given two BasicBlocks A and B there is a path from A to B that
+// passes through a suspend point.
+//
+// For every basic block 'i' it maintains a BlockData that consists of:
+//   Consumes:  a bit vector which contains a set of indices of blocks that can
+//              reach block 'i'
+//   Kills: a bit vector which contains a set of indices of blocks that can
+//          reach block 'i', but one of the path will cross a suspend point
+//   Suspend: a boolean indicating whether block 'i' contains a suspend point.
+//   End: a boolean indicating whether block 'i' contains a coro.end intrinsic.
+//
+namespace {
+struct SuspendCrossingInfo {
+  BlockToIndexMapping Mapping;
+
+  struct BlockData {
+    BitVector Consumes;
+    BitVector Kills;
+    bool Suspend = false;
+    bool End = false;
+  };
+  SmallVector<BlockData, SmallVectorThreshold> Block;
+
+  iterator_range<succ_iterator> successors(BlockData const &BD) const {
+    BasicBlock *BB = Mapping.indexToBlock(&BD - &Block[0]);
+    return llvm::successors(BB);
+  }
+
+  BlockData &getBlockData(BasicBlock *BB) {
+    return Block[Mapping.blockToIndex(BB)];
+  }
+
+  void dump() const;
+  void dump(StringRef Label, BitVector const &BV) const;
+
+  SuspendCrossingInfo(Function &F, coro::Shape &Shape);
+
+  bool hasPathCrossingSuspendPoint(BasicBlock *DefBB, BasicBlock *UseBB) const {
+    size_t const DefIndex = Mapping.blockToIndex(DefBB);
+    size_t const UseIndex = Mapping.blockToIndex(UseBB);
+
+    assert(Block[UseIndex].Consumes[DefIndex] && "use must consume def");
+    bool const Result = Block[UseIndex].Kills[DefIndex];
+    DEBUG(dbgs() << UseBB->getName() << " => " << DefBB->getName()
+                 << " answer is " << Result << "\n");
+    return Result;
+  }
+
+  bool isDefinitionAcrossSuspend(BasicBlock *DefBB, User *U) const {
+    auto *I = cast<Instruction>(U);
+
+    // We rewrote PHINodes, so that only the ones with exactly one incoming
+    // value need to be analyzed.
+    if (auto *PN = dyn_cast<PHINode>(I))
+      if (PN->getNumIncomingValues() > 1)
+        return false;
+
+    BasicBlock *UseBB = I->getParent();
+    return hasPathCrossingSuspendPoint(DefBB, UseBB);
+  }
+
+  bool isDefinitionAcrossSuspend(Argument &A, User *U) const {
+    return isDefinitionAcrossSuspend(&A.getParent()->getEntryBlock(), U);
+  }
+
+  bool isDefinitionAcrossSuspend(Instruction &I, User *U) const {
+    return isDefinitionAcrossSuspend(I.getParent(), U);
+  }
+};
+} // end anonymous namespace
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void SuspendCrossingInfo::dump(StringRef Label,
+                                                BitVector const &BV) const {
+  dbgs() << Label << ":";
+  for (size_t I = 0, N = BV.size(); I < N; ++I)
+    if (BV[I])
+      dbgs() << " " << Mapping.indexToBlock(I)->getName();
+  dbgs() << "\n";
+}
+
+LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const {
+  for (size_t I = 0, N = Block.size(); I < N; ++I) {
+    BasicBlock *const B = Mapping.indexToBlock(I);
+    dbgs() << B->getName() << ":\n";
+    dump("   Consumes", Block[I].Consumes);
+    dump("      Kills", Block[I].Kills);
+  }
+  dbgs() << "\n";
+}
+#endif
+
+SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
+    : Mapping(F) {
+  const size_t N = Mapping.size();
+  Block.resize(N);
+
+  // Initialize every block so that it consumes itself
+  for (size_t I = 0; I < N; ++I) {
+    auto &B = Block[I];
+    B.Consumes.resize(N);
+    B.Kills.resize(N);
+    B.Consumes.set(I);
+  }
+
+  // Mark all CoroEnd Blocks. We do not propagate Kills beyond coro.ends as
+  // the code beyond coro.end is reachable during initial invocation of the
+  // coroutine.
+  for (auto *CE : Shape.CoroEnds)
+    getBlockData(CE->getParent()).End = true;
+
+  // Mark all suspend blocks and indicate that they kill everything they
+  // consume. Note, that crossing coro.save also requires a spill, as any code
+  // between coro.save and coro.suspend may resume the coroutine and all of the
+  // state needs to be saved by that time.
+  auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) {
+    BasicBlock *SuspendBlock = BarrierInst->getParent();
+    auto &B = getBlockData(SuspendBlock);
+    B.Suspend = true;
+    B.Kills |= B.Consumes;
+  };
+  for (CoroSuspendInst *CSI : Shape.CoroSuspends) {
+    markSuspendBlock(CSI);
+    markSuspendBlock(CSI->getCoroSave());
+  }
+
+  // Iterate propagating consumes and kills until they stop changing.
+  int Iteration = 0;
+  (void)Iteration;
+
+  bool Changed;
+  do {
+    DEBUG(dbgs() << "iteration " << ++Iteration);
+    DEBUG(dbgs() << "==============\n");
+
+    Changed = false;
+    for (size_t I = 0; I < N; ++I) {
+      auto &B = Block[I];
+      for (BasicBlock *SI : successors(B)) {
+
+        auto SuccNo = Mapping.blockToIndex(SI);
+
+        // Saved Consumes and Kills bitsets so that it is easy to see
+        // if anything changed after propagation.
+        auto &S = Block[SuccNo];
+        auto SavedConsumes = S.Consumes;
+        auto SavedKills = S.Kills;
+
+        // Propagate Kills and Consumes from block B into its successor S.
+        S.Consumes |= B.Consumes;
+        S.Kills |= B.Kills;
+
+        // If block B is a suspend block, it should propagate kills into the
+        // its successor for every block B consumes.
+        if (B.Suspend) {
+          S.Kills |= B.Consumes;
+        }
+        if (S.Suspend) {
+          // If block S is a suspend block, it should kill all of the blocks it
+          // consumes.
+          S.Kills |= S.Consumes;
+        } else if (S.End) {
+          // If block S is an end block, it should not propagate kills as the
+          // blocks following coro.end() are reached during initial invocation
+          // of the coroutine while all the data are still available on the
+          // stack or in the registers.
+          S.Kills.reset();
+        } else {
+          // This is reached when S block it not Suspend nor coro.end and it
+          // need to make sure that it is not in the kill set.
+          S.Kills.reset(SuccNo);
+        }
+
+        // See if anything changed.
+        Changed |= (S.Kills != SavedKills) || (S.Consumes != SavedConsumes);
+
+        if (S.Kills != SavedKills) {
+          DEBUG(dbgs() << "\nblock " << I << " follower " << SI->getName()
+                       << "\n");
+          DEBUG(dump("S.Kills", S.Kills));
+          DEBUG(dump("SavedKills", SavedKills));
+        }
+        if (S.Consumes != SavedConsumes) {
+          DEBUG(dbgs() << "\nblock " << I << " follower " << SI << "\n");
+          DEBUG(dump("S.Consume", S.Consumes));
+          DEBUG(dump("SavedCons", SavedConsumes));
+        }
+      }
+    }
+  } while (Changed);
+  DEBUG(dump());
+}
+
+#undef DEBUG_TYPE // "coro-suspend-crossing"
+#define DEBUG_TYPE "coro-frame"
+
+// We build up the list of spills for every case where a use is separated
+// from the definition by a suspend point.
+
+struct Spill : std::pair<Value *, Instruction *> {
+  using base = std::pair<Value *, Instruction *>;
+
+  Spill(Value *Def, User *U) : base(Def, cast<Instruction>(U)) {}
+
+  Value *def() const { return first; }
+  Instruction *user() const { return second; }
+  BasicBlock *userBlock() const { return second->getParent(); }
+
+  std::pair<Value *, BasicBlock *> getKey() const {
+    return {def(), userBlock()};
+  }
+
+  bool operator<(Spill const &rhs) const { return getKey() < rhs.getKey(); }
+};
+
+// Note that there may be more than one record with the same value of Def in
+// the SpillInfo vector.
+using SpillInfo = SmallVector<Spill, 8>;
+
+#ifndef NDEBUG
+static void dump(StringRef Title, SpillInfo const &Spills) {
+  dbgs() << "------------- " << Title << "--------------\n";
+  Value *CurrentValue = nullptr;
+  for (auto const &E : Spills) {
+    if (CurrentValue != E.def()) {
+      CurrentValue = E.def();
+      CurrentValue->dump();
+    }
+    dbgs() << "   user: ";
+    E.user()->dump();
+  }
+}
+#endif
+
+// Build a struct that will keep state for an active coroutine.
+//   struct f.frame {
+//     ResumeFnTy ResumeFnAddr;
+//     ResumeFnTy DestroyFnAddr;
+//     int ResumeIndex;
+//     ... promise (if present) ...
+//     ... spills ...
+//   };
+static StructType *buildFrameType(Function &F, coro::Shape &Shape,
+                                  SpillInfo &Spills) {
+  LLVMContext &C = F.getContext();
+  SmallString<32> Name(F.getName());
+  Name.append(".Frame");
+  StructType *FrameTy = StructType::create(C, Name);
+  auto *FramePtrTy = FrameTy->getPointerTo();
+  auto *FnTy = FunctionType::get(Type::getVoidTy(C), FramePtrTy,
+                                 /*IsVarArgs=*/false);
+  auto *FnPtrTy = FnTy->getPointerTo();
+
+  // Figure out how wide should be an integer type storing the suspend index.
+  unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size()));
+  Type *PromiseType = Shape.PromiseAlloca
+                          ? Shape.PromiseAlloca->getType()->getElementType()
+                          : Type::getInt1Ty(C);
+  SmallVector<Type *, 8> Types{FnPtrTy, FnPtrTy, PromiseType,
+                               Type::getIntNTy(C, IndexBits)};
+  Value *CurrentDef = nullptr;
+
+  // Create an entry for every spilled value.
+  for (auto const &S : Spills) {
+    if (CurrentDef == S.def())
+      continue;
+
+    CurrentDef = S.def();
+    // PromiseAlloca was already added to Types array earlier.
+    if (CurrentDef == Shape.PromiseAlloca)
+      continue;
+
+    Type *Ty = nullptr;
+    if (auto *AI = dyn_cast<AllocaInst>(CurrentDef))
+      Ty = AI->getAllocatedType();
+    else
+      Ty = CurrentDef->getType();
+
+    Types.push_back(Ty);
+  }
+  FrameTy->setBody(Types);
+
+  return FrameTy;
+}
+
+// We need to make room to insert a spill after initial PHIs, but before
+// catchswitch instruction. Placing it before violates the requirement that
+// catchswitch, like all other EHPads must be the first nonPHI in a block.
+//
+// Split away catchswitch into a separate block and insert in its place:
+//
+//   cleanuppad <InsertPt> cleanupret.
+//
+// cleanupret instruction will act as an insert point for the spill.
+static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) {
+  BasicBlock *CurrentBlock = CatchSwitch->getParent();
+  BasicBlock *NewBlock = CurrentBlock->splitBasicBlock(CatchSwitch);
+  CurrentBlock->getTerminator()->eraseFromParent();
+
+  auto *CleanupPad =
+      CleanupPadInst::Create(CatchSwitch->getParentPad(), {}, "", CurrentBlock);
+  auto *CleanupRet =
+      CleanupReturnInst::Create(CleanupPad, NewBlock, CurrentBlock);
+  return CleanupRet;
+}
+
+// Replace all alloca and SSA values that are accessed across suspend points
+// with GetElementPointer from coroutine frame + loads and stores. Create an
+// AllocaSpillBB that will become the new entry block for the resume parts of
+// the coroutine:
+//
+//    %hdl = coro.begin(...)
+//    whatever
+//
+// becomes:
+//
+//    %hdl = coro.begin(...)
+//    %FramePtr = bitcast i8* hdl to %f.frame*
+//    br label %AllocaSpillBB
+//
+//  AllocaSpillBB:
+//    ; geps corresponding to allocas that were moved to coroutine frame
+//    br label PostSpill
+//
+//  PostSpill:
+//    whatever
+//
+//
+static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
+  auto *CB = Shape.CoroBegin;
+  IRBuilder<> Builder(CB->getNextNode());
+  PointerType *FramePtrTy = Shape.FrameTy->getPointerTo();
+  auto *FramePtr =
+      cast<Instruction>(Builder.CreateBitCast(CB, FramePtrTy, "FramePtr"));
+  Type *FrameTy = FramePtrTy->getElementType();
+
+  Value *CurrentValue = nullptr;
+  BasicBlock *CurrentBlock = nullptr;
+  Value *CurrentReload = nullptr;
+  unsigned Index = coro::Shape::LastKnownField;
+
+  // We need to keep track of any allocas that need "spilling"
+  // since they will live in the coroutine frame now, all access to them
+  // need to be changed, not just the access across suspend points
+  // we remember allocas and their indices to be handled once we processed
+  // all the spills.
+  SmallVector<std::pair<AllocaInst *, unsigned>, 4> Allocas;
+  // Promise alloca (if present) has a fixed field number (Shape::PromiseField)
+  if (Shape.PromiseAlloca)
+    Allocas.emplace_back(Shape.PromiseAlloca, coro::Shape::PromiseField);
+
+  // Create a load instruction to reload the spilled value from the coroutine
+  // frame.
+  auto CreateReload = [&](Instruction *InsertBefore) {
+    Builder.SetInsertPoint(InsertBefore);
+    auto *G = Builder.CreateConstInBoundsGEP2_32(FrameTy, FramePtr, 0, Index,
+                                                 CurrentValue->getName() +
+                                                     Twine(".reload.addr"));
+    return isa<AllocaInst>(CurrentValue)
+               ? G
+               : Builder.CreateLoad(G,
+                                    CurrentValue->getName() + Twine(".reload"));
+  };
+
+  for (auto const &E : Spills) {
+    // If we have not seen the value, generate a spill.
+    if (CurrentValue != E.def()) {
+      CurrentValue = E.def();
+      CurrentBlock = nullptr;
+      CurrentReload = nullptr;
+
+      ++Index;
+
+      if (auto *AI = dyn_cast<AllocaInst>(CurrentValue)) {
+        // Spilled AllocaInst will be replaced with GEP from the coroutine frame
+        // there is no spill required.
+        Allocas.emplace_back(AI, Index);
+        if (!AI->isStaticAlloca())
+          report_fatal_error("Coroutines cannot handle non static allocas yet");
+      } else {
+        // Otherwise, create a store instruction storing the value into the
+        // coroutine frame.
+
+        Instruction *InsertPt = nullptr;
+        if (isa<Argument>(CurrentValue)) {
+          // For arguments, we will place the store instruction right after
+          // the coroutine frame pointer instruction, i.e. bitcast of
+          // coro.begin from i8* to %f.frame*.
+          InsertPt = FramePtr->getNextNode();
+        } else if (auto *II = dyn_cast<InvokeInst>(CurrentValue)) {
+          // If we are spilling the result of the invoke instruction, split the
+          // normal edge and insert the spill in the new block.
+          auto NewBB = SplitEdge(II->getParent(), II->getNormalDest());
+          InsertPt = NewBB->getTerminator();
+        } else if (dyn_cast<PHINode>(CurrentValue)) {
+          // Skip the PHINodes and EH pads instructions.
+          BasicBlock *DefBlock = cast<Instruction>(E.def())->getParent();
+          if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator()))
+            InsertPt = splitBeforeCatchSwitch(CSI);
+          else
+            InsertPt = &*DefBlock->getFirstInsertionPt();
+        } else {
+          // For all other values, the spill is placed immediately after
+          // the definition.
+          assert(!isa<TerminatorInst>(E.def()) && "unexpected terminator");
+          InsertPt = cast<Instruction>(E.def())->getNextNode();
+        }
+
+        Builder.SetInsertPoint(InsertPt);
+        auto *G = Builder.CreateConstInBoundsGEP2_32(
+            FrameTy, FramePtr, 0, Index,
+            CurrentValue->getName() + Twine(".spill.addr"));
+        Builder.CreateStore(CurrentValue, G);
+      }
+    }
+
+    // If we have not seen the use block, generate a reload in it.
+    if (CurrentBlock != E.userBlock()) {
+      CurrentBlock = E.userBlock();
+      CurrentReload = CreateReload(&*CurrentBlock->getFirstInsertionPt());
+    }
+
+    // If we have a single edge PHINode, remove it and replace it with a reload
+    // from the coroutine frame. (We already took care of multi edge PHINodes
+    // by rewriting them in the rewritePHIs function).
+    if (auto *PN = dyn_cast<PHINode>(E.user())) {
+      assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
+                                                "values in the PHINode");
+      PN->replaceAllUsesWith(CurrentReload);
+      PN->eraseFromParent();
+      continue;
+    }
+
+    // Replace all uses of CurrentValue in the current instruction with reload.
+    E.user()->replaceUsesOfWith(CurrentValue, CurrentReload);
+  }
+
+  BasicBlock *FramePtrBB = FramePtr->getParent();
+  Shape.AllocaSpillBlock =
+      FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB");
+  Shape.AllocaSpillBlock->splitBasicBlock(&Shape.AllocaSpillBlock->front(),
+                                          "PostSpill");
+
+  Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front());
+  // If we found any allocas, replace all of their remaining uses with Geps.
+  for (auto &P : Allocas) {
+    auto *G =
+        Builder.CreateConstInBoundsGEP2_32(FrameTy, FramePtr, 0, P.second);
+    // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G)) here,
+    // as we are changing location of the instruction.
+    G->takeName(P.first);
+    P.first->replaceAllUsesWith(G);
+    P.first->eraseFromParent();
+  }
+  return FramePtr;
+}
+
+// Sets the unwind edge of an instruction to a particular successor.
+static void setUnwindEdgeTo(TerminatorInst *TI, BasicBlock *Succ) {
+  if (auto *II = dyn_cast<InvokeInst>(TI))
+    II->setUnwindDest(Succ);
+  else if (auto *CS = dyn_cast<CatchSwitchInst>(TI))
+    CS->setUnwindDest(Succ);
+  else if (auto *CR = dyn_cast<CleanupReturnInst>(TI))
+    CR->setUnwindDest(Succ);
+  else
+    llvm_unreachable("unexpected terminator instruction");
+}
+
+// Replaces all uses of OldPred with the NewPred block in all PHINodes in a
+// block.
+static void updatePhiNodes(BasicBlock *DestBB, BasicBlock *OldPred,
+                           BasicBlock *NewPred,
+                           PHINode *LandingPadReplacement) {
+  unsigned BBIdx = 0;
+  for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+
+    // We manually update the LandingPadReplacement PHINode and it is the last
+    // PHI Node. So, if we find it, we are done.
+    if (LandingPadReplacement == PN)
+      break;
+
+    // Reuse the previous value of BBIdx if it lines up.  In cases where we
+    // have multiple phi nodes with *lots* of predecessors, this is a speed
+    // win because we don't have to scan the PHI looking for TIBB.  This
+    // happens because the BB list of PHI nodes are usually in the same
+    // order.
+    if (PN->getIncomingBlock(BBIdx) != OldPred)
+      BBIdx = PN->getBasicBlockIndex(OldPred);
+
+    assert(BBIdx != (unsigned)-1 && "Invalid PHI Index!");
+    PN->setIncomingBlock(BBIdx, NewPred);
+  }
+}
+
+// Uses SplitEdge unless the successor block is an EHPad, in which case do EH
+// specific handling.
+static BasicBlock *ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ,
+                                    LandingPadInst *OriginalPad,
+                                    PHINode *LandingPadReplacement) {
+  auto *PadInst = Succ->getFirstNonPHI();
+  if (!LandingPadReplacement && !PadInst->isEHPad())
+    return SplitEdge(BB, Succ);
+
+  auto *NewBB = BasicBlock::Create(BB->getContext(), "", BB->getParent(), Succ);
+  setUnwindEdgeTo(BB->getTerminator(), NewBB);
+  updatePhiNodes(Succ, BB, NewBB, LandingPadReplacement);
+
+  if (LandingPadReplacement) {
+    auto *NewLP = OriginalPad->clone();
+    auto *Terminator = BranchInst::Create(Succ, NewBB);
+    NewLP->insertBefore(Terminator);
+    LandingPadReplacement->addIncoming(NewLP, NewBB);
+    return NewBB;
+  }
+  Value *ParentPad = nullptr;
+  if (auto *FuncletPad = dyn_cast<FuncletPadInst>(PadInst))
+    ParentPad = FuncletPad->getParentPad();
+  else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(PadInst))
+    ParentPad = CatchSwitch->getParentPad();
+  else
+    llvm_unreachable("handling for other EHPads not implemented yet");
+
+  auto *NewCleanupPad = CleanupPadInst::Create(ParentPad, {}, "", NewBB);
+  CleanupReturnInst::Create(NewCleanupPad, Succ, NewBB);
+  return NewBB;
+}
+
+static void rewritePHIs(BasicBlock &BB) {
+  // For every incoming edge we will create a block holding all
+  // incoming values in a single PHI nodes.
+  //
+  // loop:
+  //    %n.val = phi i32[%n, %entry], [%inc, %loop]
+  //
+  // It will create:
+  //
+  // loop.from.entry:
+  //    %n.loop.pre = phi i32 [%n, %entry]
+  //    br %label loop
+  // loop.from.loop:
+  //    %inc.loop.pre = phi i32 [%inc, %loop]
+  //    br %label loop
+  //
+  // After this rewrite, further analysis will ignore any phi nodes with more
+  // than one incoming edge.
+
+  // TODO: Simplify PHINodes in the basic block to remove duplicate
+  // predecessors.
+
+  LandingPadInst *LandingPad = nullptr;
+  PHINode *ReplPHI = nullptr;
+  if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) {
+    // ehAwareSplitEdge will clone the LandingPad in all the edge blocks.
+    // We replace the original landing pad with a PHINode that will collect the
+    // results from all of them.
+    ReplPHI = PHINode::Create(LandingPad->getType(), 1, "", LandingPad);
+    ReplPHI->takeName(LandingPad);
+    LandingPad->replaceAllUsesWith(ReplPHI);
+    // We will erase the original landing pad at the end of this function after
+    // ehAwareSplitEdge cloned it in the transition blocks.
+  }
+
+  SmallVector<BasicBlock *, 8> Preds(pred_begin(&BB), pred_end(&BB));
+  for (BasicBlock *Pred : Preds) {
+    auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI);
+    IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName());
+    auto *PN = cast<PHINode>(&BB.front());
+    do {
+      int Index = PN->getBasicBlockIndex(IncomingBB);
+      Value *V = PN->getIncomingValue(Index);
+      PHINode *InputV = PHINode::Create(
+          V->getType(), 1, V->getName() + Twine(".") + BB.getName(),
+          &IncomingBB->front());
+      InputV->addIncoming(V, Pred);
+      PN->setIncomingValue(Index, InputV);
+      PN = dyn_cast<PHINode>(PN->getNextNode());
+    } while (PN != ReplPHI); // ReplPHI is either null or the PHI that replaced
+                             // the landing pad.
+  }
+
+  if (LandingPad) {
+    // Calls to ehAwareSplitEdge function cloned the original lading pad.
+    // No longer need it.
+    LandingPad->eraseFromParent();
+  }
+}
+
+static void rewritePHIs(Function &F) {
+  SmallVector<BasicBlock *, 8> WorkList;
+
+  for (BasicBlock &BB : F)
+    if (auto *PN = dyn_cast<PHINode>(&BB.front()))
+      if (PN->getNumIncomingValues() > 1)
+        WorkList.push_back(&BB);
+
+  for (BasicBlock *BB : WorkList)
+    rewritePHIs(*BB);
+}
+
+// Check for instructions that we can recreate on resume as opposed to spill
+// the result into a coroutine frame.
+static bool materializable(Instruction &V) {
+  return isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) ||
+         isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V);
+}
+
+// Check for structural coroutine intrinsics that should not be spilled into
+// the coroutine frame.
+static bool isCoroutineStructureIntrinsic(Instruction &I) {
+  return isa<CoroIdInst>(&I) || isa<CoroBeginInst>(&I) ||
+         isa<CoroSaveInst>(&I) || isa<CoroSuspendInst>(&I);
+}
+
+// For every use of the value that is across suspend point, recreate that value
+// after a suspend point.
+static void rewriteMaterializableInstructions(IRBuilder<> &IRB,
+                                              SpillInfo const &Spills) {
+  BasicBlock *CurrentBlock = nullptr;
+  Instruction *CurrentMaterialization = nullptr;
+  Instruction *CurrentDef = nullptr;
+
+  for (auto const &E : Spills) {
+    // If it is a new definition, update CurrentXXX variables.
+    if (CurrentDef != E.def()) {
+      CurrentDef = cast<Instruction>(E.def());
+      CurrentBlock = nullptr;
+      CurrentMaterialization = nullptr;
+    }
+
+    // If we have not seen this block, materialize the value.
+    if (CurrentBlock != E.userBlock()) {
+      CurrentBlock = E.userBlock();
+      CurrentMaterialization = cast<Instruction>(CurrentDef)->clone();
+      CurrentMaterialization->setName(CurrentDef->getName());
+      CurrentMaterialization->insertBefore(
+          &*CurrentBlock->getFirstInsertionPt());
+    }
+
+    if (auto *PN = dyn_cast<PHINode>(E.user())) {
+      assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
+                                                "values in the PHINode");
+      PN->replaceAllUsesWith(CurrentMaterialization);
+      PN->eraseFromParent();
+      continue;
+    }
+
+    // Replace all uses of CurrentDef in the current instruction with the
+    // CurrentMaterialization for the block.
+    E.user()->replaceUsesOfWith(CurrentDef, CurrentMaterialization);
+  }
+}
+
+// Move early uses of spilled variable after CoroBegin.
+// For example, if a parameter had address taken, we may end up with the code
+// like:
+//        define @f(i32 %n) {
+//          %n.addr = alloca i32
+//          store %n, %n.addr
+//          ...
+//          call @coro.begin
+//    we need to move the store after coro.begin
+static void moveSpillUsesAfterCoroBegin(Function &F, SpillInfo const &Spills,
+                                        CoroBeginInst *CoroBegin) {
+  DominatorTree DT(F);
+  SmallVector<Instruction *, 8> NeedsMoving;
+
+  Value *CurrentValue = nullptr;
+
+  for (auto const &E : Spills) {
+    if (CurrentValue == E.def())
+      continue;
+
+    CurrentValue = E.def();
+
+    for (User *U : CurrentValue->users()) {
+      Instruction *I = cast<Instruction>(U);
+      if (!DT.dominates(CoroBegin, I)) {
+        // TODO: Make this more robust. Currently if we run into a situation
+        // where simple instruction move won't work we panic and
+        // report_fatal_error.
+        for (User *UI : I->users()) {
+          if (!DT.dominates(CoroBegin, cast<Instruction>(UI)))
+            report_fatal_error("cannot move instruction since its users are not"
+                               " dominated by CoroBegin");
+        }
+
+        DEBUG(dbgs() << "will move: " << *I << "\n");
+        NeedsMoving.push_back(I);
+      }
+    }
+  }
+
+  Instruction *InsertPt = CoroBegin->getNextNode();
+  for (Instruction *I : NeedsMoving)
+    I->moveBefore(InsertPt);
+}
+
+// Splits the block at a particular instruction unless it is the first
+// instruction in the block with a single predecessor.
+static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) {
+  auto *BB = I->getParent();
+  if (&BB->front() == I) {
+    if (BB->getSinglePredecessor()) {
+      BB->setName(Name);
+      return BB;
+    }
+  }
+  return BB->splitBasicBlock(I, Name);
+}
+
+// Split above and below a particular instruction so that it
+// will be all alone by itself in a block.
+static void splitAround(Instruction *I, const Twine &Name) {
+  splitBlockIfNotFirst(I, Name);
+  splitBlockIfNotFirst(I->getNextNode(), "After" + Name);
+}
+
+void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
+  // Lower coro.dbg.declare to coro.dbg.value, since we are going to rewrite
+  // access to local variables.
+  LowerDbgDeclare(F);
+
+  Shape.PromiseAlloca = Shape.CoroBegin->getId()->getPromise();
+  if (Shape.PromiseAlloca) {
+    Shape.CoroBegin->getId()->clearPromise();
+  }
+
+  // Make sure that all coro.save, coro.suspend and the fallthrough coro.end
+  // intrinsics are in their own blocks to simplify the logic of building up
+  // SuspendCrossing data.
+  for (CoroSuspendInst *CSI : Shape.CoroSuspends) {
+    splitAround(CSI->getCoroSave(), "CoroSave");
+    splitAround(CSI, "CoroSuspend");
+  }
+
+  // Put CoroEnds into their own blocks.
+  for (CoroEndInst *CE : Shape.CoroEnds)
+    splitAround(CE, "CoroEnd");
+
+  // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will
+  // never has its definition separated from the PHI by the suspend point.
+  rewritePHIs(F);
+
+  // Build suspend crossing info.
+  SuspendCrossingInfo Checker(F, Shape);
+
+  IRBuilder<> Builder(F.getContext());
+  SpillInfo Spills;
+
+  for (int Repeat = 0; Repeat < 4; ++Repeat) {
+    // See if there are materializable instructions across suspend points.
+    for (Instruction &I : instructions(F))
+      if (materializable(I))
+        for (User *U : I.users())
+          if (Checker.isDefinitionAcrossSuspend(I, U))
+            Spills.emplace_back(&I, U);
+
+    if (Spills.empty())
+      break;
+
+    // Rewrite materializable instructions to be materialized at the use point.
+    DEBUG(dump("Materializations", Spills));
+    rewriteMaterializableInstructions(Builder, Spills);
+    Spills.clear();
+  }
+
+  // Collect the spills for arguments and other not-materializable values.
+  for (Argument &A : F.args())
+    for (User *U : A.users())
+      if (Checker.isDefinitionAcrossSuspend(A, U))
+        Spills.emplace_back(&A, U);
+
+  for (Instruction &I : instructions(F)) {
+    // Values returned from coroutine structure intrinsics should not be part
+    // of the Coroutine Frame.
+    if (isCoroutineStructureIntrinsic(I))
+      continue;
+    // The Coroutine Promise always included into coroutine frame, no need to
+    // check for suspend crossing.
+    if (Shape.PromiseAlloca == &I)
+      continue;
+
+    for (User *U : I.users())
+      if (Checker.isDefinitionAcrossSuspend(I, U)) {
+        // We cannot spill a token.
+        if (I.getType()->isTokenTy())
+          report_fatal_error(
+              "token definition is separated from the use by a suspend point");
+        Spills.emplace_back(&I, U);
+      }
+  }
+  DEBUG(dump("Spills", Spills));
+  moveSpillUsesAfterCoroBegin(F, Spills, Shape.CoroBegin);
+  Shape.FrameTy = buildFrameType(F, Shape, Spills);
+  Shape.FramePtr = insertSpills(Spills, Shape);
+}
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroInstr.h b/contrib/llvm/lib/Transforms/Coroutines/CoroInstr.h
new file mode 100644
index 000000000000..9a8cc5a2591c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroInstr.h
@@ -0,0 +1,323 @@
+//===-- CoroInstr.h - Coroutine Intrinsics Instruction Wrappers -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file defines classes that make it really easy to deal with intrinsic
+// functions with the isa/dyncast family of functions.  In particular, this
+// allows you to do things like:
+//
+//     if (auto *SF = dyn_cast<CoroSubFnInst>(Inst))
+//        ... SF->getFrame() ...
+//
+// All intrinsic function calls are instances of the call instruction, so these
+// are all subclasses of the CallInst class.  Note that none of these classes
+// has state or virtual methods, which is an important part of this gross/neat
+// hack working.
+//
+// The helpful comment above is borrowed from llvm/IntrinsicInst.h, we keep
+// coroutine intrinsic wrappers here since they are only used by the passes in
+// the Coroutine library.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_COROUTINES_COROINSTR_H
+#define LLVM_LIB_TRANSFORMS_COROUTINES_COROINSTR_H
+
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+
+namespace llvm {
+
+/// This class represents the llvm.coro.subfn.addr instruction.
+class LLVM_LIBRARY_VISIBILITY CoroSubFnInst : public IntrinsicInst {
+  enum { FrameArg, IndexArg };
+
+public:
+  enum ResumeKind {
+    RestartTrigger = -1,
+    ResumeIndex,
+    DestroyIndex,
+    CleanupIndex,
+    IndexLast,
+    IndexFirst = RestartTrigger
+  };
+
+  Value *getFrame() const { return getArgOperand(FrameArg); }
+  ResumeKind getIndex() const {
+    int64_t Index = getRawIndex()->getValue().getSExtValue();
+    assert(Index >= IndexFirst && Index < IndexLast &&
+           "unexpected CoroSubFnInst index argument");
+    return static_cast<ResumeKind>(Index);
+  }
+
+  ConstantInt *getRawIndex() const {
+    return cast<ConstantInt>(getArgOperand(IndexArg));
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_subfn_addr;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.alloc instruction.
+class LLVM_LIBRARY_VISIBILITY CoroAllocInst : public IntrinsicInst {
+public:
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_alloc;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.alloc instruction.
+class LLVM_LIBRARY_VISIBILITY CoroIdInst : public IntrinsicInst {
+  enum { AlignArg, PromiseArg, CoroutineArg, InfoArg };
+
+public:
+  CoroAllocInst *getCoroAlloc() {
+    for (User *U : users())
+      if (auto *CA = dyn_cast<CoroAllocInst>(U))
+        return CA;
+    return nullptr;
+  }
+
+  IntrinsicInst *getCoroBegin() {
+    for (User *U : users())
+      if (auto *II = dyn_cast<IntrinsicInst>(U))
+        if (II->getIntrinsicID() == Intrinsic::coro_begin)
+          return II;
+    llvm_unreachable("no coro.begin associated with coro.id");
+  }
+
+  AllocaInst *getPromise() const {
+    Value *Arg = getArgOperand(PromiseArg);
+    return isa<ConstantPointerNull>(Arg)
+               ? nullptr
+               : cast<AllocaInst>(Arg->stripPointerCasts());
+  }
+
+  void clearPromise() {
+    Value *Arg = getArgOperand(PromiseArg);
+    setArgOperand(PromiseArg,
+                  ConstantPointerNull::get(Type::getInt8PtrTy(getContext())));
+    if (isa<AllocaInst>(Arg))
+      return;
+    assert((isa<BitCastInst>(Arg) || isa<GetElementPtrInst>(Arg)) &&
+           "unexpected instruction designating the promise");
+    // TODO: Add a check that any remaining users of Inst are after coro.begin
+    // or add code to move the users after coro.begin.
+    auto *Inst = cast<Instruction>(Arg);
+    if (Inst->use_empty()) {
+      Inst->eraseFromParent();
+      return;
+    }
+    Inst->moveBefore(getCoroBegin()->getNextNode());
+  }
+
+  // Info argument of coro.id is
+  //   fresh out of the frontend: null ;
+  //   outlined                 : {Init, Return, Susp1, Susp2, ...} ;
+  //   postsplit                : [resume, destroy, cleanup] ;
+  //
+  // If parts of the coroutine were outlined to protect against undesirable
+  // code motion, these functions will be stored in a struct literal referred to
+  // by the Info parameter. Note: this is only needed before coroutine is split.
+  //
+  // After coroutine is split, resume functions are stored in an array
+  // referred to by this parameter.
+
+  struct Info {
+    ConstantStruct *OutlinedParts = nullptr;
+    ConstantArray *Resumers = nullptr;
+
+    bool hasOutlinedParts() const { return OutlinedParts != nullptr; }
+    bool isPostSplit() const { return Resumers != nullptr; }
+    bool isPreSplit() const { return !isPostSplit(); }
+  };
+  Info getInfo() const {
+    Info Result;
+    auto *GV = dyn_cast<GlobalVariable>(getRawInfo());
+    if (!GV)
+      return Result;
+
+    assert(GV->isConstant() && GV->hasDefinitiveInitializer());
+    Constant *Initializer = GV->getInitializer();
+    if ((Result.OutlinedParts = dyn_cast<ConstantStruct>(Initializer)))
+      return Result;
+
+    Result.Resumers = cast<ConstantArray>(Initializer);
+    return Result;
+  }
+  Constant *getRawInfo() const {
+    return cast<Constant>(getArgOperand(InfoArg)->stripPointerCasts());
+  }
+
+  void setInfo(Constant *C) { setArgOperand(InfoArg, C); }
+
+  Function *getCoroutine() const {
+    return cast<Function>(getArgOperand(CoroutineArg)->stripPointerCasts());
+  }
+  void setCoroutineSelf() {
+    assert(isa<ConstantPointerNull>(getArgOperand(CoroutineArg)) &&
+           "Coroutine argument is already assigned");
+    auto *const Int8PtrTy = Type::getInt8PtrTy(getContext());
+    setArgOperand(CoroutineArg,
+                  ConstantExpr::getBitCast(getFunction(), Int8PtrTy));
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_id;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.frame instruction.
+class LLVM_LIBRARY_VISIBILITY CoroFrameInst : public IntrinsicInst {
+public:
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_frame;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.free instruction.
+class LLVM_LIBRARY_VISIBILITY CoroFreeInst : public IntrinsicInst {
+  enum { IdArg, FrameArg };
+
+public:
+  Value *getFrame() const { return getArgOperand(FrameArg); }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_free;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This class represents the llvm.coro.begin instruction.
+class LLVM_LIBRARY_VISIBILITY CoroBeginInst : public IntrinsicInst {
+  enum { IdArg, MemArg };
+
+public:
+  CoroIdInst *getId() const { return cast<CoroIdInst>(getArgOperand(IdArg)); }
+
+  Value *getMem() const { return getArgOperand(MemArg); }
+
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_begin;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.save instruction.
+class LLVM_LIBRARY_VISIBILITY CoroSaveInst : public IntrinsicInst {
+public:
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_save;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.promise instruction.
+class LLVM_LIBRARY_VISIBILITY CoroPromiseInst : public IntrinsicInst {
+  enum { FrameArg, AlignArg, FromArg };
+
+public:
+  bool isFromPromise() const {
+    return cast<Constant>(getArgOperand(FromArg))->isOneValue();
+  }
+  unsigned getAlignment() const {
+    return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue();
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_promise;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.suspend instruction.
+class LLVM_LIBRARY_VISIBILITY CoroSuspendInst : public IntrinsicInst {
+  enum { SaveArg, FinalArg };
+
+public:
+  CoroSaveInst *getCoroSave() const {
+    Value *Arg = getArgOperand(SaveArg);
+    if (auto *SI = dyn_cast<CoroSaveInst>(Arg))
+      return SI;
+    assert(isa<ConstantTokenNone>(Arg));
+    return nullptr;
+  }
+  bool isFinal() const {
+    return cast<Constant>(getArgOperand(FinalArg))->isOneValue();
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_suspend;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.size instruction.
+class LLVM_LIBRARY_VISIBILITY CoroSizeInst : public IntrinsicInst {
+public:
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_size;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.end instruction.
+class LLVM_LIBRARY_VISIBILITY CoroEndInst : public IntrinsicInst {
+  enum { FrameArg, UnwindArg };
+
+public:
+  bool isFallthrough() const { return !isUnwind(); }
+  bool isUnwind() const {
+    return cast<Constant>(getArgOperand(UnwindArg))->isOneValue();
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_end;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+} // End namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroInternal.h b/contrib/llvm/lib/Transforms/Coroutines/CoroInternal.h
new file mode 100644
index 000000000000..1eac88dbac3a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -0,0 +1,107 @@
+//===- CoroInternal.h - Internal Coroutine interfaces ---------*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Common definitions/declarations used internally by coroutine lowering passes.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_COROUTINES_COROINTERNAL_H
+#define LLVM_LIB_TRANSFORMS_COROUTINES_COROINTERNAL_H
+
+#include "CoroInstr.h"
+#include "llvm/Transforms/Coroutines.h"
+
+namespace llvm {
+
+class CallGraph;
+class CallGraphSCC;
+class PassRegistry;
+
+void initializeCoroEarlyPass(PassRegistry &);
+void initializeCoroSplitPass(PassRegistry &);
+void initializeCoroElidePass(PassRegistry &);
+void initializeCoroCleanupPass(PassRegistry &);
+
+// CoroEarly pass marks every function that has coro.begin with a string
+// attribute "coroutine.presplit"="0". CoroSplit pass processes the coroutine
+// twice. First, it lets it go through complete IPO optimization pipeline as a
+// single function. It forces restart of the pipeline by inserting an indirect
+// call to an empty function "coro.devirt.trigger" which is devirtualized by
+// CoroElide pass that triggers a restart of the pipeline by CGPassManager.
+// When CoroSplit pass sees the same coroutine the second time, it splits it up,
+// adds coroutine subfunctions to the SCC to be processed by IPO pipeline.
+
+#define CORO_PRESPLIT_ATTR "coroutine.presplit"
+#define UNPREPARED_FOR_SPLIT "0"
+#define PREPARED_FOR_SPLIT "1"
+
+#define CORO_DEVIRT_TRIGGER_FN "coro.devirt.trigger"
+
+namespace coro {
+
+bool declaresIntrinsics(Module &M, std::initializer_list<StringRef>);
+void replaceAllCoroAllocs(CoroBeginInst *CB, bool Replacement);
+void replaceAllCoroFrees(CoroBeginInst *CB, Value *Replacement);
+void replaceCoroFree(CoroIdInst *CoroId, bool Elide);
+void updateCallGraph(Function &Caller, ArrayRef<Function *> Funcs,
+                     CallGraph &CG, CallGraphSCC &SCC);
+
+// Keeps data and helper functions for lowering coroutine intrinsics.
+struct LowererBase {
+  Module &TheModule;
+  LLVMContext &Context;
+  PointerType *const Int8Ptr;
+  FunctionType *const ResumeFnType;
+  ConstantPointerNull *const NullPtr;
+
+  LowererBase(Module &M);
+  Value *makeSubFnCall(Value *Arg, int Index, Instruction *InsertPt);
+};
+
+// Holds structural Coroutine Intrinsics for a particular function and other
+// values used during CoroSplit pass.
+struct LLVM_LIBRARY_VISIBILITY Shape {
+  CoroBeginInst *CoroBegin;
+  SmallVector<CoroEndInst *, 4> CoroEnds;
+  SmallVector<CoroSizeInst *, 2> CoroSizes;
+  SmallVector<CoroSuspendInst *, 4> CoroSuspends;
+
+  // Field Indexes for known coroutine frame fields.
+  enum {
+    ResumeField,
+    DestroyField,
+    PromiseField,
+    IndexField,
+    LastKnownField = IndexField
+  };
+
+  StructType *FrameTy;
+  Instruction *FramePtr;
+  BasicBlock *AllocaSpillBlock;
+  SwitchInst *ResumeSwitch;
+  AllocaInst *PromiseAlloca;
+  bool HasFinalSuspend;
+
+  IntegerType *getIndexType() const {
+    assert(FrameTy && "frame type not assigned");
+    return cast<IntegerType>(FrameTy->getElementType(IndexField));
+  }
+  ConstantInt *getIndex(uint64_t Value) const {
+    return ConstantInt::get(getIndexType(), Value);
+  }
+
+  Shape() = default;
+  explicit Shape(Function &F) { buildFrom(F); }
+  void buildFrom(Function &F);
+};
+
+void buildCoroutineFrame(Function &F, Shape &Shape);
+
+} // End namespace coro.
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
new file mode 100644
index 000000000000..173dc05f0584
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -0,0 +1,742 @@
+//===- CoroSplit.cpp - Converts a coroutine into a state machine ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass builds the coroutine frame and outlines resume and destroy parts
+// of the coroutine into separate functions.
+//
+// We present a coroutine to an LLVM as an ordinary function with suspension
+// points marked up with intrinsics. We let the optimizer party on the coroutine
+// as a single function for as long as possible. Shortly before the coroutine is
+// eligible to be inlined into its callers, we split up the coroutine into parts
+// corresponding to an initial, resume and destroy invocations of the coroutine,
+// add them to the current SCC and restart the IPO pipeline to optimize the
+// coroutine subfunctions we extracted before proceeding to the caller of the
+// coroutine.
+//===----------------------------------------------------------------------===//
+
+#include "CoroInternal.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "coro-split"
+
+// Create an entry block for a resume function with a switch that will jump to
+// suspend points.
+static BasicBlock *createResumeEntryBlock(Function &F, coro::Shape &Shape) {
+  LLVMContext &C = F.getContext();
+
+  // resume.entry:
+  //  %index.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0,
+  //  i32 2
+  //  % index = load i32, i32* %index.addr
+  //  switch i32 %index, label %unreachable [
+  //    i32 0, label %resume.0
+  //    i32 1, label %resume.1
+  //    ...
+  //  ]
+
+  auto *NewEntry = BasicBlock::Create(C, "resume.entry", &F);
+  auto *UnreachBB = BasicBlock::Create(C, "unreachable", &F);
+
+  IRBuilder<> Builder(NewEntry);
+  auto *FramePtr = Shape.FramePtr;
+  auto *FrameTy = Shape.FrameTy;
+  auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(
+      FrameTy, FramePtr, 0, coro::Shape::IndexField, "index.addr");
+  auto *Index = Builder.CreateLoad(GepIndex, "index");
+  auto *Switch =
+      Builder.CreateSwitch(Index, UnreachBB, Shape.CoroSuspends.size());
+  Shape.ResumeSwitch = Switch;
+
+  size_t SuspendIndex = 0;
+  for (CoroSuspendInst *S : Shape.CoroSuspends) {
+    ConstantInt *IndexVal = Shape.getIndex(SuspendIndex);
+
+    // Replace CoroSave with a store to Index:
+    //    %index.addr = getelementptr %f.frame... (index field number)
+    //    store i32 0, i32* %index.addr1
+    auto *Save = S->getCoroSave();
+    Builder.SetInsertPoint(Save);
+    if (S->isFinal()) {
+      // Final suspend point is represented by storing zero in ResumeFnAddr.
+      auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(FrameTy, FramePtr, 0,
+                                                          0, "ResumeFn.addr");
+      auto *NullPtr = ConstantPointerNull::get(cast<PointerType>(
+          cast<PointerType>(GepIndex->getType())->getElementType()));
+      Builder.CreateStore(NullPtr, GepIndex);
+    } else {
+      auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(
+          FrameTy, FramePtr, 0, coro::Shape::IndexField, "index.addr");
+      Builder.CreateStore(IndexVal, GepIndex);
+    }
+    Save->replaceAllUsesWith(ConstantTokenNone::get(C));
+    Save->eraseFromParent();
+
+    // Split block before and after coro.suspend and add a jump from an entry
+    // switch:
+    //
+    //  whateverBB:
+    //    whatever
+    //    %0 = call i8 @llvm.coro.suspend(token none, i1 false)
+    //    switch i8 %0, label %suspend[i8 0, label %resume
+    //                                 i8 1, label %cleanup]
+    // becomes:
+    //
+    //  whateverBB:
+    //     whatever
+    //     br label %resume.0.landing
+    //
+    //  resume.0: ; <--- jump from the switch in the resume.entry
+    //     %0 = tail call i8 @llvm.coro.suspend(token none, i1 false)
+    //     br label %resume.0.landing
+    //
+    //  resume.0.landing:
+    //     %1 = phi i8[-1, %whateverBB], [%0, %resume.0]
+    //     switch i8 % 1, label %suspend [i8 0, label %resume
+    //                                    i8 1, label %cleanup]
+
+    auto *SuspendBB = S->getParent();
+    auto *ResumeBB =
+        SuspendBB->splitBasicBlock(S, "resume." + Twine(SuspendIndex));
+    auto *LandingBB = ResumeBB->splitBasicBlock(
+        S->getNextNode(), ResumeBB->getName() + Twine(".landing"));
+    Switch->addCase(IndexVal, ResumeBB);
+
+    cast<BranchInst>(SuspendBB->getTerminator())->setSuccessor(0, LandingBB);
+    auto *PN = PHINode::Create(Builder.getInt8Ty(), 2, "", &LandingBB->front());
+    S->replaceAllUsesWith(PN);
+    PN->addIncoming(Builder.getInt8(-1), SuspendBB);
+    PN->addIncoming(S, ResumeBB);
+
+    ++SuspendIndex;
+  }
+
+  Builder.SetInsertPoint(UnreachBB);
+  Builder.CreateUnreachable();
+
+  return NewEntry;
+}
+
+// In Resumers, we replace fallthrough coro.end with ret void and delete the
+// rest of the block.
+static void replaceFallthroughCoroEnd(IntrinsicInst *End,
+                                      ValueToValueMapTy &VMap) {
+  auto *NewE = cast<IntrinsicInst>(VMap[End]);
+  ReturnInst::Create(NewE->getContext(), nullptr, NewE);
+
+  // Remove the rest of the block, by splitting it into an unreachable block.
+  auto *BB = NewE->getParent();
+  BB->splitBasicBlock(NewE);
+  BB->getTerminator()->eraseFromParent();
+}
+
+// In Resumers, we replace unwind coro.end with True to force the immediate
+// unwind to caller.
+static void replaceUnwindCoroEnds(coro::Shape &Shape, ValueToValueMapTy &VMap) {
+  if (Shape.CoroEnds.empty())
+    return;
+
+  LLVMContext &Context = Shape.CoroEnds.front()->getContext();
+  auto *True = ConstantInt::getTrue(Context);
+  for (CoroEndInst *CE : Shape.CoroEnds) {
+    if (!CE->isUnwind())
+      continue;
+
+    auto *NewCE = cast<IntrinsicInst>(VMap[CE]);
+
+    // If coro.end has an associated bundle, add cleanupret instruction.
+    if (auto Bundle = NewCE->getOperandBundle(LLVMContext::OB_funclet)) {
+      Value *FromPad = Bundle->Inputs[0];
+      auto *CleanupRet = CleanupReturnInst::Create(FromPad, nullptr, NewCE);
+      NewCE->getParent()->splitBasicBlock(NewCE);
+      CleanupRet->getParent()->getTerminator()->eraseFromParent();
+    }
+
+    NewCE->replaceAllUsesWith(True);
+    NewCE->eraseFromParent();
+  }
+}
+
+// Rewrite final suspend point handling. We do not use suspend index to
+// represent the final suspend point. Instead we zero-out ResumeFnAddr in the
+// coroutine frame, since it is undefined behavior to resume a coroutine
+// suspended at the final suspend point. Thus, in the resume function, we can
+// simply remove the last case (when coro::Shape is built, the final suspend
+// point (if present) is always the last element of CoroSuspends array).
+// In the destroy function, we add a code sequence to check if ResumeFnAddress
+// is Null, and if so, jump to the appropriate label to handle cleanup from the
+// final suspend point.
+static void handleFinalSuspend(IRBuilder<> &Builder, Value *FramePtr,
+                               coro::Shape &Shape, SwitchInst *Switch,
+                               bool IsDestroy) {
+  assert(Shape.HasFinalSuspend);
+  auto FinalCaseIt = std::prev(Switch->case_end());
+  BasicBlock *ResumeBB = FinalCaseIt->getCaseSuccessor();
+  Switch->removeCase(FinalCaseIt);
+  if (IsDestroy) {
+    BasicBlock *OldSwitchBB = Switch->getParent();
+    auto *NewSwitchBB = OldSwitchBB->splitBasicBlock(Switch, "Switch");
+    Builder.SetInsertPoint(OldSwitchBB->getTerminator());
+    auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(Shape.FrameTy, FramePtr,
+                                                        0, 0, "ResumeFn.addr");
+    auto *Load = Builder.CreateLoad(GepIndex);
+    auto *NullPtr =
+        ConstantPointerNull::get(cast<PointerType>(Load->getType()));
+    auto *Cond = Builder.CreateICmpEQ(Load, NullPtr);
+    Builder.CreateCondBr(Cond, ResumeBB, NewSwitchBB);
+    OldSwitchBB->getTerminator()->eraseFromParent();
+  }
+}
+
+// Create a resume clone by cloning the body of the original function, setting
+// new entry block and replacing coro.suspend an appropriate value to force
+// resume or cleanup pass for every suspend point.
+static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
+                             BasicBlock *ResumeEntry, int8_t FnIndex) {
+  Module *M = F.getParent();
+  auto *FrameTy = Shape.FrameTy;
+  auto *FnPtrTy = cast<PointerType>(FrameTy->getElementType(0));
+  auto *FnTy = cast<FunctionType>(FnPtrTy->getElementType());
+
+  Function *NewF =
+      Function::Create(FnTy, GlobalValue::LinkageTypes::InternalLinkage,
+                       F.getName() + Suffix, M);
+  NewF->addParamAttr(0, Attribute::NonNull);
+  NewF->addParamAttr(0, Attribute::NoAlias);
+
+  ValueToValueMapTy VMap;
+  // Replace all args with undefs. The buildCoroutineFrame algorithm already
+  // rewritten access to the args that occurs after suspend points with loads
+  // and stores to/from the coroutine frame.
+  for (Argument &A : F.args())
+    VMap[&A] = UndefValue::get(A.getType());
+
+  SmallVector<ReturnInst *, 4> Returns;
+
+  CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/true, Returns);
+
+  // Remove old returns.
+  for (ReturnInst *Return : Returns)
+    changeToUnreachable(Return, /*UseLLVMTrap=*/false);
+
+  // Remove old return attributes.
+  NewF->removeAttributes(
+      AttributeList::ReturnIndex,
+      AttributeFuncs::typeIncompatible(NewF->getReturnType()));
+
+  // Make AllocaSpillBlock the new entry block.
+  auto *SwitchBB = cast<BasicBlock>(VMap[ResumeEntry]);
+  auto *Entry = cast<BasicBlock>(VMap[Shape.AllocaSpillBlock]);
+  Entry->moveBefore(&NewF->getEntryBlock());
+  Entry->getTerminator()->eraseFromParent();
+  BranchInst::Create(SwitchBB, Entry);
+  Entry->setName("entry" + Suffix);
+
+  // Clear all predecessors of the new entry block.
+  auto *Switch = cast<SwitchInst>(VMap[Shape.ResumeSwitch]);
+  Entry->replaceAllUsesWith(Switch->getDefaultDest());
+
+  IRBuilder<> Builder(&NewF->getEntryBlock().front());
+
+  // Remap frame pointer.
+  Argument *NewFramePtr = &*NewF->arg_begin();
+  Value *OldFramePtr = cast<Value>(VMap[Shape.FramePtr]);
+  NewFramePtr->takeName(OldFramePtr);
+  OldFramePtr->replaceAllUsesWith(NewFramePtr);
+
+  // Remap vFrame pointer.
+  auto *NewVFrame = Builder.CreateBitCast(
+      NewFramePtr, Type::getInt8PtrTy(Builder.getContext()), "vFrame");
+  Value *OldVFrame = cast<Value>(VMap[Shape.CoroBegin]);
+  OldVFrame->replaceAllUsesWith(NewVFrame);
+
+  // Rewrite final suspend handling as it is not done via switch (allows to
+  // remove final case from the switch, since it is undefined behavior to resume
+  // the coroutine suspended at the final suspend point.
+  if (Shape.HasFinalSuspend) {
+    auto *Switch = cast<SwitchInst>(VMap[Shape.ResumeSwitch]);
+    bool IsDestroy = FnIndex != 0;
+    handleFinalSuspend(Builder, NewFramePtr, Shape, Switch, IsDestroy);
+  }
+
+  // Replace coro suspend with the appropriate resume index.
+  // Replacing coro.suspend with (0) will result in control flow proceeding to
+  // a resume label associated with a suspend point, replacing it with (1) will
+  // result in control flow proceeding to a cleanup label associated with this
+  // suspend point.
+  auto *NewValue = Builder.getInt8(FnIndex ? 1 : 0);
+  for (CoroSuspendInst *CS : Shape.CoroSuspends) {
+    auto *MappedCS = cast<CoroSuspendInst>(VMap[CS]);
+    MappedCS->replaceAllUsesWith(NewValue);
+    MappedCS->eraseFromParent();
+  }
+
+  // Remove coro.end intrinsics.
+  replaceFallthroughCoroEnd(Shape.CoroEnds.front(), VMap);
+  replaceUnwindCoroEnds(Shape, VMap);
+  // Eliminate coro.free from the clones, replacing it with 'null' in cleanup,
+  // to suppress deallocation code.
+  coro::replaceCoroFree(cast<CoroIdInst>(VMap[Shape.CoroBegin->getId()]),
+                        /*Elide=*/FnIndex == 2);
+
+  NewF->setCallingConv(CallingConv::Fast);
+
+  return NewF;
+}
+
+static void removeCoroEnds(coro::Shape &Shape) {
+  if (Shape.CoroEnds.empty())
+    return;
+
+  LLVMContext &Context = Shape.CoroEnds.front()->getContext();
+  auto *False = ConstantInt::getFalse(Context);
+
+  for (CoroEndInst *CE : Shape.CoroEnds) {
+    CE->replaceAllUsesWith(False);
+    CE->eraseFromParent();
+  }
+}
+
+static void replaceFrameSize(coro::Shape &Shape) {
+  if (Shape.CoroSizes.empty())
+    return;
+
+  // In the same function all coro.sizes should have the same result type.
+  auto *SizeIntrin = Shape.CoroSizes.back();
+  Module *M = SizeIntrin->getModule();
+  const DataLayout &DL = M->getDataLayout();
+  auto Size = DL.getTypeAllocSize(Shape.FrameTy);
+  auto *SizeConstant = ConstantInt::get(SizeIntrin->getType(), Size);
+
+  for (CoroSizeInst *CS : Shape.CoroSizes) {
+    CS->replaceAllUsesWith(SizeConstant);
+    CS->eraseFromParent();
+  }
+}
+
+// Create a global constant array containing pointers to functions provided and
+// set Info parameter of CoroBegin to point at this constant. Example:
+//
+//   @f.resumers = internal constant [2 x void(%f.frame*)*]
+//                    [void(%f.frame*)* @f.resume, void(%f.frame*)* @f.destroy]
+//   define void @f() {
+//     ...
+//     call i8* @llvm.coro.begin(i8* null, i32 0, i8* null,
+//                    i8* bitcast([2 x void(%f.frame*)*] * @f.resumers to i8*))
+//
+// Assumes that all the functions have the same signature.
+static void setCoroInfo(Function &F, CoroBeginInst *CoroBegin,
+                        std::initializer_list<Function *> Fns) {
+
+  SmallVector<Constant *, 4> Args(Fns.begin(), Fns.end());
+  assert(!Args.empty());
+  Function *Part = *Fns.begin();
+  Module *M = Part->getParent();
+  auto *ArrTy = ArrayType::get(Part->getType(), Args.size());
+
+  auto *ConstVal = ConstantArray::get(ArrTy, Args);
+  auto *GV = new GlobalVariable(*M, ConstVal->getType(), /*isConstant=*/true,
+                                GlobalVariable::PrivateLinkage, ConstVal,
+                                F.getName() + Twine(".resumers"));
+
+  // Update coro.begin instruction to refer to this constant.
+  LLVMContext &C = F.getContext();
+  auto *BC = ConstantExpr::getPointerCast(GV, Type::getInt8PtrTy(C));
+  CoroBegin->getId()->setInfo(BC);
+}
+
+// Store addresses of Resume/Destroy/Cleanup functions in the coroutine frame.
+static void updateCoroFrame(coro::Shape &Shape, Function *ResumeFn,
+                            Function *DestroyFn, Function *CleanupFn) {
+
+  IRBuilder<> Builder(Shape.FramePtr->getNextNode());
+  auto *ResumeAddr = Builder.CreateConstInBoundsGEP2_32(
+      Shape.FrameTy, Shape.FramePtr, 0, coro::Shape::ResumeField,
+      "resume.addr");
+  Builder.CreateStore(ResumeFn, ResumeAddr);
+
+  Value *DestroyOrCleanupFn = DestroyFn;
+
+  CoroIdInst *CoroId = Shape.CoroBegin->getId();
+  if (CoroAllocInst *CA = CoroId->getCoroAlloc()) {
+    // If there is a CoroAlloc and it returns false (meaning we elide the
+    // allocation, use CleanupFn instead of DestroyFn).
+    DestroyOrCleanupFn = Builder.CreateSelect(CA, DestroyFn, CleanupFn);
+  }
+
+  auto *DestroyAddr = Builder.CreateConstInBoundsGEP2_32(
+      Shape.FrameTy, Shape.FramePtr, 0, coro::Shape::DestroyField,
+      "destroy.addr");
+  Builder.CreateStore(DestroyOrCleanupFn, DestroyAddr);
+}
+
+static void postSplitCleanup(Function &F) {
+  removeUnreachableBlocks(F);
+  llvm::legacy::FunctionPassManager FPM(F.getParent());
+
+  FPM.add(createVerifierPass());
+  FPM.add(createSCCPPass());
+  FPM.add(createCFGSimplificationPass());
+  FPM.add(createEarlyCSEPass());
+  FPM.add(createCFGSimplificationPass());
+
+  FPM.doInitialization();
+  FPM.run(F);
+  FPM.doFinalization();
+}
+
+// Coroutine has no suspend points. Remove heap allocation for the coroutine
+// frame if possible.
+static void handleNoSuspendCoroutine(CoroBeginInst *CoroBegin, Type *FrameTy) {
+  auto *CoroId = CoroBegin->getId();
+  auto *AllocInst = CoroId->getCoroAlloc();
+  coro::replaceCoroFree(CoroId, /*Elide=*/AllocInst != nullptr);
+  if (AllocInst) {
+    IRBuilder<> Builder(AllocInst);
+    // FIXME: Need to handle overaligned members.
+    auto *Frame = Builder.CreateAlloca(FrameTy);
+    auto *VFrame = Builder.CreateBitCast(Frame, Builder.getInt8PtrTy());
+    AllocInst->replaceAllUsesWith(Builder.getFalse());
+    AllocInst->eraseFromParent();
+    CoroBegin->replaceAllUsesWith(VFrame);
+  } else {
+    CoroBegin->replaceAllUsesWith(CoroBegin->getMem());
+  }
+  CoroBegin->eraseFromParent();
+}
+
+// look for a very simple pattern
+//    coro.save
+//    no other calls
+//    resume or destroy call
+//    coro.suspend
+//
+// If there are other calls between coro.save and coro.suspend, they can
+// potentially resume or destroy the coroutine, so it is unsafe to eliminate a
+// suspend point.
+static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
+                                 CoroBeginInst *CoroBegin) {
+  auto *Save = Suspend->getCoroSave();
+  auto *BB = Suspend->getParent();
+  if (BB != Save->getParent())
+    return false;
+
+  CallSite SingleCallSite;
+
+  // Check that we have only one CallSite.
+  for (Instruction *I = Save->getNextNode(); I != Suspend;
+       I = I->getNextNode()) {
+    if (isa<CoroFrameInst>(I))
+      continue;
+    if (isa<CoroSubFnInst>(I))
+      continue;
+    if (CallSite CS = CallSite(I)) {
+      if (SingleCallSite)
+        return false;
+      else
+        SingleCallSite = CS;
+    }
+  }
+  auto *CallInstr = SingleCallSite.getInstruction();
+  if (!CallInstr)
+    return false;
+
+  auto *Callee = SingleCallSite.getCalledValue()->stripPointerCasts();
+
+  // See if the callsite is for resumption or destruction of the coroutine.
+  auto *SubFn = dyn_cast<CoroSubFnInst>(Callee);
+  if (!SubFn)
+    return false;
+
+  // Does not refer to the current coroutine, we cannot do anything with it.
+  if (SubFn->getFrame() != CoroBegin)
+    return false;
+
+  // Replace llvm.coro.suspend with the value that results in resumption over
+  // the resume or cleanup path.
+  Suspend->replaceAllUsesWith(SubFn->getRawIndex());
+  Suspend->eraseFromParent();
+  Save->eraseFromParent();
+
+  // No longer need a call to coro.resume or coro.destroy.
+  CallInstr->eraseFromParent();
+
+  if (SubFn->user_empty())
+    SubFn->eraseFromParent();
+
+  return true;
+}
+
+// Remove suspend points that are simplified.
+static void simplifySuspendPoints(coro::Shape &Shape) {
+  auto &S = Shape.CoroSuspends;
+  size_t I = 0, N = S.size();
+  if (N == 0)
+    return;
+  for (;;) {
+    if (simplifySuspendPoint(S[I], Shape.CoroBegin)) {
+      if (--N == I)
+        break;
+      std::swap(S[I], S[N]);
+      continue;
+    }
+    if (++I == N)
+      break;
+  }
+  S.resize(N);
+}
+
+static SmallPtrSet<BasicBlock *, 4> getCoroBeginPredBlocks(CoroBeginInst *CB) {
+  // Collect all blocks that we need to look for instructions to relocate.
+  SmallPtrSet<BasicBlock *, 4> RelocBlocks;
+  SmallVector<BasicBlock *, 4> Work;
+  Work.push_back(CB->getParent());
+
+  do {
+    BasicBlock *Current = Work.pop_back_val();
+    for (BasicBlock *BB : predecessors(Current))
+      if (RelocBlocks.count(BB) == 0) {
+        RelocBlocks.insert(BB);
+        Work.push_back(BB);
+      }
+  } while (!Work.empty());
+  return RelocBlocks;
+}
+
+static SmallPtrSet<Instruction *, 8>
+getNotRelocatableInstructions(CoroBeginInst *CoroBegin,
+                              SmallPtrSetImpl<BasicBlock *> &RelocBlocks) {
+  SmallPtrSet<Instruction *, 8> DoNotRelocate;
+  // Collect all instructions that we should not relocate
+  SmallVector<Instruction *, 8> Work;
+
+  // Start with CoroBegin and terminators of all preceding blocks.
+  Work.push_back(CoroBegin);
+  BasicBlock *CoroBeginBB = CoroBegin->getParent();
+  for (BasicBlock *BB : RelocBlocks)
+    if (BB != CoroBeginBB)
+      Work.push_back(BB->getTerminator());
+
+  // For every instruction in the Work list, place its operands in DoNotRelocate
+  // set.
+  do {
+    Instruction *Current = Work.pop_back_val();
+    DoNotRelocate.insert(Current);
+    for (Value *U : Current->operands()) {
+      auto *I = dyn_cast<Instruction>(U);
+      if (!I)
+        continue;
+      if (isa<AllocaInst>(U))
+        continue;
+      if (DoNotRelocate.count(I) == 0) {
+        Work.push_back(I);
+        DoNotRelocate.insert(I);
+      }
+    }
+  } while (!Work.empty());
+  return DoNotRelocate;
+}
+
+static void relocateInstructionBefore(CoroBeginInst *CoroBegin, Function &F) {
+  // Analyze which non-alloca instructions are needed for allocation and
+  // relocate the rest to after coro.begin. We need to do it, since some of the
+  // targets of those instructions may be placed into coroutine frame memory
+  // for which becomes available after coro.begin intrinsic.
+
+  auto BlockSet = getCoroBeginPredBlocks(CoroBegin);
+  auto DoNotRelocateSet = getNotRelocatableInstructions(CoroBegin, BlockSet);
+
+  Instruction *InsertPt = CoroBegin->getNextNode();
+  BasicBlock &BB = F.getEntryBlock(); // TODO: Look at other blocks as well.
+  for (auto B = BB.begin(), E = BB.end(); B != E;) {
+    Instruction &I = *B++;
+    if (isa<AllocaInst>(&I))
+      continue;
+    if (&I == CoroBegin)
+      break;
+    if (DoNotRelocateSet.count(&I))
+      continue;
+    I.moveBefore(InsertPt);
+  }
+}
+
+static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) {
+  coro::Shape Shape(F);
+  if (!Shape.CoroBegin)
+    return;
+
+  simplifySuspendPoints(Shape);
+  relocateInstructionBefore(Shape.CoroBegin, F);
+  buildCoroutineFrame(F, Shape);
+  replaceFrameSize(Shape);
+
+  // If there are no suspend points, no split required, just remove
+  // the allocation and deallocation blocks, they are not needed.
+  if (Shape.CoroSuspends.empty()) {
+    handleNoSuspendCoroutine(Shape.CoroBegin, Shape.FrameTy);
+    removeCoroEnds(Shape);
+    postSplitCleanup(F);
+    coro::updateCallGraph(F, {}, CG, SCC);
+    return;
+  }
+
+  auto *ResumeEntry = createResumeEntryBlock(F, Shape);
+  auto ResumeClone = createClone(F, ".resume", Shape, ResumeEntry, 0);
+  auto DestroyClone = createClone(F, ".destroy", Shape, ResumeEntry, 1);
+  auto CleanupClone = createClone(F, ".cleanup", Shape, ResumeEntry, 2);
+
+  // We no longer need coro.end in F.
+  removeCoroEnds(Shape);
+
+  postSplitCleanup(F);
+  postSplitCleanup(*ResumeClone);
+  postSplitCleanup(*DestroyClone);
+  postSplitCleanup(*CleanupClone);
+
+  // Store addresses resume/destroy/cleanup functions in the coroutine frame.
+  updateCoroFrame(Shape, ResumeClone, DestroyClone, CleanupClone);
+
+  // Create a constant array referring to resume/destroy/clone functions pointed
+  // by the last argument of @llvm.coro.info, so that CoroElide pass can
+  // determined correct function to call.
+  setCoroInfo(F, Shape.CoroBegin, {ResumeClone, DestroyClone, CleanupClone});
+
+  // Update call graph and add the functions we created to the SCC.
+  coro::updateCallGraph(F, {ResumeClone, DestroyClone, CleanupClone}, CG, SCC);
+}
+
+// When we see the coroutine the first time, we insert an indirect call to a
+// devirt trigger function and mark the coroutine that it is now ready for
+// split.
+static void prepareForSplit(Function &F, CallGraph &CG) {
+  Module &M = *F.getParent();
+#ifndef NDEBUG
+  Function *DevirtFn = M.getFunction(CORO_DEVIRT_TRIGGER_FN);
+  assert(DevirtFn && "coro.devirt.trigger function not found");
+#endif
+
+  F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT);
+
+  // Insert an indirect call sequence that will be devirtualized by CoroElide
+  // pass:
+  //    %0 = call i8* @llvm.coro.subfn.addr(i8* null, i8 -1)
+  //    %1 = bitcast i8* %0 to void(i8*)*
+  //    call void %1(i8* null)
+  coro::LowererBase Lowerer(M);
+  Instruction *InsertPt = F.getEntryBlock().getTerminator();
+  auto *Null = ConstantPointerNull::get(Type::getInt8PtrTy(F.getContext()));
+  auto *DevirtFnAddr =
+      Lowerer.makeSubFnCall(Null, CoroSubFnInst::RestartTrigger, InsertPt);
+  auto *IndirectCall = CallInst::Create(DevirtFnAddr, Null, "", InsertPt);
+
+  // Update CG graph with an indirect call we just added.
+  CG[&F]->addCalledFunction(IndirectCall, CG.getCallsExternalNode());
+}
+
+// Make sure that there is a devirtualization trigger function that CoroSplit
+// pass uses the force restart CGSCC pipeline. If devirt trigger function is not
+// found, we will create one and add it to the current SCC.
+static void createDevirtTriggerFunc(CallGraph &CG, CallGraphSCC &SCC) {
+  Module &M = CG.getModule();
+  if (M.getFunction(CORO_DEVIRT_TRIGGER_FN))
+    return;
+
+  LLVMContext &C = M.getContext();
+  auto *FnTy = FunctionType::get(Type::getVoidTy(C), Type::getInt8PtrTy(C),
+                                 /*IsVarArgs=*/false);
+  Function *DevirtFn =
+      Function::Create(FnTy, GlobalValue::LinkageTypes::PrivateLinkage,
+                       CORO_DEVIRT_TRIGGER_FN, &M);
+  DevirtFn->addFnAttr(Attribute::AlwaysInline);
+  auto *Entry = BasicBlock::Create(C, "entry", DevirtFn);
+  ReturnInst::Create(C, Entry);
+
+  auto *Node = CG.getOrInsertFunction(DevirtFn);
+
+  SmallVector<CallGraphNode *, 8> Nodes(SCC.begin(), SCC.end());
+  Nodes.push_back(Node);
+  SCC.initialize(Nodes);
+}
+
+//===----------------------------------------------------------------------===//
+//                              Top Level Driver
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+struct CoroSplit : public CallGraphSCCPass {
+  static char ID; // Pass identification, replacement for typeid
+  CoroSplit() : CallGraphSCCPass(ID) {
+    initializeCoroSplitPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool Run = false;
+
+  // A coroutine is identified by the presence of coro.begin intrinsic, if
+  // we don't have any, this pass has nothing to do.
+  bool doInitialization(CallGraph &CG) override {
+    Run = coro::declaresIntrinsics(CG.getModule(), {"llvm.coro.begin"});
+    return CallGraphSCCPass::doInitialization(CG);
+  }
+
+  bool runOnSCC(CallGraphSCC &SCC) override {
+    if (!Run)
+      return false;
+
+    // Find coroutines for processing.
+    SmallVector<Function *, 4> Coroutines;
+    for (CallGraphNode *CGN : SCC)
+      if (auto *F = CGN->getFunction())
+        if (F->hasFnAttribute(CORO_PRESPLIT_ATTR))
+          Coroutines.push_back(F);
+
+    if (Coroutines.empty())
+      return false;
+
+    CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+    createDevirtTriggerFunc(CG, SCC);
+
+    for (Function *F : Coroutines) {
+      Attribute Attr = F->getFnAttribute(CORO_PRESPLIT_ATTR);
+      StringRef Value = Attr.getValueAsString();
+      DEBUG(dbgs() << "CoroSplit: Processing coroutine '" << F->getName()
+                   << "' state: " << Value << "\n");
+      if (Value == UNPREPARED_FOR_SPLIT) {
+        prepareForSplit(*F, CG);
+        continue;
+      }
+      F->removeFnAttr(CORO_PRESPLIT_ATTR);
+      splitCoroutine(*F, CG, SCC);
+    }
+    return true;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    CallGraphSCCPass::getAnalysisUsage(AU);
+  }
+  StringRef getPassName() const override { return "Coroutine Splitting"; }
+};
+}
+
+char CoroSplit::ID = 0;
+INITIALIZE_PASS(
+    CoroSplit, "coro-split",
+    "Split coroutine into a set of functions driving its state machine", false,
+    false)
+
+Pass *llvm::createCoroSplitPass() { return new CoroSplit(); }
diff --git a/contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp
new file mode 100644
index 000000000000..44e1f9b404ed
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -0,0 +1,326 @@
+//===-- Coroutines.cpp ----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file implements the common infrastructure for Coroutine Passes.
+//===----------------------------------------------------------------------===//
+
+#include "CoroInternal.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+void llvm::initializeCoroutines(PassRegistry &Registry) {
+  initializeCoroEarlyPass(Registry);
+  initializeCoroSplitPass(Registry);
+  initializeCoroElidePass(Registry);
+  initializeCoroCleanupPass(Registry);
+}
+
+static void addCoroutineOpt0Passes(const PassManagerBuilder &Builder,
+                                   legacy::PassManagerBase &PM) {
+  PM.add(createCoroSplitPass());
+  PM.add(createCoroElidePass());
+
+  PM.add(createBarrierNoopPass());
+  PM.add(createCoroCleanupPass());
+}
+
+static void addCoroutineEarlyPasses(const PassManagerBuilder &Builder,
+                                    legacy::PassManagerBase &PM) {
+  PM.add(createCoroEarlyPass());
+}
+
+static void addCoroutineScalarOptimizerPasses(const PassManagerBuilder &Builder,
+                                              legacy::PassManagerBase &PM) {
+  PM.add(createCoroElidePass());
+}
+
+static void addCoroutineSCCPasses(const PassManagerBuilder &Builder,
+                                  legacy::PassManagerBase &PM) {
+  PM.add(createCoroSplitPass());
+}
+
+static void addCoroutineOptimizerLastPasses(const PassManagerBuilder &Builder,
+                                            legacy::PassManagerBase &PM) {
+  PM.add(createCoroCleanupPass());
+}
+
+void llvm::addCoroutinePassesToExtensionPoints(PassManagerBuilder &Builder) {
+  Builder.addExtension(PassManagerBuilder::EP_EarlyAsPossible,
+                       addCoroutineEarlyPasses);
+  Builder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0,
+                       addCoroutineOpt0Passes);
+  Builder.addExtension(PassManagerBuilder::EP_CGSCCOptimizerLate,
+                       addCoroutineSCCPasses);
+  Builder.addExtension(PassManagerBuilder::EP_ScalarOptimizerLate,
+                       addCoroutineScalarOptimizerPasses);
+  Builder.addExtension(PassManagerBuilder::EP_OptimizerLast,
+                       addCoroutineOptimizerLastPasses);
+}
+
+// Construct the lowerer base class and initialize its members.
+coro::LowererBase::LowererBase(Module &M)
+    : TheModule(M), Context(M.getContext()),
+      Int8Ptr(Type::getInt8PtrTy(Context)),
+      ResumeFnType(FunctionType::get(Type::getVoidTy(Context), Int8Ptr,
+                                     /*isVarArg=*/false)),
+      NullPtr(ConstantPointerNull::get(Int8Ptr)) {}
+
+// Creates a sequence of instructions to obtain a resume function address using
+// llvm.coro.subfn.addr. It generates the following sequence:
+//
+//    call i8* @llvm.coro.subfn.addr(i8* %Arg, i8 %index)
+//    bitcast i8* %2 to void(i8*)*
+
+Value *coro::LowererBase::makeSubFnCall(Value *Arg, int Index,
+                                        Instruction *InsertPt) {
+  auto *IndexVal = ConstantInt::get(Type::getInt8Ty(Context), Index);
+  auto *Fn = Intrinsic::getDeclaration(&TheModule, Intrinsic::coro_subfn_addr);
+
+  assert(Index >= CoroSubFnInst::IndexFirst &&
+         Index < CoroSubFnInst::IndexLast &&
+         "makeSubFnCall: Index value out of range");
+  auto *Call = CallInst::Create(Fn, {Arg, IndexVal}, "", InsertPt);
+
+  auto *Bitcast =
+      new BitCastInst(Call, ResumeFnType->getPointerTo(), "", InsertPt);
+  return Bitcast;
+}
+
+#ifndef NDEBUG
+static bool isCoroutineIntrinsicName(StringRef Name) {
+  // NOTE: Must be sorted!
+  static const char *const CoroIntrinsics[] = {
+      "llvm.coro.alloc",   "llvm.coro.begin",   "llvm.coro.destroy",
+      "llvm.coro.done",    "llvm.coro.end",     "llvm.coro.frame",
+      "llvm.coro.free",    "llvm.coro.id",      "llvm.coro.param",
+      "llvm.coro.promise", "llvm.coro.resume",  "llvm.coro.save",
+      "llvm.coro.size",    "llvm.coro.subfn.addr", "llvm.coro.suspend",
+  };
+  return Intrinsic::lookupLLVMIntrinsicByName(CoroIntrinsics, Name) != -1;
+}
+#endif
+
+// Verifies if a module has named values listed. Also, in debug mode verifies
+// that names are intrinsic names.
+bool coro::declaresIntrinsics(Module &M,
+                              std::initializer_list<StringRef> List) {
+
+  for (StringRef Name : List) {
+    assert(isCoroutineIntrinsicName(Name) && "not a coroutine intrinsic");
+    if (M.getNamedValue(Name))
+      return true;
+  }
+
+  return false;
+}
+
+// Replace all coro.frees associated with the provided CoroId either with 'null'
+// if Elide is true and with its frame parameter otherwise.
+void coro::replaceCoroFree(CoroIdInst *CoroId, bool Elide) {
+  SmallVector<CoroFreeInst *, 4> CoroFrees;
+  for (User *U : CoroId->users())
+    if (auto CF = dyn_cast<CoroFreeInst>(U))
+      CoroFrees.push_back(CF);
+
+  if (CoroFrees.empty())
+    return;
+
+  Value *Replacement =
+      Elide ? ConstantPointerNull::get(Type::getInt8PtrTy(CoroId->getContext()))
+            : CoroFrees.front()->getFrame();
+
+  for (CoroFreeInst *CF : CoroFrees) {
+    CF->replaceAllUsesWith(Replacement);
+    CF->eraseFromParent();
+  }
+}
+
+// FIXME: This code is stolen from CallGraph::addToCallGraph(Function *F), which
+// happens to be private. It is better for this functionality exposed by the
+// CallGraph.
+static void buildCGN(CallGraph &CG, CallGraphNode *Node) {
+  Function *F = Node->getFunction();
+
+  // Look for calls by this function.
+  for (Instruction &I : instructions(F))
+    if (CallSite CS = CallSite(cast<Value>(&I))) {
+      const Function *Callee = CS.getCalledFunction();
+      if (!Callee || !Intrinsic::isLeaf(Callee->getIntrinsicID()))
+        // Indirect calls of intrinsics are not allowed so no need to check.
+        // We can be more precise here by using TargetArg returned by
+        // Intrinsic::isLeaf.
+        Node->addCalledFunction(CS, CG.getCallsExternalNode());
+      else if (!Callee->isIntrinsic())
+        Node->addCalledFunction(CS, CG.getOrInsertFunction(Callee));
+    }
+}
+
+// Rebuild CGN after we extracted parts of the code from ParentFunc into
+// NewFuncs. Builds CGNs for the NewFuncs and adds them to the current SCC.
+void coro::updateCallGraph(Function &ParentFunc, ArrayRef<Function *> NewFuncs,
+                           CallGraph &CG, CallGraphSCC &SCC) {
+  // Rebuild CGN from scratch for the ParentFunc
+  auto *ParentNode = CG[&ParentFunc];
+  ParentNode->removeAllCalledFunctions();
+  buildCGN(CG, ParentNode);
+
+  SmallVector<CallGraphNode *, 8> Nodes(SCC.begin(), SCC.end());
+
+  for (Function *F : NewFuncs) {
+    CallGraphNode *Callee = CG.getOrInsertFunction(F);
+    Nodes.push_back(Callee);
+    buildCGN(CG, Callee);
+  }
+
+  SCC.initialize(Nodes);
+}
+
+static void clear(coro::Shape &Shape) {
+  Shape.CoroBegin = nullptr;
+  Shape.CoroEnds.clear();
+  Shape.CoroSizes.clear();
+  Shape.CoroSuspends.clear();
+
+  Shape.FrameTy = nullptr;
+  Shape.FramePtr = nullptr;
+  Shape.AllocaSpillBlock = nullptr;
+  Shape.ResumeSwitch = nullptr;
+  Shape.PromiseAlloca = nullptr;
+  Shape.HasFinalSuspend = false;
+}
+
+static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin,
+                                    CoroSuspendInst *SuspendInst) {
+  Module *M = SuspendInst->getModule();
+  auto *Fn = Intrinsic::getDeclaration(M, Intrinsic::coro_save);
+  auto *SaveInst =
+      cast<CoroSaveInst>(CallInst::Create(Fn, CoroBegin, "", SuspendInst));
+  assert(!SuspendInst->getCoroSave());
+  SuspendInst->setArgOperand(0, SaveInst);
+  return SaveInst;
+}
+
+// Collect "interesting" coroutine intrinsics.
+void coro::Shape::buildFrom(Function &F) {
+  size_t FinalSuspendIndex = 0;
+  clear(*this);
+  SmallVector<CoroFrameInst *, 8> CoroFrames;
+  SmallVector<CoroSaveInst *, 2> UnusedCoroSaves;
+
+  for (Instruction &I : instructions(F)) {
+    if (auto II = dyn_cast<IntrinsicInst>(&I)) {
+      switch (II->getIntrinsicID()) {
+      default:
+        continue;
+      case Intrinsic::coro_size:
+        CoroSizes.push_back(cast<CoroSizeInst>(II));
+        break;
+      case Intrinsic::coro_frame:
+        CoroFrames.push_back(cast<CoroFrameInst>(II));
+        break;
+      case Intrinsic::coro_save:
+        // After optimizations, coro_suspends using this coro_save might have
+        // been removed, remember orphaned coro_saves to remove them later.
+        if (II->use_empty())
+          UnusedCoroSaves.push_back(cast<CoroSaveInst>(II));
+        break;
+      case Intrinsic::coro_suspend:
+        CoroSuspends.push_back(cast<CoroSuspendInst>(II));
+        if (CoroSuspends.back()->isFinal()) {
+          if (HasFinalSuspend)
+            report_fatal_error(
+              "Only one suspend point can be marked as final");
+          HasFinalSuspend = true;
+          FinalSuspendIndex = CoroSuspends.size() - 1;
+        }
+        break;
+      case Intrinsic::coro_begin: {
+        auto CB = cast<CoroBeginInst>(II);
+        if (CB->getId()->getInfo().isPreSplit()) {
+          if (CoroBegin)
+            report_fatal_error(
+                "coroutine should have exactly one defining @llvm.coro.begin");
+          CB->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+          CB->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+          CB->removeAttribute(AttributeList::FunctionIndex,
+                              Attribute::NoDuplicate);
+          CoroBegin = CB;
+        }
+        break;
+      }
+      case Intrinsic::coro_end:
+        CoroEnds.push_back(cast<CoroEndInst>(II));
+        if (CoroEnds.back()->isFallthrough()) {
+          // Make sure that the fallthrough coro.end is the first element in the
+          // CoroEnds vector.
+          if (CoroEnds.size() > 1) {
+            if (CoroEnds.front()->isFallthrough())
+              report_fatal_error(
+                  "Only one coro.end can be marked as fallthrough");
+            std::swap(CoroEnds.front(), CoroEnds.back());
+          }
+        }
+        break;
+      }
+    }
+  }
+
+  // If for some reason, we were not able to find coro.begin, bailout.
+  if (!CoroBegin) {
+    // Replace coro.frame which are supposed to be lowered to the result of
+    // coro.begin with undef.
+    auto *Undef = UndefValue::get(Type::getInt8PtrTy(F.getContext()));
+    for (CoroFrameInst *CF : CoroFrames) {
+      CF->replaceAllUsesWith(Undef);
+      CF->eraseFromParent();
+    }
+
+    // Replace all coro.suspend with undef and remove related coro.saves if
+    // present.
+    for (CoroSuspendInst *CS : CoroSuspends) {
+      CS->replaceAllUsesWith(UndefValue::get(CS->getType()));
+      CS->eraseFromParent();
+      if (auto *CoroSave = CS->getCoroSave())
+        CoroSave->eraseFromParent();
+    }
+
+    // Replace all coro.ends with unreachable instruction.
+    for (CoroEndInst *CE : CoroEnds)
+      changeToUnreachable(CE, /*UseLLVMTrap=*/false);
+
+    return;
+  }
+
+  // The coro.free intrinsic is always lowered to the result of coro.begin.
+  for (CoroFrameInst *CF : CoroFrames) {
+    CF->replaceAllUsesWith(CoroBegin);
+    CF->eraseFromParent();
+  }
+
+  // Canonicalize coro.suspend by inserting a coro.save if needed.
+  for (CoroSuspendInst *CS : CoroSuspends)
+    if (!CS->getCoroSave())
+      createCoroSave(CoroBegin, CS);
+
+  // Move final suspend to be the last element in the CoroSuspends vector.
+  if (HasFinalSuspend &&
+      FinalSuspendIndex != CoroSuspends.size() - 1)
+    std::swap(CoroSuspends[FinalSuspendIndex], CoroSuspends.back());
+
+  // Remove orphaned coro.saves.
+  for (CoroSaveInst *CoroSave : UnusedCoroSaves)
+    CoroSave->eraseFromParent();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/contrib/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
new file mode 100644
index 000000000000..b7d96007c24a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -0,0 +1,158 @@
+//===- InlineAlways.cpp - Code to inline always_inline functions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a custom inliner that handles only functions that
+// are marked as "always inline".
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/Inliner.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inline"
+
+PreservedAnalyses AlwaysInlinerPass::run(Module &M, ModuleAnalysisManager &) {
+  InlineFunctionInfo IFI;
+  SmallSetVector<CallSite, 16> Calls;
+  bool Changed = false;
+  SmallVector<Function *, 16> InlinedFunctions;
+  for (Function &F : M)
+    if (!F.isDeclaration() && F.hasFnAttribute(Attribute::AlwaysInline) &&
+        isInlineViable(F)) {
+      Calls.clear();
+
+      for (User *U : F.users())
+        if (auto CS = CallSite(U))
+          if (CS.getCalledFunction() == &F)
+            Calls.insert(CS);
+
+      for (CallSite CS : Calls)
+        // FIXME: We really shouldn't be able to fail to inline at this point!
+        // We should do something to log or check the inline failures here.
+        Changed |= InlineFunction(CS, IFI);
+
+      // Remember to try and delete this function afterward. This both avoids
+      // re-walking the rest of the module and avoids dealing with any iterator
+      // invalidation issues while deleting functions.
+      InlinedFunctions.push_back(&F);
+    }
+
+  // Remove any live functions.
+  erase_if(InlinedFunctions, [&](Function *F) {
+    F->removeDeadConstantUsers();
+    return !F->isDefTriviallyDead();
+  });
+
+  // Delete the non-comdat ones from the module and also from our vector.
+  auto NonComdatBegin = partition(
+      InlinedFunctions, [&](Function *F) { return F->hasComdat(); });
+  for (Function *F : make_range(NonComdatBegin, InlinedFunctions.end()))
+    M.getFunctionList().erase(F);
+  InlinedFunctions.erase(NonComdatBegin, InlinedFunctions.end());
+
+  if (!InlinedFunctions.empty()) {
+    // Now we just have the comdat functions. Filter out the ones whose comdats
+    // are not actually dead.
+    filterDeadComdatFunctions(M, InlinedFunctions);
+    // The remaining functions are actually dead.
+    for (Function *F : InlinedFunctions)
+      M.getFunctionList().erase(F);
+  }
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+namespace {
+
+/// Inliner pass which only handles "always inline" functions.
+///
+/// Unlike the \c AlwaysInlinerPass, this uses the more heavyweight \c Inliner
+/// base class to provide several facilities such as array alloca merging.
+class AlwaysInlinerLegacyPass : public LegacyInlinerBase {
+
+public:
+  AlwaysInlinerLegacyPass() : LegacyInlinerBase(ID, /*InsertLifetime*/ true) {
+    initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  AlwaysInlinerLegacyPass(bool InsertLifetime)
+      : LegacyInlinerBase(ID, InsertLifetime) {
+    initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  /// Main run interface method.  We override here to avoid calling skipSCC().
+  bool runOnSCC(CallGraphSCC &SCC) override { return inlineCalls(SCC); }
+
+  static char ID; // Pass identification, replacement for typeid
+
+  InlineCost getInlineCost(CallSite CS) override;
+
+  using llvm::Pass::doFinalization;
+  bool doFinalization(CallGraph &CG) override {
+    return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true);
+  }
+};
+}
+
+char AlwaysInlinerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(AlwaysInlinerLegacyPass, "always-inline",
+                      "Inliner for always_inline functions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(AlwaysInlinerLegacyPass, "always-inline",
+                    "Inliner for always_inline functions", false, false)
+
+Pass *llvm::createAlwaysInlinerLegacyPass(bool InsertLifetime) {
+  return new AlwaysInlinerLegacyPass(InsertLifetime);
+}
+
+/// \brief Get the inline cost for the always-inliner.
+///
+/// The always inliner *only* handles functions which are marked with the
+/// attribute to force inlining. As such, it is dramatically simpler and avoids
+/// using the powerful (but expensive) inline cost analysis. Instead it uses
+/// a very simple and boring direct walk of the instructions looking for
+/// impossible-to-inline constructs.
+///
+/// Note, it would be possible to go to some lengths to cache the information
+/// computed here, but as we only expect to do this for relatively few and
+/// small functions which have the explicit attribute to force inlining, it is
+/// likely not worth it in practice.
+InlineCost AlwaysInlinerLegacyPass::getInlineCost(CallSite CS) {
+  Function *Callee = CS.getCalledFunction();
+
+  // Only inline direct calls to functions with always-inline attributes
+  // that are viable for inlining. FIXME: We shouldn't even get here for
+  // declarations.
+  if (Callee && !Callee->isDeclaration() &&
+      CS.hasFnAttr(Attribute::AlwaysInline) && isInlineViable(*Callee))
+    return InlineCost::getAlways();
+
+  return InlineCost::getNever();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
new file mode 100644
index 000000000000..53223ab44316
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -0,0 +1,1060 @@
+//===-- ArgumentPromotion.cpp - Promote by-reference arguments ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass promotes "by reference" arguments to be "by value" arguments.  In
+// practice, this means looking for internal functions that have pointer
+// arguments.  If it can prove, through the use of alias analysis, that an
+// argument is *only* loaded, then it can pass the value into the function
+// instead of the address of the value.  This can cause recursive simplification
+// of code and lead to the elimination of allocas (especially in C++ template
+// code like the STL).
+//
+// This pass also handles aggregate arguments that are passed into a function,
+// scalarizing them if the elements of the aggregate are only loaded.  Note that
+// by default it refuses to scalarize aggregates which would require passing in
+// more than three operands to the function, because passing thousands of
+// operands for a large array or structure is unprofitable! This limit can be
+// configured or disabled, however.
+//
+// Note that this transformation could also be done for arguments that are only
+// stored to (returning the value instead), but does not currently.  This case
+// would be best handled when and if LLVM begins supporting multiple return
+// values from functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ArgumentPromotion.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "argpromotion"
+
+STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted");
+STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted");
+STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted");
+STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated");
+
+/// A vector used to hold the indices of a single GEP instruction
+typedef std::vector<uint64_t> IndicesVector;
+
+/// DoPromotion - This method actually performs the promotion of the specified
+/// arguments, and returns the new function.  At this point, we know that it's
+/// safe to do so.
+static Function *
+doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
+            SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
+            Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
+                ReplaceCallSite) {
+
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but has modified arguments.
+  FunctionType *FTy = F->getFunctionType();
+  std::vector<Type *> Params;
+
+  typedef std::set<std::pair<Type *, IndicesVector>> ScalarizeTable;
+
+  // ScalarizedElements - If we are promoting a pointer that has elements
+  // accessed out of it, keep track of which elements are accessed so that we
+  // can add one argument for each.
+  //
+  // Arguments that are directly loaded will have a zero element value here, to
+  // handle cases where there are both a direct load and GEP accesses.
+  //
+  std::map<Argument *, ScalarizeTable> ScalarizedElements;
+
+  // OriginalLoads - Keep track of a representative load instruction from the
+  // original function so that we can tell the alias analysis implementation
+  // what the new GEP/Load instructions we are inserting look like.
+  // We need to keep the original loads for each argument and the elements
+  // of the argument that are accessed.
+  std::map<std::pair<Argument *, IndicesVector>, LoadInst *> OriginalLoads;
+
+  // Attribute - Keep track of the parameter attributes for the arguments
+  // that we are *not* promoting. For the ones that we do promote, the parameter
+  // attributes are lost
+  SmallVector<AttributeSet, 8> ArgAttrVec;
+  AttributeList PAL = F->getAttributes();
+
+  // First, determine the new argument list
+  unsigned ArgNo = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+       ++I, ++ArgNo) {
+    if (ByValArgsToTransform.count(&*I)) {
+      // Simple byval argument? Just add all the struct element types.
+      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      StructType *STy = cast<StructType>(AgTy);
+      Params.insert(Params.end(), STy->element_begin(), STy->element_end());
+      ArgAttrVec.insert(ArgAttrVec.end(), STy->getNumElements(),
+                        AttributeSet());
+      ++NumByValArgsPromoted;
+    } else if (!ArgsToPromote.count(&*I)) {
+      // Unchanged argument
+      Params.push_back(I->getType());
+      ArgAttrVec.push_back(PAL.getParamAttributes(ArgNo));
+    } else if (I->use_empty()) {
+      // Dead argument (which are always marked as promotable)
+      ++NumArgumentsDead;
+
+      // There may be remaining metadata uses of the argument for things like
+      // llvm.dbg.value. Replace them with undef.
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+    } else {
+      // Okay, this is being promoted. This means that the only uses are loads
+      // or GEPs which are only used by loads
+
+      // In this table, we will track which indices are loaded from the argument
+      // (where direct loads are tracked as no indices).
+      ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+      for (User *U : I->users()) {
+        Instruction *UI = cast<Instruction>(U);
+        Type *SrcTy;
+        if (LoadInst *L = dyn_cast<LoadInst>(UI))
+          SrcTy = L->getType();
+        else
+          SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType();
+        IndicesVector Indices;
+        Indices.reserve(UI->getNumOperands() - 1);
+        // Since loads will only have a single operand, and GEPs only a single
+        // non-index operand, this will record direct loads without any indices,
+        // and gep+loads with the GEP indices.
+        for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end();
+             II != IE; ++II)
+          Indices.push_back(cast<ConstantInt>(*II)->getSExtValue());
+        // GEPs with a single 0 index can be merged with direct loads
+        if (Indices.size() == 1 && Indices.front() == 0)
+          Indices.clear();
+        ArgIndices.insert(std::make_pair(SrcTy, Indices));
+        LoadInst *OrigLoad;
+        if (LoadInst *L = dyn_cast<LoadInst>(UI))
+          OrigLoad = L;
+        else
+          // Take any load, we will use it only to update Alias Analysis
+          OrigLoad = cast<LoadInst>(UI->user_back());
+        OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad;
+      }
+
+      // Add a parameter to the function for each element passed in.
+      for (const auto &ArgIndex : ArgIndices) {
+        // not allowed to dereference ->begin() if size() is 0
+        Params.push_back(GetElementPtrInst::getIndexedType(
+            cast<PointerType>(I->getType()->getScalarType())->getElementType(),
+            ArgIndex.second));
+        ArgAttrVec.push_back(AttributeSet());
+        assert(Params.back());
+      }
+
+      if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty())
+        ++NumArgumentsPromoted;
+      else
+        ++NumAggregatesPromoted;
+    }
+  }
+
+  Type *RetTy = FTy->getReturnType();
+
+  // Construct the new function type using the new arguments.
+  FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());
+
+  // Create the new function body and insert it into the module.
+  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
+  NF->copyAttributesFrom(F);
+
+  // Patch the pointer to LLVM function in debug info descriptor.
+  NF->setSubprogram(F->getSubprogram());
+  F->setSubprogram(nullptr);
+
+  DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
+               << "From: " << *F);
+
+  // Recompute the parameter attributes list based on the new arguments for
+  // the function.
+  NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(),
+                                       PAL.getRetAttributes(), ArgAttrVec));
+  ArgAttrVec.clear();
+
+  F->getParent()->getFunctionList().insert(F->getIterator(), NF);
+  NF->takeName(F);
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in the loaded pointers.
+  //
+  SmallVector<Value *, 16> Args;
+  while (!F->use_empty()) {
+    CallSite CS(F->user_back());
+    assert(CS.getCalledFunction() == F);
+    Instruction *Call = CS.getInstruction();
+    const AttributeList &CallPAL = CS.getAttributes();
+
+    // Loop over the operands, inserting GEP and loads in the caller as
+    // appropriate.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    ArgNo = 0;
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+         ++I, ++AI, ++ArgNo)
+      if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
+        Args.push_back(*AI); // Unmodified argument
+        ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
+      } else if (ByValArgsToTransform.count(&*I)) {
+        // Emit a GEP and load for each element of the struct.
+        Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+        StructType *STy = cast<StructType>(AgTy);
+        Value *Idxs[2] = {
+            ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr};
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
+          Value *Idx = GetElementPtrInst::Create(
+              STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i), Call);
+          // TODO: Tell AA about the new values?
+          Args.push_back(new LoadInst(Idx, Idx->getName() + ".val", Call));
+          ArgAttrVec.push_back(AttributeSet());
+        }
+      } else if (!I->use_empty()) {
+        // Non-dead argument: insert GEPs and loads as appropriate.
+        ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+        // Store the Value* version of the indices in here, but declare it now
+        // for reuse.
+        std::vector<Value *> Ops;
+        for (const auto &ArgIndex : ArgIndices) {
+          Value *V = *AI;
+          LoadInst *OrigLoad =
+              OriginalLoads[std::make_pair(&*I, ArgIndex.second)];
+          if (!ArgIndex.second.empty()) {
+            Ops.reserve(ArgIndex.second.size());
+            Type *ElTy = V->getType();
+            for (auto II : ArgIndex.second) {
+              // Use i32 to index structs, and i64 for others (pointers/arrays).
+              // This satisfies GEP constraints.
+              Type *IdxTy =
+                  (ElTy->isStructTy() ? Type::getInt32Ty(F->getContext())
+                                      : Type::getInt64Ty(F->getContext()));
+              Ops.push_back(ConstantInt::get(IdxTy, II));
+              // Keep track of the type we're currently indexing.
+              if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
+                ElTy = ElPTy->getElementType();
+              else
+                ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(II);
+            }
+            // And create a GEP to extract those indices.
+            V = GetElementPtrInst::Create(ArgIndex.first, V, Ops,
+                                          V->getName() + ".idx", Call);
+            Ops.clear();
+          }
+          // Since we're replacing a load make sure we take the alignment
+          // of the previous load.
+          LoadInst *newLoad = new LoadInst(V, V->getName() + ".val", Call);
+          newLoad->setAlignment(OrigLoad->getAlignment());
+          // Transfer the AA info too.
+          AAMDNodes AAInfo;
+          OrigLoad->getAAMetadata(AAInfo);
+          newLoad->setAAMetadata(AAInfo);
+
+          Args.push_back(newLoad);
+          ArgAttrVec.push_back(AttributeSet());
+        }
+      }
+
+    // Push any varargs arguments on the list.
+    for (; AI != CS.arg_end(); ++AI, ++ArgNo) {
+      Args.push_back(*AI);
+      ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
+    }
+
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CS.getOperandBundlesAsDefs(OpBundles);
+
+    CallSite NewCS;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                                 Args, OpBundles, "", Call);
+    } else {
+      auto *NewCall = CallInst::Create(NF, Args, OpBundles, "", Call);
+      NewCall->setTailCallKind(cast<CallInst>(Call)->getTailCallKind());
+      NewCS = NewCall;
+    }
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(
+        AttributeList::get(F->getContext(), CallPAL.getFnAttributes(),
+                           CallPAL.getRetAttributes(), ArgAttrVec));
+    NewCS->setDebugLoc(Call->getDebugLoc());
+    uint64_t W;
+    if (Call->extractProfTotalWeight(W))
+      NewCS->setProfWeight(W);
+    Args.clear();
+    ArgAttrVec.clear();
+
+    // Update the callgraph to know that the callsite has been transformed.
+    if (ReplaceCallSite)
+      (*ReplaceCallSite)(CS, NewCS);
+
+    if (!Call->use_empty()) {
+      Call->replaceAllUsesWith(NewCS.getInstruction());
+      NewCS->takeName(Call);
+    }
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    Call->eraseFromParent();
+  }
+
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.
+  //
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+                              I2 = NF->arg_begin();
+       I != E; ++I) {
+    if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
+      // If this is an unmodified argument, move the name and users over to the
+      // new version.
+      I->replaceAllUsesWith(&*I2);
+      I2->takeName(&*I);
+      ++I2;
+      continue;
+    }
+
+    if (ByValArgsToTransform.count(&*I)) {
+      // In the callee, we create an alloca, and store each of the new incoming
+      // arguments into the alloca.
+      Instruction *InsertPt = &NF->begin()->front();
+
+      // Just add all the struct element types.
+      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      Value *TheAlloca = new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr,
+                                        "", InsertPt);
+      StructType *STy = cast<StructType>(AgTy);
+      Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
+                        nullptr};
+
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+        Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
+        Value *Idx = GetElementPtrInst::Create(
+            AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i),
+            InsertPt);
+        I2->setName(I->getName() + "." + Twine(i));
+        new StoreInst(&*I2++, Idx, InsertPt);
+      }
+
+      // Anything that used the arg should now use the alloca.
+      I->replaceAllUsesWith(TheAlloca);
+      TheAlloca->takeName(&*I);
+
+      // If the alloca is used in a call, we must clear the tail flag since
+      // the callee now uses an alloca from the caller.
+      for (User *U : TheAlloca->users()) {
+        CallInst *Call = dyn_cast<CallInst>(U);
+        if (!Call)
+          continue;
+        Call->setTailCall(false);
+      }
+      continue;
+    }
+
+    if (I->use_empty())
+      continue;
+
+    // Otherwise, if we promoted this argument, then all users are load
+    // instructions (or GEPs with only load users), and all loads should be
+    // using the new argument that we added.
+    ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+
+    while (!I->use_empty()) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) {
+        assert(ArgIndices.begin()->second.empty() &&
+               "Load element should sort to front!");
+        I2->setName(I->getName() + ".val");
+        LI->replaceAllUsesWith(&*I2);
+        LI->eraseFromParent();
+        DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
+                     << "' in function '" << F->getName() << "'\n");
+      } else {
+        GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back());
+        IndicesVector Operands;
+        Operands.reserve(GEP->getNumIndices());
+        for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
+             II != IE; ++II)
+          Operands.push_back(cast<ConstantInt>(*II)->getSExtValue());
+
+        // GEPs with a single 0 index can be merged with direct loads
+        if (Operands.size() == 1 && Operands.front() == 0)
+          Operands.clear();
+
+        Function::arg_iterator TheArg = I2;
+        for (ScalarizeTable::iterator It = ArgIndices.begin();
+             It->second != Operands; ++It, ++TheArg) {
+          assert(It != ArgIndices.end() && "GEP not handled??");
+        }
+
+        std::string NewName = I->getName();
+        for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+          NewName += "." + utostr(Operands[i]);
+        }
+        NewName += ".val";
+        TheArg->setName(NewName);
+
+        DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
+                     << "' of function '" << NF->getName() << "'\n");
+
+        // All of the uses must be load instructions.  Replace them all with
+        // the argument specified by ArgNo.
+        while (!GEP->use_empty()) {
+          LoadInst *L = cast<LoadInst>(GEP->user_back());
+          L->replaceAllUsesWith(&*TheArg);
+          L->eraseFromParent();
+        }
+        GEP->eraseFromParent();
+      }
+    }
+
+    // Increment I2 past all of the arguments added for this promoted pointer.
+    std::advance(I2, ArgIndices.size());
+  }
+
+  return NF;
+}
+
+/// AllCallersPassInValidPointerForArgument - Return true if we can prove that
+/// all callees pass in a valid pointer for the specified function argument.
+static bool allCallersPassInValidPointerForArgument(Argument *Arg) {
+  Function *Callee = Arg->getParent();
+  const DataLayout &DL = Callee->getParent()->getDataLayout();
+
+  unsigned ArgNo = Arg->getArgNo();
+
+  // Look at all call sites of the function.  At this point we know we only have
+  // direct callees.
+  for (User *U : Callee->users()) {
+    CallSite CS(U);
+    assert(CS && "Should only have direct calls!");
+
+    if (!isDereferenceablePointer(CS.getArgument(ArgNo), DL))
+      return false;
+  }
+  return true;
+}
+
+/// Returns true if Prefix is a prefix of longer. That means, Longer has a size
+/// that is greater than or equal to the size of prefix, and each of the
+/// elements in Prefix is the same as the corresponding elements in Longer.
+///
+/// This means it also returns true when Prefix and Longer are equal!
+static bool isPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) {
+  if (Prefix.size() > Longer.size())
+    return false;
+  return std::equal(Prefix.begin(), Prefix.end(), Longer.begin());
+}
+
+/// Checks if Indices, or a prefix of Indices, is in Set.
+static bool prefixIn(const IndicesVector &Indices,
+                     std::set<IndicesVector> &Set) {
+  std::set<IndicesVector>::iterator Low;
+  Low = Set.upper_bound(Indices);
+  if (Low != Set.begin())
+    Low--;
+  // Low is now the last element smaller than or equal to Indices. This means
+  // it points to a prefix of Indices (possibly Indices itself), if such
+  // prefix exists.
+  //
+  // This load is safe if any prefix of its operands is safe to load.
+  return Low != Set.end() && isPrefix(*Low, Indices);
+}
+
+/// Mark the given indices (ToMark) as safe in the given set of indices
+/// (Safe). Marking safe usually means adding ToMark to Safe. However, if there
+/// is already a prefix of Indices in Safe, Indices are implicitely marked safe
+/// already. Furthermore, any indices that Indices is itself a prefix of, are
+/// removed from Safe (since they are implicitely safe because of Indices now).
+static void markIndicesSafe(const IndicesVector &ToMark,
+                            std::set<IndicesVector> &Safe) {
+  std::set<IndicesVector>::iterator Low;
+  Low = Safe.upper_bound(ToMark);
+  // Guard against the case where Safe is empty
+  if (Low != Safe.begin())
+    Low--;
+  // Low is now the last element smaller than or equal to Indices. This
+  // means it points to a prefix of Indices (possibly Indices itself), if
+  // such prefix exists.
+  if (Low != Safe.end()) {
+    if (isPrefix(*Low, ToMark))
+      // If there is already a prefix of these indices (or exactly these
+      // indices) marked a safe, don't bother adding these indices
+      return;
+
+    // Increment Low, so we can use it as a "insert before" hint
+    ++Low;
+  }
+  // Insert
+  Low = Safe.insert(Low, ToMark);
+  ++Low;
+  // If there we're a prefix of longer index list(s), remove those
+  std::set<IndicesVector>::iterator End = Safe.end();
+  while (Low != End && isPrefix(ToMark, *Low)) {
+    std::set<IndicesVector>::iterator Remove = Low;
+    ++Low;
+    Safe.erase(Remove);
+  }
+}
+
+/// isSafeToPromoteArgument - As you might guess from the name of this method,
+/// it checks to see if it is both safe and useful to promote the argument.
+/// This method limits promotion of aggregates to only promote up to three
+/// elements of the aggregate in order to avoid exploding the number of
+/// arguments passed in.
+static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
+                                    AAResults &AAR, unsigned MaxElements) {
+  typedef std::set<IndicesVector> GEPIndicesSet;
+
+  // Quick exit for unused arguments
+  if (Arg->use_empty())
+    return true;
+
+  // We can only promote this argument if all of the uses are loads, or are GEP
+  // instructions (with constant indices) that are subsequently loaded.
+  //
+  // Promoting the argument causes it to be loaded in the caller
+  // unconditionally. This is only safe if we can prove that either the load
+  // would have happened in the callee anyway (ie, there is a load in the entry
+  // block) or the pointer passed in at every call site is guaranteed to be
+  // valid.
+  // In the former case, invalid loads can happen, but would have happened
+  // anyway, in the latter case, invalid loads won't happen. This prevents us
+  // from introducing an invalid load that wouldn't have happened in the
+  // original code.
+  //
+  // This set will contain all sets of indices that are loaded in the entry
+  // block, and thus are safe to unconditionally load in the caller.
+  //
+  // This optimization is also safe for InAlloca parameters, because it verifies
+  // that the address isn't captured.
+  GEPIndicesSet SafeToUnconditionallyLoad;
+
+  // This set contains all the sets of indices that we are planning to promote.
+  // This makes it possible to limit the number of arguments added.
+  GEPIndicesSet ToPromote;
+
+  // If the pointer is always valid, any load with first index 0 is valid.
+  if (isByValOrInAlloca || allCallersPassInValidPointerForArgument(Arg))
+    SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
+
+  // First, iterate the entry block and mark loads of (geps of) arguments as
+  // safe.
+  BasicBlock &EntryBlock = Arg->getParent()->front();
+  // Declare this here so we can reuse it
+  IndicesVector Indices;
+  for (Instruction &I : EntryBlock)
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+      Value *V = LI->getPointerOperand();
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
+        V = GEP->getPointerOperand();
+        if (V == Arg) {
+          // This load actually loads (part of) Arg? Check the indices then.
+          Indices.reserve(GEP->getNumIndices());
+          for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
+               II != IE; ++II)
+            if (ConstantInt *CI = dyn_cast<ConstantInt>(*II))
+              Indices.push_back(CI->getSExtValue());
+            else
+              // We found a non-constant GEP index for this argument? Bail out
+              // right away, can't promote this argument at all.
+              return false;
+
+          // Indices checked out, mark them as safe
+          markIndicesSafe(Indices, SafeToUnconditionallyLoad);
+          Indices.clear();
+        }
+      } else if (V == Arg) {
+        // Direct loads are equivalent to a GEP with a single 0 index.
+        markIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
+      }
+    }
+
+  // Now, iterate all uses of the argument to see if there are any uses that are
+  // not (GEP+)loads, or any (GEP+)loads that are not safe to promote.
+  SmallVector<LoadInst *, 16> Loads;
+  IndicesVector Operands;
+  for (Use &U : Arg->uses()) {
+    User *UR = U.getUser();
+    Operands.clear();
+    if (LoadInst *LI = dyn_cast<LoadInst>(UR)) {
+      // Don't hack volatile/atomic loads
+      if (!LI->isSimple())
+        return false;
+      Loads.push_back(LI);
+      // Direct loads are equivalent to a GEP with a zero index and then a load.
+      Operands.push_back(0);
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UR)) {
+      if (GEP->use_empty()) {
+        // Dead GEP's cause trouble later.  Just remove them if we run into
+        // them.
+        GEP->eraseFromParent();
+        // TODO: This runs the above loop over and over again for dead GEPs
+        // Couldn't we just do increment the UI iterator earlier and erase the
+        // use?
+        return isSafeToPromoteArgument(Arg, isByValOrInAlloca, AAR,
+                                       MaxElements);
+      }
+
+      // Ensure that all of the indices are constants.
+      for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end(); i != e;
+           ++i)
+        if (ConstantInt *C = dyn_cast<ConstantInt>(*i))
+          Operands.push_back(C->getSExtValue());
+        else
+          return false; // Not a constant operand GEP!
+
+      // Ensure that the only users of the GEP are load instructions.
+      for (User *GEPU : GEP->users())
+        if (LoadInst *LI = dyn_cast<LoadInst>(GEPU)) {
+          // Don't hack volatile/atomic loads
+          if (!LI->isSimple())
+            return false;
+          Loads.push_back(LI);
+        } else {
+          // Other uses than load?
+          return false;
+        }
+    } else {
+      return false; // Not a load or a GEP.
+    }
+
+    // Now, see if it is safe to promote this load / loads of this GEP. Loading
+    // is safe if Operands, or a prefix of Operands, is marked as safe.
+    if (!prefixIn(Operands, SafeToUnconditionallyLoad))
+      return false;
+
+    // See if we are already promoting a load with these indices. If not, check
+    // to make sure that we aren't promoting too many elements.  If so, nothing
+    // to do.
+    if (ToPromote.find(Operands) == ToPromote.end()) {
+      if (MaxElements > 0 && ToPromote.size() == MaxElements) {
+        DEBUG(dbgs() << "argpromotion not promoting argument '"
+                     << Arg->getName()
+                     << "' because it would require adding more "
+                     << "than " << MaxElements
+                     << " arguments to the function.\n");
+        // We limit aggregate promotion to only promoting up to a fixed number
+        // of elements of the aggregate.
+        return false;
+      }
+      ToPromote.insert(std::move(Operands));
+    }
+  }
+
+  if (Loads.empty())
+    return true; // No users, this is a dead argument.
+
+  // Okay, now we know that the argument is only used by load instructions and
+  // it is safe to unconditionally perform all of them. Use alias analysis to
+  // check to see if the pointer is guaranteed to not be modified from entry of
+  // the function to each of the load instructions.
+
+  // Because there could be several/many load instructions, remember which
+  // blocks we know to be transparent to the load.
+  df_iterator_default_set<BasicBlock *, 16> TranspBlocks;
+
+  for (LoadInst *Load : Loads) {
+    // Check to see if the load is invalidated from the start of the block to
+    // the load itself.
+    BasicBlock *BB = Load->getParent();
+
+    MemoryLocation Loc = MemoryLocation::get(Load);
+    if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, MRI_Mod))
+      return false; // Pointer is invalidated!
+
+    // Now check every path from the entry block to the load for transparency.
+    // To do this, we perform a depth first search on the inverse CFG from the
+    // loading block.
+    for (BasicBlock *P : predecessors(BB)) {
+      for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks))
+        if (AAR.canBasicBlockModify(*TranspBB, Loc))
+          return false;
+    }
+  }
+
+  // If the path from the entry of the function to each load is free of
+  // instructions that potentially invalidate the load, we can make the
+  // transformation!
+  return true;
+}
+
+/// \brief Checks if a type could have padding bytes.
+static bool isDenselyPacked(Type *type, const DataLayout &DL) {
+
+  // There is no size information, so be conservative.
+  if (!type->isSized())
+    return false;
+
+  // If the alloc size is not equal to the storage size, then there are padding
+  // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128.
+  if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type))
+    return false;
+
+  if (!isa<CompositeType>(type))
+    return true;
+
+  // For homogenous sequential types, check for padding within members.
+  if (SequentialType *seqTy = dyn_cast<SequentialType>(type))
+    return isDenselyPacked(seqTy->getElementType(), DL);
+
+  // Check for padding within and between elements of a struct.
+  StructType *StructTy = cast<StructType>(type);
+  const StructLayout *Layout = DL.getStructLayout(StructTy);
+  uint64_t StartPos = 0;
+  for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) {
+    Type *ElTy = StructTy->getElementType(i);
+    if (!isDenselyPacked(ElTy, DL))
+      return false;
+    if (StartPos != Layout->getElementOffsetInBits(i))
+      return false;
+    StartPos += DL.getTypeAllocSizeInBits(ElTy);
+  }
+
+  return true;
+}
+
+/// \brief Checks if the padding bytes of an argument could be accessed.
+static bool canPaddingBeAccessed(Argument *arg) {
+
+  assert(arg->hasByValAttr());
+
+  // Track all the pointers to the argument to make sure they are not captured.
+  SmallPtrSet<Value *, 16> PtrValues;
+  PtrValues.insert(arg);
+
+  // Track all of the stores.
+  SmallVector<StoreInst *, 16> Stores;
+
+  // Scan through the uses recursively to make sure the pointer is always used
+  // sanely.
+  SmallVector<Value *, 16> WorkList;
+  WorkList.insert(WorkList.end(), arg->user_begin(), arg->user_end());
+  while (!WorkList.empty()) {
+    Value *V = WorkList.back();
+    WorkList.pop_back();
+    if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) {
+      if (PtrValues.insert(V).second)
+        WorkList.insert(WorkList.end(), V->user_begin(), V->user_end());
+    } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
+      Stores.push_back(Store);
+    } else if (!isa<LoadInst>(V)) {
+      return true;
+    }
+  }
+
+  // Check to make sure the pointers aren't captured
+  for (StoreInst *Store : Stores)
+    if (PtrValues.count(Store->getValueOperand()))
+      return true;
+
+  return false;
+}
+
+/// PromoteArguments - This method checks the specified function to see if there
+/// are any promotable arguments and if it is safe to promote the function (for
+/// example, all callers are direct).  If safe to promote some arguments, it
+/// calls the DoPromotion method.
+///
+static Function *
+promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
+                 unsigned MaxElements,
+                 Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
+                     ReplaceCallSite) {
+  // Make sure that it is local to this module.
+  if (!F->hasLocalLinkage())
+    return nullptr;
+
+  // Don't promote arguments for variadic functions. Adding, removing, or
+  // changing non-pack parameters can change the classification of pack
+  // parameters. Frontends encode that classification at the call site in the
+  // IR, while in the callee the classification is determined dynamically based
+  // on the number of registers consumed so far.
+  if (F->isVarArg())
+    return nullptr;
+
+  // First check: see if there are any pointer arguments!  If not, quick exit.
+  SmallVector<Argument *, 16> PointerArgs;
+  for (Argument &I : F->args())
+    if (I.getType()->isPointerTy())
+      PointerArgs.push_back(&I);
+  if (PointerArgs.empty())
+    return nullptr;
+
+  // Second check: make sure that all callers are direct callers.  We can't
+  // transform functions that have indirect callers.  Also see if the function
+  // is self-recursive.
+  bool isSelfRecursive = false;
+  for (Use &U : F->uses()) {
+    CallSite CS(U.getUser());
+    // Must be a direct call.
+    if (CS.getInstruction() == nullptr || !CS.isCallee(&U))
+      return nullptr;
+
+    if (CS.getInstruction()->getParent()->getParent() == F)
+      isSelfRecursive = true;
+  }
+
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
+  AAResults &AAR = AARGetter(*F);
+
+  // Check to see which arguments are promotable.  If an argument is promotable,
+  // add it to ArgsToPromote.
+  SmallPtrSet<Argument *, 8> ArgsToPromote;
+  SmallPtrSet<Argument *, 8> ByValArgsToTransform;
+  for (Argument *PtrArg : PointerArgs) {
+    Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
+
+    // Replace sret attribute with noalias. This reduces register pressure by
+    // avoiding a register copy.
+    if (PtrArg->hasStructRetAttr()) {
+      unsigned ArgNo = PtrArg->getArgNo();
+      F->removeParamAttr(ArgNo, Attribute::StructRet);
+      F->addParamAttr(ArgNo, Attribute::NoAlias);
+      for (Use &U : F->uses()) {
+        CallSite CS(U.getUser());
+        CS.removeParamAttr(ArgNo, Attribute::StructRet);
+        CS.addParamAttr(ArgNo, Attribute::NoAlias);
+      }
+    }
+
+    // If this is a byval argument, and if the aggregate type is small, just
+    // pass the elements, which is always safe, if the passed value is densely
+    // packed or if we can prove the padding bytes are never accessed. This does
+    // not apply to inalloca.
+    bool isSafeToPromote =
+        PtrArg->hasByValAttr() &&
+        (isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg));
+    if (isSafeToPromote) {
+      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
+        if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
+          DEBUG(dbgs() << "argpromotion disable promoting argument '"
+                       << PtrArg->getName()
+                       << "' because it would require adding more"
+                       << " than " << MaxElements
+                       << " arguments to the function.\n");
+          continue;
+        }
+
+        // If all the elements are single-value types, we can promote it.
+        bool AllSimple = true;
+        for (const auto *EltTy : STy->elements()) {
+          if (!EltTy->isSingleValueType()) {
+            AllSimple = false;
+            break;
+          }
+        }
+
+        // Safe to transform, don't even bother trying to "promote" it.
+        // Passing the elements as a scalar will allow sroa to hack on
+        // the new alloca we introduce.
+        if (AllSimple) {
+          ByValArgsToTransform.insert(PtrArg);
+          continue;
+        }
+      }
+    }
+
+    // If the argument is a recursive type and we're in a recursive
+    // function, we could end up infinitely peeling the function argument.
+    if (isSelfRecursive) {
+      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
+        bool RecursiveType = false;
+        for (const auto *EltTy : STy->elements()) {
+          if (EltTy == PtrArg->getType()) {
+            RecursiveType = true;
+            break;
+          }
+        }
+        if (RecursiveType)
+          continue;
+      }
+    }
+
+    // Otherwise, see if we can promote the pointer to its value.
+    if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR,
+                                MaxElements))
+      ArgsToPromote.insert(PtrArg);
+  }
+
+  // No promotable pointer arguments.
+  if (ArgsToPromote.empty() && ByValArgsToTransform.empty())
+    return nullptr;
+
+  return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite);
+}
+
+PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
+                                             CGSCCAnalysisManager &AM,
+                                             LazyCallGraph &CG,
+                                             CGSCCUpdateResult &UR) {
+  bool Changed = false, LocalChange;
+
+  // Iterate until we stop promoting from this SCC.
+  do {
+    LocalChange = false;
+
+    for (LazyCallGraph::Node &N : C) {
+      Function &OldF = N.getFunction();
+
+      FunctionAnalysisManager &FAM =
+          AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+      // FIXME: This lambda must only be used with this function. We should
+      // skip the lambda and just get the AA results directly.
+      auto AARGetter = [&](Function &F) -> AAResults & {
+        assert(&F == &OldF && "Called with an unexpected function!");
+        return FAM.getResult<AAManager>(F);
+      };
+
+      Function *NewF = promoteArguments(&OldF, AARGetter, 3u, None);
+      if (!NewF)
+        continue;
+      LocalChange = true;
+
+      // Directly substitute the functions in the call graph. Note that this
+      // requires the old function to be completely dead and completely
+      // replaced by the new function. It does no call graph updates, it merely
+      // swaps out the particular function mapped to a particular node in the
+      // graph.
+      C.getOuterRefSCC().replaceNodeFunction(N, *NewF);
+      OldF.eraseFromParent();
+    }
+
+    Changed |= LocalChange;
+  } while (LocalChange);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+namespace {
+/// ArgPromotion - The 'by reference' to 'by value' argument promotion pass.
+///
+struct ArgPromotion : public CallGraphSCCPass {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    getAAResultsAnalysisUsage(AU);
+    CallGraphSCCPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnSCC(CallGraphSCC &SCC) override;
+  static char ID; // Pass identification, replacement for typeid
+  explicit ArgPromotion(unsigned MaxElements = 3)
+      : CallGraphSCCPass(ID), MaxElements(MaxElements) {
+    initializeArgPromotionPass(*PassRegistry::getPassRegistry());
+  }
+
+private:
+  using llvm::Pass::doInitialization;
+  bool doInitialization(CallGraph &CG) override;
+  /// The maximum number of elements to expand, or 0 for unlimited.
+  unsigned MaxElements;
+};
+}
+
+char ArgPromotion::ID = 0;
+INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
+                      "Promote 'by reference' arguments to scalars", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
+                    "Promote 'by reference' arguments to scalars", false, false)
+
+Pass *llvm::createArgumentPromotionPass(unsigned MaxElements) {
+  return new ArgPromotion(MaxElements);
+}
+
+bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
+  if (skipSCC(SCC))
+    return false;
+
+  // Get the callgraph information that we need to update to reflect our
+  // changes.
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+
+  LegacyAARGetter AARGetter(*this);
+
+  bool Changed = false, LocalChange;
+
+  // Iterate until we stop promoting from this SCC.
+  do {
+    LocalChange = false;
+    // Attempt to promote arguments from all functions in this SCC.
+    for (CallGraphNode *OldNode : SCC) {
+      Function *OldF = OldNode->getFunction();
+      if (!OldF)
+        continue;
+
+      auto ReplaceCallSite = [&](CallSite OldCS, CallSite NewCS) {
+        Function *Caller = OldCS.getInstruction()->getParent()->getParent();
+        CallGraphNode *NewCalleeNode =
+            CG.getOrInsertFunction(NewCS.getCalledFunction());
+        CallGraphNode *CallerNode = CG[Caller];
+        CallerNode->replaceCallEdge(OldCS, NewCS, NewCalleeNode);
+      };
+
+      if (Function *NewF = promoteArguments(OldF, AARGetter, MaxElements,
+                                            {ReplaceCallSite})) {
+        LocalChange = true;
+
+        // Update the call graph for the newly promoted function.
+        CallGraphNode *NewNode = CG.getOrInsertFunction(NewF);
+        NewNode->stealCalledFunctionsFrom(OldNode);
+        if (OldNode->getNumReferences() == 0)
+          delete CG.removeFunctionFromModule(OldNode);
+        else
+          OldF->setLinkage(Function::ExternalLinkage);
+
+        // And updat ethe SCC we're iterating as well.
+        SCC.ReplaceNode(OldNode, NewNode);
+      }
+    }
+    // Remember that we changed something.
+    Changed |= LocalChange;
+  } while (LocalChange);
+
+  return Changed;
+}
+
+bool ArgPromotion::doInitialization(CallGraph &CG) {
+  return CallGraphSCCPass::doInitialization(CG);
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp b/contrib/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp
new file mode 100644
index 000000000000..6af104362594
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp
@@ -0,0 +1,47 @@
+//===- BarrierNoopPass.cpp - A barrier pass for the pass manager ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// NOTE: DO NOT USE THIS IF AVOIDABLE
+//
+// This pass is a nonce pass intended to allow manipulation of the implicitly
+// nesting pass manager. For example, it can be used to cause a CGSCC pass
+// manager to be closed prior to running a new collection of function passes.
+//
+// FIXME: This is a huge HACK. This should be removed when the pass manager's
+// nesting is made explicit instead of implicit.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+using namespace llvm;
+
+namespace {
+/// \brief A nonce module pass used to place a barrier in a pass manager.
+///
+/// There is no mechanism for ending a CGSCC pass manager once one is started.
+/// This prevents extension points from having clear deterministic ordering
+/// when they are phrased as non-module passes.
+class BarrierNoop : public ModulePass {
+public:
+  static char ID; // Pass identification.
+
+  BarrierNoop() : ModulePass(ID) {
+    initializeBarrierNoopPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override { return false; }
+};
+}
+
+ModulePass *llvm::createBarrierNoopPass() { return new BarrierNoop(); }
+
+char BarrierNoop::ID = 0;
+INITIALIZE_PASS(BarrierNoop, "barrier", "A No-Op Barrier Pass",
+                false, false)
diff --git a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp
new file mode 100644
index 000000000000..62b5a9c9ba26
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp
@@ -0,0 +1,250 @@
+//===- ConstantMerge.cpp - Merge duplicate global constants ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface to a pass that merges duplicate global
+// constants together into a single constant that is shared.  This is useful
+// because some passes (ie TraceValues) insert a lot of string constants into
+// the program, regardless of whether or not an existing string is available.
+//
+// Algorithm: ConstantMerge is designed to build up a map of available constants
+// and eliminate duplicates when it is initialized.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ConstantMerge.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "constmerge"
+
+STATISTIC(NumMerged, "Number of global constants merged");
+
+/// Find values that are marked as llvm.used.
+static void FindUsedValues(GlobalVariable *LLVMUsed,
+                           SmallPtrSetImpl<const GlobalValue*> &UsedValues) {
+  if (!LLVMUsed) return;
+  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+
+  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) {
+    Value *Operand = Inits->getOperand(i)->stripPointerCastsNoFollowAliases();
+    GlobalValue *GV = cast<GlobalValue>(Operand);
+    UsedValues.insert(GV);
+  }
+}
+
+// True if A is better than B.
+static bool IsBetterCanonical(const GlobalVariable &A,
+                              const GlobalVariable &B) {
+  if (!A.hasLocalLinkage() && B.hasLocalLinkage())
+    return true;
+
+  if (A.hasLocalLinkage() && !B.hasLocalLinkage())
+    return false;
+
+  return A.hasGlobalUnnamedAddr();
+}
+
+static bool hasMetadataOtherThanDebugLoc(const GlobalVariable *GV) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  GV->getAllMetadata(MDs);
+  for (const auto &V : MDs)
+    if (V.first != LLVMContext::MD_dbg)
+      return true;
+  return false;
+}
+
+static void copyDebugLocMetadata(const GlobalVariable *From,
+                                 GlobalVariable *To) {
+  SmallVector<DIGlobalVariableExpression *, 1> MDs;
+  From->getDebugInfo(MDs);
+  for (auto MD : MDs)
+    To->addDebugInfo(MD);
+}
+
+static unsigned getAlignment(GlobalVariable *GV) {
+  unsigned Align = GV->getAlignment();
+  if (Align)
+    return Align;
+  return GV->getParent()->getDataLayout().getPreferredAlignment(GV);
+}
+
+static bool mergeConstants(Module &M) {
+  // Find all the globals that are marked "used".  These cannot be merged.
+  SmallPtrSet<const GlobalValue*, 8> UsedGlobals;
+  FindUsedValues(M.getGlobalVariable("llvm.used"), UsedGlobals);
+  FindUsedValues(M.getGlobalVariable("llvm.compiler.used"), UsedGlobals);
+
+  // Map unique constants to globals.
+  DenseMap<Constant *, GlobalVariable *> CMap;
+
+  // Replacements - This vector contains a list of replacements to perform.
+  SmallVector<std::pair<GlobalVariable*, GlobalVariable*>, 32> Replacements;
+
+  bool MadeChange = false;
+
+  // Iterate constant merging while we are still making progress.  Merging two
+  // constants together may allow us to merge other constants together if the
+  // second level constants have initializers which point to the globals that
+  // were just merged.
+  while (1) {
+
+    // First: Find the canonical constants others will be merged with.
+    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+         GVI != E; ) {
+      GlobalVariable *GV = &*GVI++;
+
+      // If this GV is dead, remove it.
+      GV->removeDeadConstantUsers();
+      if (GV->use_empty() && GV->hasLocalLinkage()) {
+        GV->eraseFromParent();
+        continue;
+      }
+
+      // Only process constants with initializers in the default address space.
+      if (!GV->isConstant() || !GV->hasDefinitiveInitializer() ||
+          GV->getType()->getAddressSpace() != 0 || GV->hasSection() ||
+          // Don't touch values marked with attribute(used).
+          UsedGlobals.count(GV))
+        continue;
+
+      // This transformation is legal for weak ODR globals in the sense it
+      // doesn't change semantics, but we really don't want to perform it
+      // anyway; it's likely to pessimize code generation, and some tools
+      // (like the Darwin linker in cases involving CFString) don't expect it.
+      if (GV->isWeakForLinker())
+        continue;
+
+      // Don't touch globals with metadata other then !dbg.
+      if (hasMetadataOtherThanDebugLoc(GV))
+        continue;
+
+      Constant *Init = GV->getInitializer();
+
+      // Check to see if the initializer is already known.
+      GlobalVariable *&Slot = CMap[Init];
+
+      // If this is the first constant we find or if the old one is local,
+      // replace with the current one. If the current is externally visible
+      // it cannot be replace, but can be the canonical constant we merge with.
+      if (!Slot || IsBetterCanonical(*GV, *Slot))
+        Slot = GV;
+    }
+
+    // Second: identify all globals that can be merged together, filling in
+    // the Replacements vector.  We cannot do the replacement in this pass
+    // because doing so may cause initializers of other globals to be rewritten,
+    // invalidating the Constant* pointers in CMap.
+    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+         GVI != E; ) {
+      GlobalVariable *GV = &*GVI++;
+
+      // Only process constants with initializers in the default address space.
+      if (!GV->isConstant() || !GV->hasDefinitiveInitializer() ||
+          GV->getType()->getAddressSpace() != 0 || GV->hasSection() ||
+          // Don't touch values marked with attribute(used).
+          UsedGlobals.count(GV))
+        continue;
+
+      // We can only replace constant with local linkage.
+      if (!GV->hasLocalLinkage())
+        continue;
+
+      Constant *Init = GV->getInitializer();
+
+      // Check to see if the initializer is already known.
+      GlobalVariable *Slot = CMap[Init];
+
+      if (!Slot || Slot == GV)
+        continue;
+
+      if (!Slot->hasGlobalUnnamedAddr() && !GV->hasGlobalUnnamedAddr())
+        continue;
+
+      if (hasMetadataOtherThanDebugLoc(GV))
+        continue;
+
+      if (!GV->hasGlobalUnnamedAddr())
+        Slot->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
+
+      // Make all uses of the duplicate constant use the canonical version.
+      Replacements.push_back(std::make_pair(GV, Slot));
+    }
+
+    if (Replacements.empty())
+      return MadeChange;
+    CMap.clear();
+
+    // Now that we have figured out which replacements must be made, do them all
+    // now.  This avoid invalidating the pointers in CMap, which are unneeded
+    // now.
+    for (unsigned i = 0, e = Replacements.size(); i != e; ++i) {
+      // Bump the alignment if necessary.
+      if (Replacements[i].first->getAlignment() ||
+          Replacements[i].second->getAlignment()) {
+        Replacements[i].second->setAlignment(
+            std::max(getAlignment(Replacements[i].first),
+                     getAlignment(Replacements[i].second)));
+      }
+
+      copyDebugLocMetadata(Replacements[i].first, Replacements[i].second);
+
+      // Eliminate any uses of the dead global.
+      Replacements[i].first->replaceAllUsesWith(Replacements[i].second);
+
+      // Delete the global value from the module.
+      assert(Replacements[i].first->hasLocalLinkage() &&
+             "Refusing to delete an externally visible global variable.");
+      Replacements[i].first->eraseFromParent();
+    }
+
+    NumMerged += Replacements.size();
+    Replacements.clear();
+  }
+}
+
+PreservedAnalyses ConstantMergePass::run(Module &M, ModuleAnalysisManager &) {
+  if (!mergeConstants(M))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+namespace {
+struct ConstantMergeLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  ConstantMergeLegacyPass() : ModulePass(ID) {
+    initializeConstantMergeLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // For this pass, process all of the globals in the module, eliminating
+  // duplicate constants.
+  bool runOnModule(Module &M) {
+    if (skipModule(M))
+      return false;
+    return mergeConstants(M);
+  }
+};
+}
+
+char ConstantMergeLegacyPass::ID = 0;
+INITIALIZE_PASS(ConstantMergeLegacyPass, "constmerge",
+                "Merge Duplicate Global Constants", false, false)
+
+ModulePass *llvm::createConstantMergePass() {
+  return new ConstantMergeLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
new file mode 100644
index 000000000000..d94aa5da8560
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -0,0 +1,178 @@
+//===-- CrossDSOCFI.cpp - Externalize this module's CFI checks ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass exports all llvm.bitset's found in the module in the form of a
+// __cfi_check function, which can be used to verify cross-DSO call targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/CrossDSOCFI.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "cross-dso-cfi"
+
+STATISTIC(NumTypeIds, "Number of unique type identifiers");
+
+namespace {
+
+struct CrossDSOCFI : public ModulePass {
+  static char ID;
+  CrossDSOCFI() : ModulePass(ID) {
+    initializeCrossDSOCFIPass(*PassRegistry::getPassRegistry());
+  }
+
+  MDNode *VeryLikelyWeights;
+
+  ConstantInt *extractNumericTypeId(MDNode *MD);
+  void buildCFICheck(Module &M);
+  bool runOnModule(Module &M) override;
+};
+
+} // anonymous namespace
+
+INITIALIZE_PASS_BEGIN(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false,
+                      false)
+INITIALIZE_PASS_END(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false, false)
+char CrossDSOCFI::ID = 0;
+
+ModulePass *llvm::createCrossDSOCFIPass() { return new CrossDSOCFI; }
+
+/// Extracts a numeric type identifier from an MDNode containing type metadata.
+ConstantInt *CrossDSOCFI::extractNumericTypeId(MDNode *MD) {
+  // This check excludes vtables for classes inside anonymous namespaces.
+  auto TM = dyn_cast<ValueAsMetadata>(MD->getOperand(1));
+  if (!TM)
+    return nullptr;
+  auto C = dyn_cast_or_null<ConstantInt>(TM->getValue());
+  if (!C) return nullptr;
+  // We are looking for i64 constants.
+  if (C->getBitWidth() != 64) return nullptr;
+
+  return C;
+}
+
+/// buildCFICheck - emits __cfi_check for the current module.
+void CrossDSOCFI::buildCFICheck(Module &M) {
+  // FIXME: verify that __cfi_check ends up near the end of the code section,
+  // but before the jump slots created in LowerTypeTests.
+  llvm::DenseSet<uint64_t> TypeIds;
+  SmallVector<MDNode *, 2> Types;
+  for (GlobalObject &GO : M.global_objects()) {
+    Types.clear();
+    GO.getMetadata(LLVMContext::MD_type, Types);
+    for (MDNode *Type : Types) {
+      // Sanity check. GO must not be a function declaration.
+      assert(!isa<Function>(&GO) || !cast<Function>(&GO)->isDeclaration());
+
+      if (ConstantInt *TypeId = extractNumericTypeId(Type))
+        TypeIds.insert(TypeId->getZExtValue());
+    }
+  }
+
+  NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
+  if (CfiFunctionsMD) {
+    for (auto Func : CfiFunctionsMD->operands()) {
+      assert(Func->getNumOperands() >= 2);
+      for (unsigned I = 2; I < Func->getNumOperands(); ++I)
+        if (ConstantInt *TypeId =
+                extractNumericTypeId(cast<MDNode>(Func->getOperand(I).get())))
+          TypeIds.insert(TypeId->getZExtValue());
+    }
+  }
+
+  LLVMContext &Ctx = M.getContext();
+  Constant *C = M.getOrInsertFunction(
+      "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx),
+      Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx));
+  Function *F = dyn_cast<Function>(C);
+  // Take over the existing function. The frontend emits a weak stub so that the
+  // linker knows about the symbol; this pass replaces the function body.
+  F->deleteBody();
+  F->setAlignment(4096);
+  auto args = F->arg_begin();
+  Value &CallSiteTypeId = *(args++);
+  CallSiteTypeId.setName("CallSiteTypeId");
+  Value &Addr = *(args++);
+  Addr.setName("Addr");
+  Value &CFICheckFailData = *(args++);
+  CFICheckFailData.setName("CFICheckFailData");
+  assert(args == F->arg_end());
+
+  BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
+  BasicBlock *ExitBB = BasicBlock::Create(Ctx, "exit", F);
+
+  BasicBlock *TrapBB = BasicBlock::Create(Ctx, "fail", F);
+  IRBuilder<> IRBFail(TrapBB);
+  Constant *CFICheckFailFn = M.getOrInsertFunction(
+      "__cfi_check_fail", Type::getVoidTy(Ctx), Type::getInt8PtrTy(Ctx),
+      Type::getInt8PtrTy(Ctx));
+  IRBFail.CreateCall(CFICheckFailFn, {&CFICheckFailData, &Addr});
+  IRBFail.CreateBr(ExitBB);
+
+  IRBuilder<> IRBExit(ExitBB);
+  IRBExit.CreateRetVoid();
+
+  IRBuilder<> IRB(BB);
+  SwitchInst *SI = IRB.CreateSwitch(&CallSiteTypeId, TrapBB, TypeIds.size());
+  for (uint64_t TypeId : TypeIds) {
+    ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId);
+    BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F);
+    IRBuilder<> IRBTest(TestBB);
+    Function *BitsetTestFn = Intrinsic::getDeclaration(&M, Intrinsic::type_test);
+
+    Value *Test = IRBTest.CreateCall(
+        BitsetTestFn, {&Addr, MetadataAsValue::get(
+                                  Ctx, ConstantAsMetadata::get(CaseTypeId))});
+    BranchInst *BI = IRBTest.CreateCondBr(Test, ExitBB, TrapBB);
+    BI->setMetadata(LLVMContext::MD_prof, VeryLikelyWeights);
+
+    SI->addCase(CaseTypeId, TestBB);
+    ++NumTypeIds;
+  }
+}
+
+bool CrossDSOCFI::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  VeryLikelyWeights =
+    MDBuilder(M.getContext()).createBranchWeights((1U << 20) - 1, 1);
+  if (M.getModuleFlag("Cross-DSO CFI") == nullptr)
+    return false;
+  buildCFICheck(M);
+  return true;
+}
+
+PreservedAnalyses CrossDSOCFIPass::run(Module &M, ModuleAnalysisManager &AM) {
+  CrossDSOCFI Impl;
+  bool Changed = Impl.runOnModule(M);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
new file mode 100644
index 000000000000..8e26849ea9e3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -0,0 +1,1051 @@
+//===-- DeadArgumentElimination.cpp - Eliminate dead arguments ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass deletes dead arguments from internal functions.  Dead argument
+// elimination removes arguments which are directly dead, as well as arguments
+// only passed into function calls as dead arguments of other functions.  This
+// pass also deletes dead return values in a similar way.
+//
+// This pass is often useful as a cleanup pass to run after aggressive
+// interprocedural passes, which add possibly-dead arguments or return values.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/DeadArgumentElimination.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <set>
+#include <tuple>
+using namespace llvm;
+
+#define DEBUG_TYPE "deadargelim"
+
+STATISTIC(NumArgumentsEliminated, "Number of unread args removed");
+STATISTIC(NumRetValsEliminated  , "Number of unused return values removed");
+STATISTIC(NumArgumentsReplacedWithUndef, 
+          "Number of unread args replaced with undef");
+namespace {
+  /// DAE - The dead argument elimination pass.
+  ///
+  class DAE : public ModulePass {
+  protected:
+    // DAH uses this to specify a different ID.
+    explicit DAE(char &ID) : ModulePass(ID) {}
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    DAE() : ModulePass(ID) {
+      initializeDAEPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnModule(Module &M) override {
+      if (skipModule(M))
+        return false;
+      DeadArgumentEliminationPass DAEP(ShouldHackArguments());
+      ModuleAnalysisManager DummyMAM;
+      PreservedAnalyses PA = DAEP.run(M, DummyMAM);
+      return !PA.areAllPreserved();
+    }
+
+    virtual bool ShouldHackArguments() const { return false; }
+  };
+}
+
+
+char DAE::ID = 0;
+INITIALIZE_PASS(DAE, "deadargelim", "Dead Argument Elimination", false, false)
+
+namespace {
+  /// DAH - DeadArgumentHacking pass - Same as dead argument elimination, but
+  /// deletes arguments to functions which are external.  This is only for use
+  /// by bugpoint.
+  struct DAH : public DAE {
+    static char ID;
+    DAH() : DAE(ID) {}
+
+    bool ShouldHackArguments() const override { return true; }
+  };
+}
+
+char DAH::ID = 0;
+INITIALIZE_PASS(DAH, "deadarghaX0r", 
+                "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)",
+                false, false)
+
+/// createDeadArgEliminationPass - This pass removes arguments from functions
+/// which are not used by the body of the function.
+///
+ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); }
+ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); }
+
+/// DeleteDeadVarargs - If this is an function that takes a ... list, and if
+/// llvm.vastart is never called, the varargs list is dead for the function.
+bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
+  assert(Fn.getFunctionType()->isVarArg() && "Function isn't varargs!");
+  if (Fn.isDeclaration() || !Fn.hasLocalLinkage()) return false;
+
+  // Ensure that the function is only directly called.
+  if (Fn.hasAddressTaken())
+    return false;
+
+  // Don't touch naked functions. The assembly might be using an argument, or
+  // otherwise rely on the frame layout in a way that this analysis will not
+  // see.
+  if (Fn.hasFnAttribute(Attribute::Naked)) {
+    return false;
+  }
+
+  // Okay, we know we can transform this function if safe.  Scan its body
+  // looking for calls marked musttail or calls to llvm.vastart.
+  for (BasicBlock &BB : Fn) {
+    for (Instruction &I : BB) {
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (!CI)
+        continue;
+      if (CI->isMustTailCall())
+        return false;
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+        if (II->getIntrinsicID() == Intrinsic::vastart)
+          return false;
+      }
+    }
+  }
+
+  // If we get here, there are no calls to llvm.vastart in the function body,
+  // remove the "..." and adjust all the calls.
+
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but doesn't have isVarArg set.
+  FunctionType *FTy = Fn.getFunctionType();
+
+  std::vector<Type*> Params(FTy->param_begin(), FTy->param_end());
+  FunctionType *NFTy = FunctionType::get(FTy->getReturnType(),
+                                                Params, false);
+  unsigned NumArgs = Params.size();
+
+  // Create the new function body and insert it into the module...
+  Function *NF = Function::Create(NFTy, Fn.getLinkage());
+  NF->copyAttributesFrom(&Fn);
+  NF->setComdat(Fn.getComdat());
+  Fn.getParent()->getFunctionList().insert(Fn.getIterator(), NF);
+  NF->takeName(&Fn);
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in a smaller number of arguments into the new function.
+  //
+  std::vector<Value*> Args;
+  for (Value::user_iterator I = Fn.user_begin(), E = Fn.user_end(); I != E; ) {
+    CallSite CS(*I++);
+    if (!CS)
+      continue;
+    Instruction *Call = CS.getInstruction();
+
+    // Pass all the same arguments.
+    Args.assign(CS.arg_begin(), CS.arg_begin() + NumArgs);
+
+    // Drop any attributes that were on the vararg arguments.
+    AttributeList PAL = CS.getAttributes();
+    if (!PAL.isEmpty()) {
+      SmallVector<AttributeSet, 8> ArgAttrs;
+      for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo)
+        ArgAttrs.push_back(PAL.getParamAttributes(ArgNo));
+      PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttributes(),
+                               PAL.getRetAttributes(), ArgAttrs);
+    }
+
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CS.getOperandBundlesAsDefs(OpBundles);
+
+    CallSite NewCS;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                                 Args, OpBundles, "", Call);
+    } else {
+      NewCS = CallInst::Create(NF, Args, OpBundles, "", Call);
+      cast<CallInst>(NewCS.getInstruction())
+          ->setTailCallKind(cast<CallInst>(Call)->getTailCallKind());
+    }
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(PAL);
+    NewCS->setDebugLoc(Call->getDebugLoc());
+    uint64_t W;
+    if (Call->extractProfTotalWeight(W))
+      NewCS->setProfWeight(W);
+
+    Args.clear();
+
+    if (!Call->use_empty())
+      Call->replaceAllUsesWith(NewCS.getInstruction());
+
+    NewCS->takeName(Call);
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    Call->eraseFromParent();
+  }
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), Fn.getBasicBlockList());
+
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.  While we're at
+  // it, remove the dead arguments from the DeadArguments list.
+  //
+  for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(),
+       I2 = NF->arg_begin(); I != E; ++I, ++I2) {
+    // Move the name and users over to the new version.
+    I->replaceAllUsesWith(&*I2);
+    I2->takeName(&*I);
+  }
+
+  // Patch the pointer to LLVM function in debug info descriptor.
+  NF->setSubprogram(Fn.getSubprogram());
+
+  // Fix up any BlockAddresses that refer to the function.
+  Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType()));
+  // Delete the bitcast that we just created, so that NF does not
+  // appear to be address-taken.
+  NF->removeDeadConstantUsers();
+  // Finally, nuke the old function.
+  Fn.eraseFromParent();
+  return true;
+}
+
+/// RemoveDeadArgumentsFromCallers - Checks if the given function has any 
+/// arguments that are unused, and changes the caller parameters to be undefined
+/// instead.
+bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) {
+  // We cannot change the arguments if this TU does not define the function or
+  // if the linker may choose a function body from another TU, even if the
+  // nominal linkage indicates that other copies of the function have the same
+  // semantics. In the below example, the dead load from %p may not have been
+  // eliminated from the linker-chosen copy of f, so replacing %p with undef
+  // in callers may introduce undefined behavior.
+  //
+  // define linkonce_odr void @f(i32* %p) {
+  //   %v = load i32 %p
+  //   ret void
+  // }
+  if (!Fn.hasExactDefinition())
+    return false;
+
+  // Functions with local linkage should already have been handled, except the
+  // fragile (variadic) ones which we can improve here.
+  if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg())
+    return false;
+
+  // Don't touch naked functions. The assembly might be using an argument, or
+  // otherwise rely on the frame layout in a way that this analysis will not
+  // see.
+  if (Fn.hasFnAttribute(Attribute::Naked))
+    return false;
+
+  if (Fn.use_empty())
+    return false;
+
+  SmallVector<unsigned, 8> UnusedArgs;
+  for (Argument &Arg : Fn.args()) {
+    if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() && !Arg.hasByValOrInAllocaAttr())
+      UnusedArgs.push_back(Arg.getArgNo());
+  }
+
+  if (UnusedArgs.empty())
+    return false;
+
+  bool Changed = false;
+
+  for (Use &U : Fn.uses()) {
+    CallSite CS(U.getUser());
+    if (!CS || !CS.isCallee(&U))
+      continue;
+
+    // Now go through all unused args and replace them with "undef".
+    for (unsigned I = 0, E = UnusedArgs.size(); I != E; ++I) {
+      unsigned ArgNo = UnusedArgs[I];
+
+      Value *Arg = CS.getArgument(ArgNo);
+      CS.setArgument(ArgNo, UndefValue::get(Arg->getType()));
+      ++NumArgumentsReplacedWithUndef;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// Convenience function that returns the number of return values. It returns 0
+/// for void functions and 1 for functions not returning a struct. It returns
+/// the number of struct elements for functions returning a struct.
+static unsigned NumRetVals(const Function *F) {
+  Type *RetTy = F->getReturnType();
+  if (RetTy->isVoidTy())
+    return 0;
+  else if (StructType *STy = dyn_cast<StructType>(RetTy))
+    return STy->getNumElements();
+  else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy))
+    return ATy->getNumElements();
+  else
+    return 1;
+}
+
+/// Returns the sub-type a function will return at a given Idx. Should
+/// correspond to the result type of an ExtractValue instruction executed with
+/// just that one Idx (i.e. only top-level structure is considered).
+static Type *getRetComponentType(const Function *F, unsigned Idx) {
+  Type *RetTy = F->getReturnType();
+  assert(!RetTy->isVoidTy() && "void type has no subtype");
+
+  if (StructType *STy = dyn_cast<StructType>(RetTy))
+    return STy->getElementType(Idx);
+  else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy))
+    return ATy->getElementType();
+  else
+    return RetTy;
+}
+
+/// MarkIfNotLive - This checks Use for liveness in LiveValues. If Use is not
+/// live, it adds Use to the MaybeLiveUses argument. Returns the determined
+/// liveness of Use.
+DeadArgumentEliminationPass::Liveness
+DeadArgumentEliminationPass::MarkIfNotLive(RetOrArg Use,
+                                           UseVector &MaybeLiveUses) {
+  // We're live if our use or its Function is already marked as live.
+  if (LiveFunctions.count(Use.F) || LiveValues.count(Use))
+    return Live;
+
+  // We're maybe live otherwise, but remember that we must become live if
+  // Use becomes live.
+  MaybeLiveUses.push_back(Use);
+  return MaybeLive;
+}
+
+
+/// SurveyUse - This looks at a single use of an argument or return value
+/// and determines if it should be alive or not. Adds this use to MaybeLiveUses
+/// if it causes the used value to become MaybeLive.
+///
+/// RetValNum is the return value number to use when this use is used in a
+/// return instruction. This is used in the recursion, you should always leave
+/// it at 0.
+DeadArgumentEliminationPass::Liveness
+DeadArgumentEliminationPass::SurveyUse(const Use *U, UseVector &MaybeLiveUses,
+                                       unsigned RetValNum) {
+    const User *V = U->getUser();
+    if (const ReturnInst *RI = dyn_cast<ReturnInst>(V)) {
+      // The value is returned from a function. It's only live when the
+      // function's return value is live. We use RetValNum here, for the case
+      // that U is really a use of an insertvalue instruction that uses the
+      // original Use.
+      const Function *F = RI->getParent()->getParent();
+      if (RetValNum != -1U) {
+        RetOrArg Use = CreateRet(F, RetValNum);
+        // We might be live, depending on the liveness of Use.
+        return MarkIfNotLive(Use, MaybeLiveUses);
+      } else {
+        DeadArgumentEliminationPass::Liveness Result = MaybeLive;
+        for (unsigned i = 0; i < NumRetVals(F); ++i) {
+          RetOrArg Use = CreateRet(F, i);
+          // We might be live, depending on the liveness of Use. If any
+          // sub-value is live, then the entire value is considered live. This
+          // is a conservative choice, and better tracking is possible.
+          DeadArgumentEliminationPass::Liveness SubResult =
+              MarkIfNotLive(Use, MaybeLiveUses);
+          if (Result != Live)
+            Result = SubResult;
+        }
+        return Result;
+      }
+    }
+    if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) {
+      if (U->getOperandNo() != InsertValueInst::getAggregateOperandIndex()
+          && IV->hasIndices())
+        // The use we are examining is inserted into an aggregate. Our liveness
+        // depends on all uses of that aggregate, but if it is used as a return
+        // value, only index at which we were inserted counts.
+        RetValNum = *IV->idx_begin();
+
+      // Note that if we are used as the aggregate operand to the insertvalue,
+      // we don't change RetValNum, but do survey all our uses.
+
+      Liveness Result = MaybeLive;
+      for (const Use &UU : IV->uses()) {
+        Result = SurveyUse(&UU, MaybeLiveUses, RetValNum);
+        if (Result == Live)
+          break;
+      }
+      return Result;
+    }
+
+    if (auto CS = ImmutableCallSite(V)) {
+      const Function *F = CS.getCalledFunction();
+      if (F) {
+        // Used in a direct call.
+
+        // The function argument is live if it is used as a bundle operand.
+        if (CS.isBundleOperand(U))
+          return Live;
+
+        // Find the argument number. We know for sure that this use is an
+        // argument, since if it was the function argument this would be an
+        // indirect call and the we know can't be looking at a value of the
+        // label type (for the invoke instruction).
+        unsigned ArgNo = CS.getArgumentNo(U);
+
+        if (ArgNo >= F->getFunctionType()->getNumParams())
+          // The value is passed in through a vararg! Must be live.
+          return Live;
+
+        assert(CS.getArgument(ArgNo)
+               == CS->getOperand(U->getOperandNo())
+               && "Argument is not where we expected it");
+
+        // Value passed to a normal call. It's only live when the corresponding
+        // argument to the called function turns out live.
+        RetOrArg Use = CreateArg(F, ArgNo);
+        return MarkIfNotLive(Use, MaybeLiveUses);
+      }
+    }
+    // Used in any other way? Value must be live.
+    return Live;
+}
+
+/// SurveyUses - This looks at all the uses of the given value
+/// Returns the Liveness deduced from the uses of this value.
+///
+/// Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses. If
+/// the result is Live, MaybeLiveUses might be modified but its content should
+/// be ignored (since it might not be complete).
+DeadArgumentEliminationPass::Liveness
+DeadArgumentEliminationPass::SurveyUses(const Value *V,
+                                        UseVector &MaybeLiveUses) {
+  // Assume it's dead (which will only hold if there are no uses at all..).
+  Liveness Result = MaybeLive;
+  // Check each use.
+  for (const Use &U : V->uses()) {
+    Result = SurveyUse(&U, MaybeLiveUses);
+    if (Result == Live)
+      break;
+  }
+  return Result;
+}
+
+// SurveyFunction - This performs the initial survey of the specified function,
+// checking out whether or not it uses any of its incoming arguments or whether
+// any callers use the return value.  This fills in the LiveValues set and Uses
+// map.
+//
+// We consider arguments of non-internal functions to be intrinsically alive as
+// well as arguments to functions which have their "address taken".
+//
+void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
+  // Functions with inalloca parameters are expecting args in a particular
+  // register and memory layout.
+  if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca)) {
+    MarkLive(F);
+    return;
+  }
+
+  // Don't touch naked functions. The assembly might be using an argument, or
+  // otherwise rely on the frame layout in a way that this analysis will not
+  // see.
+  if (F.hasFnAttribute(Attribute::Naked)) {
+    MarkLive(F);
+    return;
+  }
+
+  unsigned RetCount = NumRetVals(&F);
+  // Assume all return values are dead
+  typedef SmallVector<Liveness, 5> RetVals;
+  RetVals RetValLiveness(RetCount, MaybeLive);
+
+  typedef SmallVector<UseVector, 5> RetUses;
+  // These vectors map each return value to the uses that make it MaybeLive, so
+  // we can add those to the Uses map if the return value really turns out to be
+  // MaybeLive. Initialized to a list of RetCount empty lists.
+  RetUses MaybeLiveRetUses(RetCount);
+
+  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (const ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
+      if (RI->getNumOperands() != 0 && RI->getOperand(0)->getType()
+          != F.getFunctionType()->getReturnType()) {
+        // We don't support old style multiple return values.
+        MarkLive(F);
+        return;
+      }
+
+  if (!F.hasLocalLinkage() && (!ShouldHackArguments || F.isIntrinsic())) {
+    MarkLive(F);
+    return;
+  }
+
+  DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting callers for fn: "
+               << F.getName() << "\n");
+  // Keep track of the number of live retvals, so we can skip checks once all
+  // of them turn out to be live.
+  unsigned NumLiveRetVals = 0;
+  // Loop all uses of the function.
+  for (const Use &U : F.uses()) {
+    // If the function is PASSED IN as an argument, its address has been
+    // taken.
+    ImmutableCallSite CS(U.getUser());
+    if (!CS || !CS.isCallee(&U)) {
+      MarkLive(F);
+      return;
+    }
+
+    // If this use is anything other than a call site, the function is alive.
+    const Instruction *TheCall = CS.getInstruction();
+    if (!TheCall) {   // Not a direct call site?
+      MarkLive(F);
+      return;
+    }
+
+    // If we end up here, we are looking at a direct call to our function.
+
+    // Now, check how our return value(s) is/are used in this caller. Don't
+    // bother checking return values if all of them are live already.
+    if (NumLiveRetVals == RetCount)
+      continue;
+
+    // Check all uses of the return value.
+    for (const Use &U : TheCall->uses()) {
+      if (ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(U.getUser())) {
+        // This use uses a part of our return value, survey the uses of
+        // that part and store the results for this index only.
+        unsigned Idx = *Ext->idx_begin();
+        if (RetValLiveness[Idx] != Live) {
+          RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]);
+          if (RetValLiveness[Idx] == Live)
+            NumLiveRetVals++;
+        }
+      } else {
+        // Used by something else than extractvalue. Survey, but assume that the
+        // result applies to all sub-values.
+        UseVector MaybeLiveAggregateUses;
+        if (SurveyUse(&U, MaybeLiveAggregateUses) == Live) {
+          NumLiveRetVals = RetCount;
+          RetValLiveness.assign(RetCount, Live);
+          break;
+        } else {
+          for (unsigned i = 0; i != RetCount; ++i) {
+            if (RetValLiveness[i] != Live)
+              MaybeLiveRetUses[i].append(MaybeLiveAggregateUses.begin(),
+                                         MaybeLiveAggregateUses.end());
+          }
+        }
+      }
+    }
+  }
+
+  // Now we've inspected all callers, record the liveness of our return values.
+  for (unsigned i = 0; i != RetCount; ++i)
+    MarkValue(CreateRet(&F, i), RetValLiveness[i], MaybeLiveRetUses[i]);
+
+  DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting args for fn: "
+               << F.getName() << "\n");
+
+  // Now, check all of our arguments.
+  unsigned i = 0;
+  UseVector MaybeLiveArgUses;
+  for (Function::const_arg_iterator AI = F.arg_begin(),
+       E = F.arg_end(); AI != E; ++AI, ++i) {
+    Liveness Result;
+    if (F.getFunctionType()->isVarArg()) {
+      // Variadic functions will already have a va_arg function expanded inside
+      // them, making them potentially very sensitive to ABI changes resulting
+      // from removing arguments entirely, so don't. For example AArch64 handles
+      // register and stack HFAs very differently, and this is reflected in the
+      // IR which has already been generated.
+      Result = Live;
+    } else {
+      // See what the effect of this use is (recording any uses that cause
+      // MaybeLive in MaybeLiveArgUses). 
+      Result = SurveyUses(&*AI, MaybeLiveArgUses);
+    }
+
+    // Mark the result.
+    MarkValue(CreateArg(&F, i), Result, MaybeLiveArgUses);
+    // Clear the vector again for the next iteration.
+    MaybeLiveArgUses.clear();
+  }
+}
+
+/// MarkValue - This function marks the liveness of RA depending on L. If L is
+/// MaybeLive, it also takes all uses in MaybeLiveUses and records them in Uses,
+/// such that RA will be marked live if any use in MaybeLiveUses gets marked
+/// live later on.
+void DeadArgumentEliminationPass::MarkValue(const RetOrArg &RA, Liveness L,
+                                            const UseVector &MaybeLiveUses) {
+  switch (L) {
+    case Live: MarkLive(RA); break;
+    case MaybeLive:
+    {
+      // Note any uses of this value, so this return value can be
+      // marked live whenever one of the uses becomes live.
+      for (const auto &MaybeLiveUse : MaybeLiveUses)
+        Uses.insert(std::make_pair(MaybeLiveUse, RA));
+      break;
+    }
+  }
+}
+
+/// MarkLive - Mark the given Function as alive, meaning that it cannot be
+/// changed in any way. Additionally,
+/// mark any values that are used as this function's parameters or by its return
+/// values (according to Uses) live as well.
+void DeadArgumentEliminationPass::MarkLive(const Function &F) {
+  DEBUG(dbgs() << "DeadArgumentEliminationPass - Intrinsically live fn: "
+               << F.getName() << "\n");
+  // Mark the function as live.
+  LiveFunctions.insert(&F);
+  // Mark all arguments as live.
+  for (unsigned i = 0, e = F.arg_size(); i != e; ++i)
+    PropagateLiveness(CreateArg(&F, i));
+  // Mark all return values as live.
+  for (unsigned i = 0, e = NumRetVals(&F); i != e; ++i)
+    PropagateLiveness(CreateRet(&F, i));
+}
+
+/// MarkLive - Mark the given return value or argument as live. Additionally,
+/// mark any values that are used by this value (according to Uses) live as
+/// well.
+void DeadArgumentEliminationPass::MarkLive(const RetOrArg &RA) {
+  if (LiveFunctions.count(RA.F))
+    return; // Function was already marked Live.
+
+  if (!LiveValues.insert(RA).second)
+    return; // We were already marked Live.
+
+  DEBUG(dbgs() << "DeadArgumentEliminationPass - Marking "
+               << RA.getDescription() << " live\n");
+  PropagateLiveness(RA);
+}
+
+/// PropagateLiveness - Given that RA is a live value, propagate it's liveness
+/// to any other values it uses (according to Uses).
+void DeadArgumentEliminationPass::PropagateLiveness(const RetOrArg &RA) {
+  // We don't use upper_bound (or equal_range) here, because our recursive call
+  // to ourselves is likely to cause the upper_bound (which is the first value
+  // not belonging to RA) to become erased and the iterator invalidated.
+  UseMap::iterator Begin = Uses.lower_bound(RA);
+  UseMap::iterator E = Uses.end();
+  UseMap::iterator I;
+  for (I = Begin; I != E && I->first == RA; ++I)
+    MarkLive(I->second);
+
+  // Erase RA from the Uses map (from the lower bound to wherever we ended up
+  // after the loop).
+  Uses.erase(Begin, I);
+}
+
+// RemoveDeadStuffFromFunction - Remove any arguments and return values from F
+// that are not in LiveValues. Transform the function and all of the callees of
+// the function to not have these arguments and return values.
+//
+bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
+  // Don't modify fully live functions
+  if (LiveFunctions.count(F))
+    return false;
+
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but has fewer arguments and a different return type.
+  FunctionType *FTy = F->getFunctionType();
+  std::vector<Type*> Params;
+
+  // Keep track of if we have a live 'returned' argument
+  bool HasLiveReturnedArg = false;
+
+  // Set up to build a new list of parameter attributes.
+  SmallVector<AttributeSet, 8> ArgAttrVec;
+  const AttributeList &PAL = F->getAttributes();
+
+  // Remember which arguments are still alive.
+  SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
+  // Construct the new parameter list from non-dead arguments. Also construct
+  // a new set of parameter attributes to correspond. Skip the first parameter
+  // attribute, since that belongs to the return value.
+  unsigned i = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++i) {
+    RetOrArg Arg = CreateArg(F, i);
+    if (LiveValues.erase(Arg)) {
+      Params.push_back(I->getType());
+      ArgAlive[i] = true;
+      ArgAttrVec.push_back(PAL.getParamAttributes(i));
+      HasLiveReturnedArg |= PAL.hasParamAttribute(i, Attribute::Returned);
+    } else {
+      ++NumArgumentsEliminated;
+      DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument " << i
+                   << " (" << I->getName() << ") from " << F->getName()
+                   << "\n");
+    }
+  }
+
+  // Find out the new return value.
+  Type *RetTy = FTy->getReturnType();
+  Type *NRetTy = nullptr;
+  unsigned RetCount = NumRetVals(F);
+
+  // -1 means unused, other numbers are the new index
+  SmallVector<int, 5> NewRetIdxs(RetCount, -1);
+  std::vector<Type*> RetTypes;
+
+  // If there is a function with a live 'returned' argument but a dead return
+  // value, then there are two possible actions:
+  // 1) Eliminate the return value and take off the 'returned' attribute on the
+  //    argument.
+  // 2) Retain the 'returned' attribute and treat the return value (but not the
+  //    entire function) as live so that it is not eliminated.
+  // 
+  // It's not clear in the general case which option is more profitable because,
+  // even in the absence of explicit uses of the return value, code generation
+  // is free to use the 'returned' attribute to do things like eliding
+  // save/restores of registers across calls. Whether or not this happens is
+  // target and ABI-specific as well as depending on the amount of register
+  // pressure, so there's no good way for an IR-level pass to figure this out.
+  //
+  // Fortunately, the only places where 'returned' is currently generated by
+  // the FE are places where 'returned' is basically free and almost always a
+  // performance win, so the second option can just be used always for now.
+  //
+  // This should be revisited if 'returned' is ever applied more liberally.
+  if (RetTy->isVoidTy() || HasLiveReturnedArg) {
+    NRetTy = RetTy;
+  } else {
+    // Look at each of the original return values individually.
+    for (unsigned i = 0; i != RetCount; ++i) {
+      RetOrArg Ret = CreateRet(F, i);
+      if (LiveValues.erase(Ret)) {
+        RetTypes.push_back(getRetComponentType(F, i));
+        NewRetIdxs[i] = RetTypes.size() - 1;
+      } else {
+        ++NumRetValsEliminated;
+        DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing return value "
+                     << i << " from " << F->getName() << "\n");
+      }
+    }
+    if (RetTypes.size() > 1) {
+      // More than one return type? Reduce it down to size.
+      if (StructType *STy = dyn_cast<StructType>(RetTy)) {
+        // Make the new struct packed if we used to return a packed struct
+        // already.
+        NRetTy = StructType::get(STy->getContext(), RetTypes, STy->isPacked());
+      } else {
+        assert(isa<ArrayType>(RetTy) && "unexpected multi-value return");
+        NRetTy = ArrayType::get(RetTypes[0], RetTypes.size());
+      }
+    } else if (RetTypes.size() == 1)
+      // One return type? Just a simple value then, but only if we didn't use to
+      // return a struct with that simple value before.
+      NRetTy = RetTypes.front();
+    else if (RetTypes.size() == 0)
+      // No return types? Make it void, but only if we didn't use to return {}.
+      NRetTy = Type::getVoidTy(F->getContext());
+  }
+
+  assert(NRetTy && "No new return type found?");
+
+  // The existing function return attributes.
+  AttrBuilder RAttrs(PAL.getRetAttributes());
+
+  // Remove any incompatible attributes, but only if we removed all return
+  // values. Otherwise, ensure that we don't have any conflicting attributes
+  // here. Currently, this should not be possible, but special handling might be
+  // required when new return value attributes are added.
+  if (NRetTy->isVoidTy())
+    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
+  else
+    assert(!RAttrs.overlaps(AttributeFuncs::typeIncompatible(NRetTy)) &&
+           "Return attributes no longer compatible?");
+
+  AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
+
+  // Reconstruct the AttributesList based on the vector we constructed.
+  assert(ArgAttrVec.size() == Params.size());
+  AttributeList NewPAL = AttributeList::get(
+      F->getContext(), PAL.getFnAttributes(), RetAttrs, ArgAttrVec);
+
+  // Create the new function type based on the recomputed parameters.
+  FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
+
+  // No change?
+  if (NFTy == FTy)
+    return false;
+
+  // Create the new function body and insert it into the module...
+  Function *NF = Function::Create(NFTy, F->getLinkage());
+  NF->copyAttributesFrom(F);
+  NF->setComdat(F->getComdat());
+  NF->setAttributes(NewPAL);
+  // Insert the new function before the old function, so we won't be processing
+  // it again.
+  F->getParent()->getFunctionList().insert(F->getIterator(), NF);
+  NF->takeName(F);
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in a smaller number of arguments into the new function.
+  //
+  std::vector<Value*> Args;
+  while (!F->use_empty()) {
+    CallSite CS(F->user_back());
+    Instruction *Call = CS.getInstruction();
+
+    ArgAttrVec.clear();
+    const AttributeList &CallPAL = CS.getAttributes();
+
+    // Adjust the call return attributes in case the function was changed to
+    // return void.
+    AttrBuilder RAttrs(CallPAL.getRetAttributes());
+    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
+    AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
+
+    // Declare these outside of the loops, so we can reuse them for the second
+    // loop, which loops the varargs.
+    CallSite::arg_iterator I = CS.arg_begin();
+    unsigned i = 0;
+    // Loop over those operands, corresponding to the normal arguments to the
+    // original function, and add those that are still alive.
+    for (unsigned e = FTy->getNumParams(); i != e; ++I, ++i)
+      if (ArgAlive[i]) {
+        Args.push_back(*I);
+        // Get original parameter attributes, but skip return attributes.
+        AttributeSet Attrs = CallPAL.getParamAttributes(i);
+        if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) {
+          // If the return type has changed, then get rid of 'returned' on the
+          // call site. The alternative is to make all 'returned' attributes on
+          // call sites keep the return value alive just like 'returned'
+          // attributes on function declaration but it's less clearly a win and
+          // this is not an expected case anyway
+          ArgAttrVec.push_back(AttributeSet::get(
+              F->getContext(),
+              AttrBuilder(Attrs).removeAttribute(Attribute::Returned)));
+        } else {
+          // Otherwise, use the original attributes.
+          ArgAttrVec.push_back(Attrs);
+        }
+      }
+
+    // Push any varargs arguments on the list. Don't forget their attributes.
+    for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) {
+      Args.push_back(*I);
+      ArgAttrVec.push_back(CallPAL.getParamAttributes(i));
+    }
+
+    // Reconstruct the AttributesList based on the vector we constructed.
+    assert(ArgAttrVec.size() == Args.size());
+    AttributeList NewCallPAL = AttributeList::get(
+        F->getContext(), CallPAL.getFnAttributes(), RetAttrs, ArgAttrVec);
+
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CS.getOperandBundlesAsDefs(OpBundles);
+
+    CallSite NewCS;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                                 Args, OpBundles, "", Call->getParent());
+    } else {
+      NewCS = CallInst::Create(NF, Args, OpBundles, "", Call);
+      cast<CallInst>(NewCS.getInstruction())
+          ->setTailCallKind(cast<CallInst>(Call)->getTailCallKind());
+    }
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(NewCallPAL);
+    NewCS->setDebugLoc(Call->getDebugLoc());
+    uint64_t W;
+    if (Call->extractProfTotalWeight(W))
+      NewCS->setProfWeight(W);
+    Args.clear();
+    ArgAttrVec.clear();
+
+    Instruction *New = NewCS.getInstruction();
+    if (!Call->use_empty()) {
+      if (New->getType() == Call->getType()) {
+        // Return type not changed? Just replace users then.
+        Call->replaceAllUsesWith(New);
+        New->takeName(Call);
+      } else if (New->getType()->isVoidTy()) {
+        // Our return value has uses, but they will get removed later on.
+        // Replace by null for now.
+        if (!Call->getType()->isX86_MMXTy())
+          Call->replaceAllUsesWith(Constant::getNullValue(Call->getType()));
+      } else {
+        assert((RetTy->isStructTy() || RetTy->isArrayTy()) &&
+               "Return type changed, but not into a void. The old return type"
+               " must have been a struct or an array!");
+        Instruction *InsertPt = Call;
+        if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+          BasicBlock *NewEdge = SplitEdge(New->getParent(), II->getNormalDest());
+          InsertPt = &*NewEdge->getFirstInsertionPt();
+        }
+
+        // We used to return a struct or array. Instead of doing smart stuff
+        // with all the uses, we will just rebuild it using extract/insertvalue
+        // chaining and let instcombine clean that up.
+        //
+        // Start out building up our return value from undef
+        Value *RetVal = UndefValue::get(RetTy);
+        for (unsigned i = 0; i != RetCount; ++i)
+          if (NewRetIdxs[i] != -1) {
+            Value *V;
+            if (RetTypes.size() > 1)
+              // We are still returning a struct, so extract the value from our
+              // return value
+              V = ExtractValueInst::Create(New, NewRetIdxs[i], "newret",
+                                           InsertPt);
+            else
+              // We are now returning a single element, so just insert that
+              V = New;
+            // Insert the value at the old position
+            RetVal = InsertValueInst::Create(RetVal, V, i, "oldret", InsertPt);
+          }
+        // Now, replace all uses of the old call instruction with the return
+        // struct we built
+        Call->replaceAllUsesWith(RetVal);
+        New->takeName(Call);
+      }
+    }
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    Call->eraseFromParent();
+  }
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.
+  i = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+       I2 = NF->arg_begin(); I != E; ++I, ++i)
+    if (ArgAlive[i]) {
+      // If this is a live argument, move the name and users over to the new
+      // version.
+      I->replaceAllUsesWith(&*I2);
+      I2->takeName(&*I);
+      ++I2;
+    } else {
+      // If this argument is dead, replace any uses of it with null constants
+      // (these are guaranteed to become unused later on).
+      if (!I->getType()->isX86_MMXTy())
+        I->replaceAllUsesWith(Constant::getNullValue(I->getType()));
+    }
+
+  // If we change the return value of the function we must rewrite any return
+  // instructions.  Check this now.
+  if (F->getReturnType() != NF->getReturnType())
+    for (BasicBlock &BB : *NF)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+        Value *RetVal;
+
+        if (NFTy->getReturnType()->isVoidTy()) {
+          RetVal = nullptr;
+        } else {
+          assert(RetTy->isStructTy() || RetTy->isArrayTy());
+          // The original return value was a struct or array, insert
+          // extractvalue/insertvalue chains to extract only the values we need
+          // to return and insert them into our new result.
+          // This does generate messy code, but we'll let it to instcombine to
+          // clean that up.
+          Value *OldRet = RI->getOperand(0);
+          // Start out building up our return value from undef
+          RetVal = UndefValue::get(NRetTy);
+          for (unsigned i = 0; i != RetCount; ++i)
+            if (NewRetIdxs[i] != -1) {
+              ExtractValueInst *EV = ExtractValueInst::Create(OldRet, i,
+                                                              "oldret", RI);
+              if (RetTypes.size() > 1) {
+                // We're still returning a struct, so reinsert the value into
+                // our new return value at the new index
+
+                RetVal = InsertValueInst::Create(RetVal, EV, NewRetIdxs[i],
+                                                 "newret", RI);
+              } else {
+                // We are now only returning a simple value, so just return the
+                // extracted value.
+                RetVal = EV;
+              }
+            }
+        }
+        // Replace the return instruction with one returning the new return
+        // value (possibly 0 if we became void).
+        ReturnInst::Create(F->getContext(), RetVal, RI);
+        BB.getInstList().erase(RI);
+      }
+
+  // Patch the pointer to LLVM function in debug info descriptor.
+  NF->setSubprogram(F->getSubprogram());
+
+  // Now that the old function is dead, delete it.
+  F->eraseFromParent();
+
+  return true;
+}
+
+PreservedAnalyses DeadArgumentEliminationPass::run(Module &M,
+                                                   ModuleAnalysisManager &) {
+  bool Changed = false;
+
+  // First pass: Do a simple check to see if any functions can have their "..."
+  // removed.  We can do this if they never call va_start.  This loop cannot be
+  // fused with the next loop, because deleting a function invalidates
+  // information computed while surveying other functions.
+  DEBUG(dbgs() << "DeadArgumentEliminationPass - Deleting dead varargs\n");
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    Function &F = *I++;
+    if (F.getFunctionType()->isVarArg())
+      Changed |= DeleteDeadVarargs(F);
+  }
+
+  // Second phase:loop through the module, determining which arguments are live.
+  // We assume all arguments are dead unless proven otherwise (allowing us to
+  // determine that dead arguments passed into recursive functions are dead).
+  //
+  DEBUG(dbgs() << "DeadArgumentEliminationPass - Determining liveness\n");
+  for (auto &F : M)
+    SurveyFunction(F);
+
+  // Now, remove all dead arguments and return values from each function in
+  // turn.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    // Increment now, because the function will probably get removed (ie.
+    // replaced by a new one).
+    Function *F = &*I++;
+    Changed |= RemoveDeadStuffFromFunction(F);
+  }
+
+  // Finally, look for any unused parameters in functions with non-local
+  // linkage and replace the passed in parameters with undef.
+  for (auto &F : M)
+    Changed |= RemoveDeadArgumentsFromCallers(F);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
new file mode 100644
index 000000000000..ecff88c88dcb
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -0,0 +1,96 @@
+//===-- ElimAvailExtern.cpp - DCE unreachable internal functions
+//----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transform is designed to eliminate available external global
+// definitions from the program, turning them into declarations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ElimAvailExtern.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "elim-avail-extern"
+
+STATISTIC(NumFunctions, "Number of functions removed");
+STATISTIC(NumVariables, "Number of global variables removed");
+
+static bool eliminateAvailableExternally(Module &M) {
+  bool Changed = false;
+
+  // Drop initializers of available externally global variables.
+  for (GlobalVariable &GV : M.globals()) {
+    if (!GV.hasAvailableExternallyLinkage())
+      continue;
+    if (GV.hasInitializer()) {
+      Constant *Init = GV.getInitializer();
+      GV.setInitializer(nullptr);
+      if (isSafeToDestroyConstant(Init))
+        Init->destroyConstant();
+    }
+    GV.removeDeadConstantUsers();
+    GV.setLinkage(GlobalValue::ExternalLinkage);
+    NumVariables++;
+    Changed = true;
+  }
+
+  // Drop the bodies of available externally functions.
+  for (Function &F : M) {
+    if (!F.hasAvailableExternallyLinkage())
+      continue;
+    if (!F.isDeclaration())
+      // This will set the linkage to external
+      F.deleteBody();
+    F.removeDeadConstantUsers();
+    NumFunctions++;
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses
+EliminateAvailableExternallyPass::run(Module &M, ModuleAnalysisManager &) {
+  if (!eliminateAvailableExternally(M))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+namespace {
+struct EliminateAvailableExternallyLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  EliminateAvailableExternallyLegacyPass() : ModulePass(ID) {
+    initializeEliminateAvailableExternallyLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  // run - Do the EliminateAvailableExternally pass on the specified module,
+  // optionally updating the specified callgraph to reflect the changes.
+  //
+  bool runOnModule(Module &M) {
+    if (skipModule(M))
+      return false;
+    return eliminateAvailableExternally(M);
+  }
+};
+}
+
+char EliminateAvailableExternallyLegacyPass::ID = 0;
+INITIALIZE_PASS(EliminateAvailableExternallyLegacyPass, "elim-avail-extern",
+                "Eliminate Available Externally Globals", false, false)
+
+ModulePass *llvm::createEliminateAvailableExternallyPass() {
+  return new EliminateAvailableExternallyLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
new file mode 100644
index 000000000000..d1147f7d844b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
@@ -0,0 +1,163 @@
+//===-- ExtractGV.cpp - Global Value extraction pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass extracts global values
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+#include <algorithm>
+using namespace llvm;
+
+/// Make sure GV is visible from both modules. Delete is true if it is
+/// being deleted from this module.
+/// This also makes sure GV cannot be dropped so that references from
+/// the split module remain valid.
+static void makeVisible(GlobalValue &GV, bool Delete) {
+  bool Local = GV.hasLocalLinkage();
+  if (Local || Delete) {
+    GV.setLinkage(GlobalValue::ExternalLinkage);
+    if (Local)
+      GV.setVisibility(GlobalValue::HiddenVisibility);
+    return;
+  }
+
+  if (!GV.hasLinkOnceLinkage()) {
+    assert(!GV.isDiscardableIfUnused());
+    return;
+  }
+
+  // Map linkonce* to weak* so that llvm doesn't drop this GV.
+  switch(GV.getLinkage()) {
+  default:
+    llvm_unreachable("Unexpected linkage");
+  case GlobalValue::LinkOnceAnyLinkage:
+    GV.setLinkage(GlobalValue::WeakAnyLinkage);
+    return;
+  case GlobalValue::LinkOnceODRLinkage:
+    GV.setLinkage(GlobalValue::WeakODRLinkage);
+    return;
+  }
+}
+
+namespace {
+  /// @brief A pass to extract specific global values and their dependencies.
+  class GVExtractorPass : public ModulePass {
+    SetVector<GlobalValue *> Named;
+    bool deleteStuff;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+
+    /// If deleteS is true, this pass deletes the specified global values.
+    /// Otherwise, it deletes as much of the module as possible, except for the
+    /// global values specified.
+    explicit GVExtractorPass(std::vector<GlobalValue*> &GVs,
+                             bool deleteS = true)
+      : ModulePass(ID), Named(GVs.begin(), GVs.end()), deleteStuff(deleteS) {}
+
+    bool runOnModule(Module &M) override {
+      if (skipModule(M))
+        return false;
+
+      // Visit the global inline asm.
+      if (!deleteStuff)
+        M.setModuleInlineAsm("");
+
+      // For simplicity, just give all GlobalValues ExternalLinkage. A trickier
+      // implementation could figure out which GlobalValues are actually
+      // referenced by the Named set, and which GlobalValues in the rest of
+      // the module are referenced by the NamedSet, and get away with leaving
+      // more internal and private things internal and private. But for now,
+      // be conservative and simple.
+
+      // Visit the GlobalVariables.
+      for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+           I != E; ++I) {
+        bool Delete =
+            deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration();
+        if (!Delete) {
+          if (I->hasAvailableExternallyLinkage())
+            continue;
+          if (I->getName() == "llvm.global_ctors")
+            continue;
+        }
+
+        makeVisible(*I, Delete);
+
+        if (Delete) {
+          // Make this a declaration and drop it's comdat.
+          I->setInitializer(nullptr);
+          I->setComdat(nullptr);
+        }
+      }
+
+      // Visit the Functions.
+      for (Function &F : M) {
+        bool Delete =
+            deleteStuff == (bool)Named.count(&F) && !F.isDeclaration();
+        if (!Delete) {
+          if (F.hasAvailableExternallyLinkage())
+            continue;
+        }
+
+        makeVisible(F, Delete);
+
+        if (Delete) {
+          // Make this a declaration and drop it's comdat.
+          F.deleteBody();
+          F.setComdat(nullptr);
+        }
+      }
+
+      // Visit the Aliases.
+      for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+           I != E;) {
+        Module::alias_iterator CurI = I;
+        ++I;
+
+        bool Delete = deleteStuff == (bool)Named.count(&*CurI);
+        makeVisible(*CurI, Delete);
+
+        if (Delete) {
+          Type *Ty =  CurI->getValueType();
+
+          CurI->removeFromParent();
+          llvm::Value *Declaration;
+          if (FunctionType *FTy = dyn_cast<FunctionType>(Ty)) {
+            Declaration = Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                           CurI->getName(), &M);
+
+          } else {
+            Declaration =
+              new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage,
+                                 nullptr, CurI->getName());
+
+          }
+          CurI->replaceAllUsesWith(Declaration);
+          delete &*CurI;
+        }
+      }
+
+      return true;
+    }
+  };
+
+  char GVExtractorPass::ID = 0;
+}
+
+ModulePass *llvm::createGVExtractionPass(std::vector<GlobalValue *> &GVs,
+                                         bool deleteFn) {
+  return new GVExtractorPass(GVs, deleteFn);
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
new file mode 100644
index 000000000000..968712138208
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -0,0 +1,122 @@
+//===- ForceFunctionAttrs.cpp - Force function attrs for debugging --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "forceattrs"
+
+static cl::list<std::string>
+    ForceAttributes("force-attribute", cl::Hidden,
+                    cl::desc("Add an attribute to a function. This should be a "
+                             "pair of 'function-name:attribute-name', for "
+                             "example -force-attribute=foo:noinline. This "
+                             "option can be specified multiple times."));
+
+static Attribute::AttrKind parseAttrKind(StringRef Kind) {
+  return StringSwitch<Attribute::AttrKind>(Kind)
+      .Case("alwaysinline", Attribute::AlwaysInline)
+      .Case("builtin", Attribute::Builtin)
+      .Case("cold", Attribute::Cold)
+      .Case("convergent", Attribute::Convergent)
+      .Case("inlinehint", Attribute::InlineHint)
+      .Case("jumptable", Attribute::JumpTable)
+      .Case("minsize", Attribute::MinSize)
+      .Case("naked", Attribute::Naked)
+      .Case("nobuiltin", Attribute::NoBuiltin)
+      .Case("noduplicate", Attribute::NoDuplicate)
+      .Case("noimplicitfloat", Attribute::NoImplicitFloat)
+      .Case("noinline", Attribute::NoInline)
+      .Case("nonlazybind", Attribute::NonLazyBind)
+      .Case("noredzone", Attribute::NoRedZone)
+      .Case("noreturn", Attribute::NoReturn)
+      .Case("norecurse", Attribute::NoRecurse)
+      .Case("nounwind", Attribute::NoUnwind)
+      .Case("optnone", Attribute::OptimizeNone)
+      .Case("optsize", Attribute::OptimizeForSize)
+      .Case("readnone", Attribute::ReadNone)
+      .Case("readonly", Attribute::ReadOnly)
+      .Case("argmemonly", Attribute::ArgMemOnly)
+      .Case("returns_twice", Attribute::ReturnsTwice)
+      .Case("safestack", Attribute::SafeStack)
+      .Case("sanitize_address", Attribute::SanitizeAddress)
+      .Case("sanitize_memory", Attribute::SanitizeMemory)
+      .Case("sanitize_thread", Attribute::SanitizeThread)
+      .Case("ssp", Attribute::StackProtect)
+      .Case("sspreq", Attribute::StackProtectReq)
+      .Case("sspstrong", Attribute::StackProtectStrong)
+      .Case("uwtable", Attribute::UWTable)
+      .Default(Attribute::None);
+}
+
+/// If F has any forced attributes given on the command line, add them.
+static void addForcedAttributes(Function &F) {
+  for (auto &S : ForceAttributes) {
+    auto KV = StringRef(S).split(':');
+    if (KV.first != F.getName())
+      continue;
+
+    auto Kind = parseAttrKind(KV.second);
+    if (Kind == Attribute::None) {
+      DEBUG(dbgs() << "ForcedAttribute: " << KV.second
+                   << " unknown or not handled!\n");
+      continue;
+    }
+    if (F.hasFnAttribute(Kind))
+      continue;
+    F.addFnAttr(Kind);
+  }
+}
+
+PreservedAnalyses ForceFunctionAttrsPass::run(Module &M,
+                                              ModuleAnalysisManager &) {
+  if (ForceAttributes.empty())
+    return PreservedAnalyses::all();
+
+  for (Function &F : M.functions())
+    addForcedAttributes(F);
+
+  // Just conservatively invalidate analyses, this isn't likely to be important.
+  return PreservedAnalyses::none();
+}
+
+namespace {
+struct ForceFunctionAttrsLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  ForceFunctionAttrsLegacyPass() : ModulePass(ID) {
+    initializeForceFunctionAttrsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (ForceAttributes.empty())
+      return false;
+
+    for (Function &F : M.functions())
+      addForcedAttributes(F);
+
+    // Conservatively assume we changed something.
+    return true;
+  }
+};
+}
+
+char ForceFunctionAttrsLegacyPass::ID = 0;
+INITIALIZE_PASS(ForceFunctionAttrsLegacyPass, "forceattrs",
+                "Force set function attributes", false, false)
+
+Pass *llvm::createForceFunctionAttrsLegacyPass() {
+  return new ForceFunctionAttrsLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
new file mode 100644
index 000000000000..813a4b6e2831
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -0,0 +1,1319 @@
+//===- FunctionAttrs.cpp - Pass which marks functions attributes ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements interprocedural passes which walk the
+/// call-graph deducing and/or propagating function attributes.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "functionattrs"
+
+STATISTIC(NumReadNone, "Number of functions marked readnone");
+STATISTIC(NumReadOnly, "Number of functions marked readonly");
+STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
+STATISTIC(NumReturned, "Number of arguments marked returned");
+STATISTIC(NumReadNoneArg, "Number of arguments marked readnone");
+STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly");
+STATISTIC(NumNoAlias, "Number of function returns marked noalias");
+STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull");
+STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
+
+// FIXME: This is disabled by default to avoid exposing security vulnerabilities
+// in C/C++ code compiled by clang:
+// http://lists.llvm.org/pipermail/cfe-dev/2017-January/052066.html
+static cl::opt<bool> EnableNonnullArgPropagation(
+    "enable-nonnull-arg-prop", cl::Hidden,
+    cl::desc("Try to propagate nonnull argument attributes from callsites to "
+             "caller functions."));
+
+namespace {
+typedef SmallSetVector<Function *, 8> SCCNodeSet;
+}
+
+/// Returns the memory access attribute for function F using AAR for AA results,
+/// where SCCNodes is the current SCC.
+///
+/// If ThisBody is true, this function may examine the function body and will
+/// return a result pertaining to this copy of the function. If it is false, the
+/// result will be based only on AA results for the function declaration; it
+/// will be assumed that some other (perhaps less optimized) version of the
+/// function may be selected at link time.
+static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
+                                                  AAResults &AAR,
+                                                  const SCCNodeSet &SCCNodes) {
+  FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F);
+  if (MRB == FMRB_DoesNotAccessMemory)
+    // Already perfect!
+    return MAK_ReadNone;
+
+  if (!ThisBody) {
+    if (AliasAnalysis::onlyReadsMemory(MRB))
+      return MAK_ReadOnly;
+
+    // Conservatively assume it writes to memory.
+    return MAK_MayWrite;
+  }
+
+  // Scan the function body for instructions that may read or write memory.
+  bool ReadsMemory = false;
+  for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
+    Instruction *I = &*II;
+
+    // Some instructions can be ignored even if they read or write memory.
+    // Detect these now, skipping to the next instruction if one is found.
+    CallSite CS(cast<Value>(I));
+    if (CS) {
+      // Ignore calls to functions in the same SCC, as long as the call sites
+      // don't have operand bundles.  Calls with operand bundles are allowed to
+      // have memory effects not described by the memory effects of the call
+      // target.
+      if (!CS.hasOperandBundles() && CS.getCalledFunction() &&
+          SCCNodes.count(CS.getCalledFunction()))
+        continue;
+      FunctionModRefBehavior MRB = AAR.getModRefBehavior(CS);
+
+      // If the call doesn't access memory, we're done.
+      if (!(MRB & MRI_ModRef))
+        continue;
+
+      if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) {
+        // The call could access any memory. If that includes writes, give up.
+        if (MRB & MRI_Mod)
+          return MAK_MayWrite;
+        // If it reads, note it.
+        if (MRB & MRI_Ref)
+          ReadsMemory = true;
+        continue;
+      }
+
+      // Check whether all pointer arguments point to local memory, and
+      // ignore calls that only access local memory.
+      for (CallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
+           CI != CE; ++CI) {
+        Value *Arg = *CI;
+        if (!Arg->getType()->isPtrOrPtrVectorTy())
+          continue;
+
+        AAMDNodes AAInfo;
+        I->getAAMetadata(AAInfo);
+        MemoryLocation Loc(Arg, MemoryLocation::UnknownSize, AAInfo);
+
+        // Skip accesses to local or constant memory as they don't impact the
+        // externally visible mod/ref behavior.
+        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+          continue;
+
+        if (MRB & MRI_Mod)
+          // Writes non-local memory.  Give up.
+          return MAK_MayWrite;
+        if (MRB & MRI_Ref)
+          // Ok, it reads non-local memory.
+          ReadsMemory = true;
+      }
+      continue;
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      // Ignore non-volatile loads from local memory. (Atomic is okay here.)
+      if (!LI->isVolatile()) {
+        MemoryLocation Loc = MemoryLocation::get(LI);
+        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+          continue;
+      }
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      // Ignore non-volatile stores to local memory. (Atomic is okay here.)
+      if (!SI->isVolatile()) {
+        MemoryLocation Loc = MemoryLocation::get(SI);
+        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+          continue;
+      }
+    } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) {
+      // Ignore vaargs on local memory.
+      MemoryLocation Loc = MemoryLocation::get(VI);
+      if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+        continue;
+    }
+
+    // Any remaining instructions need to be taken seriously!  Check if they
+    // read or write memory.
+    if (I->mayWriteToMemory())
+      // Writes memory.  Just give up.
+      return MAK_MayWrite;
+
+    // If this instruction may read memory, remember that.
+    ReadsMemory |= I->mayReadFromMemory();
+  }
+
+  return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone;
+}
+
+MemoryAccessKind llvm::computeFunctionBodyMemoryAccess(Function &F,
+                                                       AAResults &AAR) {
+  return checkFunctionMemoryAccess(F, /*ThisBody=*/true, AAR, {});
+}
+
+/// Deduce readonly/readnone attributes for the SCC.
+template <typename AARGetterT>
+static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
+  // Check if any of the functions in the SCC read or write memory.  If they
+  // write memory then they can't be marked readnone or readonly.
+  bool ReadsMemory = false;
+  for (Function *F : SCCNodes) {
+    // Call the callable parameter to look up AA results for this function.
+    AAResults &AAR = AARGetter(*F);
+
+    // Non-exact function definitions may not be selected at link time, and an
+    // alternative version that writes to memory may be selected.  See the
+    // comment on GlobalValue::isDefinitionExact for more details.
+    switch (checkFunctionMemoryAccess(*F, F->hasExactDefinition(),
+                                      AAR, SCCNodes)) {
+    case MAK_MayWrite:
+      return false;
+    case MAK_ReadOnly:
+      ReadsMemory = true;
+      break;
+    case MAK_ReadNone:
+      // Nothing to do!
+      break;
+    }
+  }
+
+  // Success!  Functions in this SCC do not access memory, or only read memory.
+  // Give them the appropriate attribute.
+  bool MadeChange = false;
+  for (Function *F : SCCNodes) {
+    if (F->doesNotAccessMemory())
+      // Already perfect!
+      continue;
+
+    if (F->onlyReadsMemory() && ReadsMemory)
+      // No change.
+      continue;
+
+    MadeChange = true;
+
+    // Clear out any existing attributes.
+    F->removeFnAttr(Attribute::ReadOnly);
+    F->removeFnAttr(Attribute::ReadNone);
+
+    // Add in the new attribute.
+    F->addFnAttr(ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone);
+
+    if (ReadsMemory)
+      ++NumReadOnly;
+    else
+      ++NumReadNone;
+  }
+
+  return MadeChange;
+}
+
+namespace {
+/// For a given pointer Argument, this retains a list of Arguments of functions
+/// in the same SCC that the pointer data flows into. We use this to build an
+/// SCC of the arguments.
+struct ArgumentGraphNode {
+  Argument *Definition;
+  SmallVector<ArgumentGraphNode *, 4> Uses;
+};
+
+class ArgumentGraph {
+  // We store pointers to ArgumentGraphNode objects, so it's important that
+  // that they not move around upon insert.
+  typedef std::map<Argument *, ArgumentGraphNode> ArgumentMapTy;
+
+  ArgumentMapTy ArgumentMap;
+
+  // There is no root node for the argument graph, in fact:
+  //   void f(int *x, int *y) { if (...) f(x, y); }
+  // is an example where the graph is disconnected. The SCCIterator requires a
+  // single entry point, so we maintain a fake ("synthetic") root node that
+  // uses every node. Because the graph is directed and nothing points into
+  // the root, it will not participate in any SCCs (except for its own).
+  ArgumentGraphNode SyntheticRoot;
+
+public:
+  ArgumentGraph() { SyntheticRoot.Definition = nullptr; }
+
+  typedef SmallVectorImpl<ArgumentGraphNode *>::iterator iterator;
+
+  iterator begin() { return SyntheticRoot.Uses.begin(); }
+  iterator end() { return SyntheticRoot.Uses.end(); }
+  ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; }
+
+  ArgumentGraphNode *operator[](Argument *A) {
+    ArgumentGraphNode &Node = ArgumentMap[A];
+    Node.Definition = A;
+    SyntheticRoot.Uses.push_back(&Node);
+    return &Node;
+  }
+};
+
+/// This tracker checks whether callees are in the SCC, and if so it does not
+/// consider that a capture, instead adding it to the "Uses" list and
+/// continuing with the analysis.
+struct ArgumentUsesTracker : public CaptureTracker {
+  ArgumentUsesTracker(const SCCNodeSet &SCCNodes)
+      : Captured(false), SCCNodes(SCCNodes) {}
+
+  void tooManyUses() override { Captured = true; }
+
+  bool captured(const Use *U) override {
+    CallSite CS(U->getUser());
+    if (!CS.getInstruction()) {
+      Captured = true;
+      return true;
+    }
+
+    Function *F = CS.getCalledFunction();
+    if (!F || !F->hasExactDefinition() || !SCCNodes.count(F)) {
+      Captured = true;
+      return true;
+    }
+
+    // Note: the callee and the two successor blocks *follow* the argument
+    // operands.  This means there is no need to adjust UseIndex to account for
+    // these.
+
+    unsigned UseIndex =
+        std::distance(const_cast<const Use *>(CS.arg_begin()), U);
+
+    assert(UseIndex < CS.data_operands_size() &&
+           "Indirect function calls should have been filtered above!");
+
+    if (UseIndex >= CS.getNumArgOperands()) {
+      // Data operand, but not a argument operand -- must be a bundle operand
+      assert(CS.hasOperandBundles() && "Must be!");
+
+      // CaptureTracking told us that we're being captured by an operand bundle
+      // use.  In this case it does not matter if the callee is within our SCC
+      // or not -- we've been captured in some unknown way, and we have to be
+      // conservative.
+      Captured = true;
+      return true;
+    }
+
+    if (UseIndex >= F->arg_size()) {
+      assert(F->isVarArg() && "More params than args in non-varargs call");
+      Captured = true;
+      return true;
+    }
+
+    Uses.push_back(&*std::next(F->arg_begin(), UseIndex));
+    return false;
+  }
+
+  bool Captured; // True only if certainly captured (used outside our SCC).
+  SmallVector<Argument *, 4> Uses; // Uses within our SCC.
+
+  const SCCNodeSet &SCCNodes;
+};
+}
+
+namespace llvm {
+template <> struct GraphTraits<ArgumentGraphNode *> {
+  typedef ArgumentGraphNode *NodeRef;
+  typedef SmallVectorImpl<ArgumentGraphNode *>::iterator ChildIteratorType;
+
+  static NodeRef getEntryNode(NodeRef A) { return A; }
+  static ChildIteratorType child_begin(NodeRef N) { return N->Uses.begin(); }
+  static ChildIteratorType child_end(NodeRef N) { return N->Uses.end(); }
+};
+template <>
+struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> {
+  static NodeRef getEntryNode(ArgumentGraph *AG) { return AG->getEntryNode(); }
+  static ChildIteratorType nodes_begin(ArgumentGraph *AG) {
+    return AG->begin();
+  }
+  static ChildIteratorType nodes_end(ArgumentGraph *AG) { return AG->end(); }
+};
+}
+
+/// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone.
+static Attribute::AttrKind
+determinePointerReadAttrs(Argument *A,
+                          const SmallPtrSet<Argument *, 8> &SCCNodes) {
+
+  SmallVector<Use *, 32> Worklist;
+  SmallSet<Use *, 32> Visited;
+
+  // inalloca arguments are always clobbered by the call.
+  if (A->hasInAllocaAttr())
+    return Attribute::None;
+
+  bool IsRead = false;
+  // We don't need to track IsWritten. If A is written to, return immediately.
+
+  for (Use &U : A->uses()) {
+    Visited.insert(&U);
+    Worklist.push_back(&U);
+  }
+
+  while (!Worklist.empty()) {
+    Use *U = Worklist.pop_back_val();
+    Instruction *I = cast<Instruction>(U->getUser());
+
+    switch (I->getOpcode()) {
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::PHI:
+    case Instruction::Select:
+    case Instruction::AddrSpaceCast:
+      // The original value is not read/written via this if the new value isn't.
+      for (Use &UU : I->uses())
+        if (Visited.insert(&UU).second)
+          Worklist.push_back(&UU);
+      break;
+
+    case Instruction::Call:
+    case Instruction::Invoke: {
+      bool Captures = true;
+
+      if (I->getType()->isVoidTy())
+        Captures = false;
+
+      auto AddUsersToWorklistIfCapturing = [&] {
+        if (Captures)
+          for (Use &UU : I->uses())
+            if (Visited.insert(&UU).second)
+              Worklist.push_back(&UU);
+      };
+
+      CallSite CS(I);
+      if (CS.doesNotAccessMemory()) {
+        AddUsersToWorklistIfCapturing();
+        continue;
+      }
+
+      Function *F = CS.getCalledFunction();
+      if (!F) {
+        if (CS.onlyReadsMemory()) {
+          IsRead = true;
+          AddUsersToWorklistIfCapturing();
+          continue;
+        }
+        return Attribute::None;
+      }
+
+      // Note: the callee and the two successor blocks *follow* the argument
+      // operands.  This means there is no need to adjust UseIndex to account
+      // for these.
+
+      unsigned UseIndex = std::distance(CS.arg_begin(), U);
+
+      // U cannot be the callee operand use: since we're exploring the
+      // transitive uses of an Argument, having such a use be a callee would
+      // imply the CallSite is an indirect call or invoke; and we'd take the
+      // early exit above.
+      assert(UseIndex < CS.data_operands_size() &&
+             "Data operand use expected!");
+
+      bool IsOperandBundleUse = UseIndex >= CS.getNumArgOperands();
+
+      if (UseIndex >= F->arg_size() && !IsOperandBundleUse) {
+        assert(F->isVarArg() && "More params than args in non-varargs call");
+        return Attribute::None;
+      }
+
+      Captures &= !CS.doesNotCapture(UseIndex);
+
+      // Since the optimizer (by design) cannot see the data flow corresponding
+      // to a operand bundle use, these cannot participate in the optimistic SCC
+      // analysis.  Instead, we model the operand bundle uses as arguments in
+      // call to a function external to the SCC.
+      if (IsOperandBundleUse ||
+          !SCCNodes.count(&*std::next(F->arg_begin(), UseIndex))) {
+
+        // The accessors used on CallSite here do the right thing for calls and
+        // invokes with operand bundles.
+
+        if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(UseIndex))
+          return Attribute::None;
+        if (!CS.doesNotAccessMemory(UseIndex))
+          IsRead = true;
+      }
+
+      AddUsersToWorklistIfCapturing();
+      break;
+    }
+
+    case Instruction::Load:
+      // A volatile load has side effects beyond what readonly can be relied
+      // upon.
+      if (cast<LoadInst>(I)->isVolatile())
+        return Attribute::None;
+
+      IsRead = true;
+      break;
+
+    case Instruction::ICmp:
+    case Instruction::Ret:
+      break;
+
+    default:
+      return Attribute::None;
+    }
+  }
+
+  return IsRead ? Attribute::ReadOnly : Attribute::ReadNone;
+}
+
+/// Deduce returned attributes for the SCC.
+static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
+  bool Changed = false;
+
+  // Check each function in turn, determining if an argument is always returned.
+  for (Function *F : SCCNodes) {
+    // We can infer and propagate function attributes only when we know that the
+    // definition we'll get at link time is *exactly* the definition we see now.
+    // For more details, see GlobalValue::mayBeDerefined.
+    if (!F->hasExactDefinition())
+      continue;
+
+    if (F->getReturnType()->isVoidTy())
+      continue;
+
+    // There is nothing to do if an argument is already marked as 'returned'.
+    if (any_of(F->args(),
+               [](const Argument &Arg) { return Arg.hasReturnedAttr(); }))
+      continue;
+
+    auto FindRetArg = [&]() -> Value * {
+      Value *RetArg = nullptr;
+      for (BasicBlock &BB : *F)
+        if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) {
+          // Note that stripPointerCasts should look through functions with
+          // returned arguments.
+          Value *RetVal = Ret->getReturnValue()->stripPointerCasts();
+          if (!isa<Argument>(RetVal) || RetVal->getType() != F->getReturnType())
+            return nullptr;
+
+          if (!RetArg)
+            RetArg = RetVal;
+          else if (RetArg != RetVal)
+            return nullptr;
+        }
+
+      return RetArg;
+    };
+
+    if (Value *RetArg = FindRetArg()) {
+      auto *A = cast<Argument>(RetArg);
+      A->addAttr(Attribute::Returned);
+      ++NumReturned;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// If a callsite has arguments that are also arguments to the parent function,
+/// try to propagate attributes from the callsite's arguments to the parent's
+/// arguments. This may be important because inlining can cause information loss
+/// when attribute knowledge disappears with the inlined call.
+static bool addArgumentAttrsFromCallsites(Function &F) {
+  if (!EnableNonnullArgPropagation)
+    return false;
+
+  bool Changed = false;
+
+  // For an argument attribute to transfer from a callsite to the parent, the
+  // call must be guaranteed to execute every time the parent is called.
+  // Conservatively, just check for calls in the entry block that are guaranteed
+  // to execute.
+  // TODO: This could be enhanced by testing if the callsite post-dominates the
+  // entry block or by doing simple forward walks or backward walks to the
+  // callsite.
+  BasicBlock &Entry = F.getEntryBlock();
+  for (Instruction &I : Entry) {
+    if (auto CS = CallSite(&I)) {
+      if (auto *CalledFunc = CS.getCalledFunction()) {
+        for (auto &CSArg : CalledFunc->args()) {
+          if (!CSArg.hasNonNullAttr())
+            continue;
+
+          // If the non-null callsite argument operand is an argument to 'F'
+          // (the caller) and the call is guaranteed to execute, then the value
+          // must be non-null throughout 'F'.
+          auto *FArg = dyn_cast<Argument>(CS.getArgOperand(CSArg.getArgNo()));
+          if (FArg && !FArg->hasNonNullAttr()) {
+            FArg->addAttr(Attribute::NonNull);
+            Changed = true;
+          }
+        }
+      }
+    }
+    if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+      break;
+  }
+  
+  return Changed;
+}
+
+/// Deduce nocapture attributes for the SCC.
+static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
+  bool Changed = false;
+
+  ArgumentGraph AG;
+
+  // Check each function in turn, determining which pointer arguments are not
+  // captured.
+  for (Function *F : SCCNodes) {
+    // We can infer and propagate function attributes only when we know that the
+    // definition we'll get at link time is *exactly* the definition we see now.
+    // For more details, see GlobalValue::mayBeDerefined.
+    if (!F->hasExactDefinition())
+      continue;
+
+    Changed |= addArgumentAttrsFromCallsites(*F);
+
+    // Functions that are readonly (or readnone) and nounwind and don't return
+    // a value can't capture arguments. Don't analyze them.
+    if (F->onlyReadsMemory() && F->doesNotThrow() &&
+        F->getReturnType()->isVoidTy()) {
+      for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E;
+           ++A) {
+        if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
+          A->addAttr(Attribute::NoCapture);
+          ++NumNoCapture;
+          Changed = true;
+        }
+      }
+      continue;
+    }
+
+    for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E;
+         ++A) {
+      if (!A->getType()->isPointerTy())
+        continue;
+      bool HasNonLocalUses = false;
+      if (!A->hasNoCaptureAttr()) {
+        ArgumentUsesTracker Tracker(SCCNodes);
+        PointerMayBeCaptured(&*A, &Tracker);
+        if (!Tracker.Captured) {
+          if (Tracker.Uses.empty()) {
+            // If it's trivially not captured, mark it nocapture now.
+            A->addAttr(Attribute::NoCapture);
+            ++NumNoCapture;
+            Changed = true;
+          } else {
+            // If it's not trivially captured and not trivially not captured,
+            // then it must be calling into another function in our SCC. Save
+            // its particulars for Argument-SCC analysis later.
+            ArgumentGraphNode *Node = AG[&*A];
+            for (Argument *Use : Tracker.Uses) {
+              Node->Uses.push_back(AG[Use]);
+              if (Use != &*A)
+                HasNonLocalUses = true;
+            }
+          }
+        }
+        // Otherwise, it's captured. Don't bother doing SCC analysis on it.
+      }
+      if (!HasNonLocalUses && !A->onlyReadsMemory()) {
+        // Can we determine that it's readonly/readnone without doing an SCC?
+        // Note that we don't allow any calls at all here, or else our result
+        // will be dependent on the iteration order through the functions in the
+        // SCC.
+        SmallPtrSet<Argument *, 8> Self;
+        Self.insert(&*A);
+        Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self);
+        if (R != Attribute::None) {
+          A->addAttr(R);
+          Changed = true;
+          R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
+        }
+      }
+    }
+  }
+
+  // The graph we've collected is partial because we stopped scanning for
+  // argument uses once we solved the argument trivially. These partial nodes
+  // show up as ArgumentGraphNode objects with an empty Uses list, and for
+  // these nodes the final decision about whether they capture has already been
+  // made.  If the definition doesn't have a 'nocapture' attribute by now, it
+  // captures.
+
+  for (scc_iterator<ArgumentGraph *> I = scc_begin(&AG); !I.isAtEnd(); ++I) {
+    const std::vector<ArgumentGraphNode *> &ArgumentSCC = *I;
+    if (ArgumentSCC.size() == 1) {
+      if (!ArgumentSCC[0]->Definition)
+        continue; // synthetic root node
+
+      // eg. "void f(int* x) { if (...) f(x); }"
+      if (ArgumentSCC[0]->Uses.size() == 1 &&
+          ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) {
+        Argument *A = ArgumentSCC[0]->Definition;
+        A->addAttr(Attribute::NoCapture);
+        ++NumNoCapture;
+        Changed = true;
+      }
+      continue;
+    }
+
+    bool SCCCaptured = false;
+    for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end();
+         I != E && !SCCCaptured; ++I) {
+      ArgumentGraphNode *Node = *I;
+      if (Node->Uses.empty()) {
+        if (!Node->Definition->hasNoCaptureAttr())
+          SCCCaptured = true;
+      }
+    }
+    if (SCCCaptured)
+      continue;
+
+    SmallPtrSet<Argument *, 8> ArgumentSCCNodes;
+    // Fill ArgumentSCCNodes with the elements of the ArgumentSCC.  Used for
+    // quickly looking up whether a given Argument is in this ArgumentSCC.
+    for (ArgumentGraphNode *I : ArgumentSCC) {
+      ArgumentSCCNodes.insert(I->Definition);
+    }
+
+    for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end();
+         I != E && !SCCCaptured; ++I) {
+      ArgumentGraphNode *N = *I;
+      for (ArgumentGraphNode *Use : N->Uses) {
+        Argument *A = Use->Definition;
+        if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A))
+          continue;
+        SCCCaptured = true;
+        break;
+      }
+    }
+    if (SCCCaptured)
+      continue;
+
+    for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+      Argument *A = ArgumentSCC[i]->Definition;
+      A->addAttr(Attribute::NoCapture);
+      ++NumNoCapture;
+      Changed = true;
+    }
+
+    // We also want to compute readonly/readnone. With a small number of false
+    // negatives, we can assume that any pointer which is captured isn't going
+    // to be provably readonly or readnone, since by definition we can't
+    // analyze all uses of a captured pointer.
+    //
+    // The false negatives happen when the pointer is captured by a function
+    // that promises readonly/readnone behaviour on the pointer, then the
+    // pointer's lifetime ends before anything that writes to arbitrary memory.
+    // Also, a readonly/readnone pointer may be returned, but returning a
+    // pointer is capturing it.
+
+    Attribute::AttrKind ReadAttr = Attribute::ReadNone;
+    for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+      Argument *A = ArgumentSCC[i]->Definition;
+      Attribute::AttrKind K = determinePointerReadAttrs(A, ArgumentSCCNodes);
+      if (K == Attribute::ReadNone)
+        continue;
+      if (K == Attribute::ReadOnly) {
+        ReadAttr = Attribute::ReadOnly;
+        continue;
+      }
+      ReadAttr = K;
+      break;
+    }
+
+    if (ReadAttr != Attribute::None) {
+      for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+        Argument *A = ArgumentSCC[i]->Definition;
+        // Clear out existing readonly/readnone attributes
+        A->removeAttr(Attribute::ReadOnly);
+        A->removeAttr(Attribute::ReadNone);
+        A->addAttr(ReadAttr);
+        ReadAttr == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+/// Tests whether a function is "malloc-like".
+///
+/// A function is "malloc-like" if it returns either null or a pointer that
+/// doesn't alias any other pointer visible to the caller.
+static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {
+  SmallSetVector<Value *, 8> FlowsToReturn;
+  for (BasicBlock &BB : *F)
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator()))
+      FlowsToReturn.insert(Ret->getReturnValue());
+
+  for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
+    Value *RetVal = FlowsToReturn[i];
+
+    if (Constant *C = dyn_cast<Constant>(RetVal)) {
+      if (!C->isNullValue() && !isa<UndefValue>(C))
+        return false;
+
+      continue;
+    }
+
+    if (isa<Argument>(RetVal))
+      return false;
+
+    if (Instruction *RVI = dyn_cast<Instruction>(RetVal))
+      switch (RVI->getOpcode()) {
+      // Extend the analysis by looking upwards.
+      case Instruction::BitCast:
+      case Instruction::GetElementPtr:
+      case Instruction::AddrSpaceCast:
+        FlowsToReturn.insert(RVI->getOperand(0));
+        continue;
+      case Instruction::Select: {
+        SelectInst *SI = cast<SelectInst>(RVI);
+        FlowsToReturn.insert(SI->getTrueValue());
+        FlowsToReturn.insert(SI->getFalseValue());
+        continue;
+      }
+      case Instruction::PHI: {
+        PHINode *PN = cast<PHINode>(RVI);
+        for (Value *IncValue : PN->incoming_values())
+          FlowsToReturn.insert(IncValue);
+        continue;
+      }
+
+      // Check whether the pointer came from an allocation.
+      case Instruction::Alloca:
+        break;
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        CallSite CS(RVI);
+        if (CS.hasRetAttr(Attribute::NoAlias))
+          break;
+        if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction()))
+          break;
+        LLVM_FALLTHROUGH;
+      }
+      default:
+        return false; // Did not come from an allocation.
+      }
+
+    if (PointerMayBeCaptured(RetVal, false, /*StoreCaptures=*/false))
+      return false;
+  }
+
+  return true;
+}
+
+/// Deduce noalias attributes for the SCC.
+static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
+  // Check each function in turn, determining which functions return noalias
+  // pointers.
+  for (Function *F : SCCNodes) {
+    // Already noalias.
+    if (F->returnDoesNotAlias())
+      continue;
+
+    // We can infer and propagate function attributes only when we know that the
+    // definition we'll get at link time is *exactly* the definition we see now.
+    // For more details, see GlobalValue::mayBeDerefined.
+    if (!F->hasExactDefinition())
+      return false;
+
+    // We annotate noalias return values, which are only applicable to
+    // pointer types.
+    if (!F->getReturnType()->isPointerTy())
+      continue;
+
+    if (!isFunctionMallocLike(F, SCCNodes))
+      return false;
+  }
+
+  bool MadeChange = false;
+  for (Function *F : SCCNodes) {
+    if (F->returnDoesNotAlias() ||
+        !F->getReturnType()->isPointerTy())
+      continue;
+
+    F->setReturnDoesNotAlias();
+    ++NumNoAlias;
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+/// Tests whether this function is known to not return null.
+///
+/// Requires that the function returns a pointer.
+///
+/// Returns true if it believes the function will not return a null, and sets
+/// \p Speculative based on whether the returned conclusion is a speculative
+/// conclusion due to SCC calls.
+static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
+                            bool &Speculative) {
+  assert(F->getReturnType()->isPointerTy() &&
+         "nonnull only meaningful on pointer types");
+  Speculative = false;
+
+  SmallSetVector<Value *, 8> FlowsToReturn;
+  for (BasicBlock &BB : *F)
+    if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator()))
+      FlowsToReturn.insert(Ret->getReturnValue());
+
+  for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
+    Value *RetVal = FlowsToReturn[i];
+
+    // If this value is locally known to be non-null, we're good
+    if (isKnownNonNull(RetVal))
+      continue;
+
+    // Otherwise, we need to look upwards since we can't make any local
+    // conclusions.
+    Instruction *RVI = dyn_cast<Instruction>(RetVal);
+    if (!RVI)
+      return false;
+    switch (RVI->getOpcode()) {
+    // Extend the analysis by looking upwards.
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::AddrSpaceCast:
+      FlowsToReturn.insert(RVI->getOperand(0));
+      continue;
+    case Instruction::Select: {
+      SelectInst *SI = cast<SelectInst>(RVI);
+      FlowsToReturn.insert(SI->getTrueValue());
+      FlowsToReturn.insert(SI->getFalseValue());
+      continue;
+    }
+    case Instruction::PHI: {
+      PHINode *PN = cast<PHINode>(RVI);
+      for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        FlowsToReturn.insert(PN->getIncomingValue(i));
+      continue;
+    }
+    case Instruction::Call:
+    case Instruction::Invoke: {
+      CallSite CS(RVI);
+      Function *Callee = CS.getCalledFunction();
+      // A call to a node within the SCC is assumed to return null until
+      // proven otherwise
+      if (Callee && SCCNodes.count(Callee)) {
+        Speculative = true;
+        continue;
+      }
+      return false;
+    }
+    default:
+      return false; // Unknown source, may be null
+    };
+    llvm_unreachable("should have either continued or returned");
+  }
+
+  return true;
+}
+
+/// Deduce nonnull attributes for the SCC.
+static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
+  // Speculative that all functions in the SCC return only nonnull
+  // pointers.  We may refute this as we analyze functions.
+  bool SCCReturnsNonNull = true;
+
+  bool MadeChange = false;
+
+  // Check each function in turn, determining which functions return nonnull
+  // pointers.
+  for (Function *F : SCCNodes) {
+    // Already nonnull.
+    if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
+                                        Attribute::NonNull))
+      continue;
+
+    // We can infer and propagate function attributes only when we know that the
+    // definition we'll get at link time is *exactly* the definition we see now.
+    // For more details, see GlobalValue::mayBeDerefined.
+    if (!F->hasExactDefinition())
+      return false;
+
+    // We annotate nonnull return values, which are only applicable to
+    // pointer types.
+    if (!F->getReturnType()->isPointerTy())
+      continue;
+
+    bool Speculative = false;
+    if (isReturnNonNull(F, SCCNodes, Speculative)) {
+      if (!Speculative) {
+        // Mark the function eagerly since we may discover a function
+        // which prevents us from speculating about the entire SCC
+        DEBUG(dbgs() << "Eagerly marking " << F->getName() << " as nonnull\n");
+        F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+        ++NumNonNullReturn;
+        MadeChange = true;
+      }
+      continue;
+    }
+    // At least one function returns something which could be null, can't
+    // speculate any more.
+    SCCReturnsNonNull = false;
+  }
+
+  if (SCCReturnsNonNull) {
+    for (Function *F : SCCNodes) {
+      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
+                                          Attribute::NonNull) ||
+          !F->getReturnType()->isPointerTy())
+        continue;
+
+      DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
+      F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+      ++NumNonNullReturn;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+/// Remove the convergent attribute from all functions in the SCC if every
+/// callsite within the SCC is not convergent (except for calls to functions
+/// within the SCC).  Returns true if changes were made.
+static bool removeConvergentAttrs(const SCCNodeSet &SCCNodes) {
+  // For every function in SCC, ensure that either
+  //  * it is not convergent, or
+  //  * we can remove its convergent attribute.
+  bool HasConvergentFn = false;
+  for (Function *F : SCCNodes) {
+    if (!F->isConvergent()) continue;
+    HasConvergentFn = true;
+
+    // Can't remove convergent from function declarations.
+    if (F->isDeclaration()) return false;
+
+    // Can't remove convergent if any of our functions has a convergent call to a
+    // function not in the SCC.
+    for (Instruction &I : instructions(*F)) {
+      CallSite CS(&I);
+      // Bail if CS is a convergent call to a function not in the SCC.
+      if (CS && CS.isConvergent() &&
+          SCCNodes.count(CS.getCalledFunction()) == 0)
+        return false;
+    }
+  }
+
+  // If the SCC doesn't have any convergent functions, we have nothing to do.
+  if (!HasConvergentFn) return false;
+
+  // If we got here, all of the calls the SCC makes to functions not in the SCC
+  // are non-convergent.  Therefore all of the SCC's functions can also be made
+  // non-convergent.  We'll remove the attr from the callsites in
+  // InstCombineCalls.
+  for (Function *F : SCCNodes) {
+    if (!F->isConvergent()) continue;
+
+    DEBUG(dbgs() << "Removing convergent attr from fn " << F->getName()
+                 << "\n");
+    F->setNotConvergent();
+  }
+  return true;
+}
+
+static bool setDoesNotRecurse(Function &F) {
+  if (F.doesNotRecurse())
+    return false;
+  F.setDoesNotRecurse();
+  ++NumNoRecurse;
+  return true;
+}
+
+static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
+  // Try and identify functions that do not recurse.
+
+  // If the SCC contains multiple nodes we know for sure there is recursion.
+  if (SCCNodes.size() != 1)
+    return false;
+
+  Function *F = *SCCNodes.begin();
+  if (!F || F->isDeclaration() || F->doesNotRecurse())
+    return false;
+
+  // If all of the calls in F are identifiable and are to norecurse functions, F
+  // is norecurse. This check also detects self-recursion as F is not currently
+  // marked norecurse, so any called from F to F will not be marked norecurse.
+  for (Instruction &I : instructions(*F))
+    if (auto CS = CallSite(&I)) {
+      Function *Callee = CS.getCalledFunction();
+      if (!Callee || Callee == F || !Callee->doesNotRecurse())
+        // Function calls a potentially recursive function.
+        return false;
+    }
+
+  // Every call was to a non-recursive function other than this function, and
+  // we have no indirect recursion as the SCC size is one. This function cannot
+  // recurse.
+  return setDoesNotRecurse(*F);
+}
+
+PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
+                                                  CGSCCAnalysisManager &AM,
+                                                  LazyCallGraph &CG,
+                                                  CGSCCUpdateResult &) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+
+  // We pass a lambda into functions to wire them up to the analysis manager
+  // for getting function analyses.
+  auto AARGetter = [&](Function &F) -> AAResults & {
+    return FAM.getResult<AAManager>(F);
+  };
+
+  // Fill SCCNodes with the elements of the SCC. Also track whether there are
+  // any external or opt-none nodes that will prevent us from optimizing any
+  // part of the SCC.
+  SCCNodeSet SCCNodes;
+  bool HasUnknownCall = false;
+  for (LazyCallGraph::Node &N : C) {
+    Function &F = N.getFunction();
+    if (F.hasFnAttribute(Attribute::OptimizeNone)) {
+      // Treat any function we're trying not to optimize as if it were an
+      // indirect call and omit it from the node set used below.
+      HasUnknownCall = true;
+      continue;
+    }
+    // Track whether any functions in this SCC have an unknown call edge.
+    // Note: if this is ever a performance hit, we can common it with
+    // subsequent routines which also do scans over the instructions of the
+    // function.
+    if (!HasUnknownCall)
+      for (Instruction &I : instructions(F))
+        if (auto CS = CallSite(&I))
+          if (!CS.getCalledFunction()) {
+            HasUnknownCall = true;
+            break;
+          }
+
+    SCCNodes.insert(&F);
+  }
+
+  bool Changed = false;
+  Changed |= addArgumentReturnedAttrs(SCCNodes);
+  Changed |= addReadAttrs(SCCNodes, AARGetter);
+  Changed |= addArgumentAttrs(SCCNodes);
+
+  // If we have no external nodes participating in the SCC, we can deduce some
+  // more precise attributes as well.
+  if (!HasUnknownCall) {
+    Changed |= addNoAliasAttrs(SCCNodes);
+    Changed |= addNonNullAttrs(SCCNodes);
+    Changed |= removeConvergentAttrs(SCCNodes);
+    Changed |= addNoRecurseAttrs(SCCNodes);
+  }
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+namespace {
+struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass {
+  static char ID; // Pass identification, replacement for typeid
+  PostOrderFunctionAttrsLegacyPass() : CallGraphSCCPass(ID) {
+    initializePostOrderFunctionAttrsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnSCC(CallGraphSCC &SCC) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AssumptionCacheTracker>();
+    getAAResultsAnalysisUsage(AU);
+    CallGraphSCCPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char PostOrderFunctionAttrsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrsLegacyPass, "functionattrs",
+                      "Deduce function attributes", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(PostOrderFunctionAttrsLegacyPass, "functionattrs",
+                    "Deduce function attributes", false, false)
+
+Pass *llvm::createPostOrderFunctionAttrsLegacyPass() {
+  return new PostOrderFunctionAttrsLegacyPass();
+}
+
+template <typename AARGetterT>
+static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
+  bool Changed = false;
+
+  // Fill SCCNodes with the elements of the SCC. Used for quickly looking up
+  // whether a given CallGraphNode is in this SCC. Also track whether there are
+  // any external or opt-none nodes that will prevent us from optimizing any
+  // part of the SCC.
+  SCCNodeSet SCCNodes;
+  bool ExternalNode = false;
+  for (CallGraphNode *I : SCC) {
+    Function *F = I->getFunction();
+    if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) {
+      // External node or function we're trying not to optimize - we both avoid
+      // transform them and avoid leveraging information they provide.
+      ExternalNode = true;
+      continue;
+    }
+
+    SCCNodes.insert(F);
+  }
+
+  // Skip it if the SCC only contains optnone functions.
+  if (SCCNodes.empty())
+    return Changed;
+
+  Changed |= addArgumentReturnedAttrs(SCCNodes);
+  Changed |= addReadAttrs(SCCNodes, AARGetter);
+  Changed |= addArgumentAttrs(SCCNodes);
+
+  // If we have no external nodes participating in the SCC, we can deduce some
+  // more precise attributes as well.
+  if (!ExternalNode) {
+    Changed |= addNoAliasAttrs(SCCNodes);
+    Changed |= addNonNullAttrs(SCCNodes);
+    Changed |= removeConvergentAttrs(SCCNodes);
+    Changed |= addNoRecurseAttrs(SCCNodes);
+  }
+
+  return Changed;
+}
+
+bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) {
+  if (skipSCC(SCC))
+    return false;
+  return runImpl(SCC, LegacyAARGetter(*this));
+}
+
+namespace {
+struct ReversePostOrderFunctionAttrsLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  ReversePostOrderFunctionAttrsLegacyPass() : ModulePass(ID) {
+    initializeReversePostOrderFunctionAttrsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<CallGraphWrapperPass>();
+    AU.addPreserved<CallGraphWrapperPass>();
+  }
+};
+}
+
+char ReversePostOrderFunctionAttrsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrsLegacyPass, "rpo-functionattrs",
+                      "Deduce function attributes in RPO", false, false)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(ReversePostOrderFunctionAttrsLegacyPass, "rpo-functionattrs",
+                    "Deduce function attributes in RPO", false, false)
+
+Pass *llvm::createReversePostOrderFunctionAttrsPass() {
+  return new ReversePostOrderFunctionAttrsLegacyPass();
+}
+
+static bool addNoRecurseAttrsTopDown(Function &F) {
+  // We check the preconditions for the function prior to calling this to avoid
+  // the cost of building up a reversible post-order list. We assert them here
+  // to make sure none of the invariants this relies on were violated.
+  assert(!F.isDeclaration() && "Cannot deduce norecurse without a definition!");
+  assert(!F.doesNotRecurse() &&
+         "This function has already been deduced as norecurs!");
+  assert(F.hasInternalLinkage() &&
+         "Can only do top-down deduction for internal linkage functions!");
+
+  // If F is internal and all of its uses are calls from a non-recursive
+  // functions, then none of its calls could in fact recurse without going
+  // through a function marked norecurse, and so we can mark this function too
+  // as norecurse. Note that the uses must actually be calls -- otherwise
+  // a pointer to this function could be returned from a norecurse function but
+  // this function could be recursively (indirectly) called. Note that this
+  // also detects if F is directly recursive as F is not yet marked as
+  // a norecurse function.
+  for (auto *U : F.users()) {
+    auto *I = dyn_cast<Instruction>(U);
+    if (!I)
+      return false;
+    CallSite CS(I);
+    if (!CS || !CS.getParent()->getParent()->doesNotRecurse())
+      return false;
+  }
+  return setDoesNotRecurse(F);
+}
+
+static bool deduceFunctionAttributeInRPO(Module &M, CallGraph &CG) {
+  // We only have a post-order SCC traversal (because SCCs are inherently
+  // discovered in post-order), so we accumulate them in a vector and then walk
+  // it in reverse. This is simpler than using the RPO iterator infrastructure
+  // because we need to combine SCC detection and the PO walk of the call
+  // graph. We can also cheat egregiously because we're primarily interested in
+  // synthesizing norecurse and so we can only save the singular SCCs as SCCs
+  // with multiple functions in them will clearly be recursive.
+  SmallVector<Function *, 16> Worklist;
+  for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) {
+    if (I->size() != 1)
+      continue;
+
+    Function *F = I->front()->getFunction();
+    if (F && !F->isDeclaration() && !F->doesNotRecurse() &&
+        F->hasInternalLinkage())
+      Worklist.push_back(F);
+  }
+
+  bool Changed = false;
+  for (auto *F : reverse(Worklist))
+    Changed |= addNoRecurseAttrsTopDown(*F);
+
+  return Changed;
+}
+
+bool ReversePostOrderFunctionAttrsLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  auto &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+
+  return deduceFunctionAttributeInRPO(M, CG);
+}
+
+PreservedAnalyses
+ReversePostOrderFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) {
+  auto &CG = AM.getResult<CallGraphAnalysis>(M);
+
+  if (!deduceFunctionAttributeInRPO(M, CG))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<CallGraphAnalysis>();
+  return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp
new file mode 100644
index 000000000000..233a36d2bc54
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -0,0 +1,887 @@
+//===- FunctionImport.cpp - ThinLTO Summary-based Function Import ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Function import based on summaries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/FunctionImport.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/IR/AutoUpgrade.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Transforms/IPO/Internalize.h"
+#include "llvm/Transforms/Utils/FunctionImportUtils.h"
+
+#define DEBUG_TYPE "function-import"
+
+using namespace llvm;
+
+STATISTIC(NumImportedFunctions, "Number of functions imported");
+STATISTIC(NumImportedModules, "Number of modules imported from");
+STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
+STATISTIC(NumLiveSymbols, "Number of live symbols in index");
+
+/// Limit on instruction count of imported functions.
+static cl::opt<unsigned> ImportInstrLimit(
+    "import-instr-limit", cl::init(100), cl::Hidden, cl::value_desc("N"),
+    cl::desc("Only import functions with less than N instructions"));
+
+static cl::opt<float>
+    ImportInstrFactor("import-instr-evolution-factor", cl::init(0.7),
+                      cl::Hidden, cl::value_desc("x"),
+                      cl::desc("As we import functions, multiply the "
+                               "`import-instr-limit` threshold by this factor "
+                               "before processing newly imported functions"));
+
+static cl::opt<float> ImportHotInstrFactor(
+    "import-hot-evolution-factor", cl::init(1.0), cl::Hidden,
+    cl::value_desc("x"),
+    cl::desc("As we import functions called from hot callsite, multiply the "
+             "`import-instr-limit` threshold by this factor "
+             "before processing newly imported functions"));
+
+static cl::opt<float> ImportHotMultiplier(
+    "import-hot-multiplier", cl::init(3.0), cl::Hidden, cl::value_desc("x"),
+    cl::desc("Multiply the `import-instr-limit` threshold for hot callsites"));
+
+static cl::opt<float> ImportCriticalMultiplier(
+    "import-critical-multiplier", cl::init(100.0), cl::Hidden,
+    cl::value_desc("x"),
+    cl::desc(
+        "Multiply the `import-instr-limit` threshold for critical callsites"));
+
+// FIXME: This multiplier was not really tuned up.
+static cl::opt<float> ImportColdMultiplier(
+    "import-cold-multiplier", cl::init(0), cl::Hidden, cl::value_desc("N"),
+    cl::desc("Multiply the `import-instr-limit` threshold for cold callsites"));
+
+static cl::opt<bool> PrintImports("print-imports", cl::init(false), cl::Hidden,
+                                  cl::desc("Print imported functions"));
+
+static cl::opt<bool> ComputeDead("compute-dead", cl::init(true), cl::Hidden,
+                                 cl::desc("Compute dead symbols"));
+
+static cl::opt<bool> EnableImportMetadata(
+    "enable-import-metadata", cl::init(
+#if !defined(NDEBUG)
+                                  true /*Enabled with asserts.*/
+#else
+                                  false
+#endif
+                                  ),
+    cl::Hidden, cl::desc("Enable import metadata like 'thinlto_src_module'"));
+
+// Load lazily a module from \p FileName in \p Context.
+static std::unique_ptr<Module> loadFile(const std::string &FileName,
+                                        LLVMContext &Context) {
+  SMDiagnostic Err;
+  DEBUG(dbgs() << "Loading '" << FileName << "'\n");
+  // Metadata isn't loaded until functions are imported, to minimize
+  // the memory overhead.
+  std::unique_ptr<Module> Result =
+      getLazyIRFileModule(FileName, Err, Context,
+                          /* ShouldLazyLoadMetadata = */ true);
+  if (!Result) {
+    Err.print("function-import", errs());
+    report_fatal_error("Abort");
+  }
+
+  return Result;
+}
+
+namespace {
+
+/// Given a list of possible callee implementation for a call site, select one
+/// that fits the \p Threshold.
+///
+/// FIXME: select "best" instead of first that fits. But what is "best"?
+/// - The smallest: more likely to be inlined.
+/// - The one with the least outgoing edges (already well optimized).
+/// - One from a module already being imported from in order to reduce the
+///   number of source modules parsed/linked.
+/// - One that has PGO data attached.
+/// - [insert you fancy metric here]
+static const GlobalValueSummary *
+selectCallee(const ModuleSummaryIndex &Index,
+             ArrayRef<std::unique_ptr<GlobalValueSummary>> CalleeSummaryList,
+             unsigned Threshold, StringRef CallerModulePath) {
+  auto It = llvm::find_if(
+      CalleeSummaryList,
+      [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) {
+        auto *GVSummary = SummaryPtr.get();
+        if (GlobalValue::isInterposableLinkage(GVSummary->linkage()))
+          // There is no point in importing these, we can't inline them
+          return false;
+        if (auto *AS = dyn_cast<AliasSummary>(GVSummary)) {
+          GVSummary = &AS->getAliasee();
+          // Alias can't point to "available_externally". However when we import
+          // linkOnceODR the linkage does not change. So we import the alias
+          // and aliasee only in this case.
+          // FIXME: we should import alias as available_externally *function*,
+          // the destination module does need to know it is an alias.
+          if (!GlobalValue::isLinkOnceODRLinkage(GVSummary->linkage()))
+            return false;
+        }
+
+        auto *Summary = cast<FunctionSummary>(GVSummary);
+
+        // If this is a local function, make sure we import the copy
+        // in the caller's module. The only time a local function can
+        // share an entry in the index is if there is a local with the same name
+        // in another module that had the same source file name (in a different
+        // directory), where each was compiled in their own directory so there
+        // was not distinguishing path.
+        // However, do the import from another module if there is only one
+        // entry in the list - in that case this must be a reference due
+        // to indirect call profile data, since a function pointer can point to
+        // a local in another module.
+        if (GlobalValue::isLocalLinkage(Summary->linkage()) &&
+            CalleeSummaryList.size() > 1 &&
+            Summary->modulePath() != CallerModulePath)
+          return false;
+
+        if (Summary->instCount() > Threshold)
+          return false;
+
+        if (Summary->notEligibleToImport())
+          return false;
+
+        return true;
+      });
+  if (It == CalleeSummaryList.end())
+    return nullptr;
+
+  return cast<GlobalValueSummary>(It->get());
+}
+
+using EdgeInfo = std::tuple<const FunctionSummary *, unsigned /* Threshold */,
+                            GlobalValue::GUID>;
+
+/// Compute the list of functions to import for a given caller. Mark these
+/// imported functions and the symbols they reference in their source module as
+/// exported from their source module.
+static void computeImportForFunction(
+    const FunctionSummary &Summary, const ModuleSummaryIndex &Index,
+    const unsigned Threshold, const GVSummaryMapTy &DefinedGVSummaries,
+    SmallVectorImpl<EdgeInfo> &Worklist,
+    FunctionImporter::ImportMapTy &ImportList,
+    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
+  for (auto &Edge : Summary.calls()) {
+    ValueInfo VI = Edge.first;
+    DEBUG(dbgs() << " edge -> " << VI.getGUID() << " Threshold:" << Threshold
+                 << "\n");
+
+    if (VI.getSummaryList().empty()) {
+      // For SamplePGO, the indirect call targets for local functions will
+      // have its original name annotated in profile. We try to find the
+      // corresponding PGOFuncName as the GUID.
+      auto GUID = Index.getGUIDFromOriginalID(VI.getGUID());
+      if (GUID == 0)
+        continue;
+      VI = Index.getValueInfo(GUID);
+      if (!VI)
+        continue;
+    }
+
+    if (DefinedGVSummaries.count(VI.getGUID())) {
+      DEBUG(dbgs() << "ignored! Target already in destination module.\n");
+      continue;
+    }
+
+    auto GetBonusMultiplier = [](CalleeInfo::HotnessType Hotness) -> float {
+      if (Hotness == CalleeInfo::HotnessType::Hot)
+        return ImportHotMultiplier;
+      if (Hotness == CalleeInfo::HotnessType::Cold)
+        return ImportColdMultiplier;
+      if (Hotness == CalleeInfo::HotnessType::Critical)
+        return ImportCriticalMultiplier;
+      return 1.0;
+    };
+
+    const auto NewThreshold =
+        Threshold * GetBonusMultiplier(Edge.second.Hotness);
+
+    auto *CalleeSummary = selectCallee(Index, VI.getSummaryList(), NewThreshold,
+                                       Summary.modulePath());
+    if (!CalleeSummary) {
+      DEBUG(dbgs() << "ignored! No qualifying callee with summary found.\n");
+      continue;
+    }
+    // "Resolve" the summary, traversing alias,
+    const FunctionSummary *ResolvedCalleeSummary;
+    if (isa<AliasSummary>(CalleeSummary)) {
+      ResolvedCalleeSummary = cast<FunctionSummary>(
+          &cast<AliasSummary>(CalleeSummary)->getAliasee());
+      assert(
+          GlobalValue::isLinkOnceODRLinkage(ResolvedCalleeSummary->linkage()) &&
+          "Unexpected alias to a non-linkonceODR in import list");
+    } else
+      ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
+
+    assert(ResolvedCalleeSummary->instCount() <= NewThreshold &&
+           "selectCallee() didn't honor the threshold");
+
+    auto GetAdjustedThreshold = [](unsigned Threshold, bool IsHotCallsite) {
+      // Adjust the threshold for next level of imported functions.
+      // The threshold is different for hot callsites because we can then
+      // inline chains of hot calls.
+      if (IsHotCallsite)
+        return Threshold * ImportHotInstrFactor;
+      return Threshold * ImportInstrFactor;
+    };
+
+    bool IsHotCallsite = Edge.second.Hotness == CalleeInfo::HotnessType::Hot;
+    const auto AdjThreshold = GetAdjustedThreshold(Threshold, IsHotCallsite);
+
+    auto ExportModulePath = ResolvedCalleeSummary->modulePath();
+    auto &ProcessedThreshold = ImportList[ExportModulePath][VI.getGUID()];
+    /// Since the traversal of the call graph is DFS, we can revisit a function
+    /// a second time with a higher threshold. In this case, it is added back to
+    /// the worklist with the new threshold.
+    if (ProcessedThreshold && ProcessedThreshold >= AdjThreshold) {
+      DEBUG(dbgs() << "ignored! Target was already seen with Threshold "
+                   << ProcessedThreshold << "\n");
+      continue;
+    }
+    bool PreviouslyImported = ProcessedThreshold != 0;
+    // Mark this function as imported in this module, with the current Threshold
+    ProcessedThreshold = AdjThreshold;
+
+    // Make exports in the source module.
+    if (ExportLists) {
+      auto &ExportList = (*ExportLists)[ExportModulePath];
+      ExportList.insert(VI.getGUID());
+      if (!PreviouslyImported) {
+        // This is the first time this function was exported from its source
+        // module, so mark all functions and globals it references as exported
+        // to the outside if they are defined in the same source module.
+        // For efficiency, we unconditionally add all the referenced GUIDs
+        // to the ExportList for this module, and will prune out any not
+        // defined in the module later in a single pass.
+        for (auto &Edge : ResolvedCalleeSummary->calls()) {
+          auto CalleeGUID = Edge.first.getGUID();
+          ExportList.insert(CalleeGUID);
+        }
+        for (auto &Ref : ResolvedCalleeSummary->refs()) {
+          auto GUID = Ref.getGUID();
+          ExportList.insert(GUID);
+        }
+      }
+    }
+
+    // Insert the newly imported function to the worklist.
+    Worklist.emplace_back(ResolvedCalleeSummary, AdjThreshold, VI.getGUID());
+  }
+}
+
+/// Given the list of globals defined in a module, compute the list of imports
+/// as well as the list of "exports", i.e. the list of symbols referenced from
+/// another module (that may require promotion).
+static void ComputeImportForModule(
+    const GVSummaryMapTy &DefinedGVSummaries, const ModuleSummaryIndex &Index,
+    FunctionImporter::ImportMapTy &ImportList,
+    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
+  // Worklist contains the list of function imported in this module, for which
+  // we will analyse the callees and may import further down the callgraph.
+  SmallVector<EdgeInfo, 128> Worklist;
+
+  // Populate the worklist with the import for the functions in the current
+  // module
+  for (auto &GVSummary : DefinedGVSummaries) {
+    if (!Index.isGlobalValueLive(GVSummary.second)) {
+      DEBUG(dbgs() << "Ignores Dead GUID: " << GVSummary.first << "\n");
+      continue;
+    }
+    auto *Summary = GVSummary.second;
+    if (auto *AS = dyn_cast<AliasSummary>(Summary))
+      Summary = &AS->getAliasee();
+    auto *FuncSummary = dyn_cast<FunctionSummary>(Summary);
+    if (!FuncSummary)
+      // Skip import for global variables
+      continue;
+    DEBUG(dbgs() << "Initalize import for " << GVSummary.first << "\n");
+    computeImportForFunction(*FuncSummary, Index, ImportInstrLimit,
+                             DefinedGVSummaries, Worklist, ImportList,
+                             ExportLists);
+  }
+
+  // Process the newly imported functions and add callees to the worklist.
+  while (!Worklist.empty()) {
+    auto FuncInfo = Worklist.pop_back_val();
+    auto *Summary = std::get<0>(FuncInfo);
+    auto Threshold = std::get<1>(FuncInfo);
+    auto GUID = std::get<2>(FuncInfo);
+
+    // Check if we later added this summary with a higher threshold.
+    // If so, skip this entry.
+    auto ExportModulePath = Summary->modulePath();
+    auto &LatestProcessedThreshold = ImportList[ExportModulePath][GUID];
+    if (LatestProcessedThreshold > Threshold)
+      continue;
+
+    computeImportForFunction(*Summary, Index, Threshold, DefinedGVSummaries,
+                             Worklist, ImportList, ExportLists);
+  }
+}
+
+} // anonymous namespace
+
+/// Compute all the import and export for every module using the Index.
+void llvm::ComputeCrossModuleImport(
+    const ModuleSummaryIndex &Index,
+    const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+    StringMap<FunctionImporter::ImportMapTy> &ImportLists,
+    StringMap<FunctionImporter::ExportSetTy> &ExportLists) {
+  // For each module that has function defined, compute the import/export lists.
+  for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) {
+    auto &ImportList = ImportLists[DefinedGVSummaries.first()];
+    DEBUG(dbgs() << "Computing import for Module '"
+                 << DefinedGVSummaries.first() << "'\n");
+    ComputeImportForModule(DefinedGVSummaries.second, Index, ImportList,
+                           &ExportLists);
+  }
+
+  // When computing imports we added all GUIDs referenced by anything
+  // imported from the module to its ExportList. Now we prune each ExportList
+  // of any not defined in that module. This is more efficient than checking
+  // while computing imports because some of the summary lists may be long
+  // due to linkonce (comdat) copies.
+  for (auto &ELI : ExportLists) {
+    const auto &DefinedGVSummaries =
+        ModuleToDefinedGVSummaries.lookup(ELI.first());
+    for (auto EI = ELI.second.begin(); EI != ELI.second.end();) {
+      if (!DefinedGVSummaries.count(*EI))
+        EI = ELI.second.erase(EI);
+      else
+        ++EI;
+    }
+  }
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Import/Export lists for " << ImportLists.size()
+               << " modules:\n");
+  for (auto &ModuleImports : ImportLists) {
+    auto ModName = ModuleImports.first();
+    auto &Exports = ExportLists[ModName];
+    DEBUG(dbgs() << "* Module " << ModName << " exports " << Exports.size()
+                 << " functions. Imports from " << ModuleImports.second.size()
+                 << " modules.\n");
+    for (auto &Src : ModuleImports.second) {
+      auto SrcModName = Src.first();
+      DEBUG(dbgs() << " - " << Src.second.size() << " functions imported from "
+                   << SrcModName << "\n");
+    }
+  }
+#endif
+}
+
+/// Compute all the imports for the given module in the Index.
+void llvm::ComputeCrossModuleImportForModule(
+    StringRef ModulePath, const ModuleSummaryIndex &Index,
+    FunctionImporter::ImportMapTy &ImportList) {
+
+  // Collect the list of functions this module defines.
+  // GUID -> Summary
+  GVSummaryMapTy FunctionSummaryMap;
+  Index.collectDefinedFunctionsForModule(ModulePath, FunctionSummaryMap);
+
+  // Compute the import list for this module.
+  DEBUG(dbgs() << "Computing import for Module '" << ModulePath << "'\n");
+  ComputeImportForModule(FunctionSummaryMap, Index, ImportList);
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "* Module " << ModulePath << " imports from "
+               << ImportList.size() << " modules.\n");
+  for (auto &Src : ImportList) {
+    auto SrcModName = Src.first();
+    DEBUG(dbgs() << " - " << Src.second.size() << " functions imported from "
+                 << SrcModName << "\n");
+  }
+#endif
+}
+
+void llvm::computeDeadSymbols(
+    ModuleSummaryIndex &Index,
+    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
+  assert(!Index.withGlobalValueDeadStripping());
+  if (!ComputeDead)
+    return;
+  if (GUIDPreservedSymbols.empty())
+    // Don't do anything when nothing is live, this is friendly with tests.
+    return;
+  unsigned LiveSymbols = 0;
+  SmallVector<ValueInfo, 128> Worklist;
+  Worklist.reserve(GUIDPreservedSymbols.size() * 2);
+  for (auto GUID : GUIDPreservedSymbols) {
+    ValueInfo VI = Index.getValueInfo(GUID);
+    if (!VI)
+      continue;
+    for (auto &S : VI.getSummaryList())
+      S->setLive(true);
+  }
+
+  // Add values flagged in the index as live roots to the worklist.
+  for (const auto &Entry : Index)
+    for (auto &S : Entry.second.SummaryList)
+      if (S->isLive()) {
+        DEBUG(dbgs() << "Live root: " << Entry.first << "\n");
+        Worklist.push_back(ValueInfo(&Entry));
+        ++LiveSymbols;
+        break;
+      }
+
+  // Make value live and add it to the worklist if it was not live before.
+  // FIXME: we should only make the prevailing copy live here
+  auto visit = [&](ValueInfo VI) {
+    for (auto &S : VI.getSummaryList())
+      if (S->isLive())
+        return;
+    for (auto &S : VI.getSummaryList())
+      S->setLive(true);
+    ++LiveSymbols;
+    Worklist.push_back(VI);
+  };
+
+  while (!Worklist.empty()) {
+    auto VI = Worklist.pop_back_val();
+    for (auto &Summary : VI.getSummaryList()) {
+      for (auto Ref : Summary->refs())
+        visit(Ref);
+      if (auto *FS = dyn_cast<FunctionSummary>(Summary.get()))
+        for (auto Call : FS->calls())
+          visit(Call.first);
+      if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) {
+        auto AliaseeGUID = AS->getAliasee().getOriginalName();
+        ValueInfo AliaseeVI = Index.getValueInfo(AliaseeGUID);
+        if (AliaseeVI)
+          visit(AliaseeVI);
+      }
+    }
+  }
+  Index.setWithGlobalValueDeadStripping();
+
+  unsigned DeadSymbols = Index.size() - LiveSymbols;
+  DEBUG(dbgs() << LiveSymbols << " symbols Live, and " << DeadSymbols
+               << " symbols Dead \n");
+  NumDeadSymbols += DeadSymbols;
+  NumLiveSymbols += LiveSymbols;
+}
+
+/// Compute the set of summaries needed for a ThinLTO backend compilation of
+/// \p ModulePath.
+void llvm::gatherImportedSummariesForModule(
+    StringRef ModulePath,
+    const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+    const FunctionImporter::ImportMapTy &ImportList,
+    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
+  // Include all summaries from the importing module.
+  ModuleToSummariesForIndex[ModulePath] =
+      ModuleToDefinedGVSummaries.lookup(ModulePath);
+  // Include summaries for imports.
+  for (auto &ILI : ImportList) {
+    auto &SummariesForIndex = ModuleToSummariesForIndex[ILI.first()];
+    const auto &DefinedGVSummaries =
+        ModuleToDefinedGVSummaries.lookup(ILI.first());
+    for (auto &GI : ILI.second) {
+      const auto &DS = DefinedGVSummaries.find(GI.first);
+      assert(DS != DefinedGVSummaries.end() &&
+             "Expected a defined summary for imported global value");
+      SummariesForIndex[GI.first] = DS->second;
+    }
+  }
+}
+
+/// Emit the files \p ModulePath will import from into \p OutputFilename.
+std::error_code
+llvm::EmitImportsFiles(StringRef ModulePath, StringRef OutputFilename,
+                       const FunctionImporter::ImportMapTy &ModuleImports) {
+  std::error_code EC;
+  raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::F_None);
+  if (EC)
+    return EC;
+  for (auto &ILI : ModuleImports)
+    ImportsOS << ILI.first() << "\n";
+  return std::error_code();
+}
+
+/// Fixup WeakForLinker linkages in \p TheModule based on summary analysis.
+void llvm::thinLTOResolveWeakForLinkerModule(
+    Module &TheModule, const GVSummaryMapTy &DefinedGlobals) {
+  auto ConvertToDeclaration = [](GlobalValue &GV) {
+    DEBUG(dbgs() << "Converting to a declaration: `" << GV.getName() << "\n");
+    if (Function *F = dyn_cast<Function>(&GV)) {
+      F->deleteBody();
+      F->clearMetadata();
+    } else if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) {
+      V->setInitializer(nullptr);
+      V->setLinkage(GlobalValue::ExternalLinkage);
+      V->clearMetadata();
+    } else
+      // For now we don't resolve or drop aliases. Once we do we'll
+      // need to add support here for creating either a function or
+      // variable declaration, and return the new GlobalValue* for
+      // the caller to use.
+      llvm_unreachable("Expected function or variable");
+  };
+
+  auto updateLinkage = [&](GlobalValue &GV) {
+    // See if the global summary analysis computed a new resolved linkage.
+    const auto &GS = DefinedGlobals.find(GV.getGUID());
+    if (GS == DefinedGlobals.end())
+      return;
+    auto NewLinkage = GS->second->linkage();
+    if (NewLinkage == GV.getLinkage())
+      return;
+
+    // Switch the linkage to weakany if asked for, e.g. we do this for
+    // linker redefined symbols (via --wrap or --defsym).
+    // We record that the visibility should be changed here in `addThinLTO`
+    // as we need access to the resolution vectors for each input file in
+    // order to find which symbols have been redefined.
+    // We may consider reorganizing this code and moving the linkage recording
+    // somewhere else, e.g. in thinLTOResolveWeakForLinkerInIndex.
+    if (NewLinkage == GlobalValue::WeakAnyLinkage) {
+      GV.setLinkage(NewLinkage);
+      return;
+    }
+
+    if (!GlobalValue::isWeakForLinker(GV.getLinkage()))
+      return;
+    // Check for a non-prevailing def that has interposable linkage
+    // (e.g. non-odr weak or linkonce). In that case we can't simply
+    // convert to available_externally, since it would lose the
+    // interposable property and possibly get inlined. Simply drop
+    // the definition in that case.
+    if (GlobalValue::isAvailableExternallyLinkage(NewLinkage) &&
+        GlobalValue::isInterposableLinkage(GV.getLinkage()))
+      ConvertToDeclaration(GV);
+    else {
+      DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName() << "` from "
+                   << GV.getLinkage() << " to " << NewLinkage << "\n");
+      GV.setLinkage(NewLinkage);
+    }
+    // Remove declarations from comdats, including available_externally
+    // as this is a declaration for the linker, and will be dropped eventually.
+    // It is illegal for comdats to contain declarations.
+    auto *GO = dyn_cast_or_null<GlobalObject>(&GV);
+    if (GO && GO->isDeclarationForLinker() && GO->hasComdat())
+      GO->setComdat(nullptr);
+  };
+
+  // Process functions and global now
+  for (auto &GV : TheModule)
+    updateLinkage(GV);
+  for (auto &GV : TheModule.globals())
+    updateLinkage(GV);
+  for (auto &GV : TheModule.aliases())
+    updateLinkage(GV);
+}
+
+/// Run internalization on \p TheModule based on symmary analysis.
+void llvm::thinLTOInternalizeModule(Module &TheModule,
+                                    const GVSummaryMapTy &DefinedGlobals) {
+  // Parse inline ASM and collect the list of symbols that are not defined in
+  // the current module.
+  StringSet<> AsmUndefinedRefs;
+  ModuleSymbolTable::CollectAsmSymbols(
+      TheModule,
+      [&AsmUndefinedRefs](StringRef Name, object::BasicSymbolRef::Flags Flags) {
+        if (Flags & object::BasicSymbolRef::SF_Undefined)
+          AsmUndefinedRefs.insert(Name);
+      });
+
+  // Declare a callback for the internalize pass that will ask for every
+  // candidate GlobalValue if it can be internalized or not.
+  auto MustPreserveGV = [&](const GlobalValue &GV) -> bool {
+    // Can't be internalized if referenced in inline asm.
+    if (AsmUndefinedRefs.count(GV.getName()))
+      return true;
+
+    // Lookup the linkage recorded in the summaries during global analysis.
+    auto GS = DefinedGlobals.find(GV.getGUID());
+    if (GS == DefinedGlobals.end()) {
+      // Must have been promoted (possibly conservatively). Find original
+      // name so that we can access the correct summary and see if it can
+      // be internalized again.
+      // FIXME: Eventually we should control promotion instead of promoting
+      // and internalizing again.
+      StringRef OrigName =
+          ModuleSummaryIndex::getOriginalNameBeforePromote(GV.getName());
+      std::string OrigId = GlobalValue::getGlobalIdentifier(
+          OrigName, GlobalValue::InternalLinkage,
+          TheModule.getSourceFileName());
+      GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
+      if (GS == DefinedGlobals.end()) {
+        // Also check the original non-promoted non-globalized name. In some
+        // cases a preempted weak value is linked in as a local copy because
+        // it is referenced by an alias (IRLinker::linkGlobalValueProto).
+        // In that case, since it was originally not a local value, it was
+        // recorded in the index using the original name.
+        // FIXME: This may not be needed once PR27866 is fixed.
+        GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
+        assert(GS != DefinedGlobals.end());
+      }
+    }
+    return !GlobalValue::isLocalLinkage(GS->second->linkage());
+  };
+
+  // FIXME: See if we can just internalize directly here via linkage changes
+  // based on the index, rather than invoking internalizeModule.
+  llvm::internalizeModule(TheModule, MustPreserveGV);
+}
+
+// Automatically import functions in Module \p DestModule based on the summaries
+// index.
+//
+Expected<bool> FunctionImporter::importFunctions(
+    Module &DestModule, const FunctionImporter::ImportMapTy &ImportList) {
+  DEBUG(dbgs() << "Starting import for Module "
+               << DestModule.getModuleIdentifier() << "\n");
+  unsigned ImportedCount = 0;
+
+  IRMover Mover(DestModule);
+  // Do the actual import of functions now, one Module at a time
+  std::set<StringRef> ModuleNameOrderedList;
+  for (auto &FunctionsToImportPerModule : ImportList) {
+    ModuleNameOrderedList.insert(FunctionsToImportPerModule.first());
+  }
+  for (auto &Name : ModuleNameOrderedList) {
+    // Get the module for the import
+    const auto &FunctionsToImportPerModule = ImportList.find(Name);
+    assert(FunctionsToImportPerModule != ImportList.end());
+    Expected<std::unique_ptr<Module>> SrcModuleOrErr = ModuleLoader(Name);
+    if (!SrcModuleOrErr)
+      return SrcModuleOrErr.takeError();
+    std::unique_ptr<Module> SrcModule = std::move(*SrcModuleOrErr);
+    assert(&DestModule.getContext() == &SrcModule->getContext() &&
+           "Context mismatch");
+
+    // If modules were created with lazy metadata loading, materialize it
+    // now, before linking it (otherwise this will be a noop).
+    if (Error Err = SrcModule->materializeMetadata())
+      return std::move(Err);
+
+    auto &ImportGUIDs = FunctionsToImportPerModule->second;
+    // Find the globals to import
+    SetVector<GlobalValue *> GlobalsToImport;
+    for (Function &F : *SrcModule) {
+      if (!F.hasName())
+        continue;
+      auto GUID = F.getGUID();
+      auto Import = ImportGUIDs.count(GUID);
+      DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing function " << GUID
+                   << " " << F.getName() << " from "
+                   << SrcModule->getSourceFileName() << "\n");
+      if (Import) {
+        if (Error Err = F.materialize())
+          return std::move(Err);
+        if (EnableImportMetadata) {
+          // Add 'thinlto_src_module' metadata for statistics and debugging.
+          F.setMetadata(
+              "thinlto_src_module",
+              llvm::MDNode::get(
+                  DestModule.getContext(),
+                  {llvm::MDString::get(DestModule.getContext(),
+                                       SrcModule->getSourceFileName())}));
+        }
+        GlobalsToImport.insert(&F);
+      }
+    }
+    for (GlobalVariable &GV : SrcModule->globals()) {
+      if (!GV.hasName())
+        continue;
+      auto GUID = GV.getGUID();
+      auto Import = ImportGUIDs.count(GUID);
+      DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing global " << GUID
+                   << " " << GV.getName() << " from "
+                   << SrcModule->getSourceFileName() << "\n");
+      if (Import) {
+        if (Error Err = GV.materialize())
+          return std::move(Err);
+        GlobalsToImport.insert(&GV);
+      }
+    }
+    for (GlobalAlias &GA : SrcModule->aliases()) {
+      // FIXME: This should eventually be controlled entirely by the summary.
+      if (FunctionImportGlobalProcessing::doImportAsDefinition(
+              &GA, &GlobalsToImport)) {
+        GlobalsToImport.insert(&GA);
+        continue;
+      }
+
+      if (!GA.hasName())
+        continue;
+      auto GUID = GA.getGUID();
+      auto Import = ImportGUIDs.count(GUID);
+      DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing alias " << GUID
+                   << " " << GA.getName() << " from "
+                   << SrcModule->getSourceFileName() << "\n");
+      if (Import) {
+        // Alias can't point to "available_externally". However when we import
+        // linkOnceODR the linkage does not change. So we import the alias
+        // and aliasee only in this case. This has been handled by
+        // computeImportForFunction()
+        GlobalObject *GO = GA.getBaseObject();
+        assert(GO->hasLinkOnceODRLinkage() &&
+               "Unexpected alias to a non-linkonceODR in import list");
+#ifndef NDEBUG
+        if (!GlobalsToImport.count(GO))
+          DEBUG(dbgs() << " alias triggers importing aliasee " << GO->getGUID()
+                       << " " << GO->getName() << " from "
+                       << SrcModule->getSourceFileName() << "\n");
+#endif
+        if (Error Err = GO->materialize())
+          return std::move(Err);
+        GlobalsToImport.insert(GO);
+        if (Error Err = GA.materialize())
+          return std::move(Err);
+        GlobalsToImport.insert(&GA);
+      }
+    }
+
+    // Upgrade debug info after we're done materializing all the globals and we
+    // have loaded all the required metadata!
+    UpgradeDebugInfo(*SrcModule);
+
+    // Link in the specified functions.
+    if (renameModuleForThinLTO(*SrcModule, Index, &GlobalsToImport))
+      return true;
+
+    if (PrintImports) {
+      for (const auto *GV : GlobalsToImport)
+        dbgs() << DestModule.getSourceFileName() << ": Import " << GV->getName()
+               << " from " << SrcModule->getSourceFileName() << "\n";
+    }
+
+    if (Mover.move(std::move(SrcModule), GlobalsToImport.getArrayRef(),
+                   [](GlobalValue &, IRMover::ValueAdder) {},
+                   /*IsPerformingImport=*/true))
+      report_fatal_error("Function Import: link error");
+
+    ImportedCount += GlobalsToImport.size();
+    NumImportedModules++;
+  }
+
+  NumImportedFunctions += ImportedCount;
+
+  DEBUG(dbgs() << "Imported " << ImportedCount << " functions for Module "
+               << DestModule.getModuleIdentifier() << "\n");
+  return ImportedCount;
+}
+
+/// Summary file to use for function importing when using -function-import from
+/// the command line.
+static cl::opt<std::string>
+    SummaryFile("summary-file",
+                cl::desc("The summary file to use for function importing."));
+
+static bool doImportingForModule(Module &M) {
+  if (SummaryFile.empty())
+    report_fatal_error("error: -function-import requires -summary-file\n");
+  Expected<std::unique_ptr<ModuleSummaryIndex>> IndexPtrOrErr =
+      getModuleSummaryIndexForFile(SummaryFile);
+  if (!IndexPtrOrErr) {
+    logAllUnhandledErrors(IndexPtrOrErr.takeError(), errs(),
+                          "Error loading file '" + SummaryFile + "': ");
+    return false;
+  }
+  std::unique_ptr<ModuleSummaryIndex> Index = std::move(*IndexPtrOrErr);
+
+  // First step is collecting the import list.
+  FunctionImporter::ImportMapTy ImportList;
+  ComputeCrossModuleImportForModule(M.getModuleIdentifier(), *Index,
+                                    ImportList);
+
+  // Conservatively mark all internal values as promoted. This interface is
+  // only used when doing importing via the function importing pass. The pass
+  // is only enabled when testing importing via the 'opt' tool, which does
+  // not do the ThinLink that would normally determine what values to promote.
+  for (auto &I : *Index) {
+    for (auto &S : I.second.SummaryList) {
+      if (GlobalValue::isLocalLinkage(S->linkage()))
+        S->setLinkage(GlobalValue::ExternalLinkage);
+    }
+  }
+
+  // Next we need to promote to global scope and rename any local values that
+  // are potentially exported to other modules.
+  if (renameModuleForThinLTO(M, *Index, nullptr)) {
+    errs() << "Error renaming module\n";
+    return false;
+  }
+
+  // Perform the import now.
+  auto ModuleLoader = [&M](StringRef Identifier) {
+    return loadFile(Identifier, M.getContext());
+  };
+  FunctionImporter Importer(*Index, ModuleLoader);
+  Expected<bool> Result = Importer.importFunctions(M, ImportList);
+
+  // FIXME: Probably need to propagate Errors through the pass manager.
+  if (!Result) {
+    logAllUnhandledErrors(Result.takeError(), errs(),
+                          "Error importing module: ");
+    return false;
+  }
+
+  return *Result;
+}
+
+namespace {
+/// Pass that performs cross-module function import provided a summary file.
+class FunctionImportLegacyPass : public ModulePass {
+public:
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  /// Specify pass name for debug output
+  StringRef getPassName() const override { return "Function Importing"; }
+
+  explicit FunctionImportLegacyPass() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    return doImportingForModule(M);
+  }
+};
+} // anonymous namespace
+
+PreservedAnalyses FunctionImportPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
+  if (!doImportingForModule(M))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+char FunctionImportLegacyPass::ID = 0;
+INITIALIZE_PASS(FunctionImportLegacyPass, "function-import",
+                "Summary Based Function Import", false, false)
+
+namespace llvm {
+Pass *createFunctionImportPass() {
+  return new FunctionImportLegacyPass();
+}
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
new file mode 100644
index 000000000000..c91e8b454927
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -0,0 +1,294 @@
+//===-- GlobalDCE.cpp - DCE unreachable internal functions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transform is designed to eliminate unreachable internal globals from the
+// program.  It uses an aggressive algorithm, searching out globals that are
+// known to be alive.  After it finds all of the globals which are needed, it
+// deletes whatever is left over.  This allows it to delete recursive chunks of
+// the program which are unreachable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/GlobalDCE.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "globaldce"
+
+STATISTIC(NumAliases  , "Number of global aliases removed");
+STATISTIC(NumFunctions, "Number of functions removed");
+STATISTIC(NumIFuncs,    "Number of indirect functions removed");
+STATISTIC(NumVariables, "Number of global variables removed");
+
+namespace {
+  class GlobalDCELegacyPass : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    GlobalDCELegacyPass() : ModulePass(ID) {
+      initializeGlobalDCELegacyPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    // run - Do the GlobalDCE pass on the specified module, optionally updating
+    // the specified callgraph to reflect the changes.
+    //
+    bool runOnModule(Module &M) override {
+      if (skipModule(M))
+        return false;
+
+      // We need a minimally functional dummy module analysis manager. It needs
+      // to at least know about the possibility of proxying a function analysis
+      // manager.
+      FunctionAnalysisManager DummyFAM;
+      ModuleAnalysisManager DummyMAM;
+      DummyMAM.registerPass(
+          [&] { return FunctionAnalysisManagerModuleProxy(DummyFAM); });
+
+      auto PA = Impl.run(M, DummyMAM);
+      return !PA.areAllPreserved();
+    }
+
+  private:
+    GlobalDCEPass Impl;
+  };
+}
+
+char GlobalDCELegacyPass::ID = 0;
+INITIALIZE_PASS(GlobalDCELegacyPass, "globaldce",
+                "Dead Global Elimination", false, false)
+
+// Public interface to the GlobalDCEPass.
+ModulePass *llvm::createGlobalDCEPass() {
+  return new GlobalDCELegacyPass();
+}
+
+/// Returns true if F contains only a single "ret" instruction.
+static bool isEmptyFunction(Function *F) {
+  BasicBlock &Entry = F->getEntryBlock();
+  if (Entry.size() != 1 || !isa<ReturnInst>(Entry.front()))
+    return false;
+  ReturnInst &RI = cast<ReturnInst>(Entry.front());
+  return RI.getReturnValue() == nullptr;
+}
+
+/// Compute the set of GlobalValue that depends from V.
+/// The recursion stops as soon as a GlobalValue is met.
+void GlobalDCEPass::ComputeDependencies(Value *V,
+                                        SmallPtrSetImpl<GlobalValue *> &Deps) {
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    Function *Parent = I->getParent()->getParent();
+    Deps.insert(Parent);
+  } else if (auto *GV = dyn_cast<GlobalValue>(V)) {
+    Deps.insert(GV);
+  } else if (auto *CE = dyn_cast<Constant>(V)) {
+    // Avoid walking the whole tree of a big ConstantExprs multiple times.
+    auto Where = ConstantDependenciesCache.find(CE);
+    if (Where != ConstantDependenciesCache.end()) {
+      auto const &K = Where->second;
+      Deps.insert(K.begin(), K.end());
+    } else {
+      SmallPtrSetImpl<GlobalValue *> &LocalDeps = ConstantDependenciesCache[CE];
+      for (User *CEUser : CE->users())
+        ComputeDependencies(CEUser, LocalDeps);
+      Deps.insert(LocalDeps.begin(), LocalDeps.end());
+    }
+  }
+}
+
+void GlobalDCEPass::UpdateGVDependencies(GlobalValue &GV) {
+  SmallPtrSet<GlobalValue *, 8> Deps;
+  for (User *User : GV.users())
+    ComputeDependencies(User, Deps);
+  Deps.erase(&GV); // Remove self-reference.
+  for (GlobalValue *GVU : Deps) {
+    GVDependencies.insert(std::make_pair(GVU, &GV));
+  }
+}
+
+/// Mark Global value as Live
+void GlobalDCEPass::MarkLive(GlobalValue &GV,
+                             SmallVectorImpl<GlobalValue *> *Updates) {
+  auto const Ret = AliveGlobals.insert(&GV);
+  if (!Ret.second)
+    return;
+
+  if (Updates)
+    Updates->push_back(&GV);
+  if (Comdat *C = GV.getComdat()) {
+    for (auto &&CM : make_range(ComdatMembers.equal_range(C)))
+      MarkLive(*CM.second, Updates); // Recursion depth is only two because only
+                                     // globals in the same comdat are visited.
+  }
+}
+
+PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  bool Changed = false;
+
+  // The algorithm first computes the set L of global variables that are
+  // trivially live.  Then it walks the initialization of these variables to
+  // compute the globals used to initialize them, which effectively builds a
+  // directed graph where nodes are global variables, and an edge from A to B
+  // means B is used to initialize A.  Finally, it propagates the liveness
+  // information through the graph starting from the nodes in L. Nodes note
+  // marked as alive are discarded.
+
+  // Remove empty functions from the global ctors list.
+  Changed |= optimizeGlobalCtorsList(M, isEmptyFunction);
+
+  // Collect the set of members for each comdat.
+  for (Function &F : M)
+    if (Comdat *C = F.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &F));
+  for (GlobalVariable &GV : M.globals())
+    if (Comdat *C = GV.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &GV));
+  for (GlobalAlias &GA : M.aliases())
+    if (Comdat *C = GA.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &GA));
+
+  // Loop over the module, adding globals which are obviously necessary.
+  for (GlobalObject &GO : M.global_objects()) {
+    Changed |= RemoveUnusedGlobalValue(GO);
+    // Functions with external linkage are needed if they have a body.
+    // Externally visible & appending globals are needed, if they have an
+    // initializer.
+    if (!GO.isDeclaration() && !GO.hasAvailableExternallyLinkage())
+      if (!GO.isDiscardableIfUnused())
+        MarkLive(GO);
+
+    UpdateGVDependencies(GO);
+  }
+
+  // Compute direct dependencies of aliases.
+  for (GlobalAlias &GA : M.aliases()) {
+    Changed |= RemoveUnusedGlobalValue(GA);
+    // Externally visible aliases are needed.
+    if (!GA.isDiscardableIfUnused())
+      MarkLive(GA);
+
+    UpdateGVDependencies(GA);
+  }
+
+  // Compute direct dependencies of ifuncs.
+  for (GlobalIFunc &GIF : M.ifuncs()) {
+    Changed |= RemoveUnusedGlobalValue(GIF);
+    // Externally visible ifuncs are needed.
+    if (!GIF.isDiscardableIfUnused())
+      MarkLive(GIF);
+
+    UpdateGVDependencies(GIF);
+  }
+
+  // Propagate liveness from collected Global Values through the computed
+  // dependencies.
+  SmallVector<GlobalValue *, 8> NewLiveGVs{AliveGlobals.begin(),
+                                           AliveGlobals.end()};
+  while (!NewLiveGVs.empty()) {
+    GlobalValue *LGV = NewLiveGVs.pop_back_val();
+    for (auto &&GVD : make_range(GVDependencies.equal_range(LGV)))
+      MarkLive(*GVD.second, &NewLiveGVs);
+  }
+
+  // Now that all globals which are needed are in the AliveGlobals set, we loop
+  // through the program, deleting those which are not alive.
+  //
+
+  // The first pass is to drop initializers of global variables which are dead.
+  std::vector<GlobalVariable *> DeadGlobalVars; // Keep track of dead globals
+  for (GlobalVariable &GV : M.globals())
+    if (!AliveGlobals.count(&GV)) {
+      DeadGlobalVars.push_back(&GV);         // Keep track of dead globals
+      if (GV.hasInitializer()) {
+        Constant *Init = GV.getInitializer();
+        GV.setInitializer(nullptr);
+        if (isSafeToDestroyConstant(Init))
+          Init->destroyConstant();
+      }
+    }
+
+  // The second pass drops the bodies of functions which are dead...
+  std::vector<Function *> DeadFunctions;
+  for (Function &F : M)
+    if (!AliveGlobals.count(&F)) {
+      DeadFunctions.push_back(&F);         // Keep track of dead globals
+      if (!F.isDeclaration())
+        F.deleteBody();
+    }
+
+  // The third pass drops targets of aliases which are dead...
+  std::vector<GlobalAlias*> DeadAliases;
+  for (GlobalAlias &GA : M.aliases())
+    if (!AliveGlobals.count(&GA)) {
+      DeadAliases.push_back(&GA);
+      GA.setAliasee(nullptr);
+    }
+
+  // The fourth pass drops targets of ifuncs which are dead...
+  std::vector<GlobalIFunc*> DeadIFuncs;
+  for (GlobalIFunc &GIF : M.ifuncs())
+    if (!AliveGlobals.count(&GIF)) {
+      DeadIFuncs.push_back(&GIF);
+      GIF.setResolver(nullptr);
+    }
+
+  // Now that all interferences have been dropped, delete the actual objects
+  // themselves.
+  auto EraseUnusedGlobalValue = [&](GlobalValue *GV) {
+    RemoveUnusedGlobalValue(*GV);
+    GV->eraseFromParent();
+    Changed = true;
+  };
+
+  NumFunctions += DeadFunctions.size();
+  for (Function *F : DeadFunctions)
+    EraseUnusedGlobalValue(F);
+
+  NumVariables += DeadGlobalVars.size();
+  for (GlobalVariable *GV : DeadGlobalVars)
+    EraseUnusedGlobalValue(GV);
+
+  NumAliases += DeadAliases.size();
+  for (GlobalAlias *GA : DeadAliases)
+    EraseUnusedGlobalValue(GA);
+
+  NumIFuncs += DeadIFuncs.size();
+  for (GlobalIFunc *GIF : DeadIFuncs)
+    EraseUnusedGlobalValue(GIF);
+
+  // Make sure that all memory is released
+  AliveGlobals.clear();
+  ConstantDependenciesCache.clear();
+  GVDependencies.clear();
+  ComdatMembers.clear();
+
+  if (Changed)
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+// RemoveUnusedGlobalValue - Loop over all of the uses of the specified
+// GlobalValue, looking for the constant pointer ref that may be pointing to it.
+// If found, check to see if the constant pointer ref is safe to destroy, and if
+// so, nuke it.  This will reduce the reference count on the global value, which
+// might make it deader.
+//
+bool GlobalDCEPass::RemoveUnusedGlobalValue(GlobalValue &GV) {
+  if (GV.use_empty())
+    return false;
+  GV.removeDeadConstantUsers();
+  return GV.use_empty();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
new file mode 100644
index 000000000000..3d57acf06e74
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -0,0 +1,2600 @@
+//===- GlobalOpt.cpp - Optimize Global Variables --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms simple global variables that never have their address
+// taken.  If obviously true, it marks read/write globals as constant, deletes
+// variables only stored to, etc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/GlobalOpt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/Transforms/Utils/Evaluator.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "globalopt"
+
+STATISTIC(NumMarked    , "Number of globals marked constant");
+STATISTIC(NumUnnamed   , "Number of globals marked unnamed_addr");
+STATISTIC(NumSRA       , "Number of aggregate globals broken into scalars");
+STATISTIC(NumHeapSRA   , "Number of heap objects SRA'd");
+STATISTIC(NumSubstitute,"Number of globals with initializers stored into them");
+STATISTIC(NumDeleted   , "Number of globals deleted");
+STATISTIC(NumGlobUses  , "Number of global uses devirtualized");
+STATISTIC(NumLocalized , "Number of globals localized");
+STATISTIC(NumShrunkToBool  , "Number of global vars shrunk to booleans");
+STATISTIC(NumFastCallFns   , "Number of functions converted to fastcc");
+STATISTIC(NumCtorsEvaluated, "Number of static ctors evaluated");
+STATISTIC(NumNestRemoved   , "Number of nest attributes removed");
+STATISTIC(NumAliasesResolved, "Number of global aliases resolved");
+STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
+STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
+
+/// Is this global variable possibly used by a leak checker as a root?  If so,
+/// we might not really want to eliminate the stores to it.
+static bool isLeakCheckerRoot(GlobalVariable *GV) {
+  // A global variable is a root if it is a pointer, or could plausibly contain
+  // a pointer.  There are two challenges; one is that we could have a struct
+  // the has an inner member which is a pointer.  We recurse through the type to
+  // detect these (up to a point).  The other is that we may actually be a union
+  // of a pointer and another type, and so our LLVM type is an integer which
+  // gets converted into a pointer, or our type is an [i8 x #] with a pointer
+  // potentially contained here.
+
+  if (GV->hasPrivateLinkage())
+    return false;
+
+  SmallVector<Type *, 4> Types;
+  Types.push_back(GV->getValueType());
+
+  unsigned Limit = 20;
+  do {
+    Type *Ty = Types.pop_back_val();
+    switch (Ty->getTypeID()) {
+      default: break;
+      case Type::PointerTyID: return true;
+      case Type::ArrayTyID:
+      case Type::VectorTyID: {
+        SequentialType *STy = cast<SequentialType>(Ty);
+        Types.push_back(STy->getElementType());
+        break;
+      }
+      case Type::StructTyID: {
+        StructType *STy = cast<StructType>(Ty);
+        if (STy->isOpaque()) return true;
+        for (StructType::element_iterator I = STy->element_begin(),
+                 E = STy->element_end(); I != E; ++I) {
+          Type *InnerTy = *I;
+          if (isa<PointerType>(InnerTy)) return true;
+          if (isa<CompositeType>(InnerTy))
+            Types.push_back(InnerTy);
+        }
+        break;
+      }
+    }
+    if (--Limit == 0) return true;
+  } while (!Types.empty());
+  return false;
+}
+
+/// Given a value that is stored to a global but never read, determine whether
+/// it's safe to remove the store and the chain of computation that feeds the
+/// store.
+static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) {
+  do {
+    if (isa<Constant>(V))
+      return true;
+    if (!V->hasOneUse())
+      return false;
+    if (isa<LoadInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V) ||
+        isa<GlobalValue>(V))
+      return false;
+    if (isAllocationFn(V, TLI))
+      return true;
+
+    Instruction *I = cast<Instruction>(V);
+    if (I->mayHaveSideEffects())
+      return false;
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+      if (!GEP->hasAllConstantIndices())
+        return false;
+    } else if (I->getNumOperands() != 1) {
+      return false;
+    }
+
+    V = I->getOperand(0);
+  } while (1);
+}
+
+/// This GV is a pointer root.  Loop over all users of the global and clean up
+/// any that obviously don't assign the global a value that isn't dynamically
+/// allocated.
+static bool CleanupPointerRootUsers(GlobalVariable *GV,
+                                    const TargetLibraryInfo *TLI) {
+  // A brief explanation of leak checkers.  The goal is to find bugs where
+  // pointers are forgotten, causing an accumulating growth in memory
+  // usage over time.  The common strategy for leak checkers is to whitelist the
+  // memory pointed to by globals at exit.  This is popular because it also
+  // solves another problem where the main thread of a C++ program may shut down
+  // before other threads that are still expecting to use those globals.  To
+  // handle that case, we expect the program may create a singleton and never
+  // destroy it.
+
+  bool Changed = false;
+
+  // If Dead[n].first is the only use of a malloc result, we can delete its
+  // chain of computation and the store to the global in Dead[n].second.
+  SmallVector<std::pair<Instruction *, Instruction *>, 32> Dead;
+
+  // Constants can't be pointers to dynamically allocated memory.
+  for (Value::user_iterator UI = GV->user_begin(), E = GV->user_end();
+       UI != E;) {
+    User *U = *UI++;
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      Value *V = SI->getValueOperand();
+      if (isa<Constant>(V)) {
+        Changed = true;
+        SI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(V)) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, SI));
+      }
+    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(U)) {
+      if (isa<Constant>(MSI->getValue())) {
+        Changed = true;
+        MSI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(MSI->getValue())) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, MSI));
+      }
+    } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U)) {
+      GlobalVariable *MemSrc = dyn_cast<GlobalVariable>(MTI->getSource());
+      if (MemSrc && MemSrc->isConstant()) {
+        Changed = true;
+        MTI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(MemSrc)) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, MTI));
+      }
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      if (CE->use_empty()) {
+        CE->destroyConstant();
+        Changed = true;
+      }
+    } else if (Constant *C = dyn_cast<Constant>(U)) {
+      if (isSafeToDestroyConstant(C)) {
+        C->destroyConstant();
+        // This could have invalidated UI, start over from scratch.
+        Dead.clear();
+        CleanupPointerRootUsers(GV, TLI);
+        return true;
+      }
+    }
+  }
+
+  for (int i = 0, e = Dead.size(); i != e; ++i) {
+    if (IsSafeComputationToRemove(Dead[i].first, TLI)) {
+      Dead[i].second->eraseFromParent();
+      Instruction *I = Dead[i].first;
+      do {
+        if (isAllocationFn(I, TLI))
+          break;
+        Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
+        if (!J)
+          break;
+        I->eraseFromParent();
+        I = J;
+      } while (1);
+      I->eraseFromParent();
+    }
+  }
+
+  return Changed;
+}
+
+/// We just marked GV constant.  Loop over all users of the global, cleaning up
+/// the obvious ones.  This is largely just a quick scan over the use list to
+/// clean up the easy and obvious cruft.  This returns true if it made a change.
+static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
+                                       const DataLayout &DL,
+                                       TargetLibraryInfo *TLI) {
+  bool Changed = false;
+  // Note that we need to use a weak value handle for the worklist items. When
+  // we delete a constant array, we may also be holding pointer to one of its
+  // elements (or an element of one of its elements if we're dealing with an
+  // array of arrays) in the worklist.
+  SmallVector<WeakTrackingVH, 8> WorkList(V->user_begin(), V->user_end());
+  while (!WorkList.empty()) {
+    Value *UV = WorkList.pop_back_val();
+    if (!UV)
+      continue;
+
+    User *U = cast<User>(UV);
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      if (Init) {
+        // Replace the load with the initializer.
+        LI->replaceAllUsesWith(Init);
+        LI->eraseFromParent();
+        Changed = true;
+      }
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      // Store must be unreachable or storing Init into the global.
+      SI->eraseFromParent();
+      Changed = true;
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      if (CE->getOpcode() == Instruction::GetElementPtr) {
+        Constant *SubInit = nullptr;
+        if (Init)
+          SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
+        Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, TLI);
+      } else if ((CE->getOpcode() == Instruction::BitCast &&
+                  CE->getType()->isPointerTy()) ||
+                 CE->getOpcode() == Instruction::AddrSpaceCast) {
+        // Pointer cast, delete any stores and memsets to the global.
+        Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, TLI);
+      }
+
+      if (CE->use_empty()) {
+        CE->destroyConstant();
+        Changed = true;
+      }
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      // Do not transform "gepinst (gep constexpr (GV))" here, because forming
+      // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold
+      // and will invalidate our notion of what Init is.
+      Constant *SubInit = nullptr;
+      if (!isa<ConstantExpr>(GEP->getOperand(0))) {
+        ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(
+            ConstantFoldInstruction(GEP, DL, TLI));
+        if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
+          SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
+
+        // If the initializer is an all-null value and we have an inbounds GEP,
+        // we already know what the result of any load from that GEP is.
+        // TODO: Handle splats.
+        if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds())
+          SubInit = Constant::getNullValue(GEP->getResultElementType());
+      }
+      Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, TLI);
+
+      if (GEP->use_empty()) {
+        GEP->eraseFromParent();
+        Changed = true;
+      }
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv
+      if (MI->getRawDest() == V) {
+        MI->eraseFromParent();
+        Changed = true;
+      }
+
+    } else if (Constant *C = dyn_cast<Constant>(U)) {
+      // If we have a chain of dead constantexprs or other things dangling from
+      // us, and if they are all dead, nuke them without remorse.
+      if (isSafeToDestroyConstant(C)) {
+        C->destroyConstant();
+        CleanupConstantGlobalUsers(V, Init, DL, TLI);
+        return true;
+      }
+    }
+  }
+  return Changed;
+}
+
+/// Return true if the specified instruction is a safe user of a derived
+/// expression from a global that we want to SROA.
+static bool isSafeSROAElementUse(Value *V) {
+  // We might have a dead and dangling constant hanging off of here.
+  if (Constant *C = dyn_cast<Constant>(V))
+    return isSafeToDestroyConstant(C);
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // Loads are ok.
+  if (isa<LoadInst>(I)) return true;
+
+  // Stores *to* the pointer are ok.
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getOperand(0) != V;
+
+  // Otherwise, it must be a GEP.
+  GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I);
+  if (!GEPI) return false;
+
+  if (GEPI->getNumOperands() < 3 || !isa<Constant>(GEPI->getOperand(1)) ||
+      !cast<Constant>(GEPI->getOperand(1))->isNullValue())
+    return false;
+
+  for (User *U : GEPI->users())
+    if (!isSafeSROAElementUse(U))
+      return false;
+  return true;
+}
+
+
+/// U is a direct user of the specified global value.  Look at it and its uses
+/// and decide whether it is safe to SROA this global.
+static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {
+  // The user of the global must be a GEP Inst or a ConstantExpr GEP.
+  if (!isa<GetElementPtrInst>(U) &&
+      (!isa<ConstantExpr>(U) ||
+       cast<ConstantExpr>(U)->getOpcode() != Instruction::GetElementPtr))
+    return false;
+
+  // Check to see if this ConstantExpr GEP is SRA'able.  In particular, we
+  // don't like < 3 operand CE's, and we don't like non-constant integer
+  // indices.  This enforces that all uses are 'gep GV, 0, C, ...' for some
+  // value of C.
+  if (U->getNumOperands() < 3 || !isa<Constant>(U->getOperand(1)) ||
+      !cast<Constant>(U->getOperand(1))->isNullValue() ||
+      !isa<ConstantInt>(U->getOperand(2)))
+    return false;
+
+  gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U);
+  ++GEPI;  // Skip over the pointer index.
+
+  // If this is a use of an array allocation, do a bit more checking for sanity.
+  if (GEPI.isSequential()) {
+    ConstantInt *Idx = cast<ConstantInt>(U->getOperand(2));
+
+    // Check to make sure that index falls within the array.  If not,
+    // something funny is going on, so we won't do the optimization.
+    //
+    if (GEPI.isBoundedSequential() &&
+        Idx->getZExtValue() >= GEPI.getSequentialNumElements())
+      return false;
+
+    // We cannot scalar repl this level of the array unless any array
+    // sub-indices are in-range constants.  In particular, consider:
+    // A[0][i].  We cannot know that the user isn't doing invalid things like
+    // allowing i to index an out-of-range subscript that accesses A[1].
+    //
+    // Scalar replacing *just* the outer index of the array is probably not
+    // going to be a win anyway, so just give up.
+    for (++GEPI; // Skip array index.
+         GEPI != E;
+         ++GEPI) {
+      if (GEPI.isStruct())
+        continue;
+
+      ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand());
+      if (!IdxVal ||
+          (GEPI.isBoundedSequential() &&
+           IdxVal->getZExtValue() >= GEPI.getSequentialNumElements()))
+        return false;
+    }
+  }
+
+  for (User *UU : U->users())
+    if (!isSafeSROAElementUse(UU))
+      return false;
+
+  return true;
+}
+
+/// Look at all uses of the global and decide whether it is safe for us to
+/// perform this transformation.
+static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
+  for (User *U : GV->users())
+    if (!IsUserOfGlobalSafeForSRA(U, GV))
+      return false;
+
+  return true;
+}
+
+
+/// Perform scalar replacement of aggregates on the specified global variable.
+/// This opens the door for other optimizations by exposing the behavior of the
+/// program in a more fine-grained way.  We have determined that this
+/// transformation is safe already.  We return the first global variable we
+/// insert so that the caller can reprocess it.
+static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
+  // Make sure this global only has simple uses that we can SRA.
+  if (!GlobalUsersSafeToSRA(GV))
+    return nullptr;
+
+  assert(GV->hasLocalLinkage());
+  Constant *Init = GV->getInitializer();
+  Type *Ty = Init->getType();
+
+  std::vector<GlobalVariable*> NewGlobals;
+  Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
+
+  // Get the alignment of the global, either explicit or target-specific.
+  unsigned StartAlignment = GV->getAlignment();
+  if (StartAlignment == 0)
+    StartAlignment = DL.getABITypeAlignment(GV->getType());
+
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    NewGlobals.reserve(STy->getNumElements());
+    const StructLayout &Layout = *DL.getStructLayout(STy);
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      Constant *In = Init->getAggregateElement(i);
+      assert(In && "Couldn't get element of initializer?");
+      GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false,
+                                               GlobalVariable::InternalLinkage,
+                                               In, GV->getName()+"."+Twine(i),
+                                               GV->getThreadLocalMode(),
+                                              GV->getType()->getAddressSpace());
+      NGV->setExternallyInitialized(GV->isExternallyInitialized());
+      NGV->copyAttributesFrom(GV);
+      Globals.push_back(NGV);
+      NewGlobals.push_back(NGV);
+
+      // Calculate the known alignment of the field.  If the original aggregate
+      // had 256 byte alignment for example, something might depend on that:
+      // propagate info to each field.
+      uint64_t FieldOffset = Layout.getElementOffset(i);
+      unsigned NewAlign = (unsigned)MinAlign(StartAlignment, FieldOffset);
+      if (NewAlign > DL.getABITypeAlignment(STy->getElementType(i)))
+        NGV->setAlignment(NewAlign);
+    }
+  } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
+    unsigned NumElements = STy->getNumElements();
+    if (NumElements > 16 && GV->hasNUsesOrMore(16))
+      return nullptr; // It's not worth it.
+    NewGlobals.reserve(NumElements);
+
+    uint64_t EltSize = DL.getTypeAllocSize(STy->getElementType());
+    unsigned EltAlign = DL.getABITypeAlignment(STy->getElementType());
+    for (unsigned i = 0, e = NumElements; i != e; ++i) {
+      Constant *In = Init->getAggregateElement(i);
+      assert(In && "Couldn't get element of initializer?");
+
+      GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false,
+                                               GlobalVariable::InternalLinkage,
+                                               In, GV->getName()+"."+Twine(i),
+                                               GV->getThreadLocalMode(),
+                                              GV->getType()->getAddressSpace());
+      NGV->setExternallyInitialized(GV->isExternallyInitialized());
+      NGV->copyAttributesFrom(GV);
+      Globals.push_back(NGV);
+      NewGlobals.push_back(NGV);
+
+      // Calculate the known alignment of the field.  If the original aggregate
+      // had 256 byte alignment for example, something might depend on that:
+      // propagate info to each field.
+      unsigned NewAlign = (unsigned)MinAlign(StartAlignment, EltSize*i);
+      if (NewAlign > EltAlign)
+        NGV->setAlignment(NewAlign);
+    }
+  }
+
+  if (NewGlobals.empty())
+    return nullptr;
+
+  DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n");
+
+  Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext()));
+
+  // Loop over all of the uses of the global, replacing the constantexpr geps,
+  // with smaller constantexpr geps or direct references.
+  while (!GV->use_empty()) {
+    User *GEP = GV->user_back();
+    assert(((isa<ConstantExpr>(GEP) &&
+             cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)||
+            isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!");
+
+    // Ignore the 1th operand, which has to be zero or else the program is quite
+    // broken (undefined).  Get the 2nd operand, which is the structure or array
+    // index.
+    unsigned Val = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+    if (Val >= NewGlobals.size()) Val = 0; // Out of bound array access.
+
+    Value *NewPtr = NewGlobals[Val];
+    Type *NewTy = NewGlobals[Val]->getValueType();
+
+    // Form a shorter GEP if needed.
+    if (GEP->getNumOperands() > 3) {
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEP)) {
+        SmallVector<Constant*, 8> Idxs;
+        Idxs.push_back(NullInt);
+        for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i)
+          Idxs.push_back(CE->getOperand(i));
+        NewPtr =
+            ConstantExpr::getGetElementPtr(NewTy, cast<Constant>(NewPtr), Idxs);
+      } else {
+        GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP);
+        SmallVector<Value*, 8> Idxs;
+        Idxs.push_back(NullInt);
+        for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i)
+          Idxs.push_back(GEPI->getOperand(i));
+        NewPtr = GetElementPtrInst::Create(
+            NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(Val), GEPI);
+      }
+    }
+    GEP->replaceAllUsesWith(NewPtr);
+
+    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(GEP))
+      GEPI->eraseFromParent();
+    else
+      cast<ConstantExpr>(GEP)->destroyConstant();
+  }
+
+  // Delete the old global, now that it is dead.
+  Globals.erase(GV);
+  ++NumSRA;
+
+  // Loop over the new globals array deleting any globals that are obviously
+  // dead.  This can arise due to scalarization of a structure or an array that
+  // has elements that are dead.
+  unsigned FirstGlobal = 0;
+  for (unsigned i = 0, e = NewGlobals.size(); i != e; ++i)
+    if (NewGlobals[i]->use_empty()) {
+      Globals.erase(NewGlobals[i]);
+      if (FirstGlobal == i) ++FirstGlobal;
+    }
+
+  return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : nullptr;
+}
+
+/// Return true if all users of the specified value will trap if the value is
+/// dynamically null.  PHIs keeps track of any phi nodes we've seen to avoid
+/// reprocessing them.
+static bool AllUsesOfValueWillTrapIfNull(const Value *V,
+                                        SmallPtrSetImpl<const PHINode*> &PHIs) {
+  for (const User *U : V->users())
+    if (isa<LoadInst>(U)) {
+      // Will trap.
+    } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getOperand(0) == V) {
+        //cerr << "NONTRAPPING USE: " << *U;
+        return false;  // Storing the value.
+      }
+    } else if (const CallInst *CI = dyn_cast<CallInst>(U)) {
+      if (CI->getCalledValue() != V) {
+        //cerr << "NONTRAPPING USE: " << *U;
+        return false;  // Not calling the ptr
+      }
+    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(U)) {
+      if (II->getCalledValue() != V) {
+        //cerr << "NONTRAPPING USE: " << *U;
+        return false;  // Not calling the ptr
+      }
+    } else if (const BitCastInst *CI = dyn_cast<BitCastInst>(U)) {
+      if (!AllUsesOfValueWillTrapIfNull(CI, PHIs)) return false;
+    } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      if (!AllUsesOfValueWillTrapIfNull(GEPI, PHIs)) return false;
+    } else if (const PHINode *PN = dyn_cast<PHINode>(U)) {
+      // If we've already seen this phi node, ignore it, it has already been
+      // checked.
+      if (PHIs.insert(PN).second && !AllUsesOfValueWillTrapIfNull(PN, PHIs))
+        return false;
+    } else if (isa<ICmpInst>(U) &&
+               isa<ConstantPointerNull>(U->getOperand(1))) {
+      // Ignore icmp X, null
+    } else {
+      //cerr << "NONTRAPPING USE: " << *U;
+      return false;
+    }
+
+  return true;
+}
+
+/// Return true if all uses of any loads from GV will trap if the loaded value
+/// is null.  Note that this also permits comparisons of the loaded value
+/// against null, as a special case.
+static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) {
+  for (const User *U : GV->users())
+    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      SmallPtrSet<const PHINode*, 8> PHIs;
+      if (!AllUsesOfValueWillTrapIfNull(LI, PHIs))
+        return false;
+    } else if (isa<StoreInst>(U)) {
+      // Ignore stores to the global.
+    } else {
+      // We don't know or understand this user, bail out.
+      //cerr << "UNKNOWN USER OF GLOBAL!: " << *U;
+      return false;
+    }
+  return true;
+}
+
+static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
+  bool Changed = false;
+  for (auto UI = V->user_begin(), E = V->user_end(); UI != E; ) {
+    Instruction *I = cast<Instruction>(*UI++);
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      LI->setOperand(0, NewV);
+      Changed = true;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      if (SI->getOperand(1) == V) {
+        SI->setOperand(1, NewV);
+        Changed = true;
+      }
+    } else if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+      CallSite CS(I);
+      if (CS.getCalledValue() == V) {
+        // Calling through the pointer!  Turn into a direct call, but be careful
+        // that the pointer is not also being passed as an argument.
+        CS.setCalledFunction(NewV);
+        Changed = true;
+        bool PassedAsArg = false;
+        for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
+          if (CS.getArgument(i) == V) {
+            PassedAsArg = true;
+            CS.setArgument(i, NewV);
+          }
+
+        if (PassedAsArg) {
+          // Being passed as an argument also.  Be careful to not invalidate UI!
+          UI = V->user_begin();
+        }
+      }
+    } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+      Changed |= OptimizeAwayTrappingUsesOfValue(CI,
+                                ConstantExpr::getCast(CI->getOpcode(),
+                                                      NewV, CI->getType()));
+      if (CI->use_empty()) {
+        Changed = true;
+        CI->eraseFromParent();
+      }
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+      // Should handle GEP here.
+      SmallVector<Constant*, 8> Idxs;
+      Idxs.reserve(GEPI->getNumOperands()-1);
+      for (User::op_iterator i = GEPI->op_begin() + 1, e = GEPI->op_end();
+           i != e; ++i)
+        if (Constant *C = dyn_cast<Constant>(*i))
+          Idxs.push_back(C);
+        else
+          break;
+      if (Idxs.size() == GEPI->getNumOperands()-1)
+        Changed |= OptimizeAwayTrappingUsesOfValue(
+            GEPI, ConstantExpr::getGetElementPtr(nullptr, NewV, Idxs));
+      if (GEPI->use_empty()) {
+        Changed = true;
+        GEPI->eraseFromParent();
+      }
+    }
+  }
+
+  return Changed;
+}
+
+
+/// The specified global has only one non-null value stored into it.  If there
+/// are uses of the loaded value that would trap if the loaded value is
+/// dynamically null, then we know that they cannot be reachable with a null
+/// optimize away the load.
+static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
+                                            const DataLayout &DL,
+                                            TargetLibraryInfo *TLI) {
+  bool Changed = false;
+
+  // Keep track of whether we are able to remove all the uses of the global
+  // other than the store that defines it.
+  bool AllNonStoreUsesGone = true;
+
+  // Replace all uses of loads with uses of uses of the stored value.
+  for (Value::user_iterator GUI = GV->user_begin(), E = GV->user_end(); GUI != E;){
+    User *GlobalUser = *GUI++;
+    if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) {
+      Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV);
+      // If we were able to delete all uses of the loads
+      if (LI->use_empty()) {
+        LI->eraseFromParent();
+        Changed = true;
+      } else {
+        AllNonStoreUsesGone = false;
+      }
+    } else if (isa<StoreInst>(GlobalUser)) {
+      // Ignore the store that stores "LV" to the global.
+      assert(GlobalUser->getOperand(1) == GV &&
+             "Must be storing *to* the global");
+    } else {
+      AllNonStoreUsesGone = false;
+
+      // If we get here we could have other crazy uses that are transitively
+      // loaded.
+      assert((isa<PHINode>(GlobalUser) || isa<SelectInst>(GlobalUser) ||
+              isa<ConstantExpr>(GlobalUser) || isa<CmpInst>(GlobalUser) ||
+              isa<BitCastInst>(GlobalUser) ||
+              isa<GetElementPtrInst>(GlobalUser)) &&
+             "Only expect load and stores!");
+    }
+  }
+
+  if (Changed) {
+    DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV << "\n");
+    ++NumGlobUses;
+  }
+
+  // If we nuked all of the loads, then none of the stores are needed either,
+  // nor is the global.
+  if (AllNonStoreUsesGone) {
+    if (isLeakCheckerRoot(GV)) {
+      Changed |= CleanupPointerRootUsers(GV, TLI);
+    } else {
+      Changed = true;
+      CleanupConstantGlobalUsers(GV, nullptr, DL, TLI);
+    }
+    if (GV->use_empty()) {
+      DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n");
+      Changed = true;
+      GV->eraseFromParent();
+      ++NumDeleted;
+    }
+  }
+  return Changed;
+}
+
+/// Walk the use list of V, constant folding all of the instructions that are
+/// foldable.
+static void ConstantPropUsersOf(Value *V, const DataLayout &DL,
+                                TargetLibraryInfo *TLI) {
+  for (Value::user_iterator UI = V->user_begin(), E = V->user_end(); UI != E; )
+    if (Instruction *I = dyn_cast<Instruction>(*UI++))
+      if (Constant *NewC = ConstantFoldInstruction(I, DL, TLI)) {
+        I->replaceAllUsesWith(NewC);
+
+        // Advance UI to the next non-I use to avoid invalidating it!
+        // Instructions could multiply use V.
+        while (UI != E && *UI == I)
+          ++UI;
+        if (isInstructionTriviallyDead(I, TLI))
+          I->eraseFromParent();
+      }
+}
+
+/// This function takes the specified global variable, and transforms the
+/// program as if it always contained the result of the specified malloc.
+/// Because it is always the result of the specified malloc, there is no reason
+/// to actually DO the malloc.  Instead, turn the malloc into a global, and any
+/// loads of GV as uses of the new global.
+static GlobalVariable *
+OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
+                              ConstantInt *NElements, const DataLayout &DL,
+                              TargetLibraryInfo *TLI) {
+  DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << "  CALL = " << *CI << '\n');
+
+  Type *GlobalType;
+  if (NElements->getZExtValue() == 1)
+    GlobalType = AllocTy;
+  else
+    // If we have an array allocation, the global variable is of an array.
+    GlobalType = ArrayType::get(AllocTy, NElements->getZExtValue());
+
+  // Create the new global variable.  The contents of the malloc'd memory is
+  // undefined, so initialize with an undef value.
+  GlobalVariable *NewGV = new GlobalVariable(
+      *GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage,
+      UndefValue::get(GlobalType), GV->getName() + ".body", nullptr,
+      GV->getThreadLocalMode());
+
+  // If there are bitcast users of the malloc (which is typical, usually we have
+  // a malloc + bitcast) then replace them with uses of the new global.  Update
+  // other users to use the global as well.
+  BitCastInst *TheBC = nullptr;
+  while (!CI->use_empty()) {
+    Instruction *User = cast<Instruction>(CI->user_back());
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
+      if (BCI->getType() == NewGV->getType()) {
+        BCI->replaceAllUsesWith(NewGV);
+        BCI->eraseFromParent();
+      } else {
+        BCI->setOperand(0, NewGV);
+      }
+    } else {
+      if (!TheBC)
+        TheBC = new BitCastInst(NewGV, CI->getType(), "newgv", CI);
+      User->replaceUsesOfWith(CI, TheBC);
+    }
+  }
+
+  Constant *RepValue = NewGV;
+  if (NewGV->getType() != GV->getValueType())
+    RepValue = ConstantExpr::getBitCast(RepValue, GV->getValueType());
+
+  // If there is a comparison against null, we will insert a global bool to
+  // keep track of whether the global was initialized yet or not.
+  GlobalVariable *InitBool =
+    new GlobalVariable(Type::getInt1Ty(GV->getContext()), false,
+                       GlobalValue::InternalLinkage,
+                       ConstantInt::getFalse(GV->getContext()),
+                       GV->getName()+".init", GV->getThreadLocalMode());
+  bool InitBoolUsed = false;
+
+  // Loop over all uses of GV, processing them in turn.
+  while (!GV->use_empty()) {
+    if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) {
+      // The global is initialized when the store to it occurs.
+      new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false, 0,
+                    SI->getOrdering(), SI->getSyncScopeID(), SI);
+      SI->eraseFromParent();
+      continue;
+    }
+
+    LoadInst *LI = cast<LoadInst>(GV->user_back());
+    while (!LI->use_empty()) {
+      Use &LoadUse = *LI->use_begin();
+      ICmpInst *ICI = dyn_cast<ICmpInst>(LoadUse.getUser());
+      if (!ICI) {
+        LoadUse = RepValue;
+        continue;
+      }
+
+      // Replace the cmp X, 0 with a use of the bool value.
+      // Sink the load to where the compare was, if atomic rules allow us to.
+      Value *LV = new LoadInst(InitBool, InitBool->getName()+".val", false, 0,
+                               LI->getOrdering(), LI->getSyncScopeID(),
+                               LI->isUnordered() ? (Instruction*)ICI : LI);
+      InitBoolUsed = true;
+      switch (ICI->getPredicate()) {
+      default: llvm_unreachable("Unknown ICmp Predicate!");
+      case ICmpInst::ICMP_ULT:
+      case ICmpInst::ICMP_SLT:   // X < null -> always false
+        LV = ConstantInt::getFalse(GV->getContext());
+        break;
+      case ICmpInst::ICMP_ULE:
+      case ICmpInst::ICMP_SLE:
+      case ICmpInst::ICMP_EQ:
+        LV = BinaryOperator::CreateNot(LV, "notinit", ICI);
+        break;
+      case ICmpInst::ICMP_NE:
+      case ICmpInst::ICMP_UGE:
+      case ICmpInst::ICMP_SGE:
+      case ICmpInst::ICMP_UGT:
+      case ICmpInst::ICMP_SGT:
+        break;  // no change.
+      }
+      ICI->replaceAllUsesWith(LV);
+      ICI->eraseFromParent();
+    }
+    LI->eraseFromParent();
+  }
+
+  // If the initialization boolean was used, insert it, otherwise delete it.
+  if (!InitBoolUsed) {
+    while (!InitBool->use_empty())  // Delete initializations
+      cast<StoreInst>(InitBool->user_back())->eraseFromParent();
+    delete InitBool;
+  } else
+    GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool);
+
+  // Now the GV is dead, nuke it and the malloc..
+  GV->eraseFromParent();
+  CI->eraseFromParent();
+
+  // To further other optimizations, loop over all users of NewGV and try to
+  // constant prop them.  This will promote GEP instructions with constant
+  // indices into GEP constant-exprs, which will allow global-opt to hack on it.
+  ConstantPropUsersOf(NewGV, DL, TLI);
+  if (RepValue != NewGV)
+    ConstantPropUsersOf(RepValue, DL, TLI);
+
+  return NewGV;
+}
+
+/// Scan the use-list of V checking to make sure that there are no complex uses
+/// of V.  We permit simple things like dereferencing the pointer, but not
+/// storing through the address, unless it is to the specified global.
+static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V,
+                                                      const GlobalVariable *GV,
+                                        SmallPtrSetImpl<const PHINode*> &PHIs) {
+  for (const User *U : V->users()) {
+    const Instruction *Inst = cast<Instruction>(U);
+
+    if (isa<LoadInst>(Inst) || isa<CmpInst>(Inst)) {
+      continue; // Fine, ignore.
+    }
+
+    if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      if (SI->getOperand(0) == V && SI->getOperand(1) != GV)
+        return false;  // Storing the pointer itself... bad.
+      continue; // Otherwise, storing through it, or storing into GV... fine.
+    }
+
+    // Must index into the array and into the struct.
+    if (isa<GetElementPtrInst>(Inst) && Inst->getNumOperands() >= 3) {
+      if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Inst, GV, PHIs))
+        return false;
+      continue;
+    }
+
+    if (const PHINode *PN = dyn_cast<PHINode>(Inst)) {
+      // PHIs are ok if all uses are ok.  Don't infinitely recurse through PHI
+      // cycles.
+      if (PHIs.insert(PN).second)
+        if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(PN, GV, PHIs))
+          return false;
+      continue;
+    }
+
+    if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) {
+      if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV, PHIs))
+        return false;
+      continue;
+    }
+
+    return false;
+  }
+  return true;
+}
+
+/// The Alloc pointer is stored into GV somewhere.  Transform all uses of the
+/// allocation into loads from the global and uses of the resultant pointer.
+/// Further, delete the store into GV.  This assumes that these value pass the
+/// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate.
+static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc,
+                                          GlobalVariable *GV) {
+  while (!Alloc->use_empty()) {
+    Instruction *U = cast<Instruction>(*Alloc->user_begin());
+    Instruction *InsertPt = U;
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      // If this is the store of the allocation into the global, remove it.
+      if (SI->getOperand(1) == GV) {
+        SI->eraseFromParent();
+        continue;
+      }
+    } else if (PHINode *PN = dyn_cast<PHINode>(U)) {
+      // Insert the load in the corresponding predecessor, not right before the
+      // PHI.
+      InsertPt = PN->getIncomingBlock(*Alloc->use_begin())->getTerminator();
+    } else if (isa<BitCastInst>(U)) {
+      // Must be bitcast between the malloc and store to initialize the global.
+      ReplaceUsesOfMallocWithGlobal(U, GV);
+      U->eraseFromParent();
+      continue;
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      // If this is a "GEP bitcast" and the user is a store to the global, then
+      // just process it as a bitcast.
+      if (GEPI->hasAllZeroIndices() && GEPI->hasOneUse())
+        if (StoreInst *SI = dyn_cast<StoreInst>(GEPI->user_back()))
+          if (SI->getOperand(1) == GV) {
+            // Must be bitcast GEP between the malloc and store to initialize
+            // the global.
+            ReplaceUsesOfMallocWithGlobal(GEPI, GV);
+            GEPI->eraseFromParent();
+            continue;
+          }
+    }
+
+    // Insert a load from the global, and use it instead of the malloc.
+    Value *NL = new LoadInst(GV, GV->getName()+".val", InsertPt);
+    U->replaceUsesOfWith(Alloc, NL);
+  }
+}
+
+/// Verify that all uses of V (a load, or a phi of a load) are simple enough to
+/// perform heap SRA on.  This permits GEP's that index through the array and
+/// struct field, icmps of null, and PHIs.
+static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V,
+                        SmallPtrSetImpl<const PHINode*> &LoadUsingPHIs,
+                        SmallPtrSetImpl<const PHINode*> &LoadUsingPHIsPerLoad) {
+  // We permit two users of the load: setcc comparing against the null
+  // pointer, and a getelementptr of a specific form.
+  for (const User *U : V->users()) {
+    const Instruction *UI = cast<Instruction>(U);
+
+    // Comparison against null is ok.
+    if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UI)) {
+      if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
+        return false;
+      continue;
+    }
+
+    // getelementptr is also ok, but only a simple form.
+    if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(UI)) {
+      // Must index into the array and into the struct.
+      if (GEPI->getNumOperands() < 3)
+        return false;
+
+      // Otherwise the GEP is ok.
+      continue;
+    }
+
+    if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
+      if (!LoadUsingPHIsPerLoad.insert(PN).second)
+        // This means some phi nodes are dependent on each other.
+        // Avoid infinite looping!
+        return false;
+      if (!LoadUsingPHIs.insert(PN).second)
+        // If we have already analyzed this PHI, then it is safe.
+        continue;
+
+      // Make sure all uses of the PHI are simple enough to transform.
+      if (!LoadUsesSimpleEnoughForHeapSRA(PN,
+                                          LoadUsingPHIs, LoadUsingPHIsPerLoad))
+        return false;
+
+      continue;
+    }
+
+    // Otherwise we don't know what this is, not ok.
+    return false;
+  }
+
+  return true;
+}
+
+
+/// If all users of values loaded from GV are simple enough to perform HeapSRA,
+/// return true.
+static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV,
+                                                    Instruction *StoredVal) {
+  SmallPtrSet<const PHINode*, 32> LoadUsingPHIs;
+  SmallPtrSet<const PHINode*, 32> LoadUsingPHIsPerLoad;
+  for (const User *U : GV->users())
+    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      if (!LoadUsesSimpleEnoughForHeapSRA(LI, LoadUsingPHIs,
+                                          LoadUsingPHIsPerLoad))
+        return false;
+      LoadUsingPHIsPerLoad.clear();
+    }
+
+  // If we reach here, we know that all uses of the loads and transitive uses
+  // (through PHI nodes) are simple enough to transform.  However, we don't know
+  // that all inputs the to the PHI nodes are in the same equivalence sets.
+  // Check to verify that all operands of the PHIs are either PHIS that can be
+  // transformed, loads from GV, or MI itself.
+  for (const PHINode *PN : LoadUsingPHIs) {
+    for (unsigned op = 0, e = PN->getNumIncomingValues(); op != e; ++op) {
+      Value *InVal = PN->getIncomingValue(op);
+
+      // PHI of the stored value itself is ok.
+      if (InVal == StoredVal) continue;
+
+      if (const PHINode *InPN = dyn_cast<PHINode>(InVal)) {
+        // One of the PHIs in our set is (optimistically) ok.
+        if (LoadUsingPHIs.count(InPN))
+          continue;
+        return false;
+      }
+
+      // Load from GV is ok.
+      if (const LoadInst *LI = dyn_cast<LoadInst>(InVal))
+        if (LI->getOperand(0) == GV)
+          continue;
+
+      // UNDEF? NULL?
+
+      // Anything else is rejected.
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
+               DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
+  std::vector<Value*> &FieldVals = InsertedScalarizedValues[V];
+
+  if (FieldNo >= FieldVals.size())
+    FieldVals.resize(FieldNo+1);
+
+  // If we already have this value, just reuse the previously scalarized
+  // version.
+  if (Value *FieldVal = FieldVals[FieldNo])
+    return FieldVal;
+
+  // Depending on what instruction this is, we have several cases.
+  Value *Result;
+  if (LoadInst *LI = dyn_cast<LoadInst>(V)) {
+    // This is a scalarized version of the load from the global.  Just create
+    // a new Load of the scalarized global.
+    Result = new LoadInst(GetHeapSROAValue(LI->getOperand(0), FieldNo,
+                                           InsertedScalarizedValues,
+                                           PHIsToRewrite),
+                          LI->getName()+".f"+Twine(FieldNo), LI);
+  } else {
+    PHINode *PN = cast<PHINode>(V);
+    // PN's type is pointer to struct.  Make a new PHI of pointer to struct
+    // field.
+
+    PointerType *PTy = cast<PointerType>(PN->getType());
+    StructType *ST = cast<StructType>(PTy->getElementType());
+
+    unsigned AS = PTy->getAddressSpace();
+    PHINode *NewPN =
+      PHINode::Create(PointerType::get(ST->getElementType(FieldNo), AS),
+                     PN->getNumIncomingValues(),
+                     PN->getName()+".f"+Twine(FieldNo), PN);
+    Result = NewPN;
+    PHIsToRewrite.push_back(std::make_pair(PN, FieldNo));
+  }
+
+  return FieldVals[FieldNo] = Result;
+}
+
+/// Given a load instruction and a value derived from the load, rewrite the
+/// derived value to use the HeapSRoA'd load.
+static void RewriteHeapSROALoadUser(Instruction *LoadUser,
+             DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
+  // If this is a comparison against null, handle it.
+  if (ICmpInst *SCI = dyn_cast<ICmpInst>(LoadUser)) {
+    assert(isa<ConstantPointerNull>(SCI->getOperand(1)));
+    // If we have a setcc of the loaded pointer, we can use a setcc of any
+    // field.
+    Value *NPtr = GetHeapSROAValue(SCI->getOperand(0), 0,
+                                   InsertedScalarizedValues, PHIsToRewrite);
+
+    Value *New = new ICmpInst(SCI, SCI->getPredicate(), NPtr,
+                              Constant::getNullValue(NPtr->getType()),
+                              SCI->getName());
+    SCI->replaceAllUsesWith(New);
+    SCI->eraseFromParent();
+    return;
+  }
+
+  // Handle 'getelementptr Ptr, Idx, i32 FieldNo ...'
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(LoadUser)) {
+    assert(GEPI->getNumOperands() >= 3 && isa<ConstantInt>(GEPI->getOperand(2))
+           && "Unexpected GEPI!");
+
+    // Load the pointer for this field.
+    unsigned FieldNo = cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue();
+    Value *NewPtr = GetHeapSROAValue(GEPI->getOperand(0), FieldNo,
+                                     InsertedScalarizedValues, PHIsToRewrite);
+
+    // Create the new GEP idx vector.
+    SmallVector<Value*, 8> GEPIdx;
+    GEPIdx.push_back(GEPI->getOperand(1));
+    GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end());
+
+    Value *NGEPI = GetElementPtrInst::Create(GEPI->getResultElementType(), NewPtr, GEPIdx,
+                                             GEPI->getName(), GEPI);
+    GEPI->replaceAllUsesWith(NGEPI);
+    GEPI->eraseFromParent();
+    return;
+  }
+
+  // Recursively transform the users of PHI nodes.  This will lazily create the
+  // PHIs that are needed for individual elements.  Keep track of what PHIs we
+  // see in InsertedScalarizedValues so that we don't get infinite loops (very
+  // antisocial).  If the PHI is already in InsertedScalarizedValues, it has
+  // already been seen first by another load, so its uses have already been
+  // processed.
+  PHINode *PN = cast<PHINode>(LoadUser);
+  if (!InsertedScalarizedValues.insert(std::make_pair(PN,
+                                              std::vector<Value*>())).second)
+    return;
+
+  // If this is the first time we've seen this PHI, recursively process all
+  // users.
+  for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) {
+    Instruction *User = cast<Instruction>(*UI++);
+    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
+  }
+}
+
+/// We are performing Heap SRoA on a global.  Ptr is a value loaded from the
+/// global.  Eliminate all uses of Ptr, making them use FieldGlobals instead.
+/// All uses of loaded values satisfy AllGlobalLoadUsesSimpleEnoughForHeapSRA.
+static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
+               DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
+  for (auto UI = Load->user_begin(), E = Load->user_end(); UI != E;) {
+    Instruction *User = cast<Instruction>(*UI++);
+    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
+  }
+
+  if (Load->use_empty()) {
+    Load->eraseFromParent();
+    InsertedScalarizedValues.erase(Load);
+  }
+}
+
+/// CI is an allocation of an array of structures.  Break it up into multiple
+/// allocations of arrays of the fields.
+static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
+                                            Value *NElems, const DataLayout &DL,
+                                            const TargetLibraryInfo *TLI) {
+  DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *CI << '\n');
+  Type *MAT = getMallocAllocatedType(CI, TLI);
+  StructType *STy = cast<StructType>(MAT);
+
+  // There is guaranteed to be at least one use of the malloc (storing
+  // it into GV).  If there are other uses, change them to be uses of
+  // the global to simplify later code.  This also deletes the store
+  // into GV.
+  ReplaceUsesOfMallocWithGlobal(CI, GV);
+
+  // Okay, at this point, there are no users of the malloc.  Insert N
+  // new mallocs at the same place as CI, and N globals.
+  std::vector<Value*> FieldGlobals;
+  std::vector<Value*> FieldMallocs;
+
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  CI->getOperandBundlesAsDefs(OpBundles);
+
+  unsigned AS = GV->getType()->getPointerAddressSpace();
+  for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){
+    Type *FieldTy = STy->getElementType(FieldNo);
+    PointerType *PFieldTy = PointerType::get(FieldTy, AS);
+
+    GlobalVariable *NGV = new GlobalVariable(
+        *GV->getParent(), PFieldTy, false, GlobalValue::InternalLinkage,
+        Constant::getNullValue(PFieldTy), GV->getName() + ".f" + Twine(FieldNo),
+        nullptr, GV->getThreadLocalMode());
+    NGV->copyAttributesFrom(GV);
+    FieldGlobals.push_back(NGV);
+
+    unsigned TypeSize = DL.getTypeAllocSize(FieldTy);
+    if (StructType *ST = dyn_cast<StructType>(FieldTy))
+      TypeSize = DL.getStructLayout(ST)->getSizeInBytes();
+    Type *IntPtrTy = DL.getIntPtrType(CI->getType());
+    Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
+                                        ConstantInt::get(IntPtrTy, TypeSize),
+                                        NElems, OpBundles, nullptr,
+                                        CI->getName() + ".f" + Twine(FieldNo));
+    FieldMallocs.push_back(NMI);
+    new StoreInst(NMI, NGV, CI);
+  }
+
+  // The tricky aspect of this transformation is handling the case when malloc
+  // fails.  In the original code, malloc failing would set the result pointer
+  // of malloc to null.  In this case, some mallocs could succeed and others
+  // could fail.  As such, we emit code that looks like this:
+  //    F0 = malloc(field0)
+  //    F1 = malloc(field1)
+  //    F2 = malloc(field2)
+  //    if (F0 == 0 || F1 == 0 || F2 == 0) {
+  //      if (F0) { free(F0); F0 = 0; }
+  //      if (F1) { free(F1); F1 = 0; }
+  //      if (F2) { free(F2); F2 = 0; }
+  //    }
+  // The malloc can also fail if its argument is too large.
+  Constant *ConstantZero = ConstantInt::get(CI->getArgOperand(0)->getType(), 0);
+  Value *RunningOr = new ICmpInst(CI, ICmpInst::ICMP_SLT, CI->getArgOperand(0),
+                                  ConstantZero, "isneg");
+  for (unsigned i = 0, e = FieldMallocs.size(); i != e; ++i) {
+    Value *Cond = new ICmpInst(CI, ICmpInst::ICMP_EQ, FieldMallocs[i],
+                             Constant::getNullValue(FieldMallocs[i]->getType()),
+                               "isnull");
+    RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", CI);
+  }
+
+  // Split the basic block at the old malloc.
+  BasicBlock *OrigBB = CI->getParent();
+  BasicBlock *ContBB =
+      OrigBB->splitBasicBlock(CI->getIterator(), "malloc_cont");
+
+  // Create the block to check the first condition.  Put all these blocks at the
+  // end of the function as they are unlikely to be executed.
+  BasicBlock *NullPtrBlock = BasicBlock::Create(OrigBB->getContext(),
+                                                "malloc_ret_null",
+                                                OrigBB->getParent());
+
+  // Remove the uncond branch from OrigBB to ContBB, turning it into a cond
+  // branch on RunningOr.
+  OrigBB->getTerminator()->eraseFromParent();
+  BranchInst::Create(NullPtrBlock, ContBB, RunningOr, OrigBB);
+
+  // Within the NullPtrBlock, we need to emit a comparison and branch for each
+  // pointer, because some may be null while others are not.
+  for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
+    Value *GVVal = new LoadInst(FieldGlobals[i], "tmp", NullPtrBlock);
+    Value *Cmp = new ICmpInst(*NullPtrBlock, ICmpInst::ICMP_NE, GVVal,
+                              Constant::getNullValue(GVVal->getType()));
+    BasicBlock *FreeBlock = BasicBlock::Create(Cmp->getContext(), "free_it",
+                                               OrigBB->getParent());
+    BasicBlock *NextBlock = BasicBlock::Create(Cmp->getContext(), "next",
+                                               OrigBB->getParent());
+    Instruction *BI = BranchInst::Create(FreeBlock, NextBlock,
+                                         Cmp, NullPtrBlock);
+
+    // Fill in FreeBlock.
+    CallInst::CreateFree(GVVal, OpBundles, BI);
+    new StoreInst(Constant::getNullValue(GVVal->getType()), FieldGlobals[i],
+                  FreeBlock);
+    BranchInst::Create(NextBlock, FreeBlock);
+
+    NullPtrBlock = NextBlock;
+  }
+
+  BranchInst::Create(ContBB, NullPtrBlock);
+
+  // CI is no longer needed, remove it.
+  CI->eraseFromParent();
+
+  /// As we process loads, if we can't immediately update all uses of the load,
+  /// keep track of what scalarized loads are inserted for a given load.
+  DenseMap<Value*, std::vector<Value*> > InsertedScalarizedValues;
+  InsertedScalarizedValues[GV] = FieldGlobals;
+
+  std::vector<std::pair<PHINode*, unsigned> > PHIsToRewrite;
+
+  // Okay, the malloc site is completely handled.  All of the uses of GV are now
+  // loads, and all uses of those loads are simple.  Rewrite them to use loads
+  // of the per-field globals instead.
+  for (auto UI = GV->user_begin(), E = GV->user_end(); UI != E;) {
+    Instruction *User = cast<Instruction>(*UI++);
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      RewriteUsesOfLoadForHeapSRoA(LI, InsertedScalarizedValues, PHIsToRewrite);
+      continue;
+    }
+
+    // Must be a store of null.
+    StoreInst *SI = cast<StoreInst>(User);
+    assert(isa<ConstantPointerNull>(SI->getOperand(0)) &&
+           "Unexpected heap-sra user!");
+
+    // Insert a store of null into each global.
+    for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
+      Type *ValTy = cast<GlobalValue>(FieldGlobals[i])->getValueType();
+      Constant *Null = Constant::getNullValue(ValTy);
+      new StoreInst(Null, FieldGlobals[i], SI);
+    }
+    // Erase the original store.
+    SI->eraseFromParent();
+  }
+
+  // While we have PHIs that are interesting to rewrite, do it.
+  while (!PHIsToRewrite.empty()) {
+    PHINode *PN = PHIsToRewrite.back().first;
+    unsigned FieldNo = PHIsToRewrite.back().second;
+    PHIsToRewrite.pop_back();
+    PHINode *FieldPN = cast<PHINode>(InsertedScalarizedValues[PN][FieldNo]);
+    assert(FieldPN->getNumIncomingValues() == 0 &&"Already processed this phi");
+
+    // Add all the incoming values.  This can materialize more phis.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *InVal = PN->getIncomingValue(i);
+      InVal = GetHeapSROAValue(InVal, FieldNo, InsertedScalarizedValues,
+                               PHIsToRewrite);
+      FieldPN->addIncoming(InVal, PN->getIncomingBlock(i));
+    }
+  }
+
+  // Drop all inter-phi links and any loads that made it this far.
+  for (DenseMap<Value*, std::vector<Value*> >::iterator
+       I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
+       I != E; ++I) {
+    if (PHINode *PN = dyn_cast<PHINode>(I->first))
+      PN->dropAllReferences();
+    else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
+      LI->dropAllReferences();
+  }
+
+  // Delete all the phis and loads now that inter-references are dead.
+  for (DenseMap<Value*, std::vector<Value*> >::iterator
+       I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
+       I != E; ++I) {
+    if (PHINode *PN = dyn_cast<PHINode>(I->first))
+      PN->eraseFromParent();
+    else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
+      LI->eraseFromParent();
+  }
+
+  // The old global is now dead, remove it.
+  GV->eraseFromParent();
+
+  ++NumHeapSRA;
+  return cast<GlobalVariable>(FieldGlobals[0]);
+}
+
+/// This function is called when we see a pointer global variable with a single
+/// value stored it that is a malloc or cast of malloc.
+static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
+                                               Type *AllocTy,
+                                               AtomicOrdering Ordering,
+                                               const DataLayout &DL,
+                                               TargetLibraryInfo *TLI) {
+  // If this is a malloc of an abstract type, don't touch it.
+  if (!AllocTy->isSized())
+    return false;
+
+  // We can't optimize this global unless all uses of it are *known* to be
+  // of the malloc value, not of the null initializer value (consider a use
+  // that compares the global's value against zero to see if the malloc has
+  // been reached).  To do this, we check to see if all uses of the global
+  // would trap if the global were null: this proves that they must all
+  // happen after the malloc.
+  if (!AllUsesOfLoadedValueWillTrapIfNull(GV))
+    return false;
+
+  // We can't optimize this if the malloc itself is used in a complex way,
+  // for example, being stored into multiple globals.  This allows the
+  // malloc to be stored into the specified global, loaded icmp'd, and
+  // GEP'd.  These are all things we could transform to using the global
+  // for.
+  SmallPtrSet<const PHINode*, 8> PHIs;
+  if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV, PHIs))
+    return false;
+
+  // If we have a global that is only initialized with a fixed size malloc,
+  // transform the program to use global memory instead of malloc'd memory.
+  // This eliminates dynamic allocation, avoids an indirection accessing the
+  // data, and exposes the resultant global to further GlobalOpt.
+  // We cannot optimize the malloc if we cannot determine malloc array size.
+  Value *NElems = getMallocArraySize(CI, DL, TLI, true);
+  if (!NElems)
+    return false;
+
+  if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems))
+    // Restrict this transformation to only working on small allocations
+    // (2048 bytes currently), as we don't want to introduce a 16M global or
+    // something.
+    if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) {
+      OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI);
+      return true;
+    }
+
+  // If the allocation is an array of structures, consider transforming this
+  // into multiple malloc'd arrays, one for each field.  This is basically
+  // SRoA for malloc'd memory.
+
+  if (Ordering != AtomicOrdering::NotAtomic)
+    return false;
+
+  // If this is an allocation of a fixed size array of structs, analyze as a
+  // variable size array.  malloc [100 x struct],1 -> malloc struct, 100
+  if (NElems == ConstantInt::get(CI->getArgOperand(0)->getType(), 1))
+    if (ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
+      AllocTy = AT->getElementType();
+
+  StructType *AllocSTy = dyn_cast<StructType>(AllocTy);
+  if (!AllocSTy)
+    return false;
+
+  // This the structure has an unreasonable number of fields, leave it
+  // alone.
+  if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 &&
+      AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, CI)) {
+
+    // If this is a fixed size array, transform the Malloc to be an alloc of
+    // structs.  malloc [100 x struct],1 -> malloc struct, 100
+    if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) {
+      Type *IntPtrTy = DL.getIntPtrType(CI->getType());
+      unsigned TypeSize = DL.getStructLayout(AllocSTy)->getSizeInBytes();
+      Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
+      Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      CI->getOperandBundlesAsDefs(OpBundles);
+      Instruction *Malloc =
+          CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy, AllocSize, NumElements,
+                                 OpBundles, nullptr, CI->getName());
+      Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI);
+      CI->replaceAllUsesWith(Cast);
+      CI->eraseFromParent();
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Malloc))
+        CI = cast<CallInst>(BCI->getOperand(0));
+      else
+        CI = cast<CallInst>(Malloc);
+    }
+
+    PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), DL,
+                         TLI);
+    return true;
+  }
+
+  return false;
+}
+
+// Try to optimize globals based on the knowledge that only one value (besides
+// its initializer) is ever stored to the global.
+static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
+                                     AtomicOrdering Ordering,
+                                     const DataLayout &DL,
+                                     TargetLibraryInfo *TLI) {
+  // Ignore no-op GEPs and bitcasts.
+  StoredOnceVal = StoredOnceVal->stripPointerCasts();
+
+  // If we are dealing with a pointer global that is initialized to null and
+  // only has one (non-null) value stored into it, then we can optimize any
+  // users of the loaded value (often calls and loads) that would trap if the
+  // value was null.
+  if (GV->getInitializer()->getType()->isPointerTy() &&
+      GV->getInitializer()->isNullValue()) {
+    if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) {
+      if (GV->getInitializer()->getType() != SOVC->getType())
+        SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
+
+      // Optimize away any trapping uses of the loaded value.
+      if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, TLI))
+        return true;
+    } else if (CallInst *CI = extractMallocCall(StoredOnceVal, TLI)) {
+      Type *MallocType = getMallocAllocatedType(CI, TLI);
+      if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType,
+                                                           Ordering, DL, TLI))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+/// At this point, we have learned that the only two values ever stored into GV
+/// are its initializer and OtherVal.  See if we can shrink the global into a
+/// boolean and select between the two values whenever it is used.  This exposes
+/// the values to other scalar optimizations.
+static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
+  Type *GVElType = GV->getValueType();
+
+  // If GVElType is already i1, it is already shrunk.  If the type of the GV is
+  // an FP value, pointer or vector, don't do this optimization because a select
+  // between them is very expensive and unlikely to lead to later
+  // simplification.  In these cases, we typically end up with "cond ? v1 : v2"
+  // where v1 and v2 both require constant pool loads, a big loss.
+  if (GVElType == Type::getInt1Ty(GV->getContext()) ||
+      GVElType->isFloatingPointTy() ||
+      GVElType->isPointerTy() || GVElType->isVectorTy())
+    return false;
+
+  // Walk the use list of the global seeing if all the uses are load or store.
+  // If there is anything else, bail out.
+  for (User *U : GV->users())
+    if (!isa<LoadInst>(U) && !isa<StoreInst>(U))
+      return false;
+
+  DEBUG(dbgs() << "   *** SHRINKING TO BOOL: " << *GV << "\n");
+
+  // Create the new global, initializing it to false.
+  GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()),
+                                             false,
+                                             GlobalValue::InternalLinkage,
+                                        ConstantInt::getFalse(GV->getContext()),
+                                             GV->getName()+".b",
+                                             GV->getThreadLocalMode(),
+                                             GV->getType()->getAddressSpace());
+  NewGV->copyAttributesFrom(GV);
+  GV->getParent()->getGlobalList().insert(GV->getIterator(), NewGV);
+
+  Constant *InitVal = GV->getInitializer();
+  assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) &&
+         "No reason to shrink to bool!");
+
+  // If initialized to zero and storing one into the global, we can use a cast
+  // instead of a select to synthesize the desired value.
+  bool IsOneZero = false;
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal))
+    IsOneZero = InitVal->isNullValue() && CI->isOne();
+
+  while (!GV->use_empty()) {
+    Instruction *UI = cast<Instruction>(GV->user_back());
+    if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
+      // Change the store into a boolean store.
+      bool StoringOther = SI->getOperand(0) == OtherVal;
+      // Only do this if we weren't storing a loaded value.
+      Value *StoreVal;
+      if (StoringOther || SI->getOperand(0) == InitVal) {
+        StoreVal = ConstantInt::get(Type::getInt1Ty(GV->getContext()),
+                                    StoringOther);
+      } else {
+        // Otherwise, we are storing a previously loaded copy.  To do this,
+        // change the copy from copying the original value to just copying the
+        // bool.
+        Instruction *StoredVal = cast<Instruction>(SI->getOperand(0));
+
+        // If we've already replaced the input, StoredVal will be a cast or
+        // select instruction.  If not, it will be a load of the original
+        // global.
+        if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
+          assert(LI->getOperand(0) == GV && "Not a copy!");
+          // Insert a new load, to preserve the saved value.
+          StoreVal = new LoadInst(NewGV, LI->getName()+".b", false, 0,
+                                  LI->getOrdering(), LI->getSyncScopeID(), LI);
+        } else {
+          assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) &&
+                 "This is not a form that we understand!");
+          StoreVal = StoredVal->getOperand(0);
+          assert(isa<LoadInst>(StoreVal) && "Not a load of NewGV!");
+        }
+      }
+      new StoreInst(StoreVal, NewGV, false, 0,
+                    SI->getOrdering(), SI->getSyncScopeID(), SI);
+    } else {
+      // Change the load into a load of bool then a select.
+      LoadInst *LI = cast<LoadInst>(UI);
+      LoadInst *NLI = new LoadInst(NewGV, LI->getName()+".b", false, 0,
+                                   LI->getOrdering(), LI->getSyncScopeID(), LI);
+      Value *NSI;
+      if (IsOneZero)
+        NSI = new ZExtInst(NLI, LI->getType(), "", LI);
+      else
+        NSI = SelectInst::Create(NLI, OtherVal, InitVal, "", LI);
+      NSI->takeName(LI);
+      LI->replaceAllUsesWith(NSI);
+    }
+    UI->eraseFromParent();
+  }
+
+  // Retain the name of the old global variable. People who are debugging their
+  // programs may expect these variables to be named the same.
+  NewGV->takeName(GV);
+  GV->eraseFromParent();
+  return true;
+}
+
+static bool deleteIfDead(GlobalValue &GV,
+                         SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
+  GV.removeDeadConstantUsers();
+
+  if (!GV.isDiscardableIfUnused() && !GV.isDeclaration())
+    return false;
+
+  if (const Comdat *C = GV.getComdat())
+    if (!GV.hasLocalLinkage() && NotDiscardableComdats.count(C))
+      return false;
+
+  bool Dead;
+  if (auto *F = dyn_cast<Function>(&GV))
+    Dead = (F->isDeclaration() && F->use_empty()) || F->isDefTriviallyDead();
+  else
+    Dead = GV.use_empty();
+  if (!Dead)
+    return false;
+
+  DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n");
+  GV.eraseFromParent();
+  ++NumDeleted;
+  return true;
+}
+
+static bool isPointerValueDeadOnEntryToFunction(
+    const Function *F, GlobalValue *GV,
+    function_ref<DominatorTree &(Function &)> LookupDomTree) {
+  // Find all uses of GV. We expect them all to be in F, and if we can't
+  // identify any of the uses we bail out.
+  //
+  // On each of these uses, identify if the memory that GV points to is
+  // used/required/live at the start of the function. If it is not, for example
+  // if the first thing the function does is store to the GV, the GV can
+  // possibly be demoted.
+  //
+  // We don't do an exhaustive search for memory operations - simply look
+  // through bitcasts as they're quite common and benign.
+  const DataLayout &DL = GV->getParent()->getDataLayout();
+  SmallVector<LoadInst *, 4> Loads;
+  SmallVector<StoreInst *, 4> Stores;
+  for (auto *U : GV->users()) {
+    if (Operator::getOpcode(U) == Instruction::BitCast) {
+      for (auto *UU : U->users()) {
+        if (auto *LI = dyn_cast<LoadInst>(UU))
+          Loads.push_back(LI);
+        else if (auto *SI = dyn_cast<StoreInst>(UU))
+          Stores.push_back(SI);
+        else
+          return false;
+      }
+      continue;
+    }
+
+    Instruction *I = dyn_cast<Instruction>(U);
+    if (!I)
+      return false;
+    assert(I->getParent()->getParent() == F);
+
+    if (auto *LI = dyn_cast<LoadInst>(I))
+      Loads.push_back(LI);
+    else if (auto *SI = dyn_cast<StoreInst>(I))
+      Stores.push_back(SI);
+    else
+      return false;
+  }
+
+  // We have identified all uses of GV into loads and stores. Now check if all
+  // of them are known not to depend on the value of the global at the function
+  // entry point. We do this by ensuring that every load is dominated by at
+  // least one store.
+  auto &DT = LookupDomTree(*const_cast<Function *>(F));
+
+  // The below check is quadratic. Check we're not going to do too many tests.
+  // FIXME: Even though this will always have worst-case quadratic time, we
+  // could put effort into minimizing the average time by putting stores that
+  // have been shown to dominate at least one load at the beginning of the
+  // Stores array, making subsequent dominance checks more likely to succeed
+  // early.
+  //
+  // The threshold here is fairly large because global->local demotion is a
+  // very powerful optimization should it fire.
+  const unsigned Threshold = 100;
+  if (Loads.size() * Stores.size() > Threshold)
+    return false;
+
+  for (auto *L : Loads) {
+    auto *LTy = L->getType();
+    if (none_of(Stores, [&](const StoreInst *S) {
+          auto *STy = S->getValueOperand()->getType();
+          // The load is only dominated by the store if DomTree says so
+          // and the number of bits loaded in L is less than or equal to
+          // the number of bits stored in S.
+          return DT.dominates(S, L) &&
+                 DL.getTypeStoreSize(LTy) <= DL.getTypeStoreSize(STy);
+        }))
+      return false;
+  }
+  // All loads have known dependences inside F, so the global can be localized.
+  return true;
+}
+
+/// C may have non-instruction users. Can all of those users be turned into
+/// instructions?
+static bool allNonInstructionUsersCanBeMadeInstructions(Constant *C) {
+  // We don't do this exhaustively. The most common pattern that we really need
+  // to care about is a constant GEP or constant bitcast - so just looking
+  // through one single ConstantExpr.
+  //
+  // The set of constants that this function returns true for must be able to be
+  // handled by makeAllConstantUsesInstructions.
+  for (auto *U : C->users()) {
+    if (isa<Instruction>(U))
+      continue;
+    if (!isa<ConstantExpr>(U))
+      // Non instruction, non-constantexpr user; cannot convert this.
+      return false;
+    for (auto *UU : U->users())
+      if (!isa<Instruction>(UU))
+        // A constantexpr used by another constant. We don't try and recurse any
+        // further but just bail out at this point.
+        return false;
+  }
+
+  return true;
+}
+
+/// C may have non-instruction users, and
+/// allNonInstructionUsersCanBeMadeInstructions has returned true. Convert the
+/// non-instruction users to instructions.
+static void makeAllConstantUsesInstructions(Constant *C) {
+  SmallVector<ConstantExpr*,4> Users;
+  for (auto *U : C->users()) {
+    if (isa<ConstantExpr>(U))
+      Users.push_back(cast<ConstantExpr>(U));
+    else
+      // We should never get here; allNonInstructionUsersCanBeMadeInstructions
+      // should not have returned true for C.
+      assert(
+          isa<Instruction>(U) &&
+          "Can't transform non-constantexpr non-instruction to instruction!");
+  }
+
+  SmallVector<Value*,4> UUsers;
+  for (auto *U : Users) {
+    UUsers.clear();
+    for (auto *UU : U->users())
+      UUsers.push_back(UU);
+    for (auto *UU : UUsers) {
+      Instruction *UI = cast<Instruction>(UU);
+      Instruction *NewU = U->getAsInstruction();
+      NewU->insertBefore(UI);
+      UI->replaceUsesOfWith(U, NewU);
+    }
+    // We've replaced all the uses, so destroy the constant. (destroyConstant
+    // will update value handles and metadata.)
+    U->destroyConstant();
+  }
+}
+
+/// Analyze the specified global variable and optimize
+/// it if possible.  If we make a change, return true.
+static bool processInternalGlobal(
+    GlobalVariable *GV, const GlobalStatus &GS, TargetLibraryInfo *TLI,
+    function_ref<DominatorTree &(Function &)> LookupDomTree) {
+  auto &DL = GV->getParent()->getDataLayout();
+  // If this is a first class global and has only one accessing function and
+  // this function is non-recursive, we replace the global with a local alloca
+  // in this function.
+  //
+  // NOTE: It doesn't make sense to promote non-single-value types since we
+  // are just replacing static memory to stack memory.
+  //
+  // If the global is in different address space, don't bring it to stack.
+  if (!GS.HasMultipleAccessingFunctions &&
+      GS.AccessingFunction &&
+      GV->getValueType()->isSingleValueType() &&
+      GV->getType()->getAddressSpace() == 0 &&
+      !GV->isExternallyInitialized() &&
+      allNonInstructionUsersCanBeMadeInstructions(GV) &&
+      GS.AccessingFunction->doesNotRecurse() &&
+      isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV,
+                                          LookupDomTree)) {
+    const DataLayout &DL = GV->getParent()->getDataLayout();
+
+    DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
+    Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
+                                                   ->getEntryBlock().begin());
+    Type *ElemTy = GV->getValueType();
+    // FIXME: Pass Global's alignment when globals have alignment
+    AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(), nullptr,
+                                        GV->getName(), &FirstI);
+    if (!isa<UndefValue>(GV->getInitializer()))
+      new StoreInst(GV->getInitializer(), Alloca, &FirstI);
+
+    makeAllConstantUsesInstructions(GV);
+
+    GV->replaceAllUsesWith(Alloca);
+    GV->eraseFromParent();
+    ++NumLocalized;
+    return true;
+  }
+
+  // If the global is never loaded (but may be stored to), it is dead.
+  // Delete it now.
+  if (!GS.IsLoaded) {
+    DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n");
+
+    bool Changed;
+    if (isLeakCheckerRoot(GV)) {
+      // Delete any constant stores to the global.
+      Changed = CleanupPointerRootUsers(GV, TLI);
+    } else {
+      // Delete any stores we can find to the global.  We may not be able to
+      // make it completely dead though.
+      Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI);
+    }
+
+    // If the global is dead now, delete it.
+    if (GV->use_empty()) {
+      GV->eraseFromParent();
+      ++NumDeleted;
+      Changed = true;
+    }
+    return Changed;
+
+  }
+  if (GS.StoredType <= GlobalStatus::InitializerStored) {
+    DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
+    GV->setConstant(true);
+
+    // Clean up any obviously simplifiable users now.
+    CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI);
+
+    // If the global is dead now, just nuke it.
+    if (GV->use_empty()) {
+      DEBUG(dbgs() << "   *** Marking constant allowed us to simplify "
+            << "all users and delete global!\n");
+      GV->eraseFromParent();
+      ++NumDeleted;
+      return true;
+    }
+
+    // Fall through to the next check; see if we can optimize further.
+    ++NumMarked;
+  }
+  if (!GV->getInitializer()->getType()->isSingleValueType()) {
+    const DataLayout &DL = GV->getParent()->getDataLayout();
+    if (SRAGlobal(GV, DL))
+      return true;
+  }
+  if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) {
+    // If the initial value for the global was an undef value, and if only
+    // one other value was stored into it, we can just change the
+    // initializer to be the stored value, then delete all stores to the
+    // global.  This allows us to mark it constant.
+    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
+      if (isa<UndefValue>(GV->getInitializer())) {
+        // Change the initial value here.
+        GV->setInitializer(SOVConstant);
+
+        // Clean up any obviously simplifiable users now.
+        CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI);
+
+        if (GV->use_empty()) {
+          DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
+                       << "simplify all users and delete global!\n");
+          GV->eraseFromParent();
+          ++NumDeleted;
+        }
+        ++NumSubstitute;
+        return true;
+      }
+
+    // Try to optimize globals based on the knowledge that only one value
+    // (besides its initializer) is ever stored to the global.
+    if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL, TLI))
+      return true;
+
+    // Otherwise, if the global was not a boolean, we can shrink it to be a
+    // boolean.
+    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) {
+      if (GS.Ordering == AtomicOrdering::NotAtomic) {
+        if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
+          ++NumShrunkToBool;
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+/// Analyze the specified global variable and optimize it if possible.  If we
+/// make a change, return true.
+static bool
+processGlobal(GlobalValue &GV, TargetLibraryInfo *TLI,
+              function_ref<DominatorTree &(Function &)> LookupDomTree) {
+  if (GV.getName().startswith("llvm."))
+    return false;
+
+  GlobalStatus GS;
+
+  if (GlobalStatus::analyzeGlobal(&GV, GS))
+    return false;
+
+  bool Changed = false;
+  if (!GS.IsCompared && !GV.hasGlobalUnnamedAddr()) {
+    auto NewUnnamedAddr = GV.hasLocalLinkage() ? GlobalValue::UnnamedAddr::Global
+                                               : GlobalValue::UnnamedAddr::Local;
+    if (NewUnnamedAddr != GV.getUnnamedAddr()) {
+      GV.setUnnamedAddr(NewUnnamedAddr);
+      NumUnnamed++;
+      Changed = true;
+    }
+  }
+
+  // Do more involved optimizations if the global is internal.
+  if (!GV.hasLocalLinkage())
+    return Changed;
+
+  auto *GVar = dyn_cast<GlobalVariable>(&GV);
+  if (!GVar)
+    return Changed;
+
+  if (GVar->isConstant() || !GVar->hasInitializer())
+    return Changed;
+
+  return processInternalGlobal(GVar, GS, TLI, LookupDomTree) || Changed;
+}
+
+/// Walk all of the direct calls of the specified function, changing them to
+/// FastCC.
+static void ChangeCalleesToFastCall(Function *F) {
+  for (User *U : F->users()) {
+    if (isa<BlockAddress>(U))
+      continue;
+    CallSite CS(cast<Instruction>(U));
+    CS.setCallingConv(CallingConv::Fast);
+  }
+}
+
+static AttributeList StripNest(LLVMContext &C, AttributeList Attrs) {
+  // There can be at most one attribute set with a nest attribute.
+  unsigned NestIndex;
+  if (Attrs.hasAttrSomewhere(Attribute::Nest, &NestIndex))
+    return Attrs.removeAttribute(C, NestIndex, Attribute::Nest);
+  return Attrs;
+}
+
+static void RemoveNestAttribute(Function *F) {
+  F->setAttributes(StripNest(F->getContext(), F->getAttributes()));
+  for (User *U : F->users()) {
+    if (isa<BlockAddress>(U))
+      continue;
+    CallSite CS(cast<Instruction>(U));
+    CS.setAttributes(StripNest(F->getContext(), CS.getAttributes()));
+  }
+}
+
+/// Return true if this is a calling convention that we'd like to change.  The
+/// idea here is that we don't want to mess with the convention if the user
+/// explicitly requested something with performance implications like coldcc,
+/// GHC, or anyregcc.
+static bool isProfitableToMakeFastCC(Function *F) {
+  CallingConv::ID CC = F->getCallingConv();
+  // FIXME: Is it worth transforming x86_stdcallcc and x86_fastcallcc?
+  return CC == CallingConv::C || CC == CallingConv::X86_ThisCall;
+}
+
+static bool
+OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
+                  function_ref<DominatorTree &(Function &)> LookupDomTree,
+                  SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
+  bool Changed = false;
+  // Optimize functions.
+  for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
+    Function *F = &*FI++;
+    // Functions without names cannot be referenced outside this module.
+    if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())
+      F->setLinkage(GlobalValue::InternalLinkage);
+
+    if (deleteIfDead(*F, NotDiscardableComdats)) {
+      Changed = true;
+      continue;
+    }
+
+    Changed |= processGlobal(*F, TLI, LookupDomTree);
+
+    if (!F->hasLocalLinkage())
+      continue;
+    if (isProfitableToMakeFastCC(F) && !F->isVarArg() &&
+        !F->hasAddressTaken()) {
+      // If this function has a calling convention worth changing, is not a
+      // varargs function, and is only called directly, promote it to use the
+      // Fast calling convention.
+      F->setCallingConv(CallingConv::Fast);
+      ChangeCalleesToFastCall(F);
+      ++NumFastCallFns;
+      Changed = true;
+    }
+
+    if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
+        !F->hasAddressTaken()) {
+      // The function is not used by a trampoline intrinsic, so it is safe
+      // to remove the 'nest' attribute.
+      RemoveNestAttribute(F);
+      ++NumNestRemoved;
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+static bool
+OptimizeGlobalVars(Module &M, TargetLibraryInfo *TLI,
+                   function_ref<DominatorTree &(Function &)> LookupDomTree,
+                   SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
+  bool Changed = false;
+
+  for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+       GVI != E; ) {
+    GlobalVariable *GV = &*GVI++;
+    // Global variables without names cannot be referenced outside this module.
+    if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage())
+      GV->setLinkage(GlobalValue::InternalLinkage);
+    // Simplify the initializer.
+    if (GV->hasInitializer())
+      if (auto *C = dyn_cast<Constant>(GV->getInitializer())) {
+        auto &DL = M.getDataLayout();
+        Constant *New = ConstantFoldConstant(C, DL, TLI);
+        if (New && New != C)
+          GV->setInitializer(New);
+      }
+
+    if (deleteIfDead(*GV, NotDiscardableComdats)) {
+      Changed = true;
+      continue;
+    }
+
+    Changed |= processGlobal(*GV, TLI, LookupDomTree);
+  }
+  return Changed;
+}
+
+/// Evaluate a piece of a constantexpr store into a global initializer.  This
+/// returns 'Init' modified to reflect 'Val' stored into it.  At this point, the
+/// GEP operands of Addr [0, OpNo) have been stepped into.
+static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
+                                   ConstantExpr *Addr, unsigned OpNo) {
+  // Base case of the recursion.
+  if (OpNo == Addr->getNumOperands()) {
+    assert(Val->getType() == Init->getType() && "Type mismatch!");
+    return Val;
+  }
+
+  SmallVector<Constant*, 32> Elts;
+  if (StructType *STy = dyn_cast<StructType>(Init->getType())) {
+    // Break up the constant into its elements.
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+      Elts.push_back(Init->getAggregateElement(i));
+
+    // Replace the element that we are supposed to.
+    ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo));
+    unsigned Idx = CU->getZExtValue();
+    assert(Idx < STy->getNumElements() && "Struct index out of range!");
+    Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1);
+
+    // Return the modified struct.
+    return ConstantStruct::get(STy, Elts);
+  }
+
+  ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
+  SequentialType *InitTy = cast<SequentialType>(Init->getType());
+  uint64_t NumElts = InitTy->getNumElements();
+
+  // Break up the array into elements.
+  for (uint64_t i = 0, e = NumElts; i != e; ++i)
+    Elts.push_back(Init->getAggregateElement(i));
+
+  assert(CI->getZExtValue() < NumElts);
+  Elts[CI->getZExtValue()] =
+    EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1);
+
+  if (Init->getType()->isArrayTy())
+    return ConstantArray::get(cast<ArrayType>(InitTy), Elts);
+  return ConstantVector::get(Elts);
+}
+
+/// We have decided that Addr (which satisfies the predicate
+/// isSimpleEnoughPointerToCommit) should get Val as its value.  Make it happen.
+static void CommitValueTo(Constant *Val, Constant *Addr) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    assert(GV->hasInitializer());
+    GV->setInitializer(Val);
+    return;
+  }
+
+  ConstantExpr *CE = cast<ConstantExpr>(Addr);
+  GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+  GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2));
+}
+
+/// Evaluate static constructors in the function, if we can.  Return true if we
+/// can, false otherwise.
+static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
+                                      TargetLibraryInfo *TLI) {
+  // Call the function.
+  Evaluator Eval(DL, TLI);
+  Constant *RetValDummy;
+  bool EvalSuccess = Eval.EvaluateFunction(F, RetValDummy,
+                                           SmallVector<Constant*, 0>());
+
+  if (EvalSuccess) {
+    ++NumCtorsEvaluated;
+
+    // We succeeded at evaluation: commit the result.
+    DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
+          << F->getName() << "' to " << Eval.getMutatedMemory().size()
+          << " stores.\n");
+    for (const auto &I : Eval.getMutatedMemory())
+      CommitValueTo(I.second, I.first);
+    for (GlobalVariable *GV : Eval.getInvariants())
+      GV->setConstant(true);
+  }
+
+  return EvalSuccess;
+}
+
+static int compareNames(Constant *const *A, Constant *const *B) {
+  Value *AStripped = (*A)->stripPointerCastsNoFollowAliases();
+  Value *BStripped = (*B)->stripPointerCastsNoFollowAliases();
+  return AStripped->getName().compare(BStripped->getName());
+}
+
+static void setUsedInitializer(GlobalVariable &V,
+                               const SmallPtrSet<GlobalValue *, 8> &Init) {
+  if (Init.empty()) {
+    V.eraseFromParent();
+    return;
+  }
+
+  // Type of pointer to the array of pointers.
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext(), 0);
+
+  SmallVector<llvm::Constant *, 8> UsedArray;
+  for (GlobalValue *GV : Init) {
+    Constant *Cast
+      = ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, Int8PtrTy);
+    UsedArray.push_back(Cast);
+  }
+  // Sort to get deterministic order.
+  array_pod_sort(UsedArray.begin(), UsedArray.end(), compareNames);
+  ArrayType *ATy = ArrayType::get(Int8PtrTy, UsedArray.size());
+
+  Module *M = V.getParent();
+  V.removeFromParent();
+  GlobalVariable *NV =
+      new GlobalVariable(*M, ATy, false, llvm::GlobalValue::AppendingLinkage,
+                         llvm::ConstantArray::get(ATy, UsedArray), "");
+  NV->takeName(&V);
+  NV->setSection("llvm.metadata");
+  delete &V;
+}
+
+namespace {
+/// An easy to access representation of llvm.used and llvm.compiler.used.
+class LLVMUsed {
+  SmallPtrSet<GlobalValue *, 8> Used;
+  SmallPtrSet<GlobalValue *, 8> CompilerUsed;
+  GlobalVariable *UsedV;
+  GlobalVariable *CompilerUsedV;
+
+public:
+  LLVMUsed(Module &M) {
+    UsedV = collectUsedGlobalVariables(M, Used, false);
+    CompilerUsedV = collectUsedGlobalVariables(M, CompilerUsed, true);
+  }
+  typedef SmallPtrSet<GlobalValue *, 8>::iterator iterator;
+  typedef iterator_range<iterator> used_iterator_range;
+  iterator usedBegin() { return Used.begin(); }
+  iterator usedEnd() { return Used.end(); }
+  used_iterator_range used() {
+    return used_iterator_range(usedBegin(), usedEnd());
+  }
+  iterator compilerUsedBegin() { return CompilerUsed.begin(); }
+  iterator compilerUsedEnd() { return CompilerUsed.end(); }
+  used_iterator_range compilerUsed() {
+    return used_iterator_range(compilerUsedBegin(), compilerUsedEnd());
+  }
+  bool usedCount(GlobalValue *GV) const { return Used.count(GV); }
+  bool compilerUsedCount(GlobalValue *GV) const {
+    return CompilerUsed.count(GV);
+  }
+  bool usedErase(GlobalValue *GV) { return Used.erase(GV); }
+  bool compilerUsedErase(GlobalValue *GV) { return CompilerUsed.erase(GV); }
+  bool usedInsert(GlobalValue *GV) { return Used.insert(GV).second; }
+  bool compilerUsedInsert(GlobalValue *GV) {
+    return CompilerUsed.insert(GV).second;
+  }
+
+  void syncVariablesAndSets() {
+    if (UsedV)
+      setUsedInitializer(*UsedV, Used);
+    if (CompilerUsedV)
+      setUsedInitializer(*CompilerUsedV, CompilerUsed);
+  }
+};
+}
+
+static bool hasUseOtherThanLLVMUsed(GlobalAlias &GA, const LLVMUsed &U) {
+  if (GA.use_empty()) // No use at all.
+    return false;
+
+  assert((!U.usedCount(&GA) || !U.compilerUsedCount(&GA)) &&
+         "We should have removed the duplicated "
+         "element from llvm.compiler.used");
+  if (!GA.hasOneUse())
+    // Strictly more than one use. So at least one is not in llvm.used and
+    // llvm.compiler.used.
+    return true;
+
+  // Exactly one use. Check if it is in llvm.used or llvm.compiler.used.
+  return !U.usedCount(&GA) && !U.compilerUsedCount(&GA);
+}
+
+static bool hasMoreThanOneUseOtherThanLLVMUsed(GlobalValue &V,
+                                               const LLVMUsed &U) {
+  unsigned N = 2;
+  assert((!U.usedCount(&V) || !U.compilerUsedCount(&V)) &&
+         "We should have removed the duplicated "
+         "element from llvm.compiler.used");
+  if (U.usedCount(&V) || U.compilerUsedCount(&V))
+    ++N;
+  return V.hasNUsesOrMore(N);
+}
+
+static bool mayHaveOtherReferences(GlobalAlias &GA, const LLVMUsed &U) {
+  if (!GA.hasLocalLinkage())
+    return true;
+
+  return U.usedCount(&GA) || U.compilerUsedCount(&GA);
+}
+
+static bool hasUsesToReplace(GlobalAlias &GA, const LLVMUsed &U,
+                             bool &RenameTarget) {
+  RenameTarget = false;
+  bool Ret = false;
+  if (hasUseOtherThanLLVMUsed(GA, U))
+    Ret = true;
+
+  // If the alias is externally visible, we may still be able to simplify it.
+  if (!mayHaveOtherReferences(GA, U))
+    return Ret;
+
+  // If the aliasee has internal linkage, give it the name and linkage
+  // of the alias, and delete the alias.  This turns:
+  //   define internal ... @f(...)
+  //   @a = alias ... @f
+  // into:
+  //   define ... @a(...)
+  Constant *Aliasee = GA.getAliasee();
+  GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts());
+  if (!Target->hasLocalLinkage())
+    return Ret;
+
+  // Do not perform the transform if multiple aliases potentially target the
+  // aliasee. This check also ensures that it is safe to replace the section
+  // and other attributes of the aliasee with those of the alias.
+  if (hasMoreThanOneUseOtherThanLLVMUsed(*Target, U))
+    return Ret;
+
+  RenameTarget = true;
+  return true;
+}
+
+static bool
+OptimizeGlobalAliases(Module &M,
+                      SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
+  bool Changed = false;
+  LLVMUsed Used(M);
+
+  for (GlobalValue *GV : Used.used())
+    Used.compilerUsedErase(GV);
+
+  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+       I != E;) {
+    GlobalAlias *J = &*I++;
+
+    // Aliases without names cannot be referenced outside this module.
+    if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage())
+      J->setLinkage(GlobalValue::InternalLinkage);
+
+    if (deleteIfDead(*J, NotDiscardableComdats)) {
+      Changed = true;
+      continue;
+    }
+
+    // If the aliasee may change at link time, nothing can be done - bail out.
+    if (J->isInterposable())
+      continue;
+
+    Constant *Aliasee = J->getAliasee();
+    GlobalValue *Target = dyn_cast<GlobalValue>(Aliasee->stripPointerCasts());
+    // We can't trivially replace the alias with the aliasee if the aliasee is
+    // non-trivial in some way.
+    // TODO: Try to handle non-zero GEPs of local aliasees.
+    if (!Target)
+      continue;
+    Target->removeDeadConstantUsers();
+
+    // Make all users of the alias use the aliasee instead.
+    bool RenameTarget;
+    if (!hasUsesToReplace(*J, Used, RenameTarget))
+      continue;
+
+    J->replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J->getType()));
+    ++NumAliasesResolved;
+    Changed = true;
+
+    if (RenameTarget) {
+      // Give the aliasee the name, linkage and other attributes of the alias.
+      Target->takeName(&*J);
+      Target->setLinkage(J->getLinkage());
+      Target->setVisibility(J->getVisibility());
+      Target->setDLLStorageClass(J->getDLLStorageClass());
+
+      if (Used.usedErase(&*J))
+        Used.usedInsert(Target);
+
+      if (Used.compilerUsedErase(&*J))
+        Used.compilerUsedInsert(Target);
+    } else if (mayHaveOtherReferences(*J, Used))
+      continue;
+
+    // Delete the alias.
+    M.getAliasList().erase(J);
+    ++NumAliasesRemoved;
+    Changed = true;
+  }
+
+  Used.syncVariablesAndSets();
+
+  return Changed;
+}
+
+static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
+  LibFunc F = LibFunc_cxa_atexit;
+  if (!TLI->has(F))
+    return nullptr;
+
+  Function *Fn = M.getFunction(TLI->getName(F));
+  if (!Fn)
+    return nullptr;
+
+  // Make sure that the function has the correct prototype.
+  if (!TLI->getLibFunc(*Fn, F) || F != LibFunc_cxa_atexit)
+    return nullptr;
+
+  return Fn;
+}
+
+/// Returns whether the given function is an empty C++ destructor and can
+/// therefore be eliminated.
+/// Note that we assume that other optimization passes have already simplified
+/// the code so we only look for a function with a single basic block, where
+/// the only allowed instructions are 'ret', 'call' to an empty C++ dtor and
+/// other side-effect free instructions.
+static bool cxxDtorIsEmpty(const Function &Fn,
+                           SmallPtrSet<const Function *, 8> &CalledFunctions) {
+  // FIXME: We could eliminate C++ destructors if they're readonly/readnone and
+  // nounwind, but that doesn't seem worth doing.
+  if (Fn.isDeclaration())
+    return false;
+
+  if (++Fn.begin() != Fn.end())
+    return false;
+
+  const BasicBlock &EntryBlock = Fn.getEntryBlock();
+  for (BasicBlock::const_iterator I = EntryBlock.begin(), E = EntryBlock.end();
+       I != E; ++I) {
+    if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+      // Ignore debug intrinsics.
+      if (isa<DbgInfoIntrinsic>(CI))
+        continue;
+
+      const Function *CalledFn = CI->getCalledFunction();
+
+      if (!CalledFn)
+        return false;
+
+      SmallPtrSet<const Function *, 8> NewCalledFunctions(CalledFunctions);
+
+      // Don't treat recursive functions as empty.
+      if (!NewCalledFunctions.insert(CalledFn).second)
+        return false;
+
+      if (!cxxDtorIsEmpty(*CalledFn, NewCalledFunctions))
+        return false;
+    } else if (isa<ReturnInst>(*I))
+      return true; // We're done.
+    else if (I->mayHaveSideEffects())
+      return false; // Destructor with side effects, bail.
+  }
+
+  return false;
+}
+
+static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
+  /// Itanium C++ ABI p3.3.5:
+  ///
+  ///   After constructing a global (or local static) object, that will require
+  ///   destruction on exit, a termination function is registered as follows:
+  ///
+  ///   extern "C" int __cxa_atexit ( void (*f)(void *), void *p, void *d );
+  ///
+  ///   This registration, e.g. __cxa_atexit(f,p,d), is intended to cause the
+  ///   call f(p) when DSO d is unloaded, before all such termination calls
+  ///   registered before this one. It returns zero if registration is
+  ///   successful, nonzero on failure.
+
+  // This pass will look for calls to __cxa_atexit where the function is trivial
+  // and remove them.
+  bool Changed = false;
+
+  for (auto I = CXAAtExitFn->user_begin(), E = CXAAtExitFn->user_end();
+       I != E;) {
+    // We're only interested in calls. Theoretically, we could handle invoke
+    // instructions as well, but neither llvm-gcc nor clang generate invokes
+    // to __cxa_atexit.
+    CallInst *CI = dyn_cast<CallInst>(*I++);
+    if (!CI)
+      continue;
+
+    Function *DtorFn =
+      dyn_cast<Function>(CI->getArgOperand(0)->stripPointerCasts());
+    if (!DtorFn)
+      continue;
+
+    SmallPtrSet<const Function *, 8> CalledFunctions;
+    if (!cxxDtorIsEmpty(*DtorFn, CalledFunctions))
+      continue;
+
+    // Just remove the call.
+    CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
+    CI->eraseFromParent();
+
+    ++NumCXXDtorsRemoved;
+
+    Changed |= true;
+  }
+
+  return Changed;
+}
+
+static bool optimizeGlobalsInModule(
+    Module &M, const DataLayout &DL, TargetLibraryInfo *TLI,
+    function_ref<DominatorTree &(Function &)> LookupDomTree) {
+  SmallSet<const Comdat *, 8> NotDiscardableComdats;
+  bool Changed = false;
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+
+    NotDiscardableComdats.clear();
+    for (const GlobalVariable &GV : M.globals())
+      if (const Comdat *C = GV.getComdat())
+        if (!GV.isDiscardableIfUnused() || !GV.use_empty())
+          NotDiscardableComdats.insert(C);
+    for (Function &F : M)
+      if (const Comdat *C = F.getComdat())
+        if (!F.isDefTriviallyDead())
+          NotDiscardableComdats.insert(C);
+    for (GlobalAlias &GA : M.aliases())
+      if (const Comdat *C = GA.getComdat())
+        if (!GA.isDiscardableIfUnused() || !GA.use_empty())
+          NotDiscardableComdats.insert(C);
+
+    // Delete functions that are trivially dead, ccc -> fastcc
+    LocalChange |=
+        OptimizeFunctions(M, TLI, LookupDomTree, NotDiscardableComdats);
+
+    // Optimize global_ctors list.
+    LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) {
+      return EvaluateStaticConstructor(F, DL, TLI);
+    });
+
+    // Optimize non-address-taken globals.
+    LocalChange |= OptimizeGlobalVars(M, TLI, LookupDomTree,
+                                      NotDiscardableComdats);
+
+    // Resolve aliases, when possible.
+    LocalChange |= OptimizeGlobalAliases(M, NotDiscardableComdats);
+
+    // Try to remove trivial global destructors if they are not removed
+    // already.
+    Function *CXAAtExitFn = FindCXAAtExit(M, TLI);
+    if (CXAAtExitFn)
+      LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn);
+
+    Changed |= LocalChange;
+  }
+
+  // TODO: Move all global ctors functions to the end of the module for code
+  // layout.
+
+  return Changed;
+}
+
+PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
+    auto &DL = M.getDataLayout();
+    auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+    auto &FAM =
+        AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+    auto LookupDomTree = [&FAM](Function &F) -> DominatorTree &{
+      return FAM.getResult<DominatorTreeAnalysis>(F);
+    };
+    if (!optimizeGlobalsInModule(M, DL, &TLI, LookupDomTree))
+      return PreservedAnalyses::all();
+    return PreservedAnalyses::none();
+}
+
+namespace {
+struct GlobalOptLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  GlobalOptLegacyPass() : ModulePass(ID) {
+    initializeGlobalOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    auto &DL = M.getDataLayout();
+    auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto LookupDomTree = [this](Function &F) -> DominatorTree & {
+      return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+    };
+    return optimizeGlobalsInModule(M, DL, TLI, LookupDomTree);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+};
+}
+
+char GlobalOptLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(GlobalOptLegacyPass, "globalopt",
+                      "Global Variable Optimizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GlobalOptLegacyPass, "globalopt",
+                    "Global Variable Optimizer", false, false)
+
+ModulePass *llvm::createGlobalOptimizerPass() {
+  return new GlobalOptLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp
new file mode 100644
index 000000000000..e47d881d1127
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp
@@ -0,0 +1,180 @@
+//===- GlobalSplit.cpp - global variable splitter -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass uses inrange annotations on GEP indices to split globals where
+// beneficial. Clang currently attaches these annotations to references to
+// virtual table globals under the Itanium ABI for the benefit of the
+// whole-program virtual call optimization and control flow integrity passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/GlobalSplit.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+
+#include <set>
+
+using namespace llvm;
+
+namespace {
+
+bool splitGlobal(GlobalVariable &GV) {
+  // If the address of the global is taken outside of the module, we cannot
+  // apply this transformation.
+  if (!GV.hasLocalLinkage())
+    return false;
+
+  // We currently only know how to split ConstantStructs.
+  auto *Init = dyn_cast_or_null<ConstantStruct>(GV.getInitializer());
+  if (!Init)
+    return false;
+
+  // Verify that each user of the global is an inrange getelementptr constant.
+  // From this it follows that any loads from or stores to that global must use
+  // a pointer derived from an inrange getelementptr constant, which is
+  // sufficient to allow us to apply the splitting transform.
+  for (User *U : GV.users()) {
+    if (!isa<Constant>(U))
+      return false;
+
+    auto *GEP = dyn_cast<GEPOperator>(U);
+    if (!GEP || !GEP->getInRangeIndex() || *GEP->getInRangeIndex() != 1 ||
+        !isa<ConstantInt>(GEP->getOperand(1)) ||
+        !cast<ConstantInt>(GEP->getOperand(1))->isZero() ||
+        !isa<ConstantInt>(GEP->getOperand(2)))
+      return false;
+  }
+
+  SmallVector<MDNode *, 2> Types;
+  GV.getMetadata(LLVMContext::MD_type, Types);
+
+  const DataLayout &DL = GV.getParent()->getDataLayout();
+  const StructLayout *SL = DL.getStructLayout(Init->getType());
+
+  IntegerType *Int32Ty = Type::getInt32Ty(GV.getContext());
+
+  std::vector<GlobalVariable *> SplitGlobals(Init->getNumOperands());
+  for (unsigned I = 0; I != Init->getNumOperands(); ++I) {
+    // Build a global representing this split piece.
+    auto *SplitGV =
+        new GlobalVariable(*GV.getParent(), Init->getOperand(I)->getType(),
+                           GV.isConstant(), GlobalValue::PrivateLinkage,
+                           Init->getOperand(I), GV.getName() + "." + utostr(I));
+    SplitGlobals[I] = SplitGV;
+
+    unsigned SplitBegin = SL->getElementOffset(I);
+    unsigned SplitEnd = (I == Init->getNumOperands() - 1)
+                            ? SL->getSizeInBytes()
+                            : SL->getElementOffset(I + 1);
+
+    // Rebuild type metadata, adjusting by the split offset.
+    // FIXME: See if we can use DW_OP_piece to preserve debug metadata here.
+    for (MDNode *Type : Types) {
+      uint64_t ByteOffset = cast<ConstantInt>(
+              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+              ->getZExtValue();
+      // Type metadata may be attached one byte after the end of the vtable, for
+      // classes without virtual methods in Itanium ABI. AFAIK, it is never
+      // attached to the first byte of a vtable. Subtract one to get the right
+      // slice.
+      // This is making an assumption that vtable groups are the only kinds of
+      // global variables that !type metadata can be attached to, and that they
+      // are either Itanium ABI vtable groups or contain a single vtable (i.e.
+      // Microsoft ABI vtables).
+      uint64_t AttachedTo = (ByteOffset == 0) ? ByteOffset : ByteOffset - 1;
+      if (AttachedTo < SplitBegin || AttachedTo >= SplitEnd)
+        continue;
+      SplitGV->addMetadata(
+          LLVMContext::MD_type,
+          *MDNode::get(GV.getContext(),
+                       {ConstantAsMetadata::get(
+                            ConstantInt::get(Int32Ty, ByteOffset - SplitBegin)),
+                        Type->getOperand(1)}));
+    }
+  }
+
+  for (User *U : GV.users()) {
+    auto *GEP = cast<GEPOperator>(U);
+    unsigned I = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+    if (I >= SplitGlobals.size())
+      continue;
+
+    SmallVector<Value *, 4> Ops;
+    Ops.push_back(ConstantInt::get(Int32Ty, 0));
+    for (unsigned I = 3; I != GEP->getNumOperands(); ++I)
+      Ops.push_back(GEP->getOperand(I));
+
+    auto *NewGEP = ConstantExpr::getGetElementPtr(
+        SplitGlobals[I]->getInitializer()->getType(), SplitGlobals[I], Ops,
+        GEP->isInBounds());
+    GEP->replaceAllUsesWith(NewGEP);
+  }
+
+  // Finally, remove the original global. Any remaining uses refer to invalid
+  // elements of the global, so replace with undef.
+  if (!GV.use_empty())
+    GV.replaceAllUsesWith(UndefValue::get(GV.getType()));
+  GV.eraseFromParent();
+  return true;
+}
+
+bool splitGlobals(Module &M) {
+  // First, see if the module uses either of the llvm.type.test or
+  // llvm.type.checked.load intrinsics, which indicates that splitting globals
+  // may be beneficial.
+  Function *TypeTestFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+  Function *TypeCheckedLoadFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
+  if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
+      (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
+    return false;
+
+  bool Changed = false;
+  for (auto I = M.global_begin(); I != M.global_end();) {
+    GlobalVariable &GV = *I;
+    ++I;
+    Changed |= splitGlobal(GV);
+  }
+  return Changed;
+}
+
+struct GlobalSplit : public ModulePass {
+  static char ID;
+  GlobalSplit() : ModulePass(ID) {
+    initializeGlobalSplitPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) {
+    if (skipModule(M))
+      return false;
+
+    return splitGlobals(M);
+  }
+};
+
+}
+
+INITIALIZE_PASS(GlobalSplit, "globalsplit", "Global splitter", false, false)
+char GlobalSplit::ID = 0;
+
+ModulePass *llvm::createGlobalSplitPass() {
+  return new GlobalSplit;
+}
+
+PreservedAnalyses GlobalSplitPass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (!splitGlobals(M))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp b/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
new file mode 100644
index 000000000000..f79b61037f1d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -0,0 +1,286 @@
+//===-- IPConstantPropagation.cpp - Propagate constants through calls -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an _extremely_ simple interprocedural constant
+// propagation pass.  It could certainly be improved in many different ways,
+// like using a worklist.  This pass makes arguments dead, but does not remove
+// them.  The existing dead argument elimination pass should be run after this
+// to clean up the mess.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "ipconstprop"
+
+STATISTIC(NumArgumentsProped, "Number of args turned into constants");
+STATISTIC(NumReturnValProped, "Number of return values turned into constants");
+
+namespace {
+  /// IPCP - The interprocedural constant propagation pass
+  ///
+  struct IPCP : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    IPCP() : ModulePass(ID) {
+      initializeIPCPPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnModule(Module &M) override;
+  };
+}
+
+/// PropagateConstantsIntoArguments - Look at all uses of the specified
+/// function.  If all uses are direct call sites, and all pass a particular
+/// constant in for an argument, propagate that constant in as the argument.
+///
+static bool PropagateConstantsIntoArguments(Function &F) {
+  if (F.arg_empty() || F.use_empty()) return false; // No arguments? Early exit.
+
+  // For each argument, keep track of its constant value and whether it is a
+  // constant or not.  The bool is driven to true when found to be non-constant.
+  SmallVector<std::pair<Constant*, bool>, 16> ArgumentConstants;
+  ArgumentConstants.resize(F.arg_size());
+
+  unsigned NumNonconstant = 0;
+  for (Use &U : F.uses()) {
+    User *UR = U.getUser();
+    // Ignore blockaddress uses.
+    if (isa<BlockAddress>(UR)) continue;
+    
+    // Used by a non-instruction, or not the callee of a function, do not
+    // transform.
+    if (!isa<CallInst>(UR) && !isa<InvokeInst>(UR))
+      return false;
+    
+    CallSite CS(cast<Instruction>(UR));
+    if (!CS.isCallee(&U))
+      return false;
+
+    // Check out all of the potentially constant arguments.  Note that we don't
+    // inspect varargs here.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    Function::arg_iterator Arg = F.arg_begin();
+    for (unsigned i = 0, e = ArgumentConstants.size(); i != e;
+         ++i, ++AI, ++Arg) {
+      
+      // If this argument is known non-constant, ignore it.
+      if (ArgumentConstants[i].second)
+        continue;
+      
+      Constant *C = dyn_cast<Constant>(*AI);
+      if (C && ArgumentConstants[i].first == nullptr) {
+        ArgumentConstants[i].first = C;   // First constant seen.
+      } else if (C && ArgumentConstants[i].first == C) {
+        // Still the constant value we think it is.
+      } else if (*AI == &*Arg) {
+        // Ignore recursive calls passing argument down.
+      } else {
+        // Argument became non-constant.  If all arguments are non-constant now,
+        // give up on this function.
+        if (++NumNonconstant == ArgumentConstants.size())
+          return false;
+        ArgumentConstants[i].second = true;
+      }
+    }
+  }
+
+  // If we got to this point, there is a constant argument!
+  assert(NumNonconstant != ArgumentConstants.size());
+  bool MadeChange = false;
+  Function::arg_iterator AI = F.arg_begin();
+  for (unsigned i = 0, e = ArgumentConstants.size(); i != e; ++i, ++AI) {
+    // Do we have a constant argument?
+    if (ArgumentConstants[i].second || AI->use_empty() ||
+        AI->hasInAllocaAttr() || (AI->hasByValAttr() && !F.onlyReadsMemory()))
+      continue;
+  
+    Value *V = ArgumentConstants[i].first;
+    if (!V) V = UndefValue::get(AI->getType());
+    AI->replaceAllUsesWith(V);
+    ++NumArgumentsProped;
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+
+// Check to see if this function returns one or more constants. If so, replace
+// all callers that use those return values with the constant value. This will
+// leave in the actual return values and instructions, but deadargelim will
+// clean that up.
+//
+// Additionally if a function always returns one of its arguments directly,
+// callers will be updated to use the value they pass in directly instead of
+// using the return value.
+static bool PropagateConstantReturn(Function &F) {
+  if (F.getReturnType()->isVoidTy())
+    return false; // No return value.
+
+  // We can infer and propagate the return value only when we know that the
+  // definition we'll get at link time is *exactly* the definition we see now.
+  // For more details, see GlobalValue::mayBeDerefined.
+  if (!F.isDefinitionExact())
+    return false;
+
+  // Don't touch naked functions. The may contain asm returning
+  // value we don't see, so we may end up interprocedurally propagating
+  // the return value incorrectly.
+  if (F.hasFnAttribute(Attribute::Naked))
+    return false;
+
+  // Check to see if this function returns a constant.
+  SmallVector<Value *,4> RetVals;
+  StructType *STy = dyn_cast<StructType>(F.getReturnType());
+  if (STy)
+    for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) 
+      RetVals.push_back(UndefValue::get(STy->getElementType(i)));
+  else
+    RetVals.push_back(UndefValue::get(F.getReturnType()));
+
+  unsigned NumNonConstant = 0;
+  for (BasicBlock &BB : F)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+      for (unsigned i = 0, e = RetVals.size(); i != e; ++i) {
+        // Already found conflicting return values?
+        Value *RV = RetVals[i];
+        if (!RV)
+          continue;
+
+        // Find the returned value
+        Value *V;
+        if (!STy)
+          V = RI->getOperand(0);
+        else
+          V = FindInsertedValue(RI->getOperand(0), i);
+
+        if (V) {
+          // Ignore undefs, we can change them into anything
+          if (isa<UndefValue>(V))
+            continue;
+          
+          // Try to see if all the rets return the same constant or argument.
+          if (isa<Constant>(V) || isa<Argument>(V)) {
+            if (isa<UndefValue>(RV)) {
+              // No value found yet? Try the current one.
+              RetVals[i] = V;
+              continue;
+            }
+            // Returning the same value? Good.
+            if (RV == V)
+              continue;
+          }
+        }
+        // Different or no known return value? Don't propagate this return
+        // value.
+        RetVals[i] = nullptr;
+        // All values non-constant? Stop looking.
+        if (++NumNonConstant == RetVals.size())
+          return false;
+      }
+    }
+
+  // If we got here, the function returns at least one constant value.  Loop
+  // over all users, replacing any uses of the return value with the returned
+  // constant.
+  bool MadeChange = false;
+  for (Use &U : F.uses()) {
+    CallSite CS(U.getUser());
+    Instruction* Call = CS.getInstruction();
+
+    // Not a call instruction or a call instruction that's not calling F
+    // directly?
+    if (!Call || !CS.isCallee(&U))
+      continue;
+    
+    // Call result not used?
+    if (Call->use_empty())
+      continue;
+
+    MadeChange = true;
+
+    if (!STy) {
+      Value* New = RetVals[0];
+      if (Argument *A = dyn_cast<Argument>(New))
+        // Was an argument returned? Then find the corresponding argument in
+        // the call instruction and use that.
+        New = CS.getArgument(A->getArgNo());
+      Call->replaceAllUsesWith(New);
+      continue;
+    }
+
+    for (auto I = Call->user_begin(), E = Call->user_end(); I != E;) {
+      Instruction *Ins = cast<Instruction>(*I);
+
+      // Increment now, so we can remove the use
+      ++I;
+
+      // Find the index of the retval to replace with
+      int index = -1;
+      if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Ins))
+        if (EV->hasIndices())
+          index = *EV->idx_begin();
+
+      // If this use uses a specific return value, and we have a replacement,
+      // replace it.
+      if (index != -1) {
+        Value *New = RetVals[index];
+        if (New) {
+          if (Argument *A = dyn_cast<Argument>(New))
+            // Was an argument returned? Then find the corresponding argument in
+            // the call instruction and use that.
+            New = CS.getArgument(A->getArgNo());
+          Ins->replaceAllUsesWith(New);
+          Ins->eraseFromParent();
+        }
+      }
+    }
+  }
+
+  if (MadeChange) ++NumReturnValProped;
+  return MadeChange;
+}
+
+char IPCP::ID = 0;
+INITIALIZE_PASS(IPCP, "ipconstprop",
+                "Interprocedural constant propagation", false, false)
+
+ModulePass *llvm::createIPConstantPropagationPass() { return new IPCP(); }
+
+bool IPCP::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  bool Changed = false;
+  bool LocalChange = true;
+
+  // FIXME: instead of using smart algorithms, we just iterate until we stop
+  // making changes.
+  while (LocalChange) {
+    LocalChange = false;
+    for (Function &F : M)
+      if (!F.isDeclaration()) {
+        // Delete any klingons.
+        F.removeDeadConstantUsers();
+        if (F.hasLocalLinkage())
+          LocalChange |= PropagateConstantsIntoArguments(F);
+        Changed |= PropagateConstantReturn(F);
+      }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp
new file mode 100644
index 000000000000..5bb305ca84d0
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp
@@ -0,0 +1,123 @@
+//===-- IPO.cpp -----------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the common infrastructure (including C bindings) for
+// libLLVMIPO.a, which implements several transformations over the LLVM
+// intermediate representation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Transforms/IPO.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+
+using namespace llvm;
+
+void llvm::initializeIPO(PassRegistry &Registry) {
+  initializeArgPromotionPass(Registry);
+  initializeConstantMergeLegacyPassPass(Registry);
+  initializeCrossDSOCFIPass(Registry);
+  initializeDAEPass(Registry);
+  initializeDAHPass(Registry);
+  initializeForceFunctionAttrsLegacyPassPass(Registry);
+  initializeGlobalDCELegacyPassPass(Registry);
+  initializeGlobalOptLegacyPassPass(Registry);
+  initializeGlobalSplitPass(Registry);
+  initializeIPCPPass(Registry);
+  initializeAlwaysInlinerLegacyPassPass(Registry);
+  initializeSimpleInlinerPass(Registry);
+  initializeInferFunctionAttrsLegacyPassPass(Registry);
+  initializeInternalizeLegacyPassPass(Registry);
+  initializeLoopExtractorPass(Registry);
+  initializeBlockExtractorPassPass(Registry);
+  initializeSingleLoopExtractorPass(Registry);
+  initializeLowerTypeTestsPass(Registry);
+  initializeMergeFunctionsPass(Registry);
+  initializePartialInlinerLegacyPassPass(Registry);
+  initializePostOrderFunctionAttrsLegacyPassPass(Registry);
+  initializeReversePostOrderFunctionAttrsLegacyPassPass(Registry);
+  initializePruneEHPass(Registry);
+  initializeStripDeadPrototypesLegacyPassPass(Registry);
+  initializeStripSymbolsPass(Registry);
+  initializeStripDebugDeclarePass(Registry);
+  initializeStripDeadDebugInfoPass(Registry);
+  initializeStripNonDebugSymbolsPass(Registry);
+  initializeBarrierNoopPass(Registry);
+  initializeEliminateAvailableExternallyLegacyPassPass(Registry);
+  initializeSampleProfileLoaderLegacyPassPass(Registry);
+  initializeFunctionImportLegacyPassPass(Registry);
+  initializeWholeProgramDevirtPass(Registry);
+}
+
+void LLVMInitializeIPO(LLVMPassRegistryRef R) {
+  initializeIPO(*unwrap(R));
+}
+
+void LLVMAddArgumentPromotionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createArgumentPromotionPass());
+}
+
+void LLVMAddConstantMergePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createConstantMergePass());
+}
+
+void LLVMAddDeadArgEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadArgEliminationPass());
+}
+
+void LLVMAddFunctionAttrsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPostOrderFunctionAttrsLegacyPass());
+}
+
+void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createFunctionInliningPass());
+}
+
+void LLVMAddAlwaysInlinerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(llvm::createAlwaysInlinerLegacyPass());
+}
+
+void LLVMAddGlobalDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGlobalDCEPass());
+}
+
+void LLVMAddGlobalOptimizerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGlobalOptimizerPass());
+}
+
+void LLVMAddIPConstantPropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createIPConstantPropagationPass());
+}
+
+void LLVMAddPruneEHPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPruneEHPass());
+}
+
+void LLVMAddIPSCCPPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createIPSCCPPass());
+}
+
+void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) {
+  auto PreserveMain = [=](const GlobalValue &GV) {
+    return AllButMain && GV.getName() == "main";
+  };
+  unwrap(PM)->add(createInternalizePass(PreserveMain));
+}
+
+void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createStripDeadPrototypesPass());
+}
+
+void LLVMAddStripSymbolsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createStripSymbolsPass());
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
new file mode 100644
index 000000000000..15d7515cc842
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -0,0 +1,80 @@
+//===- InferFunctionAttrs.cpp - Infer implicit function attributes --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "inferattrs"
+
+static bool inferAllPrototypeAttributes(Module &M,
+                                        const TargetLibraryInfo &TLI) {
+  bool Changed = false;
+
+  for (Function &F : M.functions())
+    // We only infer things using the prototype and the name; we don't need
+    // definitions.
+    if (F.isDeclaration() && !F.hasFnAttribute((Attribute::OptimizeNone)))
+      Changed |= inferLibFuncAttributes(F, TLI);
+
+  return Changed;
+}
+
+PreservedAnalyses InferFunctionAttrsPass::run(Module &M,
+                                              ModuleAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+
+  if (!inferAllPrototypeAttributes(M, TLI))
+    // If we didn't infer anything, preserve all analyses.
+    return PreservedAnalyses::all();
+
+  // Otherwise, we may have changed fundamental function attributes, so clear
+  // out all the passes.
+  return PreservedAnalyses::none();
+}
+
+namespace {
+struct InferFunctionAttrsLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  InferFunctionAttrsLegacyPass() : ModulePass(ID) {
+    initializeInferFunctionAttrsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    return inferAllPrototypeAttributes(M, TLI);
+  }
+};
+}
+
+char InferFunctionAttrsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(InferFunctionAttrsLegacyPass, "inferattrs",
+                      "Infer set function attributes", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(InferFunctionAttrsLegacyPass, "inferattrs",
+                    "Infer set function attributes", false, false)
+
+Pass *llvm::createInferFunctionAttrsLegacyPass() {
+  return new InferFunctionAttrsLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
new file mode 100644
index 000000000000..50e7cc89a3b3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
@@ -0,0 +1,116 @@
+//===- InlineSimple.cpp - Code to perform simple function inlining --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements bottom-up inlining of functions into callees.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/Inliner.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inline"
+
+namespace {
+
+/// \brief Actual inliner pass implementation.
+///
+/// The common implementation of the inlining logic is shared between this
+/// inliner pass and the always inliner pass. The two passes use different cost
+/// analyses to determine when to inline.
+class SimpleInliner : public LegacyInlinerBase {
+
+  InlineParams Params;
+
+public:
+  SimpleInliner() : LegacyInlinerBase(ID), Params(llvm::getInlineParams()) {
+    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  explicit SimpleInliner(InlineParams Params)
+      : LegacyInlinerBase(ID), Params(std::move(Params)) {
+    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+
+  InlineCost getInlineCost(CallSite CS) override {
+    Function *Callee = CS.getCalledFunction();
+    TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
+    std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+        [&](Function &F) -> AssumptionCache & {
+      return ACT->getAssumptionCache(F);
+    };
+    return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache,
+                               /*GetBFI=*/None, PSI);
+  }
+
+  bool runOnSCC(CallGraphSCC &SCC) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  TargetTransformInfoWrapperPass *TTIWP;
+
+};
+
+} // end anonymous namespace
+
+char SimpleInliner::ID = 0;
+INITIALIZE_PASS_BEGIN(SimpleInliner, "inline", "Function Integration/Inlining",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(SimpleInliner, "inline", "Function Integration/Inlining",
+                    false, false)
+
+Pass *llvm::createFunctionInliningPass() { return new SimpleInliner(); }
+
+Pass *llvm::createFunctionInliningPass(int Threshold) {
+  return new SimpleInliner(llvm::getInlineParams(Threshold));
+}
+
+Pass *llvm::createFunctionInliningPass(unsigned OptLevel,
+                                       unsigned SizeOptLevel,
+                                       bool DisableInlineHotCallSite) {
+  auto Param = llvm::getInlineParams(OptLevel, SizeOptLevel);
+  if (DisableInlineHotCallSite)
+    Param.HotCallSiteThreshold = 0;
+  return new SimpleInliner(Param);
+}
+
+Pass *llvm::createFunctionInliningPass(InlineParams &Params) {
+  return new SimpleInliner(Params);
+}
+
+bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) {
+  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
+  return LegacyInlinerBase::runOnSCC(SCC);
+}
+
+void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+  LegacyInlinerBase::getAnalysisUsage(AU);
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
new file mode 100644
index 000000000000..00ddb93df830
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -0,0 +1,1001 @@
+//===- Inliner.cpp - Code common to all inliners --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the mechanics required to implement inlining without
+// missing any calls and updating the call graph.  The decisions of which calls
+// are profitable to inline are implemented elsewhere.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/Inliner.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "inline"
+
+STATISTIC(NumInlined, "Number of functions inlined");
+STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined");
+STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
+STATISTIC(NumMergedAllocas, "Number of allocas merged together");
+
+// This weirdly named statistic tracks the number of times that, when attempting
+// to inline a function A into B, we analyze the callers of B in order to see
+// if those would be more profitable and blocked inline steps.
+STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed");
+
+/// Flag to disable manual alloca merging.
+///
+/// Merging of allocas was originally done as a stack-size saving technique
+/// prior to LLVM's code generator having support for stack coloring based on
+/// lifetime markers. It is now in the process of being removed. To experiment
+/// with disabling it and relying fully on lifetime marker based stack
+/// coloring, you can pass this flag to LLVM.
+static cl::opt<bool>
+    DisableInlinedAllocaMerging("disable-inlined-alloca-merging",
+                                cl::init(false), cl::Hidden);
+
+namespace {
+enum class InlinerFunctionImportStatsOpts {
+  No = 0,
+  Basic = 1,
+  Verbose = 2,
+};
+
+cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats(
+    "inliner-function-import-stats",
+    cl::init(InlinerFunctionImportStatsOpts::No),
+    cl::values(clEnumValN(InlinerFunctionImportStatsOpts::Basic, "basic",
+                          "basic statistics"),
+               clEnumValN(InlinerFunctionImportStatsOpts::Verbose, "verbose",
+                          "printing of statistics for each inlined function")),
+    cl::Hidden, cl::desc("Enable inliner stats for imported functions"));
+} // namespace
+
+LegacyInlinerBase::LegacyInlinerBase(char &ID)
+    : CallGraphSCCPass(ID), InsertLifetime(true) {}
+
+LegacyInlinerBase::LegacyInlinerBase(char &ID, bool InsertLifetime)
+    : CallGraphSCCPass(ID), InsertLifetime(InsertLifetime) {}
+
+/// For this class, we declare that we require and preserve the call graph.
+/// If the derived class implements this method, it should
+/// always explicitly call the implementation here.
+void LegacyInlinerBase::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  getAAResultsAnalysisUsage(AU);
+  CallGraphSCCPass::getAnalysisUsage(AU);
+}
+
+typedef DenseMap<ArrayType *, std::vector<AllocaInst *>> InlinedArrayAllocasTy;
+
+/// Look at all of the allocas that we inlined through this call site.  If we
+/// have already inlined other allocas through other calls into this function,
+/// then we know that they have disjoint lifetimes and that we can merge them.
+///
+/// There are many heuristics possible for merging these allocas, and the
+/// different options have different tradeoffs.  One thing that we *really*
+/// don't want to hurt is SRoA: once inlining happens, often allocas are no
+/// longer address taken and so they can be promoted.
+///
+/// Our "solution" for that is to only merge allocas whose outermost type is an
+/// array type.  These are usually not promoted because someone is using a
+/// variable index into them.  These are also often the most important ones to
+/// merge.
+///
+/// A better solution would be to have real memory lifetime markers in the IR
+/// and not have the inliner do any merging of allocas at all.  This would
+/// allow the backend to do proper stack slot coloring of all allocas that
+/// *actually make it to the backend*, which is really what we want.
+///
+/// Because we don't have this information, we do this simple and useful hack.
+static void mergeInlinedArrayAllocas(
+    Function *Caller, InlineFunctionInfo &IFI,
+    InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory) {
+  SmallPtrSet<AllocaInst *, 16> UsedAllocas;
+
+  // When processing our SCC, check to see if CS was inlined from some other
+  // call site.  For example, if we're processing "A" in this code:
+  //   A() { B() }
+  //   B() { x = alloca ... C() }
+  //   C() { y = alloca ... }
+  // Assume that C was not inlined into B initially, and so we're processing A
+  // and decide to inline B into A.  Doing this makes an alloca available for
+  // reuse and makes a callsite (C) available for inlining.  When we process
+  // the C call site we don't want to do any alloca merging between X and Y
+  // because their scopes are not disjoint.  We could make this smarter by
+  // keeping track of the inline history for each alloca in the
+  // InlinedArrayAllocas but this isn't likely to be a significant win.
+  if (InlineHistory != -1) // Only do merging for top-level call sites in SCC.
+    return;
+
+  // Loop over all the allocas we have so far and see if they can be merged with
+  // a previously inlined alloca.  If not, remember that we had it.
+  for (unsigned AllocaNo = 0, e = IFI.StaticAllocas.size(); AllocaNo != e;
+       ++AllocaNo) {
+    AllocaInst *AI = IFI.StaticAllocas[AllocaNo];
+
+    // Don't bother trying to merge array allocations (they will usually be
+    // canonicalized to be an allocation *of* an array), or allocations whose
+    // type is not itself an array (because we're afraid of pessimizing SRoA).
+    ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType());
+    if (!ATy || AI->isArrayAllocation())
+      continue;
+
+    // Get the list of all available allocas for this array type.
+    std::vector<AllocaInst *> &AllocasForType = InlinedArrayAllocas[ATy];
+
+    // Loop over the allocas in AllocasForType to see if we can reuse one.  Note
+    // that we have to be careful not to reuse the same "available" alloca for
+    // multiple different allocas that we just inlined, we use the 'UsedAllocas'
+    // set to keep track of which "available" allocas are being used by this
+    // function.  Also, AllocasForType can be empty of course!
+    bool MergedAwayAlloca = false;
+    for (AllocaInst *AvailableAlloca : AllocasForType) {
+
+      unsigned Align1 = AI->getAlignment(),
+               Align2 = AvailableAlloca->getAlignment();
+
+      // The available alloca has to be in the right function, not in some other
+      // function in this SCC.
+      if (AvailableAlloca->getParent() != AI->getParent())
+        continue;
+
+      // If the inlined function already uses this alloca then we can't reuse
+      // it.
+      if (!UsedAllocas.insert(AvailableAlloca).second)
+        continue;
+
+      // Otherwise, we *can* reuse it, RAUW AI into AvailableAlloca and declare
+      // success!
+      DEBUG(dbgs() << "    ***MERGED ALLOCA: " << *AI
+                   << "\n\t\tINTO: " << *AvailableAlloca << '\n');
+
+      // Move affected dbg.declare calls immediately after the new alloca to
+      // avoid the situation when a dbg.declare precedes its alloca.
+      if (auto *L = LocalAsMetadata::getIfExists(AI))
+        if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L))
+          for (User *U : MDV->users())
+            if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
+              DDI->moveBefore(AvailableAlloca->getNextNode());
+
+      AI->replaceAllUsesWith(AvailableAlloca);
+
+      if (Align1 != Align2) {
+        if (!Align1 || !Align2) {
+          const DataLayout &DL = Caller->getParent()->getDataLayout();
+          unsigned TypeAlign = DL.getABITypeAlignment(AI->getAllocatedType());
+
+          Align1 = Align1 ? Align1 : TypeAlign;
+          Align2 = Align2 ? Align2 : TypeAlign;
+        }
+
+        if (Align1 > Align2)
+          AvailableAlloca->setAlignment(AI->getAlignment());
+      }
+
+      AI->eraseFromParent();
+      MergedAwayAlloca = true;
+      ++NumMergedAllocas;
+      IFI.StaticAllocas[AllocaNo] = nullptr;
+      break;
+    }
+
+    // If we already nuked the alloca, we're done with it.
+    if (MergedAwayAlloca)
+      continue;
+
+    // If we were unable to merge away the alloca either because there are no
+    // allocas of the right type available or because we reused them all
+    // already, remember that this alloca came from an inlined function and mark
+    // it used so we don't reuse it for other allocas from this inline
+    // operation.
+    AllocasForType.push_back(AI);
+    UsedAllocas.insert(AI);
+  }
+}
+
+/// If it is possible to inline the specified call site,
+/// do so and update the CallGraph for this operation.
+///
+/// This function also does some basic book-keeping to update the IR.  The
+/// InlinedArrayAllocas map keeps track of any allocas that are already
+/// available from other functions inlined into the caller.  If we are able to
+/// inline this call site we attempt to reuse already available allocas or add
+/// any new allocas to the set if not possible.
+static bool InlineCallIfPossible(
+    CallSite CS, InlineFunctionInfo &IFI,
+    InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory,
+    bool InsertLifetime, function_ref<AAResults &(Function &)> &AARGetter,
+    ImportedFunctionsInliningStatistics &ImportedFunctionsStats) {
+  Function *Callee = CS.getCalledFunction();
+  Function *Caller = CS.getCaller();
+
+  AAResults &AAR = AARGetter(*Callee);
+
+  // Try to inline the function.  Get the list of static allocas that were
+  // inlined.
+  if (!InlineFunction(CS, IFI, &AAR, InsertLifetime))
+    return false;
+
+  if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
+    ImportedFunctionsStats.recordInline(*Caller, *Callee);
+
+  AttributeFuncs::mergeAttributesForInlining(*Caller, *Callee);
+
+  if (!DisableInlinedAllocaMerging)
+    mergeInlinedArrayAllocas(Caller, IFI, InlinedArrayAllocas, InlineHistory);
+
+  return true;
+}
+
+/// Return true if inlining of CS can block the caller from being
+/// inlined which is proved to be more beneficial. \p IC is the
+/// estimated inline cost associated with callsite \p CS.
+/// \p TotalSecondaryCost will be set to the estimated cost of inlining the
+/// caller if \p CS is suppressed for inlining.
+static bool
+shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
+                 int &TotalSecondaryCost,
+                 function_ref<InlineCost(CallSite CS)> GetInlineCost) {
+
+  // For now we only handle local or inline functions.
+  if (!Caller->hasLocalLinkage() && !Caller->hasLinkOnceODRLinkage())
+    return false;
+  // Try to detect the case where the current inlining candidate caller (call
+  // it B) is a static or linkonce-ODR function and is an inlining candidate
+  // elsewhere, and the current candidate callee (call it C) is large enough
+  // that inlining it into B would make B too big to inline later. In these
+  // circumstances it may be best not to inline C into B, but to inline B into
+  // its callers.
+  //
+  // This only applies to static and linkonce-ODR functions because those are
+  // expected to be available for inlining in the translation units where they
+  // are used. Thus we will always have the opportunity to make local inlining
+  // decisions. Importantly the linkonce-ODR linkage covers inline functions
+  // and templates in C++.
+  //
+  // FIXME: All of this logic should be sunk into getInlineCost. It relies on
+  // the internal implementation of the inline cost metrics rather than
+  // treating them as truly abstract units etc.
+  TotalSecondaryCost = 0;
+  // The candidate cost to be imposed upon the current function.
+  int CandidateCost = IC.getCost() - 1;
+  // This bool tracks what happens if we do NOT inline C into B.
+  bool callerWillBeRemoved = Caller->hasLocalLinkage();
+  // This bool tracks what happens if we DO inline C into B.
+  bool inliningPreventsSomeOuterInline = false;
+  for (User *U : Caller->users()) {
+    CallSite CS2(U);
+
+    // If this isn't a call to Caller (it could be some other sort
+    // of reference) skip it.  Such references will prevent the caller
+    // from being removed.
+    if (!CS2 || CS2.getCalledFunction() != Caller) {
+      callerWillBeRemoved = false;
+      continue;
+    }
+
+    InlineCost IC2 = GetInlineCost(CS2);
+    ++NumCallerCallersAnalyzed;
+    if (!IC2) {
+      callerWillBeRemoved = false;
+      continue;
+    }
+    if (IC2.isAlways())
+      continue;
+
+    // See if inlining of the original callsite would erase the cost delta of
+    // this callsite. We subtract off the penalty for the call instruction,
+    // which we would be deleting.
+    if (IC2.getCostDelta() <= CandidateCost) {
+      inliningPreventsSomeOuterInline = true;
+      TotalSecondaryCost += IC2.getCost();
+    }
+  }
+  // If all outer calls to Caller would get inlined, the cost for the last
+  // one is set very low by getInlineCost, in anticipation that Caller will
+  // be removed entirely.  We did not account for this above unless there
+  // is only one caller of Caller.
+  if (callerWillBeRemoved && !Caller->hasOneUse())
+    TotalSecondaryCost -= InlineConstants::LastCallToStaticBonus;
+
+  if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost())
+    return true;
+
+  return false;
+}
+
+/// Return true if the inliner should attempt to inline at the given CallSite.
+static bool shouldInline(CallSite CS,
+                         function_ref<InlineCost(CallSite CS)> GetInlineCost,
+                         OptimizationRemarkEmitter &ORE) {
+  using namespace ore;
+  InlineCost IC = GetInlineCost(CS);
+  Instruction *Call = CS.getInstruction();
+  Function *Callee = CS.getCalledFunction();
+  Function *Caller = CS.getCaller();
+
+  if (IC.isAlways()) {
+    DEBUG(dbgs() << "    Inlining: cost=always"
+                 << ", Call: " << *CS.getInstruction() << "\n");
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
+             << NV("Callee", Callee)
+             << " should always be inlined (cost=always)");
+    return true;
+  }
+
+  if (IC.isNever()) {
+    DEBUG(dbgs() << "    NOT Inlining: cost=never"
+                 << ", Call: " << *CS.getInstruction() << "\n");
+    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
+             << NV("Callee", Callee) << " not inlined into "
+             << NV("Caller", Caller)
+             << " because it should never be inlined (cost=never)");
+    return false;
+  }
+
+  if (!IC) {
+    DEBUG(dbgs() << "    NOT Inlining: cost=" << IC.getCost()
+                 << ", thres=" << (IC.getCostDelta() + IC.getCost())
+                 << ", Call: " << *CS.getInstruction() << "\n");
+    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
+             << NV("Callee", Callee) << " not inlined into "
+             << NV("Caller", Caller) << " because too costly to inline (cost="
+             << NV("Cost", IC.getCost()) << ", threshold="
+             << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
+    return false;
+  }
+
+  int TotalSecondaryCost = 0;
+  if (shouldBeDeferred(Caller, CS, IC, TotalSecondaryCost, GetInlineCost)) {
+    DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction()
+                 << " Cost = " << IC.getCost()
+                 << ", outer Cost = " << TotalSecondaryCost << '\n');
+    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts",
+                                      Call)
+             << "Not inlining. Cost of inlining " << NV("Callee", Callee)
+             << " increases the cost of inlining " << NV("Caller", Caller)
+             << " in other contexts");
+    return false;
+  }
+
+  DEBUG(dbgs() << "    Inlining: cost=" << IC.getCost()
+               << ", thres=" << (IC.getCostDelta() + IC.getCost())
+               << ", Call: " << *CS.getInstruction() << '\n');
+  ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBeInlined", Call)
+           << NV("Callee", Callee) << " can be inlined into "
+           << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
+           << " (threshold="
+           << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
+  return true;
+}
+
+/// Return true if the specified inline history ID
+/// indicates an inline history that includes the specified function.
+static bool InlineHistoryIncludes(
+    Function *F, int InlineHistoryID,
+    const SmallVectorImpl<std::pair<Function *, int>> &InlineHistory) {
+  while (InlineHistoryID != -1) {
+    assert(unsigned(InlineHistoryID) < InlineHistory.size() &&
+           "Invalid inline history ID");
+    if (InlineHistory[InlineHistoryID].first == F)
+      return true;
+    InlineHistoryID = InlineHistory[InlineHistoryID].second;
+  }
+  return false;
+}
+
+bool LegacyInlinerBase::doInitialization(CallGraph &CG) {
+  if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
+    ImportedFunctionsStats.setModuleInfo(CG.getModule());
+  return false; // No changes to CallGraph.
+}
+
+bool LegacyInlinerBase::runOnSCC(CallGraphSCC &SCC) {
+  if (skipSCC(SCC))
+    return false;
+  return inlineCalls(SCC);
+}
+
+static bool
+inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
+                std::function<AssumptionCache &(Function &)> GetAssumptionCache,
+                ProfileSummaryInfo *PSI, TargetLibraryInfo &TLI,
+                bool InsertLifetime,
+                function_ref<InlineCost(CallSite CS)> GetInlineCost,
+                function_ref<AAResults &(Function &)> AARGetter,
+                ImportedFunctionsInliningStatistics &ImportedFunctionsStats) {
+  SmallPtrSet<Function *, 8> SCCFunctions;
+  DEBUG(dbgs() << "Inliner visiting SCC:");
+  for (CallGraphNode *Node : SCC) {
+    Function *F = Node->getFunction();
+    if (F)
+      SCCFunctions.insert(F);
+    DEBUG(dbgs() << " " << (F ? F->getName() : "INDIRECTNODE"));
+  }
+
+  // Scan through and identify all call sites ahead of time so that we only
+  // inline call sites in the original functions, not call sites that result
+  // from inlining other functions.
+  SmallVector<std::pair<CallSite, int>, 16> CallSites;
+
+  // When inlining a callee produces new call sites, we want to keep track of
+  // the fact that they were inlined from the callee.  This allows us to avoid
+  // infinite inlining in some obscure cases.  To represent this, we use an
+  // index into the InlineHistory vector.
+  SmallVector<std::pair<Function *, int>, 8> InlineHistory;
+
+  for (CallGraphNode *Node : SCC) {
+    Function *F = Node->getFunction();
+    if (!F || F->isDeclaration())
+      continue;
+
+    OptimizationRemarkEmitter ORE(F);
+    for (BasicBlock &BB : *F)
+      for (Instruction &I : BB) {
+        CallSite CS(cast<Value>(&I));
+        // If this isn't a call, or it is a call to an intrinsic, it can
+        // never be inlined.
+        if (!CS || isa<IntrinsicInst>(I))
+          continue;
+
+        // If this is a direct call to an external function, we can never inline
+        // it.  If it is an indirect call, inlining may resolve it to be a
+        // direct call, so we keep it.
+        if (Function *Callee = CS.getCalledFunction())
+          if (Callee->isDeclaration()) {
+            using namespace ore;
+            ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
+                     << NV("Callee", Callee) << " will not be inlined into "
+                     << NV("Caller", CS.getCaller())
+                     << " because its definition is unavailable"
+                     << setIsVerbose());
+            continue;
+          }
+
+        CallSites.push_back(std::make_pair(CS, -1));
+      }
+  }
+
+  DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n");
+
+  // If there are no calls in this function, exit early.
+  if (CallSites.empty())
+    return false;
+
+  // Now that we have all of the call sites, move the ones to functions in the
+  // current SCC to the end of the list.
+  unsigned FirstCallInSCC = CallSites.size();
+  for (unsigned i = 0; i < FirstCallInSCC; ++i)
+    if (Function *F = CallSites[i].first.getCalledFunction())
+      if (SCCFunctions.count(F))
+        std::swap(CallSites[i--], CallSites[--FirstCallInSCC]);
+
+  InlinedArrayAllocasTy InlinedArrayAllocas;
+  InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache, PSI);
+
+  // Now that we have all of the call sites, loop over them and inline them if
+  // it looks profitable to do so.
+  bool Changed = false;
+  bool LocalChange;
+  do {
+    LocalChange = false;
+    // Iterate over the outer loop because inlining functions can cause indirect
+    // calls to become direct calls.
+    // CallSites may be modified inside so ranged for loop can not be used.
+    for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) {
+      CallSite CS = CallSites[CSi].first;
+
+      Function *Caller = CS.getCaller();
+      Function *Callee = CS.getCalledFunction();
+
+      // We can only inline direct calls to non-declarations.
+      if (!Callee || Callee->isDeclaration())
+        continue;
+
+      Instruction *Instr = CS.getInstruction();
+
+      bool IsTriviallyDead = isInstructionTriviallyDead(Instr, &TLI);
+
+      int InlineHistoryID;
+      if (!IsTriviallyDead) {
+        // If this call site was obtained by inlining another function, verify
+        // that the include path for the function did not include the callee
+        // itself.  If so, we'd be recursively inlining the same function,
+        // which would provide the same callsites, which would cause us to
+        // infinitely inline.
+        InlineHistoryID = CallSites[CSi].second;
+        if (InlineHistoryID != -1 &&
+            InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory))
+          continue;
+      }
+
+      // FIXME for new PM: because of the old PM we currently generate ORE and
+      // in turn BFI on demand.  With the new PM, the ORE dependency should
+      // just become a regular analysis dependency.
+      OptimizationRemarkEmitter ORE(Caller);
+
+      // If the policy determines that we should inline this function,
+      // delete the call instead.
+      if (!shouldInline(CS, GetInlineCost, ORE))
+        continue;
+
+      // If this call site is dead and it is to a readonly function, we should
+      // just delete the call instead of trying to inline it, regardless of
+      // size.  This happens because IPSCCP propagates the result out of the
+      // call and then we're left with the dead call.
+      if (IsTriviallyDead) {
+        DEBUG(dbgs() << "    -> Deleting dead call: " << *Instr << "\n");
+        // Update the call graph by deleting the edge from Callee to Caller.
+        CG[Caller]->removeCallEdgeFor(CS);
+        Instr->eraseFromParent();
+        ++NumCallsDeleted;
+      } else {
+        // Get DebugLoc to report. CS will be invalid after Inliner.
+        DebugLoc DLoc = Instr->getDebugLoc();
+        BasicBlock *Block = CS.getParent();
+
+        // Attempt to inline the function.
+        using namespace ore;
+        if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas,
+                                  InlineHistoryID, InsertLifetime, AARGetter,
+                                  ImportedFunctionsStats)) {
+          ORE.emit(
+              OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
+              << NV("Callee", Callee) << " will not be inlined into "
+              << NV("Caller", Caller));
+          continue;
+        }
+        ++NumInlined;
+
+        // Report the inline decision.
+        ORE.emit(OptimizationRemark(DEBUG_TYPE, "Inlined", DLoc, Block)
+                 << NV("Callee", Callee) << " inlined into "
+                 << NV("Caller", Caller));
+
+        // If inlining this function gave us any new call sites, throw them
+        // onto our worklist to process.  They are useful inline candidates.
+        if (!InlineInfo.InlinedCalls.empty()) {
+          // Create a new inline history entry for this, so that we remember
+          // that these new callsites came about due to inlining Callee.
+          int NewHistoryID = InlineHistory.size();
+          InlineHistory.push_back(std::make_pair(Callee, InlineHistoryID));
+
+          for (Value *Ptr : InlineInfo.InlinedCalls)
+            CallSites.push_back(std::make_pair(CallSite(Ptr), NewHistoryID));
+        }
+      }
+
+      // If we inlined or deleted the last possible call site to the function,
+      // delete the function body now.
+      if (Callee && Callee->use_empty() && Callee->hasLocalLinkage() &&
+          // TODO: Can remove if in SCC now.
+          !SCCFunctions.count(Callee) &&
+
+          // The function may be apparently dead, but if there are indirect
+          // callgraph references to the node, we cannot delete it yet, this
+          // could invalidate the CGSCC iterator.
+          CG[Callee]->getNumReferences() == 0) {
+        DEBUG(dbgs() << "    -> Deleting dead function: " << Callee->getName()
+                     << "\n");
+        CallGraphNode *CalleeNode = CG[Callee];
+
+        // Remove any call graph edges from the callee to its callees.
+        CalleeNode->removeAllCalledFunctions();
+
+        // Removing the node for callee from the call graph and delete it.
+        delete CG.removeFunctionFromModule(CalleeNode);
+        ++NumDeleted;
+      }
+
+      // Remove this call site from the list.  If possible, use
+      // swap/pop_back for efficiency, but do not use it if doing so would
+      // move a call site to a function in this SCC before the
+      // 'FirstCallInSCC' barrier.
+      if (SCC.isSingular()) {
+        CallSites[CSi] = CallSites.back();
+        CallSites.pop_back();
+      } else {
+        CallSites.erase(CallSites.begin() + CSi);
+      }
+      --CSi;
+
+      Changed = true;
+      LocalChange = true;
+    }
+  } while (LocalChange);
+
+  return Changed;
+}
+
+bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) {
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  ACT = &getAnalysis<AssumptionCacheTracker>();
+  PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+    return ACT->getAssumptionCache(F);
+  };
+  return inlineCallsImpl(SCC, CG, GetAssumptionCache, PSI, TLI, InsertLifetime,
+                         [this](CallSite CS) { return getInlineCost(CS); },
+                         LegacyAARGetter(*this), ImportedFunctionsStats);
+}
+
+/// Remove now-dead linkonce functions at the end of
+/// processing to avoid breaking the SCC traversal.
+bool LegacyInlinerBase::doFinalization(CallGraph &CG) {
+  if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
+    ImportedFunctionsStats.dump(InlinerFunctionImportStats ==
+                                InlinerFunctionImportStatsOpts::Verbose);
+  return removeDeadFunctions(CG);
+}
+
+/// Remove dead functions that are not included in DNR (Do Not Remove) list.
+bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG,
+                                            bool AlwaysInlineOnly) {
+  SmallVector<CallGraphNode *, 16> FunctionsToRemove;
+  SmallVector<Function *, 16> DeadFunctionsInComdats;
+
+  auto RemoveCGN = [&](CallGraphNode *CGN) {
+    // Remove any call graph edges from the function to its callees.
+    CGN->removeAllCalledFunctions();
+
+    // Remove any edges from the external node to the function's call graph
+    // node.  These edges might have been made irrelegant due to
+    // optimization of the program.
+    CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN);
+
+    // Removing the node for callee from the call graph and delete it.
+    FunctionsToRemove.push_back(CGN);
+  };
+
+  // Scan for all of the functions, looking for ones that should now be removed
+  // from the program.  Insert the dead ones in the FunctionsToRemove set.
+  for (const auto &I : CG) {
+    CallGraphNode *CGN = I.second.get();
+    Function *F = CGN->getFunction();
+    if (!F || F->isDeclaration())
+      continue;
+
+    // Handle the case when this function is called and we only want to care
+    // about always-inline functions. This is a bit of a hack to share code
+    // between here and the InlineAlways pass.
+    if (AlwaysInlineOnly && !F->hasFnAttribute(Attribute::AlwaysInline))
+      continue;
+
+    // If the only remaining users of the function are dead constants, remove
+    // them.
+    F->removeDeadConstantUsers();
+
+    if (!F->isDefTriviallyDead())
+      continue;
+
+    // It is unsafe to drop a function with discardable linkage from a COMDAT
+    // without also dropping the other members of the COMDAT.
+    // The inliner doesn't visit non-function entities which are in COMDAT
+    // groups so it is unsafe to do so *unless* the linkage is local.
+    if (!F->hasLocalLinkage()) {
+      if (F->hasComdat()) {
+        DeadFunctionsInComdats.push_back(F);
+        continue;
+      }
+    }
+
+    RemoveCGN(CGN);
+  }
+  if (!DeadFunctionsInComdats.empty()) {
+    // Filter out the functions whose comdats remain alive.
+    filterDeadComdatFunctions(CG.getModule(), DeadFunctionsInComdats);
+    // Remove the rest.
+    for (Function *F : DeadFunctionsInComdats)
+      RemoveCGN(CG[F]);
+  }
+
+  if (FunctionsToRemove.empty())
+    return false;
+
+  // Now that we know which functions to delete, do so.  We didn't want to do
+  // this inline, because that would invalidate our CallGraph::iterator
+  // objects. :(
+  //
+  // Note that it doesn't matter that we are iterating over a non-stable order
+  // here to do this, it doesn't matter which order the functions are deleted
+  // in.
+  array_pod_sort(FunctionsToRemove.begin(), FunctionsToRemove.end());
+  FunctionsToRemove.erase(
+      std::unique(FunctionsToRemove.begin(), FunctionsToRemove.end()),
+      FunctionsToRemove.end());
+  for (CallGraphNode *CGN : FunctionsToRemove) {
+    delete CG.removeFunctionFromModule(CGN);
+    ++NumDeleted;
+  }
+  return true;
+}
+
+PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+                                   CGSCCAnalysisManager &AM, LazyCallGraph &CG,
+                                   CGSCCUpdateResult &UR) {
+  const ModuleAnalysisManager &MAM =
+      AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG).getManager();
+  bool Changed = false;
+
+  assert(InitialC.size() > 0 && "Cannot handle an empty SCC!");
+  Module &M = *InitialC.begin()->getFunction().getParent();
+  ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M);
+
+  // We use a single common worklist for calls across the entire SCC. We
+  // process these in-order and append new calls introduced during inlining to
+  // the end.
+  //
+  // Note that this particular order of processing is actually critical to
+  // avoid very bad behaviors. Consider *highly connected* call graphs where
+  // each function contains a small amonut of code and a couple of calls to
+  // other functions. Because the LLVM inliner is fundamentally a bottom-up
+  // inliner, it can handle gracefully the fact that these all appear to be
+  // reasonable inlining candidates as it will flatten things until they become
+  // too big to inline, and then move on and flatten another batch.
+  //
+  // However, when processing call edges *within* an SCC we cannot rely on this
+  // bottom-up behavior. As a consequence, with heavily connected *SCCs* of
+  // functions we can end up incrementally inlining N calls into each of
+  // N functions because each incremental inlining decision looks good and we
+  // don't have a topological ordering to prevent explosions.
+  //
+  // To compensate for this, we don't process transitive edges made immediate
+  // by inlining until we've done one pass of inlining across the entire SCC.
+  // Large, highly connected SCCs still lead to some amount of code bloat in
+  // this model, but it is uniformly spread across all the functions in the SCC
+  // and eventually they all become too large to inline, rather than
+  // incrementally maknig a single function grow in a super linear fashion.
+  SmallVector<std::pair<CallSite, int>, 16> Calls;
+
+  // Populate the initial list of calls in this SCC.
+  for (auto &N : InitialC) {
+    // We want to generally process call sites top-down in order for
+    // simplifications stemming from replacing the call with the returned value
+    // after inlining to be visible to subsequent inlining decisions.
+    // FIXME: Using instructions sequence is a really bad way to do this.
+    // Instead we should do an actual RPO walk of the function body.
+    for (Instruction &I : instructions(N.getFunction()))
+      if (auto CS = CallSite(&I))
+        if (Function *Callee = CS.getCalledFunction())
+          if (!Callee->isDeclaration())
+            Calls.push_back({CS, -1});
+  }
+  if (Calls.empty())
+    return PreservedAnalyses::all();
+
+  // Capture updatable variables for the current SCC and RefSCC.
+  auto *C = &InitialC;
+  auto *RC = &C->getOuterRefSCC();
+
+  // When inlining a callee produces new call sites, we want to keep track of
+  // the fact that they were inlined from the callee.  This allows us to avoid
+  // infinite inlining in some obscure cases.  To represent this, we use an
+  // index into the InlineHistory vector.
+  SmallVector<std::pair<Function *, int>, 16> InlineHistory;
+
+  // Track a set vector of inlined callees so that we can augment the caller
+  // with all of their edges in the call graph before pruning out the ones that
+  // got simplified away.
+  SmallSetVector<Function *, 4> InlinedCallees;
+
+  // Track the dead functions to delete once finished with inlining calls. We
+  // defer deleting these to make it easier to handle the call graph updates.
+  SmallVector<Function *, 4> DeadFunctions;
+
+  // Loop forward over all of the calls. Note that we cannot cache the size as
+  // inlining can introduce new calls that need to be processed.
+  for (int i = 0; i < (int)Calls.size(); ++i) {
+    // We expect the calls to typically be batched with sequences of calls that
+    // have the same caller, so we first set up some shared infrastructure for
+    // this caller. We also do any pruning we can at this layer on the caller
+    // alone.
+    Function &F = *Calls[i].first.getCaller();
+    LazyCallGraph::Node &N = *CG.lookup(F);
+    if (CG.lookupSCC(N) != C)
+      continue;
+    if (F.hasFnAttribute(Attribute::OptimizeNone))
+      continue;
+
+    DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n");
+
+    // Get a FunctionAnalysisManager via a proxy for this particular node. We
+    // do this each time we visit a node as the SCC may have changed and as
+    // we're going to mutate this particular function we want to make sure the
+    // proxy is in place to forward any invalidation events. We can use the
+    // manager we get here for looking up results for functions other than this
+    // node however because those functions aren't going to be mutated by this
+    // pass.
+    FunctionAnalysisManager &FAM =
+        AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, CG)
+            .getManager();
+    std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+        [&](Function &F) -> AssumptionCache & {
+      return FAM.getResult<AssumptionAnalysis>(F);
+    };
+    auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & {
+      return FAM.getResult<BlockFrequencyAnalysis>(F);
+    };
+
+    auto GetInlineCost = [&](CallSite CS) {
+      Function &Callee = *CS.getCalledFunction();
+      auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee);
+      return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, {GetBFI},
+                           PSI);
+    };
+
+    // Get the remarks emission analysis for the caller.
+    auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+    // Now process as many calls as we have within this caller in the sequnece.
+    // We bail out as soon as the caller has to change so we can update the
+    // call graph and prepare the context of that new caller.
+    bool DidInline = false;
+    for (; i < (int)Calls.size() && Calls[i].first.getCaller() == &F; ++i) {
+      int InlineHistoryID;
+      CallSite CS;
+      std::tie(CS, InlineHistoryID) = Calls[i];
+      Function &Callee = *CS.getCalledFunction();
+
+      if (InlineHistoryID != -1 &&
+          InlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory))
+        continue;
+
+      // Check whether we want to inline this callsite.
+      if (!shouldInline(CS, GetInlineCost, ORE))
+        continue;
+
+      // Setup the data structure used to plumb customization into the
+      // `InlineFunction` routine.
+      InlineFunctionInfo IFI(
+          /*cg=*/nullptr, &GetAssumptionCache, PSI,
+          &FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())),
+          &FAM.getResult<BlockFrequencyAnalysis>(Callee));
+
+      if (!InlineFunction(CS, IFI))
+        continue;
+      DidInline = true;
+      InlinedCallees.insert(&Callee);
+
+      // Add any new callsites to defined functions to the worklist.
+      if (!IFI.InlinedCallSites.empty()) {
+        int NewHistoryID = InlineHistory.size();
+        InlineHistory.push_back({&Callee, InlineHistoryID});
+        for (CallSite &CS : reverse(IFI.InlinedCallSites))
+          if (Function *NewCallee = CS.getCalledFunction())
+            if (!NewCallee->isDeclaration())
+              Calls.push_back({CS, NewHistoryID});
+      }
+
+      // Merge the attributes based on the inlining.
+      AttributeFuncs::mergeAttributesForInlining(F, Callee);
+
+      // For local functions, check whether this makes the callee trivially
+      // dead. In that case, we can drop the body of the function eagerly
+      // which may reduce the number of callers of other functions to one,
+      // changing inline cost thresholds.
+      if (Callee.hasLocalLinkage()) {
+        // To check this we also need to nuke any dead constant uses (perhaps
+        // made dead by this operation on other functions).
+        Callee.removeDeadConstantUsers();
+        if (Callee.use_empty()) {
+          Calls.erase(
+              std::remove_if(Calls.begin() + i + 1, Calls.end(),
+                             [&Callee](const std::pair<CallSite, int> &Call) {
+                               return Call.first.getCaller() == &Callee;
+                             }),
+              Calls.end());
+          // Clear the body and queue the function itself for deletion when we
+          // finish inlining and call graph updates.
+          // Note that after this point, it is an error to do anything other
+          // than use the callee's address or delete it.
+          Callee.dropAllReferences();
+          assert(find(DeadFunctions, &Callee) == DeadFunctions.end() &&
+                 "Cannot put cause a function to become dead twice!");
+          DeadFunctions.push_back(&Callee);
+        }
+      }
+    }
+
+    // Back the call index up by one to put us in a good position to go around
+    // the outer loop.
+    --i;
+
+    if (!DidInline)
+      continue;
+    Changed = true;
+
+    // Add all the inlined callees' edges as ref edges to the caller. These are
+    // by definition trivial edges as we always have *some* transitive ref edge
+    // chain. While in some cases these edges are direct calls inside the
+    // callee, they have to be modeled in the inliner as reference edges as
+    // there may be a reference edge anywhere along the chain from the current
+    // caller to the callee that causes the whole thing to appear like
+    // a (transitive) reference edge that will require promotion to a call edge
+    // below.
+    for (Function *InlinedCallee : InlinedCallees) {
+      LazyCallGraph::Node &CalleeN = *CG.lookup(*InlinedCallee);
+      for (LazyCallGraph::Edge &E : *CalleeN)
+        RC->insertTrivialRefEdge(N, E.getNode());
+    }
+    InlinedCallees.clear();
+
+    // At this point, since we have made changes we have at least removed
+    // a call instruction. However, in the process we do some incremental
+    // simplification of the surrounding code. This simplification can
+    // essentially do all of the same things as a function pass and we can
+    // re-use the exact same logic for updating the call graph to reflect the
+    // change..
+    C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR);
+    DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
+    RC = &C->getOuterRefSCC();
+  }
+
+  // Now that we've finished inlining all of the calls across this SCC, delete
+  // all of the trivially dead functions, updating the call graph and the CGSCC
+  // pass manager in the process.
+  //
+  // Note that this walks a pointer set which has non-deterministic order but
+  // that is OK as all we do is delete things and add pointers to unordered
+  // sets.
+  for (Function *DeadF : DeadFunctions) {
+    // Get the necessary information out of the call graph and nuke the
+    // function there. Also, cclear out any cached analyses.
+    auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF));
+    FunctionAnalysisManager &FAM =
+        AM.getResult<FunctionAnalysisManagerCGSCCProxy>(DeadC, CG)
+            .getManager();
+    FAM.clear(*DeadF);
+    AM.clear(DeadC);
+    auto &DeadRC = DeadC.getOuterRefSCC();
+    CG.removeDeadFunction(*DeadF);
+
+    // Mark the relevant parts of the call graph as invalid so we don't visit
+    // them.
+    UR.InvalidatedSCCs.insert(&DeadC);
+    UR.InvalidatedRefSCCs.insert(&DeadRC);
+
+    // And delete the actual function from the module.
+    M.getFunctionList().erase(DeadF);
+  }
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  // Even if we change the IR, we update the core CGSCC data structures and so
+  // can preserve the proxy to the function analysis manager.
+  PreservedAnalyses PA;
+  PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+  return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
new file mode 100644
index 000000000000..26db1465bb26
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
@@ -0,0 +1,294 @@
+//===-- Internalize.cpp - Mark functions internal -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loops over all of the functions and variables in the input module.
+// If the function or variable does not need to be preserved according to the
+// client supplied callback, it is marked as internal.
+//
+// This transformation would not be legal in a regular compilation, but it gets
+// extra information from the linker about what is safe.
+//
+// For example: Internalizing a function with external linkage. Only if we are
+// told it is only used from within this module, it is safe to do it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/Internalize.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+#include <fstream>
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "internalize"
+
+STATISTIC(NumAliases, "Number of aliases internalized");
+STATISTIC(NumFunctions, "Number of functions internalized");
+STATISTIC(NumGlobals, "Number of global vars internalized");
+
+// APIFile - A file which contains a list of symbols that should not be marked
+// external.
+static cl::opt<std::string>
+    APIFile("internalize-public-api-file", cl::value_desc("filename"),
+            cl::desc("A file containing list of symbol names to preserve"));
+
+// APIList - A list of symbols that should not be marked internal.
+static cl::list<std::string>
+    APIList("internalize-public-api-list", cl::value_desc("list"),
+            cl::desc("A list of symbol names to preserve"), cl::CommaSeparated);
+
+namespace {
+// Helper to load an API list to preserve from file and expose it as a functor
+// for internalization.
+class PreserveAPIList {
+public:
+  PreserveAPIList() {
+    if (!APIFile.empty())
+      LoadFile(APIFile);
+    ExternalNames.insert(APIList.begin(), APIList.end());
+  }
+
+  bool operator()(const GlobalValue &GV) {
+    return ExternalNames.count(GV.getName());
+  }
+
+private:
+  // Contains the set of symbols loaded from file
+  StringSet<> ExternalNames;
+
+  void LoadFile(StringRef Filename) {
+    // Load the APIFile...
+    std::ifstream In(Filename.data());
+    if (!In.good()) {
+      errs() << "WARNING: Internalize couldn't load file '" << Filename
+             << "'! Continuing as if it's empty.\n";
+      return; // Just continue as if the file were empty
+    }
+    while (In) {
+      std::string Symbol;
+      In >> Symbol;
+      if (!Symbol.empty())
+        ExternalNames.insert(Symbol);
+    }
+  }
+};
+} // end anonymous namespace
+
+bool InternalizePass::shouldPreserveGV(const GlobalValue &GV) {
+  // Function must be defined here
+  if (GV.isDeclaration())
+    return true;
+
+  // Available externally is really just a "declaration with a body".
+  if (GV.hasAvailableExternallyLinkage())
+    return true;
+
+  // Assume that dllexported symbols are referenced elsewhere
+  if (GV.hasDLLExportStorageClass())
+    return true;
+
+  // Already local, has nothing to do.
+  if (GV.hasLocalLinkage())
+    return false;
+
+  // Check some special cases
+  if (AlwaysPreserved.count(GV.getName()))
+    return true;
+
+  return MustPreserveGV(GV);
+}
+
+bool InternalizePass::maybeInternalize(
+    GlobalValue &GV, const std::set<const Comdat *> &ExternalComdats) {
+  if (Comdat *C = GV.getComdat()) {
+    if (ExternalComdats.count(C))
+      return false;
+
+    // If a comdat is not externally visible we can drop it.
+    if (auto GO = dyn_cast<GlobalObject>(&GV))
+      GO->setComdat(nullptr);
+
+    if (GV.hasLocalLinkage())
+      return false;
+  } else {
+    if (GV.hasLocalLinkage())
+      return false;
+
+    if (shouldPreserveGV(GV))
+      return false;
+  }
+
+  GV.setVisibility(GlobalValue::DefaultVisibility);
+  GV.setLinkage(GlobalValue::InternalLinkage);
+  return true;
+}
+
+// If GV is part of a comdat and is externally visible, keep track of its
+// comdat so that we don't internalize any of its members.
+void InternalizePass::checkComdatVisibility(
+    GlobalValue &GV, std::set<const Comdat *> &ExternalComdats) {
+  Comdat *C = GV.getComdat();
+  if (!C)
+    return;
+
+  if (shouldPreserveGV(GV))
+    ExternalComdats.insert(C);
+}
+
+bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
+  bool Changed = false;
+  CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : nullptr;
+
+  SmallPtrSet<GlobalValue *, 8> Used;
+  collectUsedGlobalVariables(M, Used, false);
+
+  // Collect comdat visiblity information for the module.
+  std::set<const Comdat *> ExternalComdats;
+  if (!M.getComdatSymbolTable().empty()) {
+    for (Function &F : M)
+      checkComdatVisibility(F, ExternalComdats);
+    for (GlobalVariable &GV : M.globals())
+      checkComdatVisibility(GV, ExternalComdats);
+    for (GlobalAlias &GA : M.aliases())
+      checkComdatVisibility(GA, ExternalComdats);
+  }
+
+  // We must assume that globals in llvm.used have a reference that not even
+  // the linker can see, so we don't internalize them.
+  // For llvm.compiler.used the situation is a bit fuzzy. The assembler and
+  // linker can drop those symbols. If this pass is running as part of LTO,
+  // one might think that it could just drop llvm.compiler.used. The problem
+  // is that even in LTO llvm doesn't see every reference. For example,
+  // we don't see references from function local inline assembly. To be
+  // conservative, we internalize symbols in llvm.compiler.used, but we
+  // keep llvm.compiler.used so that the symbol is not deleted by llvm.
+  for (GlobalValue *V : Used) {
+    AlwaysPreserved.insert(V->getName());
+  }
+
+  // Mark all functions not in the api as internal.
+  for (Function &I : M) {
+    if (!maybeInternalize(I, ExternalComdats))
+      continue;
+    Changed = true;
+
+    if (ExternalNode)
+      // Remove a callgraph edge from the external node to this function.
+      ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]);
+
+    ++NumFunctions;
+    DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n");
+  }
+
+  // Never internalize the llvm.used symbol.  It is used to implement
+  // attribute((used)).
+  // FIXME: Shouldn't this just filter on llvm.metadata section??
+  AlwaysPreserved.insert("llvm.used");
+  AlwaysPreserved.insert("llvm.compiler.used");
+
+  // Never internalize anchors used by the machine module info, else the info
+  // won't find them.  (see MachineModuleInfo.)
+  AlwaysPreserved.insert("llvm.global_ctors");
+  AlwaysPreserved.insert("llvm.global_dtors");
+  AlwaysPreserved.insert("llvm.global.annotations");
+
+  // Never internalize symbols code-gen inserts.
+  // FIXME: We should probably add this (and the __stack_chk_guard) via some
+  // type of call-back in CodeGen.
+  AlwaysPreserved.insert("__stack_chk_fail");
+  AlwaysPreserved.insert("__stack_chk_guard");
+
+  // Mark all global variables with initializers that are not in the api as
+  // internal as well.
+  for (auto &GV : M.globals()) {
+    if (!maybeInternalize(GV, ExternalComdats))
+      continue;
+    Changed = true;
+
+    ++NumGlobals;
+    DEBUG(dbgs() << "Internalized gvar " << GV.getName() << "\n");
+  }
+
+  // Mark all aliases that are not in the api as internal as well.
+  for (auto &GA : M.aliases()) {
+    if (!maybeInternalize(GA, ExternalComdats))
+      continue;
+    Changed = true;
+
+    ++NumAliases;
+    DEBUG(dbgs() << "Internalized alias " << GA.getName() << "\n");
+  }
+
+  return Changed;
+}
+
+InternalizePass::InternalizePass() : MustPreserveGV(PreserveAPIList()) {}
+
+PreservedAnalyses InternalizePass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (!internalizeModule(M, AM.getCachedResult<CallGraphAnalysis>(M)))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<CallGraphAnalysis>();
+  return PA;
+}
+
+namespace {
+class InternalizeLegacyPass : public ModulePass {
+  // Client supplied callback to control wheter a symbol must be preserved.
+  std::function<bool(const GlobalValue &)> MustPreserveGV;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  InternalizeLegacyPass() : ModulePass(ID), MustPreserveGV(PreserveAPIList()) {}
+
+  InternalizeLegacyPass(std::function<bool(const GlobalValue &)> MustPreserveGV)
+      : ModulePass(ID), MustPreserveGV(std::move(MustPreserveGV)) {
+    initializeInternalizeLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    CallGraphWrapperPass *CGPass =
+        getAnalysisIfAvailable<CallGraphWrapperPass>();
+    CallGraph *CG = CGPass ? &CGPass->getCallGraph() : nullptr;
+    return internalizeModule(M, MustPreserveGV, CG);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<CallGraphWrapperPass>();
+  }
+};
+}
+
+char InternalizeLegacyPass::ID = 0;
+INITIALIZE_PASS(InternalizeLegacyPass, "internalize",
+                "Internalize Global Symbols", false, false)
+
+ModulePass *llvm::createInternalizePass() {
+  return new InternalizeLegacyPass();
+}
+
+ModulePass *llvm::createInternalizePass(
+    std::function<bool(const GlobalValue &)> MustPreserveGV) {
+  return new InternalizeLegacyPass(std::move(MustPreserveGV));
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
new file mode 100644
index 000000000000..c74b0a35e296
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
@@ -0,0 +1,311 @@
+//===- LoopExtractor.cpp - Extract each loop into a new function ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass wrapper around the ExtractLoop() scalar transformation to extract each
+// top-level loop into its own new function. If the loop is the ONLY loop in a
+// given function, it is not touched. This is a pass most useful for debugging
+// via bugpoint.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include <fstream>
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-extract"
+
+STATISTIC(NumExtracted, "Number of loops extracted");
+
+namespace {
+  struct LoopExtractor : public LoopPass {
+    static char ID; // Pass identification, replacement for typeid
+    unsigned NumLoops;
+
+    explicit LoopExtractor(unsigned numLoops = ~0)
+      : LoopPass(ID), NumLoops(numLoops) {
+        initializeLoopExtractorPass(*PassRegistry::getPassRegistry());
+      }
+
+    bool runOnLoop(Loop *L, LPPassManager &) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+    }
+  };
+}
+
+char LoopExtractor::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopExtractor, "loop-extract",
+                      "Extract loops into new functions", false, false)
+INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(LoopExtractor, "loop-extract",
+                    "Extract loops into new functions", false, false)
+
+namespace {
+  /// SingleLoopExtractor - For bugpoint.
+  struct SingleLoopExtractor : public LoopExtractor {
+    static char ID; // Pass identification, replacement for typeid
+    SingleLoopExtractor() : LoopExtractor(1) {}
+  };
+} // End anonymous namespace
+
+char SingleLoopExtractor::ID = 0;
+INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single",
+                "Extract at most one loop into a new function", false, false)
+
+// createLoopExtractorPass - This pass extracts all natural loops from the
+// program into a function if it can.
+//
+Pass *llvm::createLoopExtractorPass() { return new LoopExtractor(); }
+
+bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &) {
+  if (skipLoop(L))
+    return false;
+
+  // Only visit top-level loops.
+  if (L->getParentLoop())
+    return false;
+
+  // If LoopSimplify form is not available, stay out of trouble.
+  if (!L->isLoopSimplifyForm())
+    return false;
+
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  bool Changed = false;
+
+  // If there is more than one top-level loop in this function, extract all of
+  // the loops. Otherwise there is exactly one top-level loop; in this case if
+  // this function is more than a minimal wrapper around the loop, extract
+  // the loop.
+  bool ShouldExtractLoop = false;
+
+  // Extract the loop if the entry block doesn't branch to the loop header.
+  TerminatorInst *EntryTI =
+    L->getHeader()->getParent()->getEntryBlock().getTerminator();
+  if (!isa<BranchInst>(EntryTI) ||
+      !cast<BranchInst>(EntryTI)->isUnconditional() ||
+      EntryTI->getSuccessor(0) != L->getHeader()) {
+    ShouldExtractLoop = true;
+  } else {
+    // Check to see if any exits from the loop are more than just return
+    // blocks.
+    SmallVector<BasicBlock*, 8> ExitBlocks;
+    L->getExitBlocks(ExitBlocks);
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+      if (!isa<ReturnInst>(ExitBlocks[i]->getTerminator())) {
+        ShouldExtractLoop = true;
+        break;
+      }
+  }
+
+  if (ShouldExtractLoop) {
+    // We must omit EH pads. EH pads must accompany the invoke
+    // instruction. But this would result in a loop in the extracted
+    // function. An infinite cycle occurs when it tries to extract that loop as
+    // well.
+    SmallVector<BasicBlock*, 8> ExitBlocks;
+    L->getExitBlocks(ExitBlocks);
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+      if (ExitBlocks[i]->isEHPad()) {
+        ShouldExtractLoop = false;
+        break;
+      }
+  }
+
+  if (ShouldExtractLoop) {
+    if (NumLoops == 0) return Changed;
+    --NumLoops;
+    CodeExtractor Extractor(DT, *L);
+    if (Extractor.extractCodeRegion() != nullptr) {
+      Changed = true;
+      // After extraction, the loop is replaced by a function call, so
+      // we shouldn't try to run any more loop passes on it.
+      LI.markAsRemoved(L);
+    }
+    ++NumExtracted;
+  }
+
+  return Changed;
+}
+
+// createSingleLoopExtractorPass - This pass extracts one natural loop from the
+// program into a function if it can.  This is used by bugpoint.
+//
+Pass *llvm::createSingleLoopExtractorPass() {
+  return new SingleLoopExtractor();
+}
+
+
+// BlockFile - A file which contains a list of blocks that should not be
+// extracted.
+static cl::opt<std::string>
+BlockFile("extract-blocks-file", cl::value_desc("filename"),
+          cl::desc("A file containing list of basic blocks to not extract"),
+          cl::Hidden);
+
+namespace {
+  /// BlockExtractorPass - This pass is used by bugpoint to extract all blocks
+  /// from the module into their own functions except for those specified by the
+  /// BlocksToNotExtract list.
+  class BlockExtractorPass : public ModulePass {
+    void LoadFile(const char *Filename);
+    void SplitLandingPadPreds(Function *F);
+
+    std::vector<BasicBlock*> BlocksToNotExtract;
+    std::vector<std::pair<std::string, std::string> > BlocksToNotExtractByName;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    BlockExtractorPass() : ModulePass(ID) {
+      if (!BlockFile.empty())
+        LoadFile(BlockFile.c_str());
+    }
+
+    bool runOnModule(Module &M) override;
+  };
+}
+
+char BlockExtractorPass::ID = 0;
+INITIALIZE_PASS(BlockExtractorPass, "extract-blocks",
+                "Extract Basic Blocks From Module (for bugpoint use)",
+                false, false)
+
+// createBlockExtractorPass - This pass extracts all blocks (except those
+// specified in the argument list) from the functions in the module.
+//
+ModulePass *llvm::createBlockExtractorPass() {
+  return new BlockExtractorPass();
+}
+
+void BlockExtractorPass::LoadFile(const char *Filename) {
+  // Load the BlockFile...
+  std::ifstream In(Filename);
+  if (!In.good()) {
+    errs() << "WARNING: BlockExtractor couldn't load file '" << Filename
+           << "'!\n";
+    return;
+  }
+  while (In) {
+    std::string FunctionName, BlockName;
+    In >> FunctionName;
+    In >> BlockName;
+    if (!BlockName.empty())
+      BlocksToNotExtractByName.push_back(
+          std::make_pair(FunctionName, BlockName));
+  }
+}
+
+/// SplitLandingPadPreds - The landing pad needs to be extracted with the invoke
+/// instruction. The critical edge breaker will refuse to break critical edges
+/// to a landing pad. So do them here. After this method runs, all landing pads
+/// should have only one predecessor.
+void BlockExtractorPass::SplitLandingPadPreds(Function *F) {
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+    InvokeInst *II = dyn_cast<InvokeInst>(I);
+    if (!II) continue;
+    BasicBlock *Parent = II->getParent();
+    BasicBlock *LPad = II->getUnwindDest();
+
+    // Look through the landing pad's predecessors. If one of them ends in an
+    // 'invoke', then we want to split the landing pad.
+    bool Split = false;
+    for (pred_iterator
+           PI = pred_begin(LPad), PE = pred_end(LPad); PI != PE; ++PI) {
+      BasicBlock *BB = *PI;
+      if (BB->isLandingPad() && BB != Parent &&
+          isa<InvokeInst>(Parent->getTerminator())) {
+        Split = true;
+        break;
+      }
+    }
+
+    if (!Split) continue;
+
+    SmallVector<BasicBlock*, 2> NewBBs;
+    SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", NewBBs);
+  }
+}
+
+bool BlockExtractorPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  std::set<BasicBlock*> TranslatedBlocksToNotExtract;
+  for (unsigned i = 0, e = BlocksToNotExtract.size(); i != e; ++i) {
+    BasicBlock *BB = BlocksToNotExtract[i];
+    Function *F = BB->getParent();
+
+    // Map the corresponding function in this module.
+    Function *MF = M.getFunction(F->getName());
+    assert(MF->getFunctionType() == F->getFunctionType() && "Wrong function?");
+
+    // Figure out which index the basic block is in its function.
+    Function::iterator BBI = MF->begin();
+    std::advance(BBI, std::distance(F->begin(), Function::iterator(BB)));
+    TranslatedBlocksToNotExtract.insert(&*BBI);
+  }
+
+  while (!BlocksToNotExtractByName.empty()) {
+    // There's no way to find BBs by name without looking at every BB inside
+    // every Function. Fortunately, this is always empty except when used by
+    // bugpoint in which case correctness is more important than performance.
+
+    std::string &FuncName  = BlocksToNotExtractByName.back().first;
+    std::string &BlockName = BlocksToNotExtractByName.back().second;
+
+    for (Function &F : M) {
+      if (F.getName() != FuncName) continue;
+
+      for (BasicBlock &BB : F) {
+        if (BB.getName() != BlockName) continue;
+
+        TranslatedBlocksToNotExtract.insert(&BB);
+      }
+    }
+
+    BlocksToNotExtractByName.pop_back();
+  }
+
+  // Now that we know which blocks to not extract, figure out which ones we WANT
+  // to extract.
+  std::vector<BasicBlock*> BlocksToExtract;
+  for (Function &F : M) {
+    SplitLandingPadPreds(&F);
+    for (BasicBlock &BB : F)
+      if (!TranslatedBlocksToNotExtract.count(&BB))
+        BlocksToExtract.push_back(&BB);
+  }
+
+  for (BasicBlock *BlockToExtract : BlocksToExtract) {
+    SmallVector<BasicBlock*, 2> BlocksToExtractVec;
+    BlocksToExtractVec.push_back(BlockToExtract);
+    if (const InvokeInst *II =
+            dyn_cast<InvokeInst>(BlockToExtract->getTerminator()))
+      BlocksToExtractVec.push_back(II->getUnwindDest());
+    CodeExtractor(BlocksToExtractVec).extractCodeRegion();
+  }
+
+  return !BlocksToExtract.empty();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
new file mode 100644
index 000000000000..693df5e7ba92
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -0,0 +1,1673 @@
+//===-- LowerTypeTests.cpp - type metadata lowering pass ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers type metadata and calls to the llvm.type.test intrinsic.
+// See http://llvm.org/docs/TypeMetadata.html for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/LowerTypeTests.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndexYAML.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/TrailingObjects.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+using namespace lowertypetests;
+
+#define DEBUG_TYPE "lowertypetests"
+
+STATISTIC(ByteArraySizeBits, "Byte array size in bits");
+STATISTIC(ByteArraySizeBytes, "Byte array size in bytes");
+STATISTIC(NumByteArraysCreated, "Number of byte arrays created");
+STATISTIC(NumTypeTestCallsLowered, "Number of type test calls lowered");
+STATISTIC(NumTypeIdDisjointSets, "Number of disjoint sets of type identifiers");
+
+static cl::opt<bool> AvoidReuse(
+    "lowertypetests-avoid-reuse",
+    cl::desc("Try to avoid reuse of byte array addresses using aliases"),
+    cl::Hidden, cl::init(true));
+
+static cl::opt<PassSummaryAction> ClSummaryAction(
+    "lowertypetests-summary-action",
+    cl::desc("What to do with the summary when running this pass"),
+    cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"),
+               clEnumValN(PassSummaryAction::Import, "import",
+                          "Import typeid resolutions from summary and globals"),
+               clEnumValN(PassSummaryAction::Export, "export",
+                          "Export typeid resolutions to summary and globals")),
+    cl::Hidden);
+
+static cl::opt<std::string> ClReadSummary(
+    "lowertypetests-read-summary",
+    cl::desc("Read summary from given YAML file before running pass"),
+    cl::Hidden);
+
+static cl::opt<std::string> ClWriteSummary(
+    "lowertypetests-write-summary",
+    cl::desc("Write summary to given YAML file after running pass"),
+    cl::Hidden);
+
+bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {
+  if (Offset < ByteOffset)
+    return false;
+
+  if ((Offset - ByteOffset) % (uint64_t(1) << AlignLog2) != 0)
+    return false;
+
+  uint64_t BitOffset = (Offset - ByteOffset) >> AlignLog2;
+  if (BitOffset >= BitSize)
+    return false;
+
+  return Bits.count(BitOffset);
+}
+
+void BitSetInfo::print(raw_ostream &OS) const {
+  OS << "offset " << ByteOffset << " size " << BitSize << " align "
+     << (1 << AlignLog2);
+
+  if (isAllOnes()) {
+    OS << " all-ones\n";
+    return;
+  }
+
+  OS << " { ";
+  for (uint64_t B : Bits)
+    OS << B << ' ';
+  OS << "}\n";
+}
+
+BitSetInfo BitSetBuilder::build() {
+  if (Min > Max)
+    Min = 0;
+
+  // Normalize each offset against the minimum observed offset, and compute
+  // the bitwise OR of each of the offsets. The number of trailing zeros
+  // in the mask gives us the log2 of the alignment of all offsets, which
+  // allows us to compress the bitset by only storing one bit per aligned
+  // address.
+  uint64_t Mask = 0;
+  for (uint64_t &Offset : Offsets) {
+    Offset -= Min;
+    Mask |= Offset;
+  }
+
+  BitSetInfo BSI;
+  BSI.ByteOffset = Min;
+
+  BSI.AlignLog2 = 0;
+  if (Mask != 0)
+    BSI.AlignLog2 = countTrailingZeros(Mask, ZB_Undefined);
+
+  // Build the compressed bitset while normalizing the offsets against the
+  // computed alignment.
+  BSI.BitSize = ((Max - Min) >> BSI.AlignLog2) + 1;
+  for (uint64_t Offset : Offsets) {
+    Offset >>= BSI.AlignLog2;
+    BSI.Bits.insert(Offset);
+  }
+
+  return BSI;
+}
+
+void GlobalLayoutBuilder::addFragment(const std::set<uint64_t> &F) {
+  // Create a new fragment to hold the layout for F.
+  Fragments.emplace_back();
+  std::vector<uint64_t> &Fragment = Fragments.back();
+  uint64_t FragmentIndex = Fragments.size() - 1;
+
+  for (auto ObjIndex : F) {
+    uint64_t OldFragmentIndex = FragmentMap[ObjIndex];
+    if (OldFragmentIndex == 0) {
+      // We haven't seen this object index before, so just add it to the current
+      // fragment.
+      Fragment.push_back(ObjIndex);
+    } else {
+      // This index belongs to an existing fragment. Copy the elements of the
+      // old fragment into this one and clear the old fragment. We don't update
+      // the fragment map just yet, this ensures that any further references to
+      // indices from the old fragment in this fragment do not insert any more
+      // indices.
+      std::vector<uint64_t> &OldFragment = Fragments[OldFragmentIndex];
+      Fragment.insert(Fragment.end(), OldFragment.begin(), OldFragment.end());
+      OldFragment.clear();
+    }
+  }
+
+  // Update the fragment map to point our object indices to this fragment.
+  for (uint64_t ObjIndex : Fragment)
+    FragmentMap[ObjIndex] = FragmentIndex;
+}
+
+void ByteArrayBuilder::allocate(const std::set<uint64_t> &Bits,
+                                uint64_t BitSize, uint64_t &AllocByteOffset,
+                                uint8_t &AllocMask) {
+  // Find the smallest current allocation.
+  unsigned Bit = 0;
+  for (unsigned I = 1; I != BitsPerByte; ++I)
+    if (BitAllocs[I] < BitAllocs[Bit])
+      Bit = I;
+
+  AllocByteOffset = BitAllocs[Bit];
+
+  // Add our size to it.
+  unsigned ReqSize = AllocByteOffset + BitSize;
+  BitAllocs[Bit] = ReqSize;
+  if (Bytes.size() < ReqSize)
+    Bytes.resize(ReqSize);
+
+  // Set our bits.
+  AllocMask = 1 << Bit;
+  for (uint64_t B : Bits)
+    Bytes[AllocByteOffset + B] |= AllocMask;
+}
+
+namespace {
+
+struct ByteArrayInfo {
+  std::set<uint64_t> Bits;
+  uint64_t BitSize;
+  GlobalVariable *ByteArray;
+  GlobalVariable *MaskGlobal;
+};
+
+/// A POD-like structure that we use to store a global reference together with
+/// its metadata types. In this pass we frequently need to query the set of
+/// metadata types referenced by a global, which at the IR level is an expensive
+/// operation involving a map lookup; this data structure helps to reduce the
+/// number of times we need to do this lookup.
+class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
+  GlobalObject *GO;
+  size_t NTypes;
+  // For functions: true if this is a definition (either in the merged module or
+  // in one of the thinlto modules).
+  bool IsDefinition;
+  // For functions: true if this function is either defined or used in a thinlto
+  // module and its jumptable entry needs to be exported to thinlto backends.
+  bool IsExported;
+
+  friend TrailingObjects;
+  size_t numTrailingObjects(OverloadToken<MDNode *>) const { return NTypes; }
+
+public:
+  static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO,
+                                  bool IsDefinition, bool IsExported,
+                                  ArrayRef<MDNode *> Types) {
+    auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate(
+        totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember)));
+    GTM->GO = GO;
+    GTM->NTypes = Types.size();
+    GTM->IsDefinition = IsDefinition;
+    GTM->IsExported = IsExported;
+    std::uninitialized_copy(Types.begin(), Types.end(),
+                            GTM->getTrailingObjects<MDNode *>());
+    return GTM;
+  }
+  GlobalObject *getGlobal() const {
+    return GO;
+  }
+  bool isDefinition() const {
+    return IsDefinition;
+  }
+  bool isExported() const {
+    return IsExported;
+  }
+  ArrayRef<MDNode *> types() const {
+    return makeArrayRef(getTrailingObjects<MDNode *>(), NTypes);
+  }
+};
+
+class LowerTypeTestsModule {
+  Module &M;
+
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+
+  Triple::ArchType Arch;
+  Triple::OSType OS;
+  Triple::ObjectFormatType ObjectFormat;
+
+  IntegerType *Int1Ty = Type::getInt1Ty(M.getContext());
+  IntegerType *Int8Ty = Type::getInt8Ty(M.getContext());
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
+  IntegerType *Int32Ty = Type::getInt32Ty(M.getContext());
+  PointerType *Int32PtrTy = PointerType::getUnqual(Int32Ty);
+  IntegerType *Int64Ty = Type::getInt64Ty(M.getContext());
+  IntegerType *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext(), 0);
+
+  // Indirect function call index assignment counter for WebAssembly
+  uint64_t IndirectIndex = 1;
+
+  // Mapping from type identifiers to the call sites that test them, as well as
+  // whether the type identifier needs to be exported to ThinLTO backends as
+  // part of the regular LTO phase of the ThinLTO pipeline (see exportTypeId).
+  struct TypeIdUserInfo {
+    std::vector<CallInst *> CallSites;
+    bool IsExported = false;
+  };
+  DenseMap<Metadata *, TypeIdUserInfo> TypeIdUsers;
+
+  /// This structure describes how to lower type tests for a particular type
+  /// identifier. It is either built directly from the global analysis (during
+  /// regular LTO or the regular LTO phase of ThinLTO), or indirectly using type
+  /// identifier summaries and external symbol references (in ThinLTO backends).
+  struct TypeIdLowering {
+    TypeTestResolution::Kind TheKind = TypeTestResolution::Unsat;
+
+    /// All except Unsat: the start address within the combined global.
+    Constant *OffsetedGlobal;
+
+    /// ByteArray, Inline, AllOnes: log2 of the required global alignment
+    /// relative to the start address.
+    Constant *AlignLog2;
+
+    /// ByteArray, Inline, AllOnes: one less than the size of the memory region
+    /// covering members of this type identifier as a multiple of 2^AlignLog2.
+    Constant *SizeM1;
+
+    /// ByteArray: the byte array to test the address against.
+    Constant *TheByteArray;
+
+    /// ByteArray: the bit mask to apply to bytes loaded from the byte array.
+    Constant *BitMask;
+
+    /// Inline: the bit mask to test the address against.
+    Constant *InlineBits;
+  };
+
+  std::vector<ByteArrayInfo> ByteArrayInfos;
+
+  Function *WeakInitializerFn = nullptr;
+
+  void exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
+  TypeIdLowering importTypeId(StringRef TypeId);
+  void importTypeTest(CallInst *CI);
+  void importFunction(Function *F, bool isDefinition);
+
+  BitSetInfo
+  buildBitSet(Metadata *TypeId,
+              const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
+  ByteArrayInfo *createByteArray(BitSetInfo &BSI);
+  void allocateByteArrays();
+  Value *createBitSetTest(IRBuilder<> &B, const TypeIdLowering &TIL,
+                          Value *BitOffset);
+  void lowerTypeTestCalls(
+      ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
+      const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
+  Value *lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
+                           const TypeIdLowering &TIL);
+  void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> TypeIds,
+                                       ArrayRef<GlobalTypeMember *> Globals);
+  unsigned getJumpTableEntrySize();
+  Type *getJumpTableEntryType();
+  void createJumpTableEntry(raw_ostream &AsmOS, raw_ostream &ConstraintOS,
+                            SmallVectorImpl<Value *> &AsmArgs, Function *Dest);
+  void verifyTypeMDNode(GlobalObject *GO, MDNode *Type);
+  void buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds,
+                                 ArrayRef<GlobalTypeMember *> Functions);
+  void buildBitSetsFromFunctionsNative(ArrayRef<Metadata *> TypeIds,
+                                    ArrayRef<GlobalTypeMember *> Functions);
+  void buildBitSetsFromFunctionsWASM(ArrayRef<Metadata *> TypeIds,
+                                     ArrayRef<GlobalTypeMember *> Functions);
+  void buildBitSetsFromDisjointSet(ArrayRef<Metadata *> TypeIds,
+                                   ArrayRef<GlobalTypeMember *> Globals);
+
+  void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT);
+  void moveInitializerToModuleConstructor(GlobalVariable *GV);
+  void findGlobalVariableUsersOf(Constant *C,
+                                 SmallSetVector<GlobalVariable *, 8> &Out);
+
+  void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions);
+
+public:
+  LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary,
+                       const ModuleSummaryIndex *ImportSummary);
+  bool lower();
+
+  // Lower the module using the action and summary passed as command line
+  // arguments. For testing purposes only.
+  static bool runForTesting(Module &M);
+};
+
+struct LowerTypeTests : public ModulePass {
+  static char ID;
+
+  bool UseCommandLine = false;
+
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+
+  LowerTypeTests() : ModulePass(ID), UseCommandLine(true) {
+    initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
+  }
+
+  LowerTypeTests(ModuleSummaryIndex *ExportSummary,
+                 const ModuleSummaryIndex *ImportSummary)
+      : ModulePass(ID), ExportSummary(ExportSummary),
+        ImportSummary(ImportSummary) {
+    initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    if (UseCommandLine)
+      return LowerTypeTestsModule::runForTesting(M);
+    return LowerTypeTestsModule(M, ExportSummary, ImportSummary).lower();
+  }
+};
+
+} // anonymous namespace
+
+INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
+                false)
+char LowerTypeTests::ID = 0;
+
+ModulePass *
+llvm::createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary,
+                               const ModuleSummaryIndex *ImportSummary) {
+  return new LowerTypeTests(ExportSummary, ImportSummary);
+}
+
+/// Build a bit set for TypeId using the object layouts in
+/// GlobalLayout.
+BitSetInfo LowerTypeTestsModule::buildBitSet(
+    Metadata *TypeId,
+    const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
+  BitSetBuilder BSB;
+
+  // Compute the byte offset of each address associated with this type
+  // identifier.
+  for (auto &GlobalAndOffset : GlobalLayout) {
+    for (MDNode *Type : GlobalAndOffset.first->types()) {
+      if (Type->getOperand(1) != TypeId)
+        continue;
+      uint64_t Offset =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+              ->getZExtValue();
+      BSB.addOffset(GlobalAndOffset.second + Offset);
+    }
+  }
+
+  return BSB.build();
+}
+
+/// Build a test that bit BitOffset mod sizeof(Bits)*8 is set in
+/// Bits. This pattern matches to the bt instruction on x86.
+static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits,
+                                  Value *BitOffset) {
+  auto BitsType = cast<IntegerType>(Bits->getType());
+  unsigned BitWidth = BitsType->getBitWidth();
+
+  BitOffset = B.CreateZExtOrTrunc(BitOffset, BitsType);
+  Value *BitIndex =
+      B.CreateAnd(BitOffset, ConstantInt::get(BitsType, BitWidth - 1));
+  Value *BitMask = B.CreateShl(ConstantInt::get(BitsType, 1), BitIndex);
+  Value *MaskedBits = B.CreateAnd(Bits, BitMask);
+  return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0));
+}
+
+ByteArrayInfo *LowerTypeTestsModule::createByteArray(BitSetInfo &BSI) {
+  // Create globals to stand in for byte arrays and masks. These never actually
+  // get initialized, we RAUW and erase them later in allocateByteArrays() once
+  // we know the offset and mask to use.
+  auto ByteArrayGlobal = new GlobalVariable(
+      M, Int8Ty, /*isConstant=*/true, GlobalValue::PrivateLinkage, nullptr);
+  auto MaskGlobal = new GlobalVariable(M, Int8Ty, /*isConstant=*/true,
+                                       GlobalValue::PrivateLinkage, nullptr);
+
+  ByteArrayInfos.emplace_back();
+  ByteArrayInfo *BAI = &ByteArrayInfos.back();
+
+  BAI->Bits = BSI.Bits;
+  BAI->BitSize = BSI.BitSize;
+  BAI->ByteArray = ByteArrayGlobal;
+  BAI->MaskGlobal = MaskGlobal;
+  return BAI;
+}
+
+void LowerTypeTestsModule::allocateByteArrays() {
+  std::stable_sort(ByteArrayInfos.begin(), ByteArrayInfos.end(),
+                   [](const ByteArrayInfo &BAI1, const ByteArrayInfo &BAI2) {
+                     return BAI1.BitSize > BAI2.BitSize;
+                   });
+
+  std::vector<uint64_t> ByteArrayOffsets(ByteArrayInfos.size());
+
+  ByteArrayBuilder BAB;
+  for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) {
+    ByteArrayInfo *BAI = &ByteArrayInfos[I];
+
+    uint8_t Mask;
+    BAB.allocate(BAI->Bits, BAI->BitSize, ByteArrayOffsets[I], Mask);
+
+    BAI->MaskGlobal->replaceAllUsesWith(
+        ConstantExpr::getIntToPtr(ConstantInt::get(Int8Ty, Mask), Int8PtrTy));
+    BAI->MaskGlobal->eraseFromParent();
+  }
+
+  Constant *ByteArrayConst = ConstantDataArray::get(M.getContext(), BAB.Bytes);
+  auto ByteArray =
+      new GlobalVariable(M, ByteArrayConst->getType(), /*isConstant=*/true,
+                         GlobalValue::PrivateLinkage, ByteArrayConst);
+
+  for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) {
+    ByteArrayInfo *BAI = &ByteArrayInfos[I];
+
+    Constant *Idxs[] = {ConstantInt::get(IntPtrTy, 0),
+                        ConstantInt::get(IntPtrTy, ByteArrayOffsets[I])};
+    Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(
+        ByteArrayConst->getType(), ByteArray, Idxs);
+
+    // Create an alias instead of RAUW'ing the gep directly. On x86 this ensures
+    // that the pc-relative displacement is folded into the lea instead of the
+    // test instruction getting another displacement.
+    GlobalAlias *Alias = GlobalAlias::create(
+        Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, &M);
+    BAI->ByteArray->replaceAllUsesWith(Alias);
+    BAI->ByteArray->eraseFromParent();
+  }
+
+  ByteArraySizeBits = BAB.BitAllocs[0] + BAB.BitAllocs[1] + BAB.BitAllocs[2] +
+                      BAB.BitAllocs[3] + BAB.BitAllocs[4] + BAB.BitAllocs[5] +
+                      BAB.BitAllocs[6] + BAB.BitAllocs[7];
+  ByteArraySizeBytes = BAB.Bytes.size();
+}
+
+/// Build a test that bit BitOffset is set in the type identifier that was
+/// lowered to TIL, which must be either an Inline or a ByteArray.
+Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B,
+                                              const TypeIdLowering &TIL,
+                                              Value *BitOffset) {
+  if (TIL.TheKind == TypeTestResolution::Inline) {
+    // If the bit set is sufficiently small, we can avoid a load by bit testing
+    // a constant.
+    return createMaskedBitTest(B, TIL.InlineBits, BitOffset);
+  } else {
+    Constant *ByteArray = TIL.TheByteArray;
+    if (AvoidReuse && !ImportSummary) {
+      // Each use of the byte array uses a different alias. This makes the
+      // backend less likely to reuse previously computed byte array addresses,
+      // improving the security of the CFI mechanism based on this pass.
+      // This won't work when importing because TheByteArray is external.
+      ByteArray = GlobalAlias::create(Int8Ty, 0, GlobalValue::PrivateLinkage,
+                                      "bits_use", ByteArray, &M);
+    }
+
+    Value *ByteAddr = B.CreateGEP(Int8Ty, ByteArray, BitOffset);
+    Value *Byte = B.CreateLoad(ByteAddr);
+
+    Value *ByteAndMask =
+        B.CreateAnd(Byte, ConstantExpr::getPtrToInt(TIL.BitMask, Int8Ty));
+    return B.CreateICmpNE(ByteAndMask, ConstantInt::get(Int8Ty, 0));
+  }
+}
+
+static bool isKnownTypeIdMember(Metadata *TypeId, const DataLayout &DL,
+                                Value *V, uint64_t COffset) {
+  if (auto GV = dyn_cast<GlobalObject>(V)) {
+    SmallVector<MDNode *, 2> Types;
+    GV->getMetadata(LLVMContext::MD_type, Types);
+    for (MDNode *Type : Types) {
+      if (Type->getOperand(1) != TypeId)
+        continue;
+      uint64_t Offset =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+              ->getZExtValue();
+      if (COffset == Offset)
+        return true;
+    }
+    return false;
+  }
+
+  if (auto GEP = dyn_cast<GEPOperator>(V)) {
+    APInt APOffset(DL.getPointerSizeInBits(0), 0);
+    bool Result = GEP->accumulateConstantOffset(DL, APOffset);
+    if (!Result)
+      return false;
+    COffset += APOffset.getZExtValue();
+    return isKnownTypeIdMember(TypeId, DL, GEP->getPointerOperand(), COffset);
+  }
+
+  if (auto Op = dyn_cast<Operator>(V)) {
+    if (Op->getOpcode() == Instruction::BitCast)
+      return isKnownTypeIdMember(TypeId, DL, Op->getOperand(0), COffset);
+
+    if (Op->getOpcode() == Instruction::Select)
+      return isKnownTypeIdMember(TypeId, DL, Op->getOperand(1), COffset) &&
+             isKnownTypeIdMember(TypeId, DL, Op->getOperand(2), COffset);
+  }
+
+  return false;
+}
+
+/// Lower a llvm.type.test call to its implementation. Returns the value to
+/// replace the call with.
+Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
+                                               const TypeIdLowering &TIL) {
+  if (TIL.TheKind == TypeTestResolution::Unsat)
+    return ConstantInt::getFalse(M.getContext());
+
+  Value *Ptr = CI->getArgOperand(0);
+  const DataLayout &DL = M.getDataLayout();
+  if (isKnownTypeIdMember(TypeId, DL, Ptr, 0))
+    return ConstantInt::getTrue(M.getContext());
+
+  BasicBlock *InitialBB = CI->getParent();
+
+  IRBuilder<> B(CI);
+
+  Value *PtrAsInt = B.CreatePtrToInt(Ptr, IntPtrTy);
+
+  Constant *OffsetedGlobalAsInt =
+      ConstantExpr::getPtrToInt(TIL.OffsetedGlobal, IntPtrTy);
+  if (TIL.TheKind == TypeTestResolution::Single)
+    return B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt);
+
+  Value *PtrOffset = B.CreateSub(PtrAsInt, OffsetedGlobalAsInt);
+
+  // We need to check that the offset both falls within our range and is
+  // suitably aligned. We can check both properties at the same time by
+  // performing a right rotate by log2(alignment) followed by an integer
+  // comparison against the bitset size. The rotate will move the lower
+  // order bits that need to be zero into the higher order bits of the
+  // result, causing the comparison to fail if they are nonzero. The rotate
+  // also conveniently gives us a bit offset to use during the load from
+  // the bitset.
+  Value *OffsetSHR =
+      B.CreateLShr(PtrOffset, ConstantExpr::getZExt(TIL.AlignLog2, IntPtrTy));
+  Value *OffsetSHL = B.CreateShl(
+      PtrOffset, ConstantExpr::getZExt(
+                     ConstantExpr::getSub(
+                         ConstantInt::get(Int8Ty, DL.getPointerSizeInBits(0)),
+                         TIL.AlignLog2),
+                     IntPtrTy));
+  Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
+
+  Value *OffsetInRange = B.CreateICmpULE(BitOffset, TIL.SizeM1);
+
+  // If the bit set is all ones, testing against it is unnecessary.
+  if (TIL.TheKind == TypeTestResolution::AllOnes)
+    return OffsetInRange;
+
+  // See if the intrinsic is used in the following common pattern:
+  //   br(llvm.type.test(...), thenbb, elsebb)
+  // where nothing happens between the type test and the br.
+  // If so, create slightly simpler IR.
+  if (CI->hasOneUse())
+    if (auto *Br = dyn_cast<BranchInst>(*CI->user_begin()))
+      if (CI->getNextNode() == Br) {
+        BasicBlock *Then = InitialBB->splitBasicBlock(CI->getIterator());
+        BasicBlock *Else = Br->getSuccessor(1);
+        BranchInst *NewBr = BranchInst::Create(Then, Else, OffsetInRange);
+        NewBr->setMetadata(LLVMContext::MD_prof,
+                           Br->getMetadata(LLVMContext::MD_prof));
+        ReplaceInstWithInst(InitialBB->getTerminator(), NewBr);
+
+        IRBuilder<> ThenB(CI);
+        return createBitSetTest(ThenB, TIL, BitOffset);
+      }
+
+  IRBuilder<> ThenB(SplitBlockAndInsertIfThen(OffsetInRange, CI, false));
+
+  // Now that we know that the offset is in range and aligned, load the
+  // appropriate bit from the bitset.
+  Value *Bit = createBitSetTest(ThenB, TIL, BitOffset);
+
+  // The value we want is 0 if we came directly from the initial block
+  // (having failed the range or alignment checks), or the loaded bit if
+  // we came from the block in which we loaded it.
+  B.SetInsertPoint(CI);
+  PHINode *P = B.CreatePHI(Int1Ty, 2);
+  P->addIncoming(ConstantInt::get(Int1Ty, 0), InitialBB);
+  P->addIncoming(Bit, ThenB.GetInsertBlock());
+  return P;
+}
+
+/// Given a disjoint set of type identifiers and globals, lay out the globals,
+/// build the bit sets and lower the llvm.type.test calls.
+void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals) {
+  // Build a new global with the combined contents of the referenced globals.
+  // This global is a struct whose even-indexed elements contain the original
+  // contents of the referenced globals and whose odd-indexed elements contain
+  // any padding required to align the next element to the next power of 2.
+  std::vector<Constant *> GlobalInits;
+  const DataLayout &DL = M.getDataLayout();
+  for (GlobalTypeMember *G : Globals) {
+    GlobalVariable *GV = cast<GlobalVariable>(G->getGlobal());
+    GlobalInits.push_back(GV->getInitializer());
+    uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
+
+    // Compute the amount of padding required.
+    uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize;
+
+    // Cap at 128 was found experimentally to have a good data/instruction
+    // overhead tradeoff.
+    if (Padding > 128)
+      Padding = alignTo(InitSize, 128) - InitSize;
+
+    GlobalInits.push_back(
+        ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding)));
+  }
+  if (!GlobalInits.empty())
+    GlobalInits.pop_back();
+  Constant *NewInit = ConstantStruct::getAnon(M.getContext(), GlobalInits);
+  auto *CombinedGlobal =
+      new GlobalVariable(M, NewInit->getType(), /*isConstant=*/true,
+                         GlobalValue::PrivateLinkage, NewInit);
+
+  StructType *NewTy = cast<StructType>(NewInit->getType());
+  const StructLayout *CombinedGlobalLayout = DL.getStructLayout(NewTy);
+
+  // Compute the offsets of the original globals within the new global.
+  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
+  for (unsigned I = 0; I != Globals.size(); ++I)
+    // Multiply by 2 to account for padding elements.
+    GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2);
+
+  lowerTypeTestCalls(TypeIds, CombinedGlobal, GlobalLayout);
+
+  // Build aliases pointing to offsets into the combined global for each
+  // global from which we built the combined global, and replace references
+  // to the original globals with references to the aliases.
+  for (unsigned I = 0; I != Globals.size(); ++I) {
+    GlobalVariable *GV = cast<GlobalVariable>(Globals[I]->getGlobal());
+
+    // Multiply by 2 to account for padding elements.
+    Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0),
+                                      ConstantInt::get(Int32Ty, I * 2)};
+    Constant *CombinedGlobalElemPtr = ConstantExpr::getGetElementPtr(
+        NewInit->getType(), CombinedGlobal, CombinedGlobalIdxs);
+    assert(GV->getType()->getAddressSpace() == 0);
+    GlobalAlias *GAlias =
+        GlobalAlias::create(NewTy->getElementType(I * 2), 0, GV->getLinkage(),
+                            "", CombinedGlobalElemPtr, &M);
+    GAlias->setVisibility(GV->getVisibility());
+    GAlias->takeName(GV);
+    GV->replaceAllUsesWith(GAlias);
+    GV->eraseFromParent();
+  }
+}
+
+/// Export the given type identifier so that ThinLTO backends may import it.
+/// Type identifiers are exported by adding coarse-grained information about how
+/// to test the type identifier to the summary, and creating symbols in the
+/// object file (aliases and absolute symbols) containing fine-grained
+/// information about the type identifier.
+void LowerTypeTestsModule::exportTypeId(StringRef TypeId,
+                                        const TypeIdLowering &TIL) {
+  TypeTestResolution &TTRes =
+      ExportSummary->getOrInsertTypeIdSummary(TypeId).TTRes;
+  TTRes.TheKind = TIL.TheKind;
+
+  auto ExportGlobal = [&](StringRef Name, Constant *C) {
+    GlobalAlias *GA =
+        GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage,
+                            "__typeid_" + TypeId + "_" + Name, C, &M);
+    GA->setVisibility(GlobalValue::HiddenVisibility);
+  };
+
+  if (TIL.TheKind != TypeTestResolution::Unsat)
+    ExportGlobal("global_addr", TIL.OffsetedGlobal);
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray ||
+      TIL.TheKind == TypeTestResolution::Inline ||
+      TIL.TheKind == TypeTestResolution::AllOnes) {
+    ExportGlobal("align", ConstantExpr::getIntToPtr(TIL.AlignLog2, Int8PtrTy));
+    ExportGlobal("size_m1", ConstantExpr::getIntToPtr(TIL.SizeM1, Int8PtrTy));
+
+    uint64_t BitSize = cast<ConstantInt>(TIL.SizeM1)->getZExtValue() + 1;
+    if (TIL.TheKind == TypeTestResolution::Inline)
+      TTRes.SizeM1BitWidth = (BitSize <= 32) ? 5 : 6;
+    else
+      TTRes.SizeM1BitWidth = (BitSize <= 128) ? 7 : 32;
+  }
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray) {
+    ExportGlobal("byte_array", TIL.TheByteArray);
+    ExportGlobal("bit_mask", TIL.BitMask);
+  }
+
+  if (TIL.TheKind == TypeTestResolution::Inline)
+    ExportGlobal("inline_bits",
+                 ConstantExpr::getIntToPtr(TIL.InlineBits, Int8PtrTy));
+}
+
+LowerTypeTestsModule::TypeIdLowering
+LowerTypeTestsModule::importTypeId(StringRef TypeId) {
+  const TypeIdSummary *TidSummary = ImportSummary->getTypeIdSummary(TypeId);
+  if (!TidSummary)
+    return {}; // Unsat: no globals match this type id.
+  const TypeTestResolution &TTRes = TidSummary->TTRes;
+
+  TypeIdLowering TIL;
+  TIL.TheKind = TTRes.TheKind;
+
+  auto ImportGlobal = [&](StringRef Name, unsigned AbsWidth) {
+    Constant *C =
+        M.getOrInsertGlobal(("__typeid_" + TypeId + "_" + Name).str(), Int8Ty);
+    auto *GV = dyn_cast<GlobalVariable>(C);
+    // We only need to set metadata if the global is newly created, in which
+    // case it would not have hidden visibility.
+    if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility)
+      return C;
+
+    GV->setVisibility(GlobalValue::HiddenVisibility);
+    auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
+      auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
+      auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
+      GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                      MDNode::get(M.getContext(), {MinC, MaxC}));
+    };
+    if (AbsWidth == IntPtrTy->getBitWidth())
+      SetAbsRange(~0ull, ~0ull); // Full set.
+    else if (AbsWidth)
+      SetAbsRange(0, 1ull << AbsWidth);
+    return C;
+  };
+
+  if (TIL.TheKind != TypeTestResolution::Unsat)
+    TIL.OffsetedGlobal = ImportGlobal("global_addr", 0);
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray ||
+      TIL.TheKind == TypeTestResolution::Inline ||
+      TIL.TheKind == TypeTestResolution::AllOnes) {
+    TIL.AlignLog2 = ConstantExpr::getPtrToInt(ImportGlobal("align", 8), Int8Ty);
+    TIL.SizeM1 = ConstantExpr::getPtrToInt(
+        ImportGlobal("size_m1", TTRes.SizeM1BitWidth), IntPtrTy);
+  }
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray) {
+    TIL.TheByteArray = ImportGlobal("byte_array", 0);
+    TIL.BitMask = ImportGlobal("bit_mask", 8);
+  }
+
+  if (TIL.TheKind == TypeTestResolution::Inline)
+    TIL.InlineBits = ConstantExpr::getPtrToInt(
+        ImportGlobal("inline_bits", 1 << TTRes.SizeM1BitWidth),
+        TTRes.SizeM1BitWidth <= 5 ? Int32Ty : Int64Ty);
+
+  return TIL;
+}
+
+void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
+  auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
+  if (!TypeIdMDVal)
+    report_fatal_error("Second argument of llvm.type.test must be metadata");
+
+  auto TypeIdStr = dyn_cast<MDString>(TypeIdMDVal->getMetadata());
+  if (!TypeIdStr)
+    report_fatal_error(
+        "Second argument of llvm.type.test must be a metadata string");
+
+  TypeIdLowering TIL = importTypeId(TypeIdStr->getString());
+  Value *Lowered = lowerTypeTestCall(TypeIdStr, CI, TIL);
+  CI->replaceAllUsesWith(Lowered);
+  CI->eraseFromParent();
+}
+
+// ThinLTO backend: the function F has a jump table entry; update this module
+// accordingly. isDefinition describes the type of the jump table entry.
+void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
+  assert(F->getType()->getAddressSpace() == 0);
+
+  // Declaration of a local function - nothing to do.
+  if (F->isDeclarationForLinker() && isDefinition)
+    return;
+
+  GlobalValue::VisibilityTypes Visibility = F->getVisibility();
+  std::string Name = F->getName();
+  Function *FDecl;
+
+  if (F->isDeclarationForLinker() && !isDefinition) {
+    // Declaration of an external function.
+    FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
+                             Name + ".cfi_jt", &M);
+    FDecl->setVisibility(GlobalValue::HiddenVisibility);
+  } else if (isDefinition) {
+    F->setName(Name + ".cfi");
+    F->setLinkage(GlobalValue::ExternalLinkage);
+    F->setVisibility(GlobalValue::HiddenVisibility);
+    FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
+                             Name, &M);
+    FDecl->setVisibility(Visibility);
+  } else {
+    // Function definition without type metadata, where some other translation
+    // unit contained a declaration with type metadata. This normally happens
+    // during mixed CFI + non-CFI compilation. We do nothing with the function
+    // so that it is treated the same way as a function defined outside of the
+    // LTO unit.
+    return;
+  }
+
+  if (F->isWeakForLinker())
+    replaceWeakDeclarationWithJumpTablePtr(F, FDecl);
+  else
+    F->replaceAllUsesWith(FDecl);
+}
+
+void LowerTypeTestsModule::lowerTypeTestCalls(
+    ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
+    const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
+  CombinedGlobalAddr = ConstantExpr::getBitCast(CombinedGlobalAddr, Int8PtrTy);
+
+  // For each type identifier in this disjoint set...
+  for (Metadata *TypeId : TypeIds) {
+    // Build the bitset.
+    BitSetInfo BSI = buildBitSet(TypeId, GlobalLayout);
+    DEBUG({
+      if (auto MDS = dyn_cast<MDString>(TypeId))
+        dbgs() << MDS->getString() << ": ";
+      else
+        dbgs() << "<unnamed>: ";
+      BSI.print(dbgs());
+    });
+
+    TypeIdLowering TIL;
+    TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr(
+        Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)),
+    TIL.AlignLog2 = ConstantInt::get(Int8Ty, BSI.AlignLog2);
+    TIL.SizeM1 = ConstantInt::get(IntPtrTy, BSI.BitSize - 1);
+    if (BSI.isAllOnes()) {
+      TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single
+                                       : TypeTestResolution::AllOnes;
+    } else if (BSI.BitSize <= 64) {
+      TIL.TheKind = TypeTestResolution::Inline;
+      uint64_t InlineBits = 0;
+      for (auto Bit : BSI.Bits)
+        InlineBits |= uint64_t(1) << Bit;
+      if (InlineBits == 0)
+        TIL.TheKind = TypeTestResolution::Unsat;
+      else
+        TIL.InlineBits = ConstantInt::get(
+            (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits);
+    } else {
+      TIL.TheKind = TypeTestResolution::ByteArray;
+      ++NumByteArraysCreated;
+      ByteArrayInfo *BAI = createByteArray(BSI);
+      TIL.TheByteArray = BAI->ByteArray;
+      TIL.BitMask = BAI->MaskGlobal;
+    }
+
+    TypeIdUserInfo &TIUI = TypeIdUsers[TypeId];
+
+    if (TIUI.IsExported)
+      exportTypeId(cast<MDString>(TypeId)->getString(), TIL);
+
+    // Lower each call to llvm.type.test for this type identifier.
+    for (CallInst *CI : TIUI.CallSites) {
+      ++NumTypeTestCallsLowered;
+      Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL);
+      CI->replaceAllUsesWith(Lowered);
+      CI->eraseFromParent();
+    }
+  }
+}
+
+void LowerTypeTestsModule::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) {
+  if (Type->getNumOperands() != 2)
+    report_fatal_error("All operands of type metadata must have 2 elements");
+
+  if (GO->isThreadLocal())
+    report_fatal_error("Bit set element may not be thread-local");
+  if (isa<GlobalVariable>(GO) && GO->hasSection())
+    report_fatal_error(
+        "A member of a type identifier may not have an explicit section");
+
+  // FIXME: We previously checked that global var member of a type identifier
+  // must be a definition, but the IR linker may leave type metadata on
+  // declarations. We should restore this check after fixing PR31759.
+
+  auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Type->getOperand(0));
+  if (!OffsetConstMD)
+    report_fatal_error("Type offset must be a constant");
+  auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue());
+  if (!OffsetInt)
+    report_fatal_error("Type offset must be an integer constant");
+}
+
+static const unsigned kX86JumpTableEntrySize = 8;
+static const unsigned kARMJumpTableEntrySize = 4;
+
+unsigned LowerTypeTestsModule::getJumpTableEntrySize() {
+  switch (Arch) {
+    case Triple::x86:
+    case Triple::x86_64:
+      return kX86JumpTableEntrySize;
+    case Triple::arm:
+    case Triple::thumb:
+    case Triple::aarch64:
+      return kARMJumpTableEntrySize;
+    default:
+      report_fatal_error("Unsupported architecture for jump tables");
+  }
+}
+
+// Create a jump table entry for the target. This consists of an instruction
+// sequence containing a relative branch to Dest. Appends inline asm text,
+// constraints and arguments to AsmOS, ConstraintOS and AsmArgs.
+void LowerTypeTestsModule::createJumpTableEntry(
+    raw_ostream &AsmOS, raw_ostream &ConstraintOS,
+    SmallVectorImpl<Value *> &AsmArgs, Function *Dest) {
+  unsigned ArgIndex = AsmArgs.size();
+
+  if (Arch == Triple::x86 || Arch == Triple::x86_64) {
+    AsmOS << "jmp ${" << ArgIndex << ":c}@plt\n";
+    AsmOS << "int3\nint3\nint3\n";
+  } else if (Arch == Triple::arm || Arch == Triple::aarch64) {
+    AsmOS << "b $" << ArgIndex << "\n";
+  } else if (Arch == Triple::thumb) {
+    AsmOS << "b.w $" << ArgIndex << "\n";
+  } else {
+    report_fatal_error("Unsupported architecture for jump tables");
+  }
+
+  ConstraintOS << (ArgIndex > 0 ? ",s" : "s");
+  AsmArgs.push_back(Dest);
+}
+
+Type *LowerTypeTestsModule::getJumpTableEntryType() {
+  return ArrayType::get(Int8Ty, getJumpTableEntrySize());
+}
+
+/// Given a disjoint set of type identifiers and functions, build the bit sets
+/// and lower the llvm.type.test calls, architecture dependently.
+void LowerTypeTestsModule::buildBitSetsFromFunctions(
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
+  if (Arch == Triple::x86 || Arch == Triple::x86_64 || Arch == Triple::arm ||
+      Arch == Triple::thumb || Arch == Triple::aarch64)
+    buildBitSetsFromFunctionsNative(TypeIds, Functions);
+  else if (Arch == Triple::wasm32 || Arch == Triple::wasm64)
+    buildBitSetsFromFunctionsWASM(TypeIds, Functions);
+  else
+    report_fatal_error("Unsupported architecture for jump tables");
+}
+
+void LowerTypeTestsModule::moveInitializerToModuleConstructor(
+    GlobalVariable *GV) {
+  if (WeakInitializerFn == nullptr) {
+    WeakInitializerFn = Function::Create(
+        FunctionType::get(Type::getVoidTy(M.getContext()),
+                          /* IsVarArg */ false),
+        GlobalValue::InternalLinkage, "__cfi_global_var_init", &M);
+    BasicBlock *BB =
+        BasicBlock::Create(M.getContext(), "entry", WeakInitializerFn);
+    ReturnInst::Create(M.getContext(), BB);
+    WeakInitializerFn->setSection(
+        ObjectFormat == Triple::MachO
+            ? "__TEXT,__StaticInit,regular,pure_instructions"
+            : ".text.startup");
+    // This code is equivalent to relocation application, and should run at the
+    // earliest possible time (i.e. with the highest priority).
+    appendToGlobalCtors(M, WeakInitializerFn, /* Priority */ 0);
+  }
+
+  IRBuilder<> IRB(WeakInitializerFn->getEntryBlock().getTerminator());
+  GV->setConstant(false);
+  IRB.CreateAlignedStore(GV->getInitializer(), GV, GV->getAlignment());
+  GV->setInitializer(Constant::getNullValue(GV->getValueType()));
+}
+
+void LowerTypeTestsModule::findGlobalVariableUsersOf(
+    Constant *C, SmallSetVector<GlobalVariable *, 8> &Out) {
+  for (auto *U : C->users()){
+    if (auto *GV = dyn_cast<GlobalVariable>(U))
+      Out.insert(GV);
+    else if (auto *C2 = dyn_cast<Constant>(U))
+      findGlobalVariableUsersOf(C2, Out);
+  }
+}
+
+// Replace all uses of F with (F ? JT : 0).
+void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
+    Function *F, Constant *JT) {
+  // The target expression can not appear in a constant initializer on most
+  // (all?) targets. Switch to a runtime initializer.
+  SmallSetVector<GlobalVariable *, 8> GlobalVarUsers;
+  findGlobalVariableUsersOf(F, GlobalVarUsers);
+  for (auto GV : GlobalVarUsers)
+    moveInitializerToModuleConstructor(GV);
+
+  // Can not RAUW F with an expression that uses F. Replace with a temporary
+  // placeholder first.
+  Function *PlaceholderFn =
+      Function::Create(cast<FunctionType>(F->getValueType()),
+                       GlobalValue::ExternalWeakLinkage, "", &M);
+  F->replaceAllUsesWith(PlaceholderFn);
+
+  Constant *Target = ConstantExpr::getSelect(
+      ConstantExpr::getICmp(CmpInst::ICMP_NE, F,
+                            Constant::getNullValue(F->getType())),
+      JT, Constant::getNullValue(F->getType()));
+  PlaceholderFn->replaceAllUsesWith(Target);
+  PlaceholderFn->eraseFromParent();
+}
+
+void LowerTypeTestsModule::createJumpTable(
+    Function *F, ArrayRef<GlobalTypeMember *> Functions) {
+  std::string AsmStr, ConstraintStr;
+  raw_string_ostream AsmOS(AsmStr), ConstraintOS(ConstraintStr);
+  SmallVector<Value *, 16> AsmArgs;
+  AsmArgs.reserve(Functions.size() * 2);
+
+  for (unsigned I = 0; I != Functions.size(); ++I)
+    createJumpTableEntry(AsmOS, ConstraintOS, AsmArgs,
+                         cast<Function>(Functions[I]->getGlobal()));
+
+  // Try to emit the jump table at the end of the text segment.
+  // Jump table must come after __cfi_check in the cross-dso mode.
+  // FIXME: this magic section name seems to do the trick.
+  F->setSection(ObjectFormat == Triple::MachO
+                    ? "__TEXT,__text,regular,pure_instructions"
+                    : ".text.cfi");
+  // Align the whole table by entry size.
+  F->setAlignment(getJumpTableEntrySize());
+  // Skip prologue.
+  // Disabled on win32 due to https://llvm.org/bugs/show_bug.cgi?id=28641#c3.
+  // Luckily, this function does not get any prologue even without the
+  // attribute.
+  if (OS != Triple::Win32)
+    F->addFnAttr(llvm::Attribute::Naked);
+  // Thumb jump table assembly needs Thumb2. The following attribute is added by
+  // Clang for -march=armv7.
+  if (Arch == Triple::thumb)
+    F->addFnAttr("target-cpu", "cortex-a8");
+
+  BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F);
+  IRBuilder<> IRB(BB);
+
+  SmallVector<Type *, 16> ArgTypes;
+  ArgTypes.reserve(AsmArgs.size());
+  for (const auto &Arg : AsmArgs)
+    ArgTypes.push_back(Arg->getType());
+  InlineAsm *JumpTableAsm =
+      InlineAsm::get(FunctionType::get(IRB.getVoidTy(), ArgTypes, false),
+                     AsmOS.str(), ConstraintOS.str(),
+                     /*hasSideEffects=*/true);
+
+  IRB.CreateCall(JumpTableAsm, AsmArgs);
+  IRB.CreateUnreachable();
+}
+
+/// Given a disjoint set of type identifiers and functions, build a jump table
+/// for the functions, build the bit sets and lower the llvm.type.test calls.
+void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
+  // Unlike the global bitset builder, the function bitset builder cannot
+  // re-arrange functions in a particular order and base its calculations on the
+  // layout of the functions' entry points, as we have no idea how large a
+  // particular function will end up being (the size could even depend on what
+  // this pass does!) Instead, we build a jump table, which is a block of code
+  // consisting of one branch instruction for each of the functions in the bit
+  // set that branches to the target function, and redirect any taken function
+  // addresses to the corresponding jump table entry. In the object file's
+  // symbol table, the symbols for the target functions also refer to the jump
+  // table entries, so that addresses taken outside the module will pass any
+  // verification done inside the module.
+  //
+  // In more concrete terms, suppose we have three functions f, g, h which are
+  // of the same type, and a function foo that returns their addresses:
+  //
+  // f:
+  // mov 0, %eax
+  // ret
+  //
+  // g:
+  // mov 1, %eax
+  // ret
+  //
+  // h:
+  // mov 2, %eax
+  // ret
+  //
+  // foo:
+  // mov f, %eax
+  // mov g, %edx
+  // mov h, %ecx
+  // ret
+  //
+  // We output the jump table as module-level inline asm string. The end result
+  // will (conceptually) look like this:
+  //
+  // f = .cfi.jumptable
+  // g = .cfi.jumptable + 4
+  // h = .cfi.jumptable + 8
+  // .cfi.jumptable:
+  // jmp f.cfi  ; 5 bytes
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  // jmp g.cfi  ; 5 bytes
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  // jmp h.cfi  ; 5 bytes
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  //
+  // f.cfi:
+  // mov 0, %eax
+  // ret
+  //
+  // g.cfi:
+  // mov 1, %eax
+  // ret
+  //
+  // h.cfi:
+  // mov 2, %eax
+  // ret
+  //
+  // foo:
+  // mov f, %eax
+  // mov g, %edx
+  // mov h, %ecx
+  // ret
+  //
+  // Because the addresses of f, g, h are evenly spaced at a power of 2, in the
+  // normal case the check can be carried out using the same kind of simple
+  // arithmetic that we normally use for globals.
+
+  // FIXME: find a better way to represent the jumptable in the IR.
+  assert(!Functions.empty());
+
+  // Build a simple layout based on the regular layout of jump tables.
+  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
+  unsigned EntrySize = getJumpTableEntrySize();
+  for (unsigned I = 0; I != Functions.size(); ++I)
+    GlobalLayout[Functions[I]] = I * EntrySize;
+
+  Function *JumpTableFn =
+      Function::Create(FunctionType::get(Type::getVoidTy(M.getContext()),
+                                         /* IsVarArg */ false),
+                       GlobalValue::PrivateLinkage, ".cfi.jumptable", &M);
+  ArrayType *JumpTableType =
+      ArrayType::get(getJumpTableEntryType(), Functions.size());
+  auto JumpTable =
+      ConstantExpr::getPointerCast(JumpTableFn, JumpTableType->getPointerTo(0));
+
+  lowerTypeTestCalls(TypeIds, JumpTable, GlobalLayout);
+
+  // Build aliases pointing to offsets into the jump table, and replace
+  // references to the original functions with references to the aliases.
+  for (unsigned I = 0; I != Functions.size(); ++I) {
+    Function *F = cast<Function>(Functions[I]->getGlobal());
+    bool IsDefinition = Functions[I]->isDefinition();
+
+    Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast(
+        ConstantExpr::getInBoundsGetElementPtr(
+            JumpTableType, JumpTable,
+            ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
+                                 ConstantInt::get(IntPtrTy, I)}),
+        F->getType());
+    if (Functions[I]->isExported()) {
+      if (IsDefinition) {
+        ExportSummary->cfiFunctionDefs().insert(F->getName());
+      } else {
+        GlobalAlias *JtAlias = GlobalAlias::create(
+            F->getValueType(), 0, GlobalValue::ExternalLinkage,
+            F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M);
+        JtAlias->setVisibility(GlobalValue::HiddenVisibility);
+        ExportSummary->cfiFunctionDecls().insert(F->getName());
+      }
+    }
+    if (!IsDefinition) {
+      if (F->isWeakForLinker())
+        replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr);
+      else
+        F->replaceAllUsesWith(CombinedGlobalElemPtr);
+    } else {
+      assert(F->getType()->getAddressSpace() == 0);
+
+      GlobalAlias *FAlias = GlobalAlias::create(
+          F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M);
+      FAlias->setVisibility(F->getVisibility());
+      FAlias->takeName(F);
+      if (FAlias->hasName())
+        F->setName(FAlias->getName() + ".cfi");
+      F->replaceAllUsesWith(FAlias);
+    }
+    if (!F->isDeclarationForLinker())
+      F->setLinkage(GlobalValue::InternalLinkage);
+  }
+
+  createJumpTable(JumpTableFn, Functions);
+}
+
+/// Assign a dummy layout using an incrementing counter, tag each function
+/// with its index represented as metadata, and lower each type test to an
+/// integer range comparison. During generation of the indirect function call
+/// table in the backend, it will assign the given indexes.
+/// Note: Dynamic linking is not supported, as the WebAssembly ABI has not yet
+/// been finalized.
+void LowerTypeTestsModule::buildBitSetsFromFunctionsWASM(
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
+  assert(!Functions.empty());
+
+  // Build consecutive monotonic integer ranges for each call target set
+  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
+
+  for (GlobalTypeMember *GTM : Functions) {
+    Function *F = cast<Function>(GTM->getGlobal());
+
+    // Skip functions that are not address taken, to avoid bloating the table
+    if (!F->hasAddressTaken())
+      continue;
+
+    // Store metadata with the index for each function
+    MDNode *MD = MDNode::get(F->getContext(),
+                             ArrayRef<Metadata *>(ConstantAsMetadata::get(
+                                 ConstantInt::get(Int64Ty, IndirectIndex))));
+    F->setMetadata("wasm.index", MD);
+
+    // Assign the counter value
+    GlobalLayout[GTM] = IndirectIndex++;
+  }
+
+  // The indirect function table index space starts at zero, so pass a NULL
+  // pointer as the subtracted "jump table" offset.
+  lowerTypeTestCalls(TypeIds, ConstantPointerNull::get(Int32PtrTy),
+                     GlobalLayout);
+}
+
+void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals) {
+  llvm::DenseMap<Metadata *, uint64_t> TypeIdIndices;
+  for (unsigned I = 0; I != TypeIds.size(); ++I)
+    TypeIdIndices[TypeIds[I]] = I;
+
+  // For each type identifier, build a set of indices that refer to members of
+  // the type identifier.
+  std::vector<std::set<uint64_t>> TypeMembers(TypeIds.size());
+  unsigned GlobalIndex = 0;
+  for (GlobalTypeMember *GTM : Globals) {
+    for (MDNode *Type : GTM->types()) {
+      // Type = { offset, type identifier }
+      unsigned TypeIdIndex = TypeIdIndices[Type->getOperand(1)];
+      TypeMembers[TypeIdIndex].insert(GlobalIndex);
+    }
+    GlobalIndex++;
+  }
+
+  // Order the sets of indices by size. The GlobalLayoutBuilder works best
+  // when given small index sets first.
+  std::stable_sort(
+      TypeMembers.begin(), TypeMembers.end(),
+      [](const std::set<uint64_t> &O1, const std::set<uint64_t> &O2) {
+        return O1.size() < O2.size();
+      });
+
+  // Create a GlobalLayoutBuilder and provide it with index sets as layout
+  // fragments. The GlobalLayoutBuilder tries to lay out members of fragments as
+  // close together as possible.
+  GlobalLayoutBuilder GLB(Globals.size());
+  for (auto &&MemSet : TypeMembers)
+    GLB.addFragment(MemSet);
+
+  // Build the bitsets from this disjoint set.
+  if (Globals.empty() || isa<GlobalVariable>(Globals[0]->getGlobal())) {
+    // Build a vector of global variables with the computed layout.
+    std::vector<GlobalTypeMember *> OrderedGVs(Globals.size());
+    auto OGI = OrderedGVs.begin();
+    for (auto &&F : GLB.Fragments) {
+      for (auto &&Offset : F) {
+        auto GV = dyn_cast<GlobalVariable>(Globals[Offset]->getGlobal());
+        if (!GV)
+          report_fatal_error("Type identifier may not contain both global "
+                             "variables and functions");
+        *OGI++ = Globals[Offset];
+      }
+    }
+
+    buildBitSetsFromGlobalVariables(TypeIds, OrderedGVs);
+  } else {
+    // Build a vector of functions with the computed layout.
+    std::vector<GlobalTypeMember *> OrderedFns(Globals.size());
+    auto OFI = OrderedFns.begin();
+    for (auto &&F : GLB.Fragments) {
+      for (auto &&Offset : F) {
+        auto Fn = dyn_cast<Function>(Globals[Offset]->getGlobal());
+        if (!Fn)
+          report_fatal_error("Type identifier may not contain both global "
+                             "variables and functions");
+        *OFI++ = Globals[Offset];
+      }
+    }
+
+    buildBitSetsFromFunctions(TypeIds, OrderedFns);
+  }
+}
+
+/// Lower all type tests in this module.
+LowerTypeTestsModule::LowerTypeTestsModule(
+    Module &M, ModuleSummaryIndex *ExportSummary,
+    const ModuleSummaryIndex *ImportSummary)
+    : M(M), ExportSummary(ExportSummary), ImportSummary(ImportSummary) {
+  assert(!(ExportSummary && ImportSummary));
+  Triple TargetTriple(M.getTargetTriple());
+  Arch = TargetTriple.getArch();
+  OS = TargetTriple.getOS();
+  ObjectFormat = TargetTriple.getObjectFormat();
+}
+
+bool LowerTypeTestsModule::runForTesting(Module &M) {
+  ModuleSummaryIndex Summary;
+
+  // Handle the command-line summary arguments. This code is for testing
+  // purposes only, so we handle errors directly.
+  if (!ClReadSummary.empty()) {
+    ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary +
+                          ": ");
+    auto ReadSummaryFile =
+        ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+
+    yaml::Input In(ReadSummaryFile->getBuffer());
+    In >> Summary;
+    ExitOnErr(errorCodeToError(In.error()));
+  }
+
+  bool Changed =
+      LowerTypeTestsModule(
+          M, ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr,
+          ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr)
+          .lower();
+
+  if (!ClWriteSummary.empty()) {
+    ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
+                          ": ");
+    std::error_code EC;
+    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+    ExitOnErr(errorCodeToError(EC));
+
+    yaml::Output Out(OS);
+    Out << Summary;
+  }
+
+  return Changed;
+}
+
+bool LowerTypeTestsModule::lower() {
+  Function *TypeTestFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+  if ((!TypeTestFunc || TypeTestFunc->use_empty()) && !ExportSummary &&
+      !ImportSummary)
+    return false;
+
+  if (ImportSummary) {
+    if (TypeTestFunc) {
+      for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
+           UI != UE;) {
+        auto *CI = cast<CallInst>((*UI++).getUser());
+        importTypeTest(CI);
+      }
+    }
+
+    SmallVector<Function *, 8> Defs;
+    SmallVector<Function *, 8> Decls;
+    for (auto &F : M) {
+      // CFI functions are either external, or promoted. A local function may
+      // have the same name, but it's not the one we are looking for.
+      if (F.hasLocalLinkage())
+        continue;
+      if (ImportSummary->cfiFunctionDefs().count(F.getName()))
+        Defs.push_back(&F);
+      else if (ImportSummary->cfiFunctionDecls().count(F.getName()))
+        Decls.push_back(&F);
+    }
+
+    for (auto F : Defs)
+      importFunction(F, /*isDefinition*/ true);
+    for (auto F : Decls)
+      importFunction(F, /*isDefinition*/ false);
+
+    return true;
+  }
+
+  // Equivalence class set containing type identifiers and the globals that
+  // reference them. This is used to partition the set of type identifiers in
+  // the module into disjoint sets.
+  typedef EquivalenceClasses<PointerUnion<GlobalTypeMember *, Metadata *>>
+      GlobalClassesTy;
+  GlobalClassesTy GlobalClasses;
+
+  // Verify the type metadata and build a few data structures to let us
+  // efficiently enumerate the type identifiers associated with a global:
+  // a list of GlobalTypeMembers (a GlobalObject stored alongside a vector
+  // of associated type metadata) and a mapping from type identifiers to their
+  // list of GlobalTypeMembers and last observed index in the list of globals.
+  // The indices will be used later to deterministically order the list of type
+  // identifiers.
+  BumpPtrAllocator Alloc;
+  struct TIInfo {
+    unsigned Index;
+    std::vector<GlobalTypeMember *> RefGlobals;
+  };
+  llvm::DenseMap<Metadata *, TIInfo> TypeIdInfo;
+  unsigned I = 0;
+  SmallVector<MDNode *, 2> Types;
+
+  struct ExportedFunctionInfo {
+    CfiFunctionLinkage Linkage;
+    MDNode *FuncMD; // {name, linkage, type[, type...]}
+  };
+  DenseMap<StringRef, ExportedFunctionInfo> ExportedFunctions;
+  if (ExportSummary) {
+    NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
+    if (CfiFunctionsMD) {
+      for (auto FuncMD : CfiFunctionsMD->operands()) {
+        assert(FuncMD->getNumOperands() >= 2);
+        StringRef FunctionName =
+            cast<MDString>(FuncMD->getOperand(0))->getString();
+        if (!ExportSummary->isGUIDLive(GlobalValue::getGUID(
+                GlobalValue::dropLLVMManglingEscape(FunctionName))))
+          continue;
+        CfiFunctionLinkage Linkage = static_cast<CfiFunctionLinkage>(
+            cast<ConstantAsMetadata>(FuncMD->getOperand(1))
+                ->getValue()
+                ->getUniqueInteger()
+                .getZExtValue());
+        auto P = ExportedFunctions.insert({FunctionName, {Linkage, FuncMD}});
+        if (!P.second && P.first->second.Linkage != CFL_Definition)
+          P.first->second = {Linkage, FuncMD};
+      }
+
+      for (const auto &P : ExportedFunctions) {
+        StringRef FunctionName = P.first;
+        CfiFunctionLinkage Linkage = P.second.Linkage;
+        MDNode *FuncMD = P.second.FuncMD;
+        Function *F = M.getFunction(FunctionName);
+        if (!F)
+          F = Function::Create(
+              FunctionType::get(Type::getVoidTy(M.getContext()), false),
+              GlobalVariable::ExternalLinkage, FunctionName, &M);
+
+        if (Linkage == CFL_Definition)
+          F->eraseMetadata(LLVMContext::MD_type);
+
+        if (F->isDeclaration()) {
+          if (Linkage == CFL_WeakDeclaration)
+            F->setLinkage(GlobalValue::ExternalWeakLinkage);
+
+          SmallVector<MDNode *, 2> Types;
+          for (unsigned I = 2; I < FuncMD->getNumOperands(); ++I)
+            F->addMetadata(LLVMContext::MD_type,
+                           *cast<MDNode>(FuncMD->getOperand(I).get()));
+        }
+      }
+    }
+  }
+
+  for (GlobalObject &GO : M.global_objects()) {
+    if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker())
+      continue;
+
+    Types.clear();
+    GO.getMetadata(LLVMContext::MD_type, Types);
+    if (Types.empty())
+      continue;
+
+    bool IsDefinition = !GO.isDeclarationForLinker();
+    bool IsExported = false;
+    if (isa<Function>(GO) && ExportedFunctions.count(GO.getName())) {
+      IsDefinition |= ExportedFunctions[GO.getName()].Linkage == CFL_Definition;
+      IsExported = true;
+    }
+
+    auto *GTM =
+        GlobalTypeMember::create(Alloc, &GO, IsDefinition, IsExported, Types);
+    for (MDNode *Type : Types) {
+      verifyTypeMDNode(&GO, Type);
+      auto &Info = TypeIdInfo[cast<MDNode>(Type)->getOperand(1)];
+      Info.Index = ++I;
+      Info.RefGlobals.push_back(GTM);
+    }
+  }
+
+  auto AddTypeIdUse = [&](Metadata *TypeId) -> TypeIdUserInfo & {
+    // Add the call site to the list of call sites for this type identifier. We
+    // also use TypeIdUsers to keep track of whether we have seen this type
+    // identifier before. If we have, we don't need to re-add the referenced
+    // globals to the equivalence class.
+    auto Ins = TypeIdUsers.insert({TypeId, {}});
+    if (Ins.second) {
+      // Add the type identifier to the equivalence class.
+      GlobalClassesTy::iterator GCI = GlobalClasses.insert(TypeId);
+      GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI);
+
+      // Add the referenced globals to the type identifier's equivalence class.
+      for (GlobalTypeMember *GTM : TypeIdInfo[TypeId].RefGlobals)
+        CurSet = GlobalClasses.unionSets(
+            CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM)));
+    }
+
+    return Ins.first->second;
+  };
+
+  if (TypeTestFunc) {
+    for (const Use &U : TypeTestFunc->uses()) {
+      auto CI = cast<CallInst>(U.getUser());
+
+      auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
+      if (!TypeIdMDVal)
+        report_fatal_error("Second argument of llvm.type.test must be metadata");
+      auto TypeId = TypeIdMDVal->getMetadata();
+      AddTypeIdUse(TypeId).CallSites.push_back(CI);
+    }
+  }
+
+  if (ExportSummary) {
+    DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
+    for (auto &P : TypeIdInfo) {
+      if (auto *TypeId = dyn_cast<MDString>(P.first))
+        MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back(
+            TypeId);
+    }
+
+    for (auto &P : *ExportSummary) {
+      for (auto &S : P.second.SummaryList) {
+        auto *FS = dyn_cast<FunctionSummary>(S.get());
+        if (!FS || !ExportSummary->isGlobalValueLive(FS))
+          continue;
+        for (GlobalValue::GUID G : FS->type_tests())
+          for (Metadata *MD : MetadataByGUID[G])
+            AddTypeIdUse(MD).IsExported = true;
+      }
+    }
+  }
+
+  if (GlobalClasses.empty())
+    return false;
+
+  // Build a list of disjoint sets ordered by their maximum global index for
+  // determinism.
+  std::vector<std::pair<GlobalClassesTy::iterator, unsigned>> Sets;
+  for (GlobalClassesTy::iterator I = GlobalClasses.begin(),
+                                 E = GlobalClasses.end();
+       I != E; ++I) {
+    if (!I->isLeader())
+      continue;
+    ++NumTypeIdDisjointSets;
+
+    unsigned MaxIndex = 0;
+    for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I);
+         MI != GlobalClasses.member_end(); ++MI) {
+      if ((*MI).is<Metadata *>())
+        MaxIndex = std::max(MaxIndex, TypeIdInfo[MI->get<Metadata *>()].Index);
+    }
+    Sets.emplace_back(I, MaxIndex);
+  }
+  std::sort(Sets.begin(), Sets.end(),
+            [](const std::pair<GlobalClassesTy::iterator, unsigned> &S1,
+               const std::pair<GlobalClassesTy::iterator, unsigned> &S2) {
+              return S1.second < S2.second;
+            });
+
+  // For each disjoint set we found...
+  for (const auto &S : Sets) {
+    // Build the list of type identifiers in this disjoint set.
+    std::vector<Metadata *> TypeIds;
+    std::vector<GlobalTypeMember *> Globals;
+    for (GlobalClassesTy::member_iterator MI =
+             GlobalClasses.member_begin(S.first);
+         MI != GlobalClasses.member_end(); ++MI) {
+      if ((*MI).is<Metadata *>())
+        TypeIds.push_back(MI->get<Metadata *>());
+      else
+        Globals.push_back(MI->get<GlobalTypeMember *>());
+    }
+
+    // Order type identifiers by global index for determinism. This ordering is
+    // stable as there is a one-to-one mapping between metadata and indices.
+    std::sort(TypeIds.begin(), TypeIds.end(), [&](Metadata *M1, Metadata *M2) {
+      return TypeIdInfo[M1].Index < TypeIdInfo[M2].Index;
+    });
+
+    // Build bitsets for this disjoint set.
+    buildBitSetsFromDisjointSet(TypeIds, Globals);
+  }
+
+  allocateByteArrays();
+
+  return true;
+}
+
+PreservedAnalyses LowerTypeTestsPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
+  bool Changed = LowerTypeTestsModule(M, /*ExportSummary=*/nullptr,
+                                      /*ImportSummary=*/nullptr)
+                     .lower();
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
new file mode 100644
index 000000000000..0e478ba607be
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -0,0 +1,892 @@
+//===- MergeFunctions.cpp - Merge identical functions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for equivalent functions that are mergable and folds them.
+//
+// Order relation is defined on set of functions. It was made through
+// special function comparison procedure that returns
+// 0 when functions are equal,
+// -1 when Left function is less than right function, and
+// 1 for opposite case. We need total-ordering, so we need to maintain
+// four properties on the functions set:
+// a <= a (reflexivity)
+// if a <= b and b <= a then a = b (antisymmetry)
+// if a <= b and b <= c then a <= c (transitivity).
+// for all a and b: a <= b or b <= a (totality).
+//
+// Comparison iterates through each instruction in each basic block.
+// Functions are kept on binary tree. For each new function F we perform
+// lookup in binary tree.
+// In practice it works the following way:
+// -- We define Function* container class with custom "operator<" (FunctionPtr).
+// -- "FunctionPtr" instances are stored in std::set collection, so every
+//    std::set::insert operation will give you result in log(N) time.
+// 
+// As an optimization, a hash of the function structure is calculated first, and
+// two functions are only compared if they have the same hash. This hash is
+// cheap to compute, and has the property that if function F == G according to
+// the comparison function, then hash(F) == hash(G). This consistency property
+// is critical to ensuring all possible merging opportunities are exploited.
+// Collisions in the hash affect the speed of the pass but not the correctness
+// or determinism of the resulting transformation.
+//
+// When a match is found the functions are folded. If both functions are
+// overridable, we move the functionality into a new internal function and
+// leave two overridable thunks to it.
+//
+//===----------------------------------------------------------------------===//
+//
+// Future work:
+//
+// * virtual functions.
+//
+// Many functions have their address taken by the virtual function table for
+// the object they belong to. However, as long as it's only used for a lookup
+// and call, this is irrelevant, and we'd like to fold such functions.
+//
+// * be smarter about bitcasts.
+//
+// In order to fold functions, we will sometimes add either bitcast instructions
+// or bitcast constant expressions. Unfortunately, this can confound further
+// analysis since the two functions differ where one has a bitcast and the
+// other doesn't. We should learn to look through bitcasts.
+//
+// * Compare complex types with pointer types inside.
+// * Compare cross-reference cases.
+// * Compare complex expressions.
+//
+// All the three issues above could be described as ability to prove that
+// fA == fB == fC == fE == fF == fG in example below:
+//
+//  void fA() {
+//    fB();
+//  }
+//  void fB() {
+//    fA();
+//  }
+//
+//  void fE() {
+//    fF();
+//  }
+//  void fF() {
+//    fG();
+//  }
+//  void fG() {
+//    fE();
+//  }
+//
+// Simplest cross-reference case (fA <--> fB) was implemented in previous
+// versions of MergeFunctions, though it presented only in two function pairs
+// in test-suite (that counts >50k functions)
+// Though possibility to detect complex cross-referencing (e.g.: A->B->C->D->A)
+// could cover much more cases.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/FunctionComparator.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mergefunc"
+
+STATISTIC(NumFunctionsMerged, "Number of functions merged");
+STATISTIC(NumThunksWritten, "Number of thunks generated");
+STATISTIC(NumAliasesWritten, "Number of aliases generated");
+STATISTIC(NumDoubleWeak, "Number of new functions created");
+
+static cl::opt<unsigned> NumFunctionsForSanityCheck(
+    "mergefunc-sanity",
+    cl::desc("How many functions in module could be used for "
+             "MergeFunctions pass sanity check. "
+             "'0' disables this check. Works only with '-debug' key."),
+    cl::init(0), cl::Hidden);
+
+// Under option -mergefunc-preserve-debug-info we:
+// - Do not create a new function for a thunk.
+// - Retain the debug info for a thunk's parameters (and associated
+//   instructions for the debug info) from the entry block.
+//   Note: -debug will display the algorithm at work.
+// - Create debug-info for the call (to the shared implementation) made by
+//   a thunk and its return value.
+// - Erase the rest of the function, retaining the (minimally sized) entry
+//   block to create a thunk.
+// - Preserve a thunk's call site to point to the thunk even when both occur
+//   within the same translation unit, to aid debugability. Note that this
+//   behaviour differs from the underlying -mergefunc implementation which
+//   modifies the thunk's call site to point to the shared implementation
+//   when both occur within the same translation unit.
+static cl::opt<bool>
+    MergeFunctionsPDI("mergefunc-preserve-debug-info", cl::Hidden,
+                      cl::init(false),
+                      cl::desc("Preserve debug info in thunk when mergefunc "
+                               "transformations are made."));
+
+namespace {
+
+class FunctionNode {
+  mutable AssertingVH<Function> F;
+  FunctionComparator::FunctionHash Hash;
+public:
+  // Note the hash is recalculated potentially multiple times, but it is cheap.
+  FunctionNode(Function *F)
+    : F(F), Hash(FunctionComparator::functionHash(*F))  {}
+  Function *getFunc() const { return F; }
+  FunctionComparator::FunctionHash getHash() const { return Hash; }
+
+  /// Replace the reference to the function F by the function G, assuming their
+  /// implementations are equal.
+  void replaceBy(Function *G) const {
+    F = G;
+  }
+
+  void release() { F = nullptr; }
+};
+
+/// MergeFunctions finds functions which will generate identical machine code,
+/// by considering all pointer types to be equivalent. Once identified,
+/// MergeFunctions will fold them by replacing a call to one to a call to a
+/// bitcast of the other.
+///
+class MergeFunctions : public ModulePass {
+public:
+  static char ID;
+  MergeFunctions()
+    : ModulePass(ID), FnTree(FunctionNodeCmp(&GlobalNumbers)), FNodesInTree(),
+      HasGlobalAliases(false) {
+    initializeMergeFunctionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+
+private:
+  // The function comparison operator is provided here so that FunctionNodes do
+  // not need to become larger with another pointer.
+  class FunctionNodeCmp {
+    GlobalNumberState* GlobalNumbers;
+  public:
+    FunctionNodeCmp(GlobalNumberState* GN) : GlobalNumbers(GN) {}
+    bool operator()(const FunctionNode &LHS, const FunctionNode &RHS) const {
+      // Order first by hashes, then full function comparison.
+      if (LHS.getHash() != RHS.getHash())
+        return LHS.getHash() < RHS.getHash();
+      FunctionComparator FCmp(LHS.getFunc(), RHS.getFunc(), GlobalNumbers);
+      return FCmp.compare() == -1;
+    }
+  };
+  typedef std::set<FunctionNode, FunctionNodeCmp> FnTreeType;
+
+  GlobalNumberState GlobalNumbers;
+
+  /// A work queue of functions that may have been modified and should be
+  /// analyzed again.
+  std::vector<WeakTrackingVH> Deferred;
+
+  /// Checks the rules of order relation introduced among functions set.
+  /// Returns true, if sanity check has been passed, and false if failed.
+#ifndef NDEBUG
+  bool doSanityCheck(std::vector<WeakTrackingVH> &Worklist);
+#endif
+
+  /// Insert a ComparableFunction into the FnTree, or merge it away if it's
+  /// equal to one that's already present.
+  bool insert(Function *NewFunction);
+
+  /// Remove a Function from the FnTree and queue it up for a second sweep of
+  /// analysis.
+  void remove(Function *F);
+
+  /// Find the functions that use this Value and remove them from FnTree and
+  /// queue the functions.
+  void removeUsers(Value *V);
+
+  /// Replace all direct calls of Old with calls of New. Will bitcast New if
+  /// necessary to make types match.
+  void replaceDirectCallers(Function *Old, Function *New);
+
+  /// Merge two equivalent functions. Upon completion, G may be deleted, or may
+  /// be converted into a thunk. In either case, it should never be visited
+  /// again.
+  void mergeTwoFunctions(Function *F, Function *G);
+
+  /// Replace G with a thunk or an alias to F. Deletes G.
+  void writeThunkOrAlias(Function *F, Function *G);
+
+  /// Fill PDIUnrelatedWL with instructions from the entry block that are
+  /// unrelated to parameter related debug info.
+  void filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock,
+                                 std::vector<Instruction *> &PDIUnrelatedWL);
+
+  /// Erase the rest of the CFG (i.e. barring the entry block).
+  void eraseTail(Function *G);
+
+  /// Erase the instructions in PDIUnrelatedWL as they are unrelated to the
+  /// parameter debug info, from the entry block.
+  void eraseInstsUnrelatedToPDI(std::vector<Instruction *> &PDIUnrelatedWL);
+
+  /// Replace G with a simple tail call to bitcast(F). Also (unless
+  /// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
+  /// delete G.
+  void writeThunk(Function *F, Function *G);
+
+  /// Replace G with an alias to F. Deletes G.
+  void writeAlias(Function *F, Function *G);
+
+  /// Replace function F with function G in the function tree.
+  void replaceFunctionInTree(const FunctionNode &FN, Function *G);
+
+  /// The set of all distinct functions. Use the insert() and remove() methods
+  /// to modify it. The map allows efficient lookup and deferring of Functions.
+  FnTreeType FnTree;
+  // Map functions to the iterators of the FunctionNode which contains them
+  // in the FnTree. This must be updated carefully whenever the FnTree is
+  // modified, i.e. in insert(), remove(), and replaceFunctionInTree(), to avoid
+  // dangling iterators into FnTree. The invariant that preserves this is that
+  // there is exactly one mapping F -> FN for each FunctionNode FN in FnTree.
+  ValueMap<Function*, FnTreeType::iterator> FNodesInTree;
+
+  /// Whether or not the target supports global aliases.
+  bool HasGlobalAliases;
+};
+
+} // end anonymous namespace
+
+char MergeFunctions::ID = 0;
+INITIALIZE_PASS(MergeFunctions, "mergefunc", "Merge Functions", false, false)
+
+ModulePass *llvm::createMergeFunctionsPass() {
+  return new MergeFunctions();
+}
+
+#ifndef NDEBUG
+bool MergeFunctions::doSanityCheck(std::vector<WeakTrackingVH> &Worklist) {
+  if (const unsigned Max = NumFunctionsForSanityCheck) {
+    unsigned TripleNumber = 0;
+    bool Valid = true;
+
+    dbgs() << "MERGEFUNC-SANITY: Started for first " << Max << " functions.\n";
+
+    unsigned i = 0;
+    for (std::vector<WeakTrackingVH>::iterator I = Worklist.begin(),
+                                               E = Worklist.end();
+         I != E && i < Max; ++I, ++i) {
+      unsigned j = i;
+      for (std::vector<WeakTrackingVH>::iterator J = I; J != E && j < Max;
+           ++J, ++j) {
+        Function *F1 = cast<Function>(*I);
+        Function *F2 = cast<Function>(*J);
+        int Res1 = FunctionComparator(F1, F2, &GlobalNumbers).compare();
+        int Res2 = FunctionComparator(F2, F1, &GlobalNumbers).compare();
+
+        // If F1 <= F2, then F2 >= F1, otherwise report failure.
+        if (Res1 != -Res2) {
+          dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber
+                 << "\n";
+          dbgs() << *F1 << '\n' << *F2 << '\n';
+          Valid = false;
+        }
+
+        if (Res1 == 0)
+          continue;
+
+        unsigned k = j;
+        for (std::vector<WeakTrackingVH>::iterator K = J; K != E && k < Max;
+             ++k, ++K, ++TripleNumber) {
+          if (K == J)
+            continue;
+
+          Function *F3 = cast<Function>(*K);
+          int Res3 = FunctionComparator(F1, F3, &GlobalNumbers).compare();
+          int Res4 = FunctionComparator(F2, F3, &GlobalNumbers).compare();
+
+          bool Transitive = true;
+
+          if (Res1 != 0 && Res1 == Res4) {
+            // F1 > F2, F2 > F3 => F1 > F3
+            Transitive = Res3 == Res1;
+          } else if (Res3 != 0 && Res3 == -Res4) {
+            // F1 > F3, F3 > F2 => F1 > F2
+            Transitive = Res3 == Res1;
+          } else if (Res4 != 0 && -Res3 == Res4) {
+            // F2 > F3, F3 > F1 => F2 > F1
+            Transitive = Res4 == -Res1;
+          }
+
+          if (!Transitive) {
+            dbgs() << "MERGEFUNC-SANITY: Non-transitive; triple: "
+                   << TripleNumber << "\n";
+            dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", "
+                   << Res4 << "\n";
+            dbgs() << *F1 << '\n' << *F2 << '\n' << *F3 << '\n';
+            Valid = false;
+          }
+        }
+      }
+    }
+
+    dbgs() << "MERGEFUNC-SANITY: " << (Valid ? "Passed." : "Failed.") << "\n";
+    return Valid;
+  }
+  return true;
+}
+#endif
+
+bool MergeFunctions::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  bool Changed = false;
+
+  // All functions in the module, ordered by hash. Functions with a unique
+  // hash value are easily eliminated.
+  std::vector<std::pair<FunctionComparator::FunctionHash, Function *>>
+    HashedFuncs;
+  for (Function &Func : M) {
+    if (!Func.isDeclaration() && !Func.hasAvailableExternallyLinkage()) {
+      HashedFuncs.push_back({FunctionComparator::functionHash(Func), &Func});
+    } 
+  }
+
+  std::stable_sort(
+      HashedFuncs.begin(), HashedFuncs.end(),
+      [](const std::pair<FunctionComparator::FunctionHash, Function *> &a,
+         const std::pair<FunctionComparator::FunctionHash, Function *> &b) {
+        return a.first < b.first;
+      });
+
+  auto S = HashedFuncs.begin();
+  for (auto I = HashedFuncs.begin(), IE = HashedFuncs.end(); I != IE; ++I) {
+    // If the hash value matches the previous value or the next one, we must
+    // consider merging it. Otherwise it is dropped and never considered again.
+    if ((I != S && std::prev(I)->first == I->first) ||
+        (std::next(I) != IE && std::next(I)->first == I->first) ) {
+      Deferred.push_back(WeakTrackingVH(I->second));
+    }
+  }
+  
+  do {
+    std::vector<WeakTrackingVH> Worklist;
+    Deferred.swap(Worklist);
+
+    DEBUG(doSanityCheck(Worklist));
+
+    DEBUG(dbgs() << "size of module: " << M.size() << '\n');
+    DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n');
+
+    // Insert functions and merge them.
+    for (WeakTrackingVH &I : Worklist) {
+      if (!I)
+        continue;
+      Function *F = cast<Function>(I);
+      if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage()) {
+        Changed |= insert(F);
+      }
+    }
+    DEBUG(dbgs() << "size of FnTree: " << FnTree.size() << '\n');
+  } while (!Deferred.empty());
+
+  FnTree.clear();
+  GlobalNumbers.clear();
+
+  return Changed;
+}
+
+// Replace direct callers of Old with New.
+void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
+  Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType());
+  for (auto UI = Old->use_begin(), UE = Old->use_end(); UI != UE;) {
+    Use *U = &*UI;
+    ++UI;
+    CallSite CS(U->getUser());
+    if (CS && CS.isCallee(U)) {
+      // Transfer the called function's attributes to the call site. Due to the
+      // bitcast we will 'lose' ABI changing attributes because the 'called
+      // function' is no longer a Function* but the bitcast. Code that looks up
+      // the attributes from the called function will fail.
+
+      // FIXME: This is not actually true, at least not anymore. The callsite
+      // will always have the same ABI affecting attributes as the callee,
+      // because otherwise the original input has UB. Note that Old and New
+      // always have matching ABI, so no attributes need to be changed.
+      // Transferring other attributes may help other optimizations, but that
+      // should be done uniformly and not in this ad-hoc way.
+      auto &Context = New->getContext();
+      auto NewPAL = New->getAttributes();
+      SmallVector<AttributeSet, 4> NewArgAttrs;
+      for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++)
+        NewArgAttrs.push_back(NewPAL.getParamAttributes(argIdx));
+      // Don't transfer attributes from the function to the callee. Function
+      // attributes typically aren't relevant to the calling convention or ABI.
+      CS.setAttributes(AttributeList::get(Context, /*FnAttrs=*/AttributeSet(),
+                                          NewPAL.getRetAttributes(),
+                                          NewArgAttrs));
+
+      remove(CS.getInstruction()->getParent()->getParent());
+      U->set(BitcastNew);
+    }
+  }
+}
+
+// Replace G with an alias to F if possible, or else a thunk to F. Deletes G.
+void MergeFunctions::writeThunkOrAlias(Function *F, Function *G) {
+  if (HasGlobalAliases && G->hasGlobalUnnamedAddr()) {
+    if (G->hasExternalLinkage() || G->hasLocalLinkage() ||
+        G->hasWeakLinkage()) {
+      writeAlias(F, G);
+      return;
+    }
+  }
+
+  writeThunk(F, G);
+}
+
+// Helper for writeThunk,
+// Selects proper bitcast operation,
+// but a bit simpler then CastInst::getCastOpcode.
+static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
+  Type *SrcTy = V->getType();
+  if (SrcTy->isStructTy()) {
+    assert(DestTy->isStructTy());
+    assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements());
+    Value *Result = UndefValue::get(DestTy);
+    for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) {
+      Value *Element = createCast(
+          Builder, Builder.CreateExtractValue(V, makeArrayRef(I)),
+          DestTy->getStructElementType(I));
+
+      Result =
+          Builder.CreateInsertValue(Result, Element, makeArrayRef(I));
+    }
+    return Result;
+  }
+  assert(!DestTy->isStructTy());
+  if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
+    return Builder.CreateIntToPtr(V, DestTy);
+  else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
+    return Builder.CreatePtrToInt(V, DestTy);
+  else
+    return Builder.CreateBitCast(V, DestTy);
+}
+
+// Erase the instructions in PDIUnrelatedWL as they are unrelated to the
+// parameter debug info, from the entry block.
+void MergeFunctions::eraseInstsUnrelatedToPDI(
+    std::vector<Instruction *> &PDIUnrelatedWL) {
+
+  DEBUG(dbgs() << " Erasing instructions (in reverse order of appearance in "
+                  "entry block) unrelated to parameter debug info from entry "
+                  "block: {\n");
+  while (!PDIUnrelatedWL.empty()) {
+    Instruction *I = PDIUnrelatedWL.back();
+    DEBUG(dbgs() << "  Deleting Instruction: ");
+    DEBUG(I->print(dbgs()));
+    DEBUG(dbgs() << "\n");
+    I->eraseFromParent();
+    PDIUnrelatedWL.pop_back();
+  }
+  DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter "
+                  "debug info from entry block. \n");
+}
+
+// Reduce G to its entry block.
+void MergeFunctions::eraseTail(Function *G) {
+
+  std::vector<BasicBlock *> WorklistBB;
+  for (Function::iterator BBI = std::next(G->begin()), BBE = G->end();
+       BBI != BBE; ++BBI) {
+    BBI->dropAllReferences();
+    WorklistBB.push_back(&*BBI);
+  }
+  while (!WorklistBB.empty()) {
+    BasicBlock *BB = WorklistBB.back();
+    BB->eraseFromParent();
+    WorklistBB.pop_back();
+  }
+}
+
+// We are interested in the following instructions from the entry block as being
+// related to parameter debug info:
+// - @llvm.dbg.declare
+// - stores from the incoming parameters to locations on the stack-frame
+// - allocas that create these locations on the stack-frame
+// - @llvm.dbg.value
+// - the entry block's terminator
+// The rest are unrelated to debug info for the parameters; fill up
+// PDIUnrelatedWL with such instructions.
+void MergeFunctions::filterInstsUnrelatedToPDI(
+    BasicBlock *GEntryBlock, std::vector<Instruction *> &PDIUnrelatedWL) {
+
+  std::set<Instruction *> PDIRelated;
+  for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end();
+       BI != BIE; ++BI) {
+    if (auto *DVI = dyn_cast<DbgValueInst>(&*BI)) {
+      DEBUG(dbgs() << " Deciding: ");
+      DEBUG(BI->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+      DILocalVariable *DILocVar = DVI->getVariable();
+      if (DILocVar->isParameter()) {
+        DEBUG(dbgs() << "  Include (parameter): ");
+        DEBUG(BI->print(dbgs()));
+        DEBUG(dbgs() << "\n");
+        PDIRelated.insert(&*BI);
+      } else {
+        DEBUG(dbgs() << "  Delete (!parameter): ");
+        DEBUG(BI->print(dbgs()));
+        DEBUG(dbgs() << "\n");
+      }
+    } else if (auto *DDI = dyn_cast<DbgDeclareInst>(&*BI)) {
+      DEBUG(dbgs() << " Deciding: ");
+      DEBUG(BI->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+      DILocalVariable *DILocVar = DDI->getVariable();
+      if (DILocVar->isParameter()) {
+        DEBUG(dbgs() << "  Parameter: ");
+        DEBUG(DILocVar->print(dbgs()));
+        AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
+        if (AI) {
+          DEBUG(dbgs() << "  Processing alloca users: ");
+          DEBUG(dbgs() << "\n");
+          for (User *U : AI->users()) {
+            if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+              if (Value *Arg = SI->getValueOperand()) {
+                if (dyn_cast<Argument>(Arg)) {
+                  DEBUG(dbgs() << "  Include: ");
+                  DEBUG(AI->print(dbgs()));
+                  DEBUG(dbgs() << "\n");
+                  PDIRelated.insert(AI);
+                  DEBUG(dbgs() << "   Include (parameter): ");
+                  DEBUG(SI->print(dbgs()));
+                  DEBUG(dbgs() << "\n");
+                  PDIRelated.insert(SI);
+                  DEBUG(dbgs() << "  Include: ");
+                  DEBUG(BI->print(dbgs()));
+                  DEBUG(dbgs() << "\n");
+                  PDIRelated.insert(&*BI);
+                } else {
+                  DEBUG(dbgs() << "   Delete (!parameter): ");
+                  DEBUG(SI->print(dbgs()));
+                  DEBUG(dbgs() << "\n");
+                }
+              }
+            } else {
+              DEBUG(dbgs() << "   Defer: ");
+              DEBUG(U->print(dbgs()));
+              DEBUG(dbgs() << "\n");
+            }
+          }
+        } else {
+          DEBUG(dbgs() << "  Delete (alloca NULL): ");
+          DEBUG(BI->print(dbgs()));
+          DEBUG(dbgs() << "\n");
+        }
+      } else {
+        DEBUG(dbgs() << "  Delete (!parameter): ");
+        DEBUG(BI->print(dbgs()));
+        DEBUG(dbgs() << "\n");
+      }
+    } else if (dyn_cast<TerminatorInst>(BI) == GEntryBlock->getTerminator()) {
+      DEBUG(dbgs() << " Will Include Terminator: ");
+      DEBUG(BI->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+      PDIRelated.insert(&*BI);
+    } else {
+      DEBUG(dbgs() << " Defer: ");
+      DEBUG(BI->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+    }
+  }
+  DEBUG(dbgs()
+        << " Report parameter debug info related/related instructions: {\n");
+  for (BasicBlock::iterator BI = GEntryBlock->begin(), BE = GEntryBlock->end();
+       BI != BE; ++BI) {
+
+    Instruction *I = &*BI;
+    if (PDIRelated.find(I) == PDIRelated.end()) {
+      DEBUG(dbgs() << "  !PDIRelated: ");
+      DEBUG(I->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+      PDIUnrelatedWL.push_back(I);
+    } else {
+      DEBUG(dbgs() << "   PDIRelated: ");
+      DEBUG(I->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+    }
+  }
+  DEBUG(dbgs() << " }\n");
+}
+
+// Replace G with a simple tail call to bitcast(F). Also (unless
+// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
+// delete G. Under MergeFunctionsPDI, we use G itself for creating
+// the thunk as we preserve the debug info (and associated instructions)
+// from G's entry block pertaining to G's incoming arguments which are
+// passed on as corresponding arguments in the call that G makes to F.
+// For better debugability, under MergeFunctionsPDI, we do not modify G's
+// call sites to point to F even when within the same translation unit.
+void MergeFunctions::writeThunk(Function *F, Function *G) {
+  if (!G->isInterposable() && !MergeFunctionsPDI) {
+    // Redirect direct callers of G to F. (See note on MergeFunctionsPDI
+    // above).
+    replaceDirectCallers(G, F);
+  }
+
+  // If G was internal then we may have replaced all uses of G with F. If so,
+  // stop here and delete G. There's no need for a thunk. (See note on
+  // MergeFunctionsPDI above).
+  if (G->hasLocalLinkage() && G->use_empty() && !MergeFunctionsPDI) {
+    G->eraseFromParent();
+    return;
+  }
+
+  BasicBlock *GEntryBlock = nullptr;
+  std::vector<Instruction *> PDIUnrelatedWL;
+  BasicBlock *BB = nullptr;
+  Function *NewG = nullptr;
+  if (MergeFunctionsPDI) {
+    DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) Do not create a new "
+                    "function as thunk; retain original: "
+                 << G->getName() << "()\n");
+    GEntryBlock = &G->getEntryBlock();
+    DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related "
+                    "debug info for "
+                 << G->getName() << "() {\n");
+    filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL);
+    GEntryBlock->getTerminator()->eraseFromParent();
+    BB = GEntryBlock;
+  } else {
+    NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "",
+                            G->getParent());
+    BB = BasicBlock::Create(F->getContext(), "", NewG);
+  }
+
+  IRBuilder<> Builder(BB);
+  Function *H = MergeFunctionsPDI ? G : NewG;
+  SmallVector<Value *, 16> Args;
+  unsigned i = 0;
+  FunctionType *FFTy = F->getFunctionType();
+  for (Argument & AI : H->args()) {
+    Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i)));
+    ++i;
+  }
+
+  CallInst *CI = Builder.CreateCall(F, Args);
+  ReturnInst *RI = nullptr;
+  CI->setTailCall();
+  CI->setCallingConv(F->getCallingConv());
+  CI->setAttributes(F->getAttributes());
+  if (H->getReturnType()->isVoidTy()) {
+    RI = Builder.CreateRetVoid();
+  } else {
+    RI = Builder.CreateRet(createCast(Builder, CI, H->getReturnType()));
+  }
+
+  if (MergeFunctionsPDI) {
+    DISubprogram *DIS = G->getSubprogram();
+    if (DIS) {
+      DebugLoc CIDbgLoc = DebugLoc::get(DIS->getScopeLine(), 0, DIS);
+      DebugLoc RIDbgLoc = DebugLoc::get(DIS->getScopeLine(), 0, DIS);
+      CI->setDebugLoc(CIDbgLoc);
+      RI->setDebugLoc(RIDbgLoc);
+    } else {
+      DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) No DISubprogram for "
+                   << G->getName() << "()\n");
+    }
+    eraseTail(G);
+    eraseInstsUnrelatedToPDI(PDIUnrelatedWL);
+    DEBUG(dbgs() << "} // End of parameter related debug info filtering for: "
+                 << G->getName() << "()\n");
+  } else {
+    NewG->copyAttributesFrom(G);
+    NewG->takeName(G);
+    removeUsers(G);
+    G->replaceAllUsesWith(NewG);
+    G->eraseFromParent();
+  }
+
+  DEBUG(dbgs() << "writeThunk: " << H->getName() << '\n');
+  ++NumThunksWritten;
+}
+
+// Replace G with an alias to F and delete G.
+void MergeFunctions::writeAlias(Function *F, Function *G) {
+  auto *GA = GlobalAlias::create(G->getLinkage(), "", F);
+  F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+  GA->takeName(G);
+  GA->setVisibility(G->getVisibility());
+  removeUsers(G);
+  G->replaceAllUsesWith(GA);
+  G->eraseFromParent();
+
+  DEBUG(dbgs() << "writeAlias: " << GA->getName() << '\n');
+  ++NumAliasesWritten;
+}
+
+// Merge two equivalent functions. Upon completion, Function G is deleted.
+void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
+  if (F->isInterposable()) {
+    assert(G->isInterposable());
+
+    // Make them both thunks to the same internal function.
+    Function *H = Function::Create(F->getFunctionType(), F->getLinkage(), "",
+                                   F->getParent());
+    H->copyAttributesFrom(F);
+    H->takeName(F);
+    removeUsers(F);
+    F->replaceAllUsesWith(H);
+
+    unsigned MaxAlignment = std::max(G->getAlignment(), H->getAlignment());
+
+    if (HasGlobalAliases) {
+      writeAlias(F, G);
+      writeAlias(F, H);
+    } else {
+      writeThunk(F, G);
+      writeThunk(F, H);
+    }
+
+    F->setAlignment(MaxAlignment);
+    F->setLinkage(GlobalValue::PrivateLinkage);
+    ++NumDoubleWeak;
+  } else {
+    writeThunkOrAlias(F, G);
+  }
+
+  ++NumFunctionsMerged;
+}
+
+/// Replace function F by function G.
+void MergeFunctions::replaceFunctionInTree(const FunctionNode &FN,
+                                           Function *G) {
+  Function *F = FN.getFunc();
+  assert(FunctionComparator(F, G, &GlobalNumbers).compare() == 0 &&
+         "The two functions must be equal");
+  
+  auto I = FNodesInTree.find(F);
+  assert(I != FNodesInTree.end() && "F should be in FNodesInTree");
+  assert(FNodesInTree.count(G) == 0 && "FNodesInTree should not contain G");
+  
+  FnTreeType::iterator IterToFNInFnTree = I->second;
+  assert(&(*IterToFNInFnTree) == &FN && "F should map to FN in FNodesInTree.");
+  // Remove F -> FN and insert G -> FN
+  FNodesInTree.erase(I);
+  FNodesInTree.insert({G, IterToFNInFnTree});
+  // Replace F with G in FN, which is stored inside the FnTree.
+  FN.replaceBy(G);
+}
+
+// Insert a ComparableFunction into the FnTree, or merge it away if equal to one
+// that was already inserted.
+bool MergeFunctions::insert(Function *NewFunction) {
+  std::pair<FnTreeType::iterator, bool> Result =
+      FnTree.insert(FunctionNode(NewFunction));
+
+  if (Result.second) {
+    assert(FNodesInTree.count(NewFunction) == 0);
+    FNodesInTree.insert({NewFunction, Result.first});
+    DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName() << '\n');
+    return false;
+  }
+
+  const FunctionNode &OldF = *Result.first;
+
+  // Don't merge tiny functions, since it can just end up making the function
+  // larger.
+  // FIXME: Should still merge them if they are unnamed_addr and produce an
+  // alias.
+  if (NewFunction->size() == 1) {
+    if (NewFunction->front().size() <= 2) {
+      DEBUG(dbgs() << NewFunction->getName()
+                   << " is to small to bother merging\n");
+      return false;
+    }
+  }
+
+  // Impose a total order (by name) on the replacement of functions. This is
+  // important when operating on more than one module independently to prevent
+  // cycles of thunks calling each other when the modules are linked together.
+  //
+  // First of all, we process strong functions before weak functions.
+  if ((OldF.getFunc()->isInterposable() && !NewFunction->isInterposable()) ||
+     (OldF.getFunc()->isInterposable() == NewFunction->isInterposable() &&
+       OldF.getFunc()->getName() > NewFunction->getName())) {
+    // Swap the two functions.
+    Function *F = OldF.getFunc();
+    replaceFunctionInTree(*Result.first, NewFunction);
+    NewFunction = F;
+    assert(OldF.getFunc() != F && "Must have swapped the functions.");
+  }
+
+  DEBUG(dbgs() << "  " << OldF.getFunc()->getName()
+               << " == " << NewFunction->getName() << '\n');
+
+  Function *DeleteF = NewFunction;
+  mergeTwoFunctions(OldF.getFunc(), DeleteF);
+  return true;
+}
+
+// Remove a function from FnTree. If it was already in FnTree, add
+// it to Deferred so that we'll look at it in the next round.
+void MergeFunctions::remove(Function *F) {
+  auto I = FNodesInTree.find(F);
+  if (I != FNodesInTree.end()) {
+    DEBUG(dbgs() << "Deferred " << F->getName()<< ".\n");
+    FnTree.erase(I->second);
+    // I->second has been invalidated, remove it from the FNodesInTree map to
+    // preserve the invariant.
+    FNodesInTree.erase(I);
+    Deferred.emplace_back(F);
+  }
+}
+
+// For each instruction used by the value, remove() the function that contains
+// the instruction. This should happen right before a call to RAUW.
+void MergeFunctions::removeUsers(Value *V) {
+  std::vector<Value *> Worklist;
+  Worklist.push_back(V);
+  SmallSet<Value*, 8> Visited;
+  Visited.insert(V);
+  while (!Worklist.empty()) {
+    Value *V = Worklist.back();
+    Worklist.pop_back();
+
+    for (User *U : V->users()) {
+      if (Instruction *I = dyn_cast<Instruction>(U)) {
+        remove(I->getParent()->getParent());
+      } else if (isa<GlobalValue>(U)) {
+        // do nothing
+      } else if (Constant *C = dyn_cast<Constant>(U)) {
+        for (User *UU : C->users()) {
+          if (!Visited.insert(UU).second)
+            Worklist.push_back(UU);
+        }
+      }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
new file mode 100644
index 000000000000..8840435af642
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -0,0 +1,993 @@
+//===- PartialInlining.cpp - Inline parts of functions --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs partial inlining, typically by inlining an if statement
+// that surrounds the body of the function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/PartialInlining.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "partial-inlining"
+
+STATISTIC(NumPartialInlined,
+          "Number of callsites functions partially inlined into.");
+
+// Command line option to disable partial-inlining. The default is false:
+static cl::opt<bool>
+    DisablePartialInlining("disable-partial-inlining", cl::init(false),
+                           cl::Hidden, cl::desc("Disable partial ininling"));
+// This is an option used by testing:
+static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
+                                      cl::init(false), cl::ZeroOrMore,
+                                      cl::ReallyHidden,
+                                      cl::desc("Skip Cost Analysis"));
+
+static cl::opt<unsigned> MaxNumInlineBlocks(
+    "max-num-inline-blocks", cl::init(5), cl::Hidden,
+    cl::desc("Max Number of Blocks  To be Partially Inlined"));
+
+// Command line option to set the maximum number of partial inlining allowed
+// for the module. The default value of -1 means no limit.
+static cl::opt<int> MaxNumPartialInlining(
+    "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Max number of partial inlining. The default is unlimited"));
+
+// Used only when PGO or user annotated branch data is absent. It is
+// the least value that is used to weigh the outline region. If BFI
+// produces larger value, the BFI value will be used.
+static cl::opt<int>
+    OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
+                             cl::Hidden, cl::ZeroOrMore,
+                             cl::desc("Relative frequency of outline region to "
+                                      "the entry block"));
+
+static cl::opt<unsigned> ExtraOutliningPenalty(
+    "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
+    cl::desc("A debug option to add additional penalty to the computed one."));
+
+namespace {
+
+struct FunctionOutliningInfo {
+  FunctionOutliningInfo()
+      : Entries(), ReturnBlock(nullptr), NonReturnBlock(nullptr),
+        ReturnBlockPreds() {}
+  // Returns the number of blocks to be inlined including all blocks
+  // in Entries and one return block.
+  unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
+
+  // A set of blocks including the function entry that guard
+  // the region to be outlined.
+  SmallVector<BasicBlock *, 4> Entries;
+  // The return block that is not included in the outlined region.
+  BasicBlock *ReturnBlock;
+  // The dominating block of the region to be outlined.
+  BasicBlock *NonReturnBlock;
+  // The set of blocks in Entries that that are predecessors to ReturnBlock
+  SmallVector<BasicBlock *, 4> ReturnBlockPreds;
+};
+
+struct PartialInlinerImpl {
+  PartialInlinerImpl(
+      std::function<AssumptionCache &(Function &)> *GetAC,
+      std::function<TargetTransformInfo &(Function &)> *GTTI,
+      Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
+      ProfileSummaryInfo *ProfSI)
+      : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
+  bool run(Module &M);
+  Function *unswitchFunction(Function *F);
+
+  // This class speculatively clones the the function to be partial inlined.
+  // At the end of partial inlining, the remaining callsites to the cloned
+  // function that are not partially inlined will be fixed up to reference
+  // the original function, and the cloned function will be erased.
+  struct FunctionCloner {
+    FunctionCloner(Function *F, FunctionOutliningInfo *OI);
+    ~FunctionCloner();
+
+    // Prepare for function outlining: making sure there is only
+    // one incoming edge from the extracted/outlined region to
+    // the return block.
+    void NormalizeReturnBlock();
+
+    // Do function outlining:
+    Function *doFunctionOutlining();
+
+    Function *OrigFunc = nullptr;
+    Function *ClonedFunc = nullptr;
+    Function *OutlinedFunc = nullptr;
+    BasicBlock *OutliningCallBB = nullptr;
+    // ClonedFunc is inlined in one of its callers after function
+    // outlining.
+    bool IsFunctionInlined = false;
+    // The cost of the region to be outlined.
+    int OutlinedRegionCost = 0;
+    std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
+    std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
+  };
+
+private:
+  int NumPartialInlining = 0;
+  std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
+  std::function<TargetTransformInfo &(Function &)> *GetTTI;
+  Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
+  ProfileSummaryInfo *PSI;
+
+  // Return the frequency of the OutlininingBB relative to F's entry point.
+  // The result is no larger than 1 and is represented using BP.
+  // (Note that the outlined region's 'head' block can only have incoming
+  // edges from the guarding entry blocks).
+  BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
+
+  // Return true if the callee of CS should be partially inlined with
+  // profit.
+  bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
+                           BlockFrequency WeightedOutliningRcost,
+                           OptimizationRemarkEmitter &ORE);
+
+  // Try to inline DuplicateFunction (cloned from F with call to
+  // the OutlinedFunction into its callers. Return true
+  // if there is any successful inlining.
+  bool tryPartialInline(FunctionCloner &Cloner);
+
+  // Compute the mapping from use site of DuplicationFunction to the enclosing
+  // BB's profile count.
+  void computeCallsiteToProfCountMap(Function *DuplicateFunction,
+                                     DenseMap<User *, uint64_t> &SiteCountMap);
+
+  bool IsLimitReached() {
+    return (MaxNumPartialInlining != -1 &&
+            NumPartialInlining >= MaxNumPartialInlining);
+  }
+
+  static CallSite getCallSite(User *U) {
+    CallSite CS;
+    if (CallInst *CI = dyn_cast<CallInst>(U))
+      CS = CallSite(CI);
+    else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
+      CS = CallSite(II);
+    else
+      llvm_unreachable("All uses must be calls");
+    return CS;
+  }
+
+  static CallSite getOneCallSiteTo(Function *F) {
+    User *User = *F->user_begin();
+    return getCallSite(User);
+  }
+
+  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
+    CallSite CS = getOneCallSiteTo(F);
+    DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+    BasicBlock *Block = CS.getParent();
+    return std::make_tuple(DLoc, Block);
+  }
+
+  // Returns the costs associated with function outlining:
+  // - The first value is the non-weighted runtime cost for making the call
+  //   to the outlined function, including the addtional  setup cost in the
+  //    outlined function itself;
+  // - The second value is the estimated size of the new call sequence in
+  //   basic block Cloner.OutliningCallBB;
+  std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
+  // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
+  // approximate both the size and runtime cost (Note that in the current
+  // inline cost analysis, there is no clear distinction there either).
+  static int computeBBInlineCost(BasicBlock *BB);
+
+  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
+
+};
+
+struct PartialInlinerLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  PartialInlinerLegacyPass() : ModulePass(ID) {
+    initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
+    TargetTransformInfoWrapperPass *TTIWP =
+        &getAnalysis<TargetTransformInfoWrapperPass>();
+    ProfileSummaryInfo *PSI =
+        getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+
+    std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+        [&ACT](Function &F) -> AssumptionCache & {
+      return ACT->getAssumptionCache(F);
+    };
+
+    std::function<TargetTransformInfo &(Function &)> GetTTI =
+        [&TTIWP](Function &F) -> TargetTransformInfo & {
+      return TTIWP->getTTI(F);
+    };
+
+    return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M);
+  }
+};
+}
+
+std::unique_ptr<FunctionOutliningInfo>
+PartialInlinerImpl::computeOutliningInfo(Function *F) {
+  BasicBlock *EntryBlock = &F->front();
+  BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
+  if (!BR || BR->isUnconditional())
+    return std::unique_ptr<FunctionOutliningInfo>();
+
+  // Returns true if Succ is BB's successor
+  auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
+    return is_contained(successors(BB), Succ);
+  };
+
+  auto SuccSize = [](BasicBlock *BB) {
+    return std::distance(succ_begin(BB), succ_end(BB));
+  };
+
+  auto IsReturnBlock = [](BasicBlock *BB) {
+    TerminatorInst *TI = BB->getTerminator();
+    return isa<ReturnInst>(TI);
+  };
+
+  auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
+    if (IsReturnBlock(Succ1))
+      return std::make_tuple(Succ1, Succ2);
+    if (IsReturnBlock(Succ2))
+      return std::make_tuple(Succ2, Succ1);
+
+    return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
+  };
+
+  // Detect a triangular shape:
+  auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
+    if (IsSuccessor(Succ1, Succ2))
+      return std::make_tuple(Succ1, Succ2);
+    if (IsSuccessor(Succ2, Succ1))
+      return std::make_tuple(Succ2, Succ1);
+
+    return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
+  };
+
+  std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
+      llvm::make_unique<FunctionOutliningInfo>();
+
+  BasicBlock *CurrEntry = EntryBlock;
+  bool CandidateFound = false;
+  do {
+    // The number of blocks to be inlined has already reached
+    // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
+    // disables partial inlining for the function.
+    if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
+      break;
+
+    if (SuccSize(CurrEntry) != 2)
+      break;
+
+    BasicBlock *Succ1 = *succ_begin(CurrEntry);
+    BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
+
+    BasicBlock *ReturnBlock, *NonReturnBlock;
+    std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
+
+    if (ReturnBlock) {
+      OutliningInfo->Entries.push_back(CurrEntry);
+      OutliningInfo->ReturnBlock = ReturnBlock;
+      OutliningInfo->NonReturnBlock = NonReturnBlock;
+      CandidateFound = true;
+      break;
+    }
+
+    BasicBlock *CommSucc;
+    BasicBlock *OtherSucc;
+    std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
+
+    if (!CommSucc)
+      break;
+
+    OutliningInfo->Entries.push_back(CurrEntry);
+    CurrEntry = OtherSucc;
+
+  } while (true);
+
+  if (!CandidateFound)
+    return std::unique_ptr<FunctionOutliningInfo>();
+
+  // Do sanity check of the entries: threre should not
+  // be any successors (not in the entry set) other than
+  // {ReturnBlock, NonReturnBlock}
+  assert(OutliningInfo->Entries[0] == &F->front() &&
+         "Function Entry must be the first in Entries vector");
+  DenseSet<BasicBlock *> Entries;
+  for (BasicBlock *E : OutliningInfo->Entries)
+    Entries.insert(E);
+
+  // Returns true of BB has Predecessor which is not
+  // in Entries set.
+  auto HasNonEntryPred = [Entries](BasicBlock *BB) {
+    for (auto Pred : predecessors(BB)) {
+      if (!Entries.count(Pred))
+        return true;
+    }
+    return false;
+  };
+  auto CheckAndNormalizeCandidate =
+      [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
+        for (BasicBlock *E : OutliningInfo->Entries) {
+          for (auto Succ : successors(E)) {
+            if (Entries.count(Succ))
+              continue;
+            if (Succ == OutliningInfo->ReturnBlock)
+              OutliningInfo->ReturnBlockPreds.push_back(E);
+            else if (Succ != OutliningInfo->NonReturnBlock)
+              return false;
+          }
+          // There should not be any outside incoming edges either:
+          if (HasNonEntryPred(E))
+            return false;
+        }
+        return true;
+      };
+
+  if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
+    return std::unique_ptr<FunctionOutliningInfo>();
+
+  // Now further growing the candidate's inlining region by
+  // peeling off dominating blocks from the outlining region:
+  while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
+    BasicBlock *Cand = OutliningInfo->NonReturnBlock;
+    if (SuccSize(Cand) != 2)
+      break;
+
+    if (HasNonEntryPred(Cand))
+      break;
+
+    BasicBlock *Succ1 = *succ_begin(Cand);
+    BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
+
+    BasicBlock *ReturnBlock, *NonReturnBlock;
+    std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
+    if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
+      break;
+
+    if (NonReturnBlock->getSinglePredecessor() != Cand)
+      break;
+
+    // Now grow and update OutlininigInfo:
+    OutliningInfo->Entries.push_back(Cand);
+    OutliningInfo->NonReturnBlock = NonReturnBlock;
+    OutliningInfo->ReturnBlockPreds.push_back(Cand);
+    Entries.insert(Cand);
+  }
+
+  return OutliningInfo;
+}
+
+// Check if there is PGO data or user annoated branch data:
+static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
+  if (F->getEntryCount())
+    return true;
+  // Now check if any of the entry block has MD_prof data:
+  for (auto *E : OI->Entries) {
+    BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
+    if (!BR || BR->isUnconditional())
+      continue;
+    uint64_t T, F;
+    if (BR->extractProfMetadata(T, F))
+      return true;
+  }
+  return false;
+}
+
+BranchProbability
+PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
+
+  auto EntryFreq =
+      Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
+  auto OutliningCallFreq =
+      Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB);
+
+  auto OutlineRegionRelFreq =
+      BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
+                                              EntryFreq.getFrequency());
+
+  if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
+    return OutlineRegionRelFreq;
+
+  // When profile data is not available, we need to be conservative in
+  // estimating the overall savings. Static branch prediction can usually
+  // guess the branch direction right (taken/non-taken), but the guessed
+  // branch probability is usually not biased enough. In case when the
+  // outlined region is predicted to be likely, its probability needs
+  // to be made higher (more biased) to not under-estimate the cost of
+  // function outlining. On the other hand, if the outlined region
+  // is predicted to be less likely, the predicted probablity is usually
+  // higher than the actual. For instance, the actual probability of the
+  // less likely target is only 5%, but the guessed probablity can be
+  // 40%. In the latter case, there is no need for further adjustement.
+  // FIXME: add an option for this.
+  if (OutlineRegionRelFreq < BranchProbability(45, 100))
+    return OutlineRegionRelFreq;
+
+  OutlineRegionRelFreq = std::max(
+      OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
+
+  return OutlineRegionRelFreq;
+}
+
+bool PartialInlinerImpl::shouldPartialInline(
+    CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
+    OptimizationRemarkEmitter &ORE) {
+
+  using namespace ore;
+  if (SkipCostAnalysis)
+    return true;
+
+  Instruction *Call = CS.getInstruction();
+  Function *Callee = CS.getCalledFunction();
+  assert(Callee == Cloner.ClonedFunc);
+
+  Function *Caller = CS.getCaller();
+  auto &CalleeTTI = (*GetTTI)(*Callee);
+  InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
+                                *GetAssumptionCache, GetBFI, PSI);
+
+  if (IC.isAlways()) {
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
+             << NV("Callee", Cloner.OrigFunc)
+             << " should always be fully inlined, not partially");
+    return false;
+  }
+
+  if (IC.isNever()) {
+    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
+             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+             << NV("Caller", Caller)
+             << " because it should never be inlined (cost=never)");
+    return false;
+  }
+
+  if (!IC) {
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
+             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+             << NV("Caller", Caller) << " because too costly to inline (cost="
+             << NV("Cost", IC.getCost()) << ", threshold="
+             << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
+    return false;
+  }
+  const DataLayout &DL = Caller->getParent()->getDataLayout();
+
+  // The savings of eliminating the call:
+  int NonWeightedSavings = getCallsiteCost(CS, DL);
+  BlockFrequency NormWeightedSavings(NonWeightedSavings);
+
+  // Weighted saving is smaller than weighted cost, return false
+  if (NormWeightedSavings < WeightedOutliningRcost) {
+    ORE.emit(
+        OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
+        << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+        << NV("Caller", Caller) << " runtime overhead (overhead="
+        << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
+        << ", savings="
+        << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
+        << " of making the outlined call is too high");
+
+    return false;
+  }
+
+  ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
+           << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
+           << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
+           << " (threshold="
+           << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
+  return true;
+}
+
+// TODO: Ideally  we should share Inliner's InlineCost Analysis code.
+// For now use a simplified version. The returned 'InlineCost' will be used
+// to esimate the size cost as well as runtime cost of the BB.
+int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
+  int InlineCost = 0;
+  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    switch (I->getOpcode()) {
+    case Instruction::BitCast:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::Alloca:
+      continue;
+    case Instruction::GetElementPtr:
+      if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
+        continue;
+    default:
+      break;
+    }
+
+    IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I);
+    if (IntrInst) {
+      if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
+          IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
+        continue;
+    }
+
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      InlineCost += getCallsiteCost(CallSite(CI), DL);
+      continue;
+    }
+
+    if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+      InlineCost += getCallsiteCost(CallSite(II), DL);
+      continue;
+    }
+
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+      InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
+      continue;
+    }
+    InlineCost += InlineConstants::InstrCost;
+  }
+  return InlineCost;
+}
+
+std::tuple<int, int>
+PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
+
+  // Now compute the cost of the call sequence to the outlined function
+  // 'OutlinedFunction' in BB 'OutliningCallBB':
+  int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB);
+
+  // Now compute the cost of the extracted/outlined function itself:
+  int OutlinedFunctionCost = 0;
+  for (BasicBlock &BB : *Cloner.OutlinedFunc) {
+    OutlinedFunctionCost += computeBBInlineCost(&BB);
+  }
+
+  assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
+         "Outlined function cost should be no less than the outlined region");
+  // The code extractor introduces a new root and exit stub blocks with
+  // additional unconditional branches. Those branches will be eliminated
+  // later with bb layout. The cost should be adjusted accordingly:
+  OutlinedFunctionCost -= 2 * InlineConstants::InstrCost;
+
+  int OutliningRuntimeOverhead =
+      OutliningFuncCallCost +
+      (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
+      ExtraOutliningPenalty;
+
+  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
+}
+
+// Create the callsite to profile count map which is
+// used to update the original function's entry count,
+// after the function is partially inlined into the callsite.
+void PartialInlinerImpl::computeCallsiteToProfCountMap(
+    Function *DuplicateFunction,
+    DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
+  std::vector<User *> Users(DuplicateFunction->user_begin(),
+                            DuplicateFunction->user_end());
+  Function *CurrentCaller = nullptr;
+  std::unique_ptr<BlockFrequencyInfo> TempBFI;
+  BlockFrequencyInfo *CurrentCallerBFI = nullptr;
+
+  auto ComputeCurrBFI = [&,this](Function *Caller) {
+      // For the old pass manager:
+      if (!GetBFI) {
+        DominatorTree DT(*Caller);
+        LoopInfo LI(DT);
+        BranchProbabilityInfo BPI(*Caller, LI);
+        TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
+        CurrentCallerBFI = TempBFI.get();
+      } else {
+        // New pass manager:
+        CurrentCallerBFI = &(*GetBFI)(*Caller);
+      }
+  };
+
+  for (User *User : Users) {
+    CallSite CS = getCallSite(User);
+    Function *Caller = CS.getCaller();
+    if (CurrentCaller != Caller) {
+      CurrentCaller = Caller;
+      ComputeCurrBFI(Caller);
+    } else {
+      assert(CurrentCallerBFI && "CallerBFI is not set");
+    }
+    BasicBlock *CallBB = CS.getInstruction()->getParent();
+    auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
+    if (Count)
+      CallSiteToProfCountMap[User] = *Count;
+    else
+      CallSiteToProfCountMap[User] = 0;
+  }
+}
+
+PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F,
+                                                   FunctionOutliningInfo *OI)
+    : OrigFunc(F) {
+  ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
+
+  // Clone the function, so that we can hack away on it.
+  ValueToValueMapTy VMap;
+  ClonedFunc = CloneFunction(F, VMap);
+
+  ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
+  ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
+  for (BasicBlock *BB : OI->Entries) {
+    ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
+  }
+  for (BasicBlock *E : OI->ReturnBlockPreds) {
+    BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
+    ClonedOI->ReturnBlockPreds.push_back(NewE);
+  }
+  // Go ahead and update all uses to the duplicate, so that we can just
+  // use the inliner functionality when we're done hacking.
+  F->replaceAllUsesWith(ClonedFunc);
+}
+
+void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
+
+  auto getFirstPHI = [](BasicBlock *BB) {
+    BasicBlock::iterator I = BB->begin();
+    PHINode *FirstPhi = nullptr;
+    while (I != BB->end()) {
+      PHINode *Phi = dyn_cast<PHINode>(I);
+      if (!Phi)
+        break;
+      if (!FirstPhi) {
+        FirstPhi = Phi;
+        break;
+      }
+    }
+    return FirstPhi;
+  };
+
+  // Special hackery is needed with PHI nodes that have inputs from more than
+  // one extracted block.  For simplicity, just split the PHIs into a two-level
+  // sequence of PHIs, some of which will go in the extracted region, and some
+  // of which will go outside.
+  BasicBlock *PreReturn = ClonedOI->ReturnBlock;
+  // only split block when necessary:
+  PHINode *FirstPhi = getFirstPHI(PreReturn);
+  unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
+
+  if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
+    return;
+
+  auto IsTrivialPhi = [](PHINode *PN) -> Value * {
+    Value *CommonValue = PN->getIncomingValue(0);
+    if (all_of(PN->incoming_values(),
+               [&](Value *V) { return V == CommonValue; }))
+      return CommonValue;
+    return nullptr;
+  };
+
+  ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
+      ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
+  BasicBlock::iterator I = PreReturn->begin();
+  Instruction *Ins = &ClonedOI->ReturnBlock->front();
+  SmallVector<Instruction *, 4> DeadPhis;
+  while (I != PreReturn->end()) {
+    PHINode *OldPhi = dyn_cast<PHINode>(I);
+    if (!OldPhi)
+      break;
+
+    PHINode *RetPhi =
+        PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
+    OldPhi->replaceAllUsesWith(RetPhi);
+    Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
+
+    RetPhi->addIncoming(&*I, PreReturn);
+    for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
+      RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
+      OldPhi->removeIncomingValue(E);
+    }
+
+    // After incoming values splitting, the old phi may become trivial.
+    // Keeping the trivial phi can introduce definition inside the outline
+    // region which is live-out, causing necessary overhead (load, store
+    // arg passing etc).
+    if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
+      OldPhi->replaceAllUsesWith(OldPhiVal);
+      DeadPhis.push_back(OldPhi);
+    }
+    ++I;
+    }
+    for (auto *DP : DeadPhis)
+      DP->eraseFromParent();
+
+    for (auto E : ClonedOI->ReturnBlockPreds) {
+      E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
+    }
+}
+
+Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() {
+  // Returns true if the block is to be partial inlined into the caller
+  // (i.e. not to be extracted to the out of line function)
+  auto ToBeInlined = [&, this](BasicBlock *BB) {
+    return BB == ClonedOI->ReturnBlock ||
+           (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
+            ClonedOI->Entries.end());
+  };
+
+  // Gather up the blocks that we're going to extract.
+  std::vector<BasicBlock *> ToExtract;
+  ToExtract.push_back(ClonedOI->NonReturnBlock);
+  OutlinedRegionCost +=
+      PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
+  for (BasicBlock &BB : *ClonedFunc)
+    if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
+      ToExtract.push_back(&BB);
+      // FIXME: the code extractor may hoist/sink more code
+      // into the outlined function which may make the outlining
+      // overhead (the difference of the outlined function cost
+      // and OutliningRegionCost) look larger.
+      OutlinedRegionCost += computeBBInlineCost(&BB);
+    }
+
+  // The CodeExtractor needs a dominator tree.
+  DominatorTree DT;
+  DT.recalculate(*ClonedFunc);
+
+  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
+  LoopInfo LI(DT);
+  BranchProbabilityInfo BPI(*ClonedFunc, LI);
+  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
+
+  // Extract the body of the if.
+  OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
+                               ClonedFuncBFI.get(), &BPI)
+                     .extractCodeRegion();
+
+  if (OutlinedFunc) {
+    OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
+        .getInstruction()
+        ->getParent();
+    assert(OutliningCallBB->getParent() == ClonedFunc);
+  }
+
+  return OutlinedFunc;
+}
+
+PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
+  // Ditch the duplicate, since we're done with it, and rewrite all remaining
+  // users (function pointers, etc.) back to the original function.
+  ClonedFunc->replaceAllUsesWith(OrigFunc);
+  ClonedFunc->eraseFromParent();
+  if (!IsFunctionInlined) {
+    // Remove the function that is speculatively created if there is no
+    // reference.
+    if (OutlinedFunc)
+      OutlinedFunc->eraseFromParent();
+  }
+}
+
+Function *PartialInlinerImpl::unswitchFunction(Function *F) {
+
+  if (F->hasAddressTaken())
+    return nullptr;
+
+  // Let inliner handle it
+  if (F->hasFnAttribute(Attribute::AlwaysInline))
+    return nullptr;
+
+  if (F->hasFnAttribute(Attribute::NoInline))
+    return nullptr;
+
+  if (PSI->isFunctionEntryCold(F))
+    return nullptr;
+
+  if (F->user_begin() == F->user_end())
+    return nullptr;
+
+  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
+
+  if (!OI)
+    return nullptr;
+
+  FunctionCloner Cloner(F, OI.get());
+  Cloner.NormalizeReturnBlock();
+  Function *OutlinedFunction = Cloner.doFunctionOutlining();
+
+  bool AnyInline = tryPartialInline(Cloner);
+
+  if (AnyInline)
+    return OutlinedFunction;
+
+  return nullptr;
+}
+
+bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
+  int NonWeightedRcost;
+  int SizeCost;
+
+  if (Cloner.OutlinedFunc == nullptr)
+    return false;
+
+  std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
+
+  auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
+  auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
+
+  // The call sequence to the outlined function is larger than the original
+  // outlined region size, it does not increase the chances of inlining
+  // the function with outlining (The inliner usies the size increase to
+  // model the cost of inlining a callee).
+  if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
+    OptimizationRemarkEmitter ORE(Cloner.OrigFunc);
+    DebugLoc DLoc;
+    BasicBlock *Block;
+    std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+                                        DLoc, Block)
+             << ore::NV("Function", Cloner.OrigFunc)
+             << " not partially inlined into callers (Original Size = "
+             << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
+             << ", Size of call sequence to outlined function = "
+             << ore::NV("NewSize", SizeCost) << ")");
+    return false;
+  }
+
+  assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() &&
+         "F's users should all be replaced!");
+
+  std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
+                            Cloner.ClonedFunc->user_end());
+
+  DenseMap<User *, uint64_t> CallSiteToProfCountMap;
+  if (Cloner.OrigFunc->getEntryCount())
+    computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
+
+  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
+  uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
+
+  bool AnyInline = false;
+  for (User *User : Users) {
+    CallSite CS = getCallSite(User);
+
+    if (IsLimitReached())
+      continue;
+
+    OptimizationRemarkEmitter ORE(CS.getCaller());
+
+    if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE))
+      continue;
+
+    ORE.emit(
+        OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
+        << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
+        << ore::NV("Caller", CS.getCaller()));
+
+    InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
+    InlineFunction(CS, IFI);
+
+    // Now update the entry count:
+    if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
+      uint64_t CallSiteCount = CallSiteToProfCountMap[User];
+      CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
+    }
+
+    AnyInline = true;
+    NumPartialInlining++;
+    // Update the stats
+    NumPartialInlined++;
+  }
+
+  if (AnyInline) {
+    Cloner.IsFunctionInlined = true;
+    if (CalleeEntryCount)
+      Cloner.OrigFunc->setEntryCount(CalleeEntryCountV);
+  }
+
+  return AnyInline;
+}
+
+bool PartialInlinerImpl::run(Module &M) {
+  if (DisablePartialInlining)
+    return false;
+
+  std::vector<Function *> Worklist;
+  Worklist.reserve(M.size());
+  for (Function &F : M)
+    if (!F.use_empty() && !F.isDeclaration())
+      Worklist.push_back(&F);
+
+  bool Changed = false;
+  while (!Worklist.empty()) {
+    Function *CurrFunc = Worklist.back();
+    Worklist.pop_back();
+
+    if (CurrFunc->use_empty())
+      continue;
+
+    bool Recursive = false;
+    for (User *U : CurrFunc->users())
+      if (Instruction *I = dyn_cast<Instruction>(U))
+        if (I->getParent()->getParent() == CurrFunc) {
+          Recursive = true;
+          break;
+        }
+    if (Recursive)
+      continue;
+
+    if (Function *NewFunc = unswitchFunction(CurrFunc)) {
+      Worklist.push_back(NewFunc);
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+char PartialInlinerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
+                      "Partial Inliner", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
+                    "Partial Inliner", false, false)
+
+ModulePass *llvm::createPartialInliningPass() {
+  return new PartialInlinerLegacyPass();
+}
+
+PreservedAnalyses PartialInlinerPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+      [&FAM](Function &F) -> AssumptionCache & {
+    return FAM.getResult<AssumptionAnalysis>(F);
+  };
+
+  std::function<BlockFrequencyInfo &(Function &)> GetBFI =
+      [&FAM](Function &F) -> BlockFrequencyInfo & {
+    return FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
+  std::function<TargetTransformInfo &(Function &)> GetTTI =
+      [&FAM](Function &F) -> TargetTransformInfo & {
+    return FAM.getResult<TargetIRAnalysis>(F);
+  };
+
+  ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+
+  if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
new file mode 100644
index 000000000000..0b319f6a488b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -0,0 +1,992 @@
+//===- PassManagerBuilder.cpp - Build Standard Pass -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PassManagerBuilder class, which is used to set up a
+// "standard" optimization sequence suitable for languages like C and C++.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm-c/Transforms/PassManagerBuilder.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CFLAndersAliasAnalysis.h"
+#include "llvm/Analysis/CFLSteensAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/Transforms/Vectorize.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden,
+                       cl::ZeroOrMore, cl::desc("Run Partial inlinining pass"));
+
+static cl::opt<bool>
+    RunLoopVectorization("vectorize-loops", cl::Hidden,
+                         cl::desc("Run the Loop vectorization passes"));
+
+static cl::opt<bool>
+RunSLPVectorization("vectorize-slp", cl::Hidden,
+                    cl::desc("Run the SLP vectorization passes"));
+
+static cl::opt<bool>
+UseGVNAfterVectorization("use-gvn-after-vectorization",
+  cl::init(false), cl::Hidden,
+  cl::desc("Run GVN instead of Early CSE after vectorization passes"));
+
+static cl::opt<bool> ExtraVectorizerPasses(
+    "extra-vectorizer-passes", cl::init(false), cl::Hidden,
+    cl::desc("Run cleanup optimization passes after vectorization."));
+
+static cl::opt<bool>
+RunLoopRerolling("reroll-loops", cl::Hidden,
+                 cl::desc("Run the loop rerolling pass"));
+
+static cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden,
+                               cl::desc("Run the NewGVN pass"));
+
+static cl::opt<bool>
+RunSLPAfterLoopVectorization("run-slp-after-loop-vectorization",
+  cl::init(true), cl::Hidden,
+  cl::desc("Run the SLP vectorizer (and BB vectorizer) after the Loop "
+           "vectorizer instead of before"));
+
+// Experimental option to use CFL-AA
+enum class CFLAAType { None, Steensgaard, Andersen, Both };
+static cl::opt<CFLAAType>
+    UseCFLAA("use-cfl-aa", cl::init(CFLAAType::None), cl::Hidden,
+             cl::desc("Enable the new, experimental CFL alias analysis"),
+             cl::values(clEnumValN(CFLAAType::None, "none", "Disable CFL-AA"),
+                        clEnumValN(CFLAAType::Steensgaard, "steens",
+                                   "Enable unification-based CFL-AA"),
+                        clEnumValN(CFLAAType::Andersen, "anders",
+                                   "Enable inclusion-based CFL-AA"),
+                        clEnumValN(CFLAAType::Both, "both",
+                                   "Enable both variants of CFL-AA")));
+
+static cl::opt<bool> EnableLoopInterchange(
+    "enable-loopinterchange", cl::init(false), cl::Hidden,
+    cl::desc("Enable the new, experimental LoopInterchange Pass"));
+
+static cl::opt<bool> EnableNonLTOGlobalsModRef(
+    "enable-non-lto-gmr", cl::init(true), cl::Hidden,
+    cl::desc(
+        "Enable the GlobalsModRef AliasAnalysis outside of the LTO pipeline."));
+
+static cl::opt<bool> EnableLoopLoadElim(
+    "enable-loop-load-elim", cl::init(true), cl::Hidden,
+    cl::desc("Enable the LoopLoadElimination Pass"));
+
+static cl::opt<bool>
+    EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
+                            cl::desc("Enable preparation for ThinLTO."));
+
+static cl::opt<bool> RunPGOInstrGen(
+    "profile-generate", cl::init(false), cl::Hidden,
+    cl::desc("Enable PGO instrumentation."));
+
+static cl::opt<std::string>
+    PGOOutputFile("profile-generate-file", cl::init(""), cl::Hidden,
+                      cl::desc("Specify the path of profile data file."));
+
+static cl::opt<std::string> RunPGOInstrUse(
+    "profile-use", cl::init(""), cl::Hidden, cl::value_desc("filename"),
+    cl::desc("Enable use phase of PGO instrumentation and specify the path "
+             "of profile data file"));
+
+static cl::opt<bool> UseLoopVersioningLICM(
+    "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
+    cl::desc("Enable the experimental Loop Versioning LICM pass"));
+
+static cl::opt<bool>
+    DisablePreInliner("disable-preinline", cl::init(false), cl::Hidden,
+                      cl::desc("Disable pre-instrumentation inliner"));
+
+static cl::opt<int> PreInlineThreshold(
+    "preinline-threshold", cl::Hidden, cl::init(75), cl::ZeroOrMore,
+    cl::desc("Control the amount of inlining in pre-instrumentation inliner "
+             "(default = 75)"));
+
+static cl::opt<bool> EnableEarlyCSEMemSSA(
+    "enable-earlycse-memssa", cl::init(true), cl::Hidden,
+    cl::desc("Enable the EarlyCSE w/ MemorySSA pass (default = on)"));
+
+static cl::opt<bool> EnableGVNHoist(
+    "enable-gvn-hoist", cl::init(false), cl::Hidden,
+    cl::desc("Enable the GVN hoisting pass (default = off)"));
+
+static cl::opt<bool>
+    DisableLibCallsShrinkWrap("disable-libcalls-shrinkwrap", cl::init(false),
+                              cl::Hidden,
+                              cl::desc("Disable shrink-wrap library calls"));
+
+static cl::opt<bool>
+    EnableSimpleLoopUnswitch("enable-simple-loop-unswitch", cl::init(false),
+                             cl::Hidden,
+                             cl::desc("Enable the simple loop unswitch pass."));
+
+static cl::opt<bool> EnableGVNSink(
+    "enable-gvn-sink", cl::init(false), cl::Hidden,
+    cl::desc("Enable the GVN sinking pass (default = off)"));
+
+PassManagerBuilder::PassManagerBuilder() {
+    OptLevel = 2;
+    SizeLevel = 0;
+    LibraryInfo = nullptr;
+    Inliner = nullptr;
+    DisableUnitAtATime = false;
+    DisableUnrollLoops = false;
+    SLPVectorize = RunSLPVectorization;
+    LoopVectorize = RunLoopVectorization;
+    RerollLoops = RunLoopRerolling;
+    NewGVN = RunNewGVN;
+    DisableGVNLoadPRE = false;
+    VerifyInput = false;
+    VerifyOutput = false;
+    MergeFunctions = false;
+    PrepareForLTO = false;
+    EnablePGOInstrGen = RunPGOInstrGen;
+    PGOInstrGen = PGOOutputFile;
+    PGOInstrUse = RunPGOInstrUse;
+    PrepareForThinLTO = EnablePrepareForThinLTO;
+    PerformThinLTO = false;
+    DivergentTarget = false;
+}
+
+PassManagerBuilder::~PassManagerBuilder() {
+  delete LibraryInfo;
+  delete Inliner;
+}
+
+/// Set of global extensions, automatically added as part of the standard set.
+static ManagedStatic<SmallVector<std::pair<PassManagerBuilder::ExtensionPointTy,
+   PassManagerBuilder::ExtensionFn>, 8> > GlobalExtensions;
+
+/// Check if GlobalExtensions is constructed and not empty.
+/// Since GlobalExtensions is a managed static, calling 'empty()' will trigger
+/// the construction of the object.
+static bool GlobalExtensionsNotEmpty() {
+  return GlobalExtensions.isConstructed() && !GlobalExtensions->empty();
+}
+
+void PassManagerBuilder::addGlobalExtension(
+    PassManagerBuilder::ExtensionPointTy Ty,
+    PassManagerBuilder::ExtensionFn Fn) {
+  GlobalExtensions->push_back(std::make_pair(Ty, std::move(Fn)));
+}
+
+void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) {
+  Extensions.push_back(std::make_pair(Ty, std::move(Fn)));
+}
+
+void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
+                                           legacy::PassManagerBase &PM) const {
+  if (GlobalExtensionsNotEmpty()) {
+    for (auto &Ext : *GlobalExtensions) {
+      if (Ext.first == ETy)
+        Ext.second(*this, PM);
+    }
+  }
+  for (unsigned i = 0, e = Extensions.size(); i != e; ++i)
+    if (Extensions[i].first == ETy)
+      Extensions[i].second(*this, PM);
+}
+
+void PassManagerBuilder::addInitialAliasAnalysisPasses(
+    legacy::PassManagerBase &PM) const {
+  switch (UseCFLAA) {
+  case CFLAAType::Steensgaard:
+    PM.add(createCFLSteensAAWrapperPass());
+    break;
+  case CFLAAType::Andersen:
+    PM.add(createCFLAndersAAWrapperPass());
+    break;
+  case CFLAAType::Both:
+    PM.add(createCFLSteensAAWrapperPass());
+    PM.add(createCFLAndersAAWrapperPass());
+    break;
+  default:
+    break;
+  }
+
+  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+  // BasicAliasAnalysis wins if they disagree. This is intended to help
+  // support "obvious" type-punning idioms.
+  PM.add(createTypeBasedAAWrapperPass());
+  PM.add(createScopedNoAliasAAWrapperPass());
+}
+
+void PassManagerBuilder::addInstructionCombiningPass(
+    legacy::PassManagerBase &PM) const {
+  bool ExpensiveCombines = OptLevel > 2;
+  PM.add(createInstructionCombiningPass(ExpensiveCombines));
+}
+
+void PassManagerBuilder::populateFunctionPassManager(
+    legacy::FunctionPassManager &FPM) {
+  addExtensionsToPM(EP_EarlyAsPossible, FPM);
+
+  // Add LibraryInfo if we have some.
+  if (LibraryInfo)
+    FPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
+
+  if (OptLevel == 0) return;
+
+  addInitialAliasAnalysisPasses(FPM);
+
+  FPM.add(createCFGSimplificationPass());
+  FPM.add(createSROAPass());
+  FPM.add(createEarlyCSEPass());
+  FPM.add(createLowerExpectIntrinsicPass());
+}
+
+// Do PGO instrumentation generation or use pass as the option specified.
+void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) {
+  if (!EnablePGOInstrGen && PGOInstrUse.empty() && PGOSampleUse.empty())
+    return;
+  // Perform the preinline and cleanup passes for O1 and above.
+  // And avoid doing them if optimizing for size.
+  if (OptLevel > 0 && SizeLevel == 0 && !DisablePreInliner &&
+      PGOSampleUse.empty()) {
+    // Create preinline pass. We construct an InlineParams object and specify
+    // the threshold here to avoid the command line options of the regular
+    // inliner to influence pre-inlining. The only fields of InlineParams we
+    // care about are DefaultThreshold and HintThreshold.
+    InlineParams IP;
+    IP.DefaultThreshold = PreInlineThreshold;
+    // FIXME: The hint threshold has the same value used by the regular inliner.
+    // This should probably be lowered after performance testing.
+    IP.HintThreshold = 325;
+
+    MPM.add(createFunctionInliningPass(IP));
+    MPM.add(createSROAPass());
+    MPM.add(createEarlyCSEPass());             // Catch trivial redundancies
+    MPM.add(createCFGSimplificationPass());    // Merge & remove BBs
+    MPM.add(createInstructionCombiningPass()); // Combine silly seq's
+    addExtensionsToPM(EP_Peephole, MPM);
+  }
+  if (EnablePGOInstrGen) {
+    MPM.add(createPGOInstrumentationGenLegacyPass());
+    // Add the profile lowering pass.
+    InstrProfOptions Options;
+    if (!PGOInstrGen.empty())
+      Options.InstrProfileOutput = PGOInstrGen;
+    Options.DoCounterPromotion = true;
+    MPM.add(createLoopRotatePass());
+    MPM.add(createInstrProfilingLegacyPass(Options));
+  }
+  if (!PGOInstrUse.empty())
+    MPM.add(createPGOInstrumentationUseLegacyPass(PGOInstrUse));
+  // Indirect call promotion that promotes intra-module targets only.
+  // For ThinLTO this is done earlier due to interactions with globalopt
+  // for imported functions. We don't run this at -O0.
+  if (OptLevel > 0)
+    MPM.add(
+        createPGOIndirectCallPromotionLegacyPass(false, !PGOSampleUse.empty()));
+}
+void PassManagerBuilder::addFunctionSimplificationPasses(
+    legacy::PassManagerBase &MPM) {
+  // Start of function pass.
+  // Break up aggregate allocas, using SSAUpdater.
+  MPM.add(createSROAPass());
+  MPM.add(createEarlyCSEPass(EnableEarlyCSEMemSSA)); // Catch trivial redundancies
+  if (EnableGVNHoist)
+    MPM.add(createGVNHoistPass());
+  if (EnableGVNSink) {
+    MPM.add(createGVNSinkPass());
+    MPM.add(createCFGSimplificationPass());
+  }
+
+  // Speculative execution if the target has divergent branches; otherwise nop.
+  MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
+  MPM.add(createJumpThreadingPass());         // Thread jumps.
+  MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  // Combine silly seq's
+  addInstructionCombiningPass(MPM);
+  if (SizeLevel == 0 && !DisableLibCallsShrinkWrap)
+    MPM.add(createLibCallsShrinkWrapPass());
+  addExtensionsToPM(EP_Peephole, MPM);
+
+  // Optimize memory intrinsic calls based on the profiled size information.
+  if (SizeLevel == 0)
+    MPM.add(createPGOMemOPSizeOptLegacyPass());
+
+  MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
+  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  MPM.add(createReassociatePass());           // Reassociate expressions
+  // Rotate Loop - disable header duplication at -Oz
+  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+  MPM.add(createLICMPass());                  // Hoist loop invariants
+  if (EnableSimpleLoopUnswitch)
+    MPM.add(createSimpleLoopUnswitchLegacyPass());
+  else
+    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
+  MPM.add(createCFGSimplificationPass());
+  addInstructionCombiningPass(MPM);
+  MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
+  MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
+  addExtensionsToPM(EP_LateLoopOptimizations, MPM);
+  MPM.add(createLoopDeletionPass());          // Delete dead loops
+
+  if (EnableLoopInterchange) {
+    MPM.add(createLoopInterchangePass()); // Interchange loops
+    MPM.add(createCFGSimplificationPass());
+  }
+  if (!DisableUnrollLoops)
+    MPM.add(createSimpleLoopUnrollPass(OptLevel));    // Unroll small loops
+  addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
+
+  if (OptLevel > 1) {
+    MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
+    MPM.add(NewGVN ? createNewGVNPass()
+                   : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
+  }
+  MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
+  MPM.add(createSCCPPass());                  // Constant prop with SCCP
+
+  // Delete dead bit computations (instcombine runs after to fold away the dead
+  // computations, and then ADCE will run later to exploit any new DCE
+  // opportunities that creates).
+  MPM.add(createBitTrackingDCEPass());        // Delete dead bit computations
+
+  // Run instcombine after redundancy elimination to exploit opportunities
+  // opened up by them.
+  addInstructionCombiningPass(MPM);
+  addExtensionsToPM(EP_Peephole, MPM);
+  MPM.add(createJumpThreadingPass());         // Thread jumps
+  MPM.add(createCorrelatedValuePropagationPass());
+  MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
+  MPM.add(createLICMPass());
+
+  addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
+
+  if (RerollLoops)
+    MPM.add(createLoopRerollPass());
+  if (!RunSLPAfterLoopVectorization && SLPVectorize)
+    MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+
+  MPM.add(createAggressiveDCEPass());         // Delete dead instructions
+  MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
+  // Clean up after everything.
+  addInstructionCombiningPass(MPM);
+  addExtensionsToPM(EP_Peephole, MPM);
+}
+
+void PassManagerBuilder::populateModulePassManager(
+    legacy::PassManagerBase &MPM) {
+  if (!PGOSampleUse.empty()) {
+    MPM.add(createPruneEHPass());
+    MPM.add(createSampleProfileLoaderPass(PGOSampleUse));
+  }
+
+  // Allow forcing function attributes as a debugging and tuning aid.
+  MPM.add(createForceFunctionAttrsLegacyPass());
+
+  // If all optimizations are disabled, just run the always-inline pass and,
+  // if enabled, the function merging pass.
+  if (OptLevel == 0) {
+    addPGOInstrPasses(MPM);
+    if (Inliner) {
+      MPM.add(Inliner);
+      Inliner = nullptr;
+    }
+
+    // FIXME: The BarrierNoopPass is a HACK! The inliner pass above implicitly
+    // creates a CGSCC pass manager, but we don't want to add extensions into
+    // that pass manager. To prevent this we insert a no-op module pass to reset
+    // the pass manager to get the same behavior as EP_OptimizerLast in non-O0
+    // builds. The function merging pass is
+    if (MergeFunctions)
+      MPM.add(createMergeFunctionsPass());
+    else if (GlobalExtensionsNotEmpty() || !Extensions.empty())
+      MPM.add(createBarrierNoopPass());
+
+    addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
+
+    // Rename anon globals to be able to export them in the summary.
+    // This has to be done after we add the extensions to the pass manager
+    // as there could be passes (e.g. Adddress sanitizer) which introduce
+    // new unnamed globals.
+    if (PrepareForThinLTO)
+      MPM.add(createNameAnonGlobalPass());
+    return;
+  }
+
+  // Add LibraryInfo if we have some.
+  if (LibraryInfo)
+    MPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
+
+  addInitialAliasAnalysisPasses(MPM);
+
+  // For ThinLTO there are two passes of indirect call promotion. The
+  // first is during the compile phase when PerformThinLTO=false and
+  // intra-module indirect call targets are promoted. The second is during
+  // the ThinLTO backend when PerformThinLTO=true, when we promote imported
+  // inter-module indirect calls. For that we perform indirect call promotion
+  // earlier in the pass pipeline, here before globalopt. Otherwise imported
+  // available_externally functions look unreferenced and are removed.
+  if (PerformThinLTO)
+    MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true,
+                                                     !PGOSampleUse.empty()));
+
+  // For SamplePGO in ThinLTO compile phase, we do not want to unroll loops
+  // as it will change the CFG too much to make the 2nd profile annotation
+  // in backend more difficult.
+  bool PrepareForThinLTOUsingPGOSampleProfile =
+      PrepareForThinLTO && !PGOSampleUse.empty();
+  if (PrepareForThinLTOUsingPGOSampleProfile)
+    DisableUnrollLoops = true;
+
+  if (!DisableUnitAtATime) {
+    // Infer attributes about declarations if possible.
+    MPM.add(createInferFunctionAttrsLegacyPass());
+
+    addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
+
+    MPM.add(createIPSCCPPass());          // IP SCCP
+    MPM.add(createGlobalOptimizerPass()); // Optimize out global vars
+    // Promote any localized global vars.
+    MPM.add(createPromoteMemoryToRegisterPass());
+
+    MPM.add(createDeadArgEliminationPass()); // Dead argument elimination
+
+    addInstructionCombiningPass(MPM); // Clean up after IPCP & DAE
+    addExtensionsToPM(EP_Peephole, MPM);
+    MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
+  }
+
+  // For SamplePGO in ThinLTO compile phase, we do not want to do indirect
+  // call promotion as it will change the CFG too much to make the 2nd
+  // profile annotation in backend more difficult.
+  // PGO instrumentation is added during the compile phase for ThinLTO, do
+  // not run it a second time
+  if (!PerformThinLTO && !PrepareForThinLTOUsingPGOSampleProfile)
+    addPGOInstrPasses(MPM);
+
+  if (EnableNonLTOGlobalsModRef)
+    // We add a module alias analysis pass here. In part due to bugs in the
+    // analysis infrastructure this "works" in that the analysis stays alive
+    // for the entire SCC pass run below.
+    MPM.add(createGlobalsAAWrapperPass());
+
+  // Start of CallGraph SCC passes.
+  if (!DisableUnitAtATime)
+    MPM.add(createPruneEHPass()); // Remove dead EH info
+  if (Inliner) {
+    MPM.add(Inliner);
+    Inliner = nullptr;
+  }
+  if (!DisableUnitAtATime)
+    MPM.add(createPostOrderFunctionAttrsLegacyPass());
+  if (OptLevel > 2)
+    MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args
+
+  addExtensionsToPM(EP_CGSCCOptimizerLate, MPM);
+  addFunctionSimplificationPasses(MPM);
+
+  // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
+  // pass manager that we are specifically trying to avoid. To prevent this
+  // we must insert a no-op module pass to reset the pass manager.
+  MPM.add(createBarrierNoopPass());
+  if (RunPartialInlining)
+    MPM.add(createPartialInliningPass());
+
+  if (!DisableUnitAtATime && OptLevel > 1 && !PrepareForLTO &&
+      !PrepareForThinLTO)
+    // Remove avail extern fns and globals definitions if we aren't
+    // compiling an object file for later LTO. For LTO we want to preserve
+    // these so they are eligible for inlining at link-time. Note if they
+    // are unreferenced they will be removed by GlobalDCE later, so
+    // this only impacts referenced available externally globals.
+    // Eventually they will be suppressed during codegen, but eliminating
+    // here enables more opportunity for GlobalDCE as it may make
+    // globals referenced by available external functions dead
+    // and saves running remaining passes on the eliminated functions.
+    MPM.add(createEliminateAvailableExternallyPass());
+
+  if (!DisableUnitAtATime)
+    MPM.add(createReversePostOrderFunctionAttrsPass());
+
+  // If we are planning to perform ThinLTO later, let's not bloat the code with
+  // unrolling/vectorization/... now. We'll first run the inliner + CGSCC passes
+  // during ThinLTO and perform the rest of the optimizations afterward.
+  if (PrepareForThinLTO) {
+    // Reduce the size of the IR as much as possible.
+    MPM.add(createGlobalOptimizerPass());
+    // Rename anon globals to be able to export them in the summary.
+    MPM.add(createNameAnonGlobalPass());
+    return;
+  }
+
+  if (PerformThinLTO)
+    // Optimize globals now when performing ThinLTO, this enables more
+    // optimizations later.
+    MPM.add(createGlobalOptimizerPass());
+
+  // Scheduling LoopVersioningLICM when inlining is over, because after that
+  // we may see more accurate aliasing. Reason to run this late is that too
+  // early versioning may prevent further inlining due to increase of code
+  // size. By placing it just after inlining other optimizations which runs
+  // later might get benefit of no-alias assumption in clone loop.
+  if (UseLoopVersioningLICM) {
+    MPM.add(createLoopVersioningLICMPass());    // Do LoopVersioningLICM
+    MPM.add(createLICMPass());                  // Hoist loop invariants
+  }
+
+  if (EnableNonLTOGlobalsModRef)
+    // We add a fresh GlobalsModRef run at this point. This is particularly
+    // useful as the above will have inlined, DCE'ed, and function-attr
+    // propagated everything. We should at this point have a reasonably minimal
+    // and richly annotated call graph. By computing aliasing and mod/ref
+    // information for all local globals here, the late loop passes and notably
+    // the vectorizer will be able to use them to help recognize vectorizable
+    // memory operations.
+    //
+    // Note that this relies on a bug in the pass manager which preserves
+    // a module analysis into a function pass pipeline (and throughout it) so
+    // long as the first function pass doesn't invalidate the module analysis.
+    // Thus both Float2Int and LoopRotate have to preserve AliasAnalysis for
+    // this to work. Fortunately, it is trivial to preserve AliasAnalysis
+    // (doing nothing preserves it as it is required to be conservatively
+    // correct in the face of IR changes).
+    MPM.add(createGlobalsAAWrapperPass());
+
+  MPM.add(createFloat2IntPass());
+
+  addExtensionsToPM(EP_VectorizerStart, MPM);
+
+  // Re-rotate loops in all our loop nests. These may have fallout out of
+  // rotated form due to GVN or other transformations, and the vectorizer relies
+  // on the rotated form. Disable header duplication at -Oz.
+  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+
+  // Distribute loops to allow partial vectorization.  I.e. isolate dependences
+  // into separate loop that would otherwise inhibit vectorization.  This is
+  // currently only performed for loops marked with the metadata
+  // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
+  MPM.add(createLoopDistributePass());
+
+  MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));
+
+  // Eliminate loads by forwarding stores from the previous iteration to loads
+  // of the current iteration.
+  if (EnableLoopLoadElim)
+    MPM.add(createLoopLoadEliminationPass());
+
+  // FIXME: Because of #pragma vectorize enable, the passes below are always
+  // inserted in the pipeline, even when the vectorizer doesn't run (ex. when
+  // on -O1 and no #pragma is found). Would be good to have these two passes
+  // as function calls, so that we can only pass them when the vectorizer
+  // changed the code.
+  addInstructionCombiningPass(MPM);
+  if (OptLevel > 1 && ExtraVectorizerPasses) {
+    // At higher optimization levels, try to clean up any runtime overlap and
+    // alignment checks inserted by the vectorizer. We want to track correllated
+    // runtime checks for two inner loops in the same outer loop, fold any
+    // common computations, hoist loop-invariant aspects out of any outer loop,
+    // and unswitch the runtime checks if possible. Once hoisted, we may have
+    // dead (or speculatable) control flows or more combining opportunities.
+    MPM.add(createEarlyCSEPass());
+    MPM.add(createCorrelatedValuePropagationPass());
+    addInstructionCombiningPass(MPM);
+    MPM.add(createLICMPass());
+    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
+    MPM.add(createCFGSimplificationPass());
+    addInstructionCombiningPass(MPM);
+  }
+
+  if (RunSLPAfterLoopVectorization && SLPVectorize) {
+    MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+    if (OptLevel > 1 && ExtraVectorizerPasses) {
+      MPM.add(createEarlyCSEPass());
+    }
+  }
+
+  addExtensionsToPM(EP_Peephole, MPM);
+  MPM.add(createLateCFGSimplificationPass()); // Switches to lookup tables
+  addInstructionCombiningPass(MPM);
+
+  if (!DisableUnrollLoops) {
+    MPM.add(createLoopUnrollPass(OptLevel));    // Unroll small loops
+
+    // LoopUnroll may generate some redundency to cleanup.
+    addInstructionCombiningPass(MPM);
+
+    // Runtime unrolling will introduce runtime check in loop prologue. If the
+    // unrolled loop is a inner loop, then the prologue will be inside the
+    // outer loop. LICM pass can help to promote the runtime check out if the
+    // checked value is loop invariant.
+    MPM.add(createLICMPass());
+ }
+
+  // After vectorization and unrolling, assume intrinsics may tell us more
+  // about pointer alignments.
+  MPM.add(createAlignmentFromAssumptionsPass());
+
+  if (!DisableUnitAtATime) {
+    // FIXME: We shouldn't bother with this anymore.
+    MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
+
+    // GlobalOpt already deletes dead functions and globals, at -O2 try a
+    // late pass of GlobalDCE.  It is capable of deleting dead cycles.
+    if (OptLevel > 1) {
+      MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.
+      MPM.add(createConstantMergePass());     // Merge dup global constants
+    }
+  }
+
+  if (MergeFunctions)
+    MPM.add(createMergeFunctionsPass());
+
+  // LoopSink pass sinks instructions hoisted by LICM, which serves as a
+  // canonicalization pass that enables other optimizations. As a result,
+  // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
+  // result too early.
+  MPM.add(createLoopSinkPass());
+  // Get rid of LCSSA nodes.
+  MPM.add(createInstructionSimplifierPass());
+
+  // LoopSink (and other loop passes since the last simplifyCFG) might have
+  // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
+  MPM.add(createCFGSimplificationPass());
+
+  addExtensionsToPM(EP_OptimizerLast, MPM);
+}
+
+void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
+  // Remove unused virtual tables to improve the quality of code generated by
+  // whole-program devirtualization and bitset lowering.
+  PM.add(createGlobalDCEPass());
+
+  // Provide AliasAnalysis services for optimizations.
+  addInitialAliasAnalysisPasses(PM);
+
+  // Allow forcing function attributes as a debugging and tuning aid.
+  PM.add(createForceFunctionAttrsLegacyPass());
+
+  // Infer attributes about declarations if possible.
+  PM.add(createInferFunctionAttrsLegacyPass());
+
+  if (OptLevel > 1) {
+    // Indirect call promotion. This should promote all the targets that are
+    // left by the earlier promotion pass that promotes intra-module targets.
+    // This two-step promotion is to save the compile time. For LTO, it should
+    // produce the same result as if we only do promotion here.
+    PM.add(
+        createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty()));
+
+    // Propagate constants at call sites into the functions they call.  This
+    // opens opportunities for globalopt (and inlining) by substituting function
+    // pointers passed as arguments to direct uses of functions.
+    PM.add(createIPSCCPPass());
+  }
+
+  // Infer attributes about definitions. The readnone attribute in particular is
+  // required for virtual constant propagation.
+  PM.add(createPostOrderFunctionAttrsLegacyPass());
+  PM.add(createReversePostOrderFunctionAttrsPass());
+
+  // Split globals using inrange annotations on GEP indices. This can help
+  // improve the quality of generated code when virtual constant propagation or
+  // control flow integrity are enabled.
+  PM.add(createGlobalSplitPass());
+
+  // Apply whole-program devirtualization and virtual constant propagation.
+  PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr));
+
+  // That's all we need at opt level 1.
+  if (OptLevel == 1)
+    return;
+
+  // Now that we internalized some globals, see if we can hack on them!
+  PM.add(createGlobalOptimizerPass());
+  // Promote any localized global vars.
+  PM.add(createPromoteMemoryToRegisterPass());
+
+  // Linking modules together can lead to duplicated global constants, only
+  // keep one copy of each constant.
+  PM.add(createConstantMergePass());
+
+  // Remove unused arguments from functions.
+  PM.add(createDeadArgEliminationPass());
+
+  // Reduce the code after globalopt and ipsccp.  Both can open up significant
+  // simplification opportunities, and both can propagate functions through
+  // function pointers.  When this happens, we often have to resolve varargs
+  // calls, etc, so let instcombine do this.
+  addInstructionCombiningPass(PM);
+  addExtensionsToPM(EP_Peephole, PM);
+
+  // Inline small functions
+  bool RunInliner = Inliner;
+  if (RunInliner) {
+    PM.add(Inliner);
+    Inliner = nullptr;
+  }
+
+  PM.add(createPruneEHPass());   // Remove dead EH info.
+
+  // Optimize globals again if we ran the inliner.
+  if (RunInliner)
+    PM.add(createGlobalOptimizerPass());
+  PM.add(createGlobalDCEPass()); // Remove dead functions.
+
+  // If we didn't decide to inline a function, check to see if we can
+  // transform it to pass arguments by value instead of by reference.
+  PM.add(createArgumentPromotionPass());
+
+  // The IPO passes may leave cruft around.  Clean up after them.
+  addInstructionCombiningPass(PM);
+  addExtensionsToPM(EP_Peephole, PM);
+  PM.add(createJumpThreadingPass());
+
+  // Break up allocas
+  PM.add(createSROAPass());
+
+  // Run a few AA driven optimizations here and now, to cleanup the code.
+  PM.add(createPostOrderFunctionAttrsLegacyPass()); // Add nocapture.
+  PM.add(createGlobalsAAWrapperPass()); // IP alias analysis.
+
+  PM.add(createLICMPass());                 // Hoist loop invariants.
+  PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds.
+  PM.add(NewGVN ? createNewGVNPass()
+                : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
+  PM.add(createMemCpyOptPass());            // Remove dead memcpys.
+
+  // Nuke dead stores.
+  PM.add(createDeadStoreEliminationPass());
+
+  // More loops are countable; try to optimize them.
+  PM.add(createIndVarSimplifyPass());
+  PM.add(createLoopDeletionPass());
+  if (EnableLoopInterchange)
+    PM.add(createLoopInterchangePass());
+
+  if (!DisableUnrollLoops)
+    PM.add(createSimpleLoopUnrollPass(OptLevel));   // Unroll small loops
+  PM.add(createLoopVectorizePass(true, LoopVectorize));
+  // The vectorizer may have significantly shortened a loop body; unroll again.
+  if (!DisableUnrollLoops)
+    PM.add(createLoopUnrollPass(OptLevel));
+
+  // Now that we've optimized loops (in particular loop induction variables),
+  // we may have exposed more scalar opportunities. Run parts of the scalar
+  // optimizer again at this point.
+  addInstructionCombiningPass(PM); // Initial cleanup
+  PM.add(createCFGSimplificationPass()); // if-convert
+  PM.add(createSCCPPass()); // Propagate exposed constants
+  addInstructionCombiningPass(PM); // Clean up again
+  PM.add(createBitTrackingDCEPass());
+
+  // More scalar chains could be vectorized due to more alias information
+  if (RunSLPAfterLoopVectorization)
+    if (SLPVectorize)
+      PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+
+  // After vectorization, assume intrinsics may tell us more about pointer
+  // alignments.
+  PM.add(createAlignmentFromAssumptionsPass());
+
+  // Cleanup and simplify the code after the scalar optimizations.
+  addInstructionCombiningPass(PM);
+  addExtensionsToPM(EP_Peephole, PM);
+
+  PM.add(createJumpThreadingPass());
+}
+
+void PassManagerBuilder::addLateLTOOptimizationPasses(
+    legacy::PassManagerBase &PM) {
+  // Delete basic blocks, which optimization passes may have killed.
+  PM.add(createCFGSimplificationPass());
+
+  // Drop bodies of available externally objects to improve GlobalDCE.
+  PM.add(createEliminateAvailableExternallyPass());
+
+  // Now that we have optimized the program, discard unreachable functions.
+  PM.add(createGlobalDCEPass());
+
+  // FIXME: this is profitable (for compiler time) to do at -O0 too, but
+  // currently it damages debug info.
+  if (MergeFunctions)
+    PM.add(createMergeFunctionsPass());
+}
+
+void PassManagerBuilder::populateThinLTOPassManager(
+    legacy::PassManagerBase &PM) {
+  PerformThinLTO = true;
+
+  if (VerifyInput)
+    PM.add(createVerifierPass());
+
+  if (ImportSummary) {
+    // These passes import type identifier resolutions for whole-program
+    // devirtualization and CFI. They must run early because other passes may
+    // disturb the specific instruction patterns that these passes look for,
+    // creating dependencies on resolutions that may not appear in the summary.
+    //
+    // For example, GVN may transform the pattern assume(type.test) appearing in
+    // two basic blocks into assume(phi(type.test, type.test)), which would
+    // transform a dependency on a WPD resolution into a dependency on a type
+    // identifier resolution for CFI.
+    //
+    // Also, WPD has access to more precise information than ICP and can
+    // devirtualize more effectively, so it should operate on the IR first.
+    PM.add(createWholeProgramDevirtPass(nullptr, ImportSummary));
+    PM.add(createLowerTypeTestsPass(nullptr, ImportSummary));
+  }
+
+  populateModulePassManager(PM);
+
+  if (VerifyOutput)
+    PM.add(createVerifierPass());
+  PerformThinLTO = false;
+}
+
+void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
+  if (LibraryInfo)
+    PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
+
+  if (VerifyInput)
+    PM.add(createVerifierPass());
+
+  if (OptLevel != 0)
+    addLTOOptimizationPasses(PM);
+  else {
+    // The whole-program-devirt pass needs to run at -O0 because only it knows
+    // about the llvm.type.checked.load intrinsic: it needs to both lower the
+    // intrinsic itself and handle it in the summary.
+    PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr));
+  }
+
+  // Create a function that performs CFI checks for cross-DSO calls with targets
+  // in the current module.
+  PM.add(createCrossDSOCFIPass());
+
+  // Lower type metadata and the type.test intrinsic. This pass supports Clang's
+  // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
+  // link time if CFI is enabled. The pass does nothing if CFI is disabled.
+  PM.add(createLowerTypeTestsPass(ExportSummary, nullptr));
+
+  if (OptLevel != 0)
+    addLateLTOOptimizationPasses(PM);
+
+  if (VerifyOutput)
+    PM.add(createVerifierPass());
+}
+
+inline PassManagerBuilder *unwrap(LLVMPassManagerBuilderRef P) {
+    return reinterpret_cast<PassManagerBuilder*>(P);
+}
+
+inline LLVMPassManagerBuilderRef wrap(PassManagerBuilder *P) {
+  return reinterpret_cast<LLVMPassManagerBuilderRef>(P);
+}
+
+LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate() {
+  PassManagerBuilder *PMB = new PassManagerBuilder();
+  return wrap(PMB);
+}
+
+void LLVMPassManagerBuilderDispose(LLVMPassManagerBuilderRef PMB) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  delete Builder;
+}
+
+void
+LLVMPassManagerBuilderSetOptLevel(LLVMPassManagerBuilderRef PMB,
+                                  unsigned OptLevel) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->OptLevel = OptLevel;
+}
+
+void
+LLVMPassManagerBuilderSetSizeLevel(LLVMPassManagerBuilderRef PMB,
+                                   unsigned SizeLevel) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->SizeLevel = SizeLevel;
+}
+
+void
+LLVMPassManagerBuilderSetDisableUnitAtATime(LLVMPassManagerBuilderRef PMB,
+                                            LLVMBool Value) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->DisableUnitAtATime = Value;
+}
+
+void
+LLVMPassManagerBuilderSetDisableUnrollLoops(LLVMPassManagerBuilderRef PMB,
+                                            LLVMBool Value) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->DisableUnrollLoops = Value;
+}
+
+void
+LLVMPassManagerBuilderSetDisableSimplifyLibCalls(LLVMPassManagerBuilderRef PMB,
+                                                 LLVMBool Value) {
+  // NOTE: The simplify-libcalls pass has been removed.
+}
+
+void
+LLVMPassManagerBuilderUseInlinerWithThreshold(LLVMPassManagerBuilderRef PMB,
+                                              unsigned Threshold) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->Inliner = createFunctionInliningPass(Threshold);
+}
+
+void
+LLVMPassManagerBuilderPopulateFunctionPassManager(LLVMPassManagerBuilderRef PMB,
+                                                  LLVMPassManagerRef PM) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  legacy::FunctionPassManager *FPM = unwrap<legacy::FunctionPassManager>(PM);
+  Builder->populateFunctionPassManager(*FPM);
+}
+
+void
+LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB,
+                                                LLVMPassManagerRef PM) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  legacy::PassManagerBase *MPM = unwrap(PM);
+  Builder->populateModulePassManager(*MPM);
+}
+
+void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB,
+                                                  LLVMPassManagerRef PM,
+                                                  LLVMBool Internalize,
+                                                  LLVMBool RunInliner) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  legacy::PassManagerBase *LPM = unwrap(PM);
+
+  // A small backwards compatibility hack. populateLTOPassManager used to take
+  // an RunInliner option.
+  if (RunInliner && !Builder->Inliner)
+    Builder->Inliner = createFunctionInliningPass();
+
+  Builder->populateLTOPassManager(*LPM);
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
new file mode 100644
index 000000000000..3fd59847a005
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
@@ -0,0 +1,270 @@
+//===- PruneEH.cpp - Pass which deletes unused exception handlers ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple interprocedural pass which walks the
+// call-graph, turning invoke instructions into calls, iff the callee cannot
+// throw an exception, and marking functions 'nounwind' if they cannot throw.
+// It implements this as a bottom-up traversal of the call-graph.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "prune-eh"
+
+STATISTIC(NumRemoved, "Number of invokes removed");
+STATISTIC(NumUnreach, "Number of noreturn calls optimized");
+
+namespace {
+  struct PruneEH : public CallGraphSCCPass {
+    static char ID; // Pass identification, replacement for typeid
+    PruneEH() : CallGraphSCCPass(ID) {
+      initializePruneEHPass(*PassRegistry::getPassRegistry());
+    }
+
+    // runOnSCC - Analyze the SCC, performing the transformation if possible.
+    bool runOnSCC(CallGraphSCC &SCC) override;
+
+  };
+}
+static bool SimplifyFunction(Function *F, CallGraph &CG);
+static void DeleteBasicBlock(BasicBlock *BB, CallGraph &CG);
+
+char PruneEH::ID = 0;
+INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh",
+                "Remove unused exception handling info", false, false)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(PruneEH, "prune-eh",
+                "Remove unused exception handling info", false, false)
+
+Pass *llvm::createPruneEHPass() { return new PruneEH(); }
+
+static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) {
+  SmallPtrSet<CallGraphNode *, 8> SCCNodes;
+  bool MadeChange = false;
+
+  // Fill SCCNodes with the elements of the SCC.  Used for quickly
+  // looking up whether a given CallGraphNode is in this SCC.
+  for (CallGraphNode *I : SCC)
+    SCCNodes.insert(I);
+
+  // First pass, scan all of the functions in the SCC, simplifying them
+  // according to what we know.
+  for (CallGraphNode *I : SCC)
+    if (Function *F = I->getFunction())
+      MadeChange |= SimplifyFunction(F, CG);
+
+  // Next, check to see if any callees might throw or if there are any external
+  // functions in this SCC: if so, we cannot prune any functions in this SCC.
+  // Definitions that are weak and not declared non-throwing might be 
+  // overridden at linktime with something that throws, so assume that.
+  // If this SCC includes the unwind instruction, we KNOW it throws, so
+  // obviously the SCC might throw.
+  //
+  bool SCCMightUnwind = false, SCCMightReturn = false;
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); 
+       (!SCCMightUnwind || !SCCMightReturn) && I != E; ++I) {
+    Function *F = (*I)->getFunction();
+    if (!F) {
+      SCCMightUnwind = true;
+      SCCMightReturn = true;
+    } else if (!F->hasExactDefinition()) {
+      SCCMightUnwind |= !F->doesNotThrow();
+      SCCMightReturn |= !F->doesNotReturn();
+    } else {
+      bool CheckUnwind = !SCCMightUnwind && !F->doesNotThrow();
+      bool CheckReturn = !SCCMightReturn && !F->doesNotReturn();
+      // Determine if we should scan for InlineAsm in a naked function as it
+      // is the only way to return without a ReturnInst.  Only do this for
+      // no-inline functions as functions which may be inlined cannot
+      // meaningfully return via assembly.
+      bool CheckReturnViaAsm = CheckReturn &&
+                               F->hasFnAttribute(Attribute::Naked) &&
+                               F->hasFnAttribute(Attribute::NoInline);
+
+      if (!CheckUnwind && !CheckReturn)
+        continue;
+
+      for (const BasicBlock &BB : *F) {
+        const TerminatorInst *TI = BB.getTerminator();
+        if (CheckUnwind && TI->mayThrow()) {
+          SCCMightUnwind = true;
+        } else if (CheckReturn && isa<ReturnInst>(TI)) {
+          SCCMightReturn = true;
+        }
+
+        for (const Instruction &I : BB) {
+          if ((!CheckUnwind || SCCMightUnwind) &&
+              (!CheckReturnViaAsm || SCCMightReturn))
+            break;
+
+          // Check to see if this function performs an unwind or calls an
+          // unwinding function.
+          if (CheckUnwind && !SCCMightUnwind && I.mayThrow()) {
+            bool InstMightUnwind = true;
+            if (const auto *CI = dyn_cast<CallInst>(&I)) {
+              if (Function *Callee = CI->getCalledFunction()) {
+                CallGraphNode *CalleeNode = CG[Callee];
+                // If the callee is outside our current SCC then we may throw
+                // because it might.  If it is inside, do nothing.
+                if (SCCNodes.count(CalleeNode) > 0)
+                  InstMightUnwind = false;
+              }
+            }
+            SCCMightUnwind |= InstMightUnwind;
+          }
+          if (CheckReturnViaAsm && !SCCMightReturn)
+            if (auto ICS = ImmutableCallSite(&I))
+              if (const auto *IA = dyn_cast<InlineAsm>(ICS.getCalledValue()))
+                if (IA->hasSideEffects())
+                  SCCMightReturn = true;
+        }
+
+        if (SCCMightUnwind && SCCMightReturn)
+          break;
+      }
+    }
+  }
+
+  // If the SCC doesn't unwind or doesn't throw, note this fact.
+  if (!SCCMightUnwind || !SCCMightReturn)
+    for (CallGraphNode *I : SCC) {
+      Function *F = I->getFunction();
+
+      if (!SCCMightUnwind && !F->hasFnAttribute(Attribute::NoUnwind)) {
+        F->addFnAttr(Attribute::NoUnwind);
+        MadeChange = true;
+      }
+
+      if (!SCCMightReturn && !F->hasFnAttribute(Attribute::NoReturn)) {
+        F->addFnAttr(Attribute::NoReturn);
+        MadeChange = true;
+      }
+    }
+
+  for (CallGraphNode *I : SCC) {
+    // Convert any invoke instructions to non-throwing functions in this node
+    // into call instructions with a branch.  This makes the exception blocks
+    // dead.
+    if (Function *F = I->getFunction())
+      MadeChange |= SimplifyFunction(F, CG);
+  }
+
+  return MadeChange;
+}
+
+
+bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
+  if (skipSCC(SCC))
+    return false;
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  return runImpl(SCC, CG);
+}
+
+
+// SimplifyFunction - Given information about callees, simplify the specified
+// function if we have invokes to non-unwinding functions or code after calls to
+// no-return functions.
+static bool SimplifyFunction(Function *F, CallGraph &CG) {
+  bool MadeChange = false;
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+      if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(F)) {
+        BasicBlock *UnwindBlock = II->getUnwindDest();
+        removeUnwindEdge(&*BB);
+
+        // If the unwind block is now dead, nuke it.
+        if (pred_empty(UnwindBlock))
+          DeleteBasicBlock(UnwindBlock, CG);  // Delete the new BB.
+
+        ++NumRemoved;
+        MadeChange = true;
+      }
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
+      if (CallInst *CI = dyn_cast<CallInst>(I++))
+        if (CI->doesNotReturn() && !isa<UnreachableInst>(I)) {
+          // This call calls a function that cannot return.  Insert an
+          // unreachable instruction after it and simplify the code.  Do this
+          // by splitting the BB, adding the unreachable, then deleting the
+          // new BB.
+          BasicBlock *New = BB->splitBasicBlock(I);
+
+          // Remove the uncond branch and add an unreachable.
+          BB->getInstList().pop_back();
+          new UnreachableInst(BB->getContext(), &*BB);
+
+          DeleteBasicBlock(New, CG);  // Delete the new BB.
+          MadeChange = true;
+          ++NumUnreach;
+          break;
+        }
+  }
+
+  return MadeChange;
+}
+
+/// DeleteBasicBlock - remove the specified basic block from the program,
+/// updating the callgraph to reflect any now-obsolete edges due to calls that
+/// exist in the BB.
+static void DeleteBasicBlock(BasicBlock *BB, CallGraph &CG) {
+  assert(pred_empty(BB) && "BB is not dead!");
+
+  Instruction *TokenInst = nullptr;
+
+  CallGraphNode *CGN = CG[BB->getParent()];
+  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) {
+    --I;
+
+    if (I->getType()->isTokenTy()) {
+      TokenInst = &*I;
+      break;
+    }
+
+    if (auto CS = CallSite (&*I)) {
+      const Function *Callee = CS.getCalledFunction();
+      if (!Callee || !Intrinsic::isLeaf(Callee->getIntrinsicID()))
+        CGN->removeCallEdgeFor(CS);
+      else if (!Callee->isIntrinsic())
+        CGN->removeCallEdgeFor(CS);
+    }
+
+    if (!I->use_empty())
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  }
+
+  if (TokenInst) {
+    if (!isa<TerminatorInst>(TokenInst))
+      changeToUnreachable(TokenInst->getNextNode(), /*UseLLVMTrap=*/false);
+  } else {
+    // Get the list of successors of this block.
+    std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
+
+    for (unsigned i = 0, e = Succs.size(); i != e; ++i)
+      Succs[i]->removePredecessor(BB);
+
+    BB->eraseFromParent();
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp
new file mode 100644
index 000000000000..ac4765f96075
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -0,0 +1,1496 @@
+//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SampleProfileLoader transformation. This pass
+// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
+// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
+// profile information in the given profile.
+//
+// This pass generates branch weight annotations on the IR:
+//
+// - prof: Represents branch weights. This annotation is added to branches
+//      to indicate the weights of each edge coming out of the branch.
+//      The weight of each edge is the weight of the target block for
+//      that edge. The weight of a block B is computed as the maximum
+//      number of samples found in B.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/SampleProfile.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <cctype>
+
+using namespace llvm;
+using namespace sampleprof;
+
+#define DEBUG_TYPE "sample-profile"
+
+// Command line option to specify the file to read samples from. This is
+// mainly used for debugging.
+static cl::opt<std::string> SampleProfileFile(
+    "sample-profile-file", cl::init(""), cl::value_desc("filename"),
+    cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
+static cl::opt<unsigned> SampleProfileMaxPropagateIterations(
+    "sample-profile-max-propagate-iterations", cl::init(100),
+    cl::desc("Maximum number of iterations to go through when propagating "
+             "sample block/edge weights through the CFG."));
+static cl::opt<unsigned> SampleProfileRecordCoverage(
+    "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"),
+    cl::desc("Emit a warning if less than N% of records in the input profile "
+             "are matched to the IR."));
+static cl::opt<unsigned> SampleProfileSampleCoverage(
+    "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"),
+    cl::desc("Emit a warning if less than N% of samples in the input profile "
+             "are matched to the IR."));
+static cl::opt<double> SampleProfileHotThreshold(
+    "sample-profile-inline-hot-threshold", cl::init(0.1), cl::value_desc("N"),
+    cl::desc("Inlined functions that account for more than N% of all samples "
+             "collected in the parent function, will be inlined again."));
+
+namespace {
+typedef DenseMap<const BasicBlock *, uint64_t> BlockWeightMap;
+typedef DenseMap<const BasicBlock *, const BasicBlock *> EquivalenceClassMap;
+typedef std::pair<const BasicBlock *, const BasicBlock *> Edge;
+typedef DenseMap<Edge, uint64_t> EdgeWeightMap;
+typedef DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>
+    BlockEdgeMap;
+
+class SampleCoverageTracker {
+public:
+  SampleCoverageTracker() : SampleCoverage(), TotalUsedSamples(0) {}
+
+  bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
+                       uint32_t Discriminator, uint64_t Samples);
+  unsigned computeCoverage(unsigned Used, unsigned Total) const;
+  unsigned countUsedRecords(const FunctionSamples *FS) const;
+  unsigned countBodyRecords(const FunctionSamples *FS) const;
+  uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
+  uint64_t countBodySamples(const FunctionSamples *FS) const;
+  void clear() {
+    SampleCoverage.clear();
+    TotalUsedSamples = 0;
+  }
+
+private:
+  typedef std::map<LineLocation, unsigned> BodySampleCoverageMap;
+  typedef DenseMap<const FunctionSamples *, BodySampleCoverageMap>
+      FunctionSamplesCoverageMap;
+
+  /// Coverage map for sampling records.
+  ///
+  /// This map keeps a record of sampling records that have been matched to
+  /// an IR instruction. This is used to detect some form of staleness in
+  /// profiles (see flag -sample-profile-check-coverage).
+  ///
+  /// Each entry in the map corresponds to a FunctionSamples instance.  This is
+  /// another map that counts how many times the sample record at the
+  /// given location has been used.
+  FunctionSamplesCoverageMap SampleCoverage;
+
+  /// Number of samples used from the profile.
+  ///
+  /// When a sampling record is used for the first time, the samples from
+  /// that record are added to this accumulator.  Coverage is later computed
+  /// based on the total number of samples available in this function and
+  /// its callsites.
+  ///
+  /// Note that this accumulator tracks samples used from a single function
+  /// and all the inlined callsites. Strictly, we should have a map of counters
+  /// keyed by FunctionSamples pointers, but these stats are cleared after
+  /// every function, so we just need to keep a single counter.
+  uint64_t TotalUsedSamples;
+};
+
+/// \brief Sample profile pass.
+///
+/// This pass reads profile data from the file specified by
+/// -sample-profile-file and annotates every affected function with the
+/// profile information found in that file.
+class SampleProfileLoader {
+public:
+  SampleProfileLoader(StringRef Name = SampleProfileFile)
+      : DT(nullptr), PDT(nullptr), LI(nullptr), ACT(nullptr), Reader(),
+        Samples(nullptr), Filename(Name), ProfileIsValid(false),
+        TotalCollectedSamples(0) {}
+
+  bool doInitialization(Module &M);
+  bool runOnModule(Module &M);
+  void setACT(AssumptionCacheTracker *A) { ACT = A; }
+
+  void dump() { Reader->dump(); }
+
+protected:
+  bool runOnFunction(Function &F);
+  unsigned getFunctionLoc(Function &F);
+  bool emitAnnotations(Function &F);
+  ErrorOr<uint64_t> getInstWeight(const Instruction &I);
+  ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
+  const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const;
+  std::vector<const FunctionSamples *>
+  findIndirectCallFunctionSamples(const Instruction &I) const;
+  const FunctionSamples *findFunctionSamples(const Instruction &I) const;
+  bool inlineHotFunctions(Function &F,
+                          DenseSet<GlobalValue::GUID> &ImportGUIDs);
+  void printEdgeWeight(raw_ostream &OS, Edge E);
+  void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
+  void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
+  bool computeBlockWeights(Function &F);
+  void findEquivalenceClasses(Function &F);
+  void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
+                           DominatorTreeBase<BasicBlock> *DomTree);
+  void propagateWeights(Function &F);
+  uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
+  void buildEdges(Function &F);
+  bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
+  void computeDominanceAndLoopInfo(Function &F);
+  unsigned getOffset(const DILocation *DIL) const;
+  void clearFunctionData();
+
+  /// \brief Map basic blocks to their computed weights.
+  ///
+  /// The weight of a basic block is defined to be the maximum
+  /// of all the instruction weights in that block.
+  BlockWeightMap BlockWeights;
+
+  /// \brief Map edges to their computed weights.
+  ///
+  /// Edge weights are computed by propagating basic block weights in
+  /// SampleProfile::propagateWeights.
+  EdgeWeightMap EdgeWeights;
+
+  /// \brief Set of visited blocks during propagation.
+  SmallPtrSet<const BasicBlock *, 32> VisitedBlocks;
+
+  /// \brief Set of visited edges during propagation.
+  SmallSet<Edge, 32> VisitedEdges;
+
+  /// \brief Equivalence classes for block weights.
+  ///
+  /// Two blocks BB1 and BB2 are in the same equivalence class if they
+  /// dominate and post-dominate each other, and they are in the same loop
+  /// nest. When this happens, the two blocks are guaranteed to execute
+  /// the same number of times.
+  EquivalenceClassMap EquivalenceClass;
+
+  /// Map from function name to Function *. Used to find the function from
+  /// the function name. If the function name contains suffix, additional
+  /// entry is added to map from the stripped name to the function if there
+  /// is one-to-one mapping.
+  StringMap<Function *> SymbolMap;
+
+  /// \brief Dominance, post-dominance and loop information.
+  std::unique_ptr<DominatorTree> DT;
+  std::unique_ptr<DominatorTreeBase<BasicBlock>> PDT;
+  std::unique_ptr<LoopInfo> LI;
+
+  AssumptionCacheTracker *ACT;
+
+  /// \brief Predecessors for each basic block in the CFG.
+  BlockEdgeMap Predecessors;
+
+  /// \brief Successors for each basic block in the CFG.
+  BlockEdgeMap Successors;
+
+  SampleCoverageTracker CoverageTracker;
+
+  /// \brief Profile reader object.
+  std::unique_ptr<SampleProfileReader> Reader;
+
+  /// \brief Samples collected for the body of this function.
+  FunctionSamples *Samples;
+
+  /// \brief Name of the profile file to load.
+  std::string Filename;
+
+  /// \brief Flag indicating whether the profile input loaded successfully.
+  bool ProfileIsValid;
+
+  /// \brief Total number of samples collected in this profile.
+  ///
+  /// This is the sum of all the samples collected in all the functions executed
+  /// at runtime.
+  uint64_t TotalCollectedSamples;
+};
+
+class SampleProfileLoaderLegacyPass : public ModulePass {
+public:
+  // Class identification, replacement for typeinfo
+  static char ID;
+
+  SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile)
+      : ModulePass(ID), SampleLoader(Name) {
+    initializeSampleProfileLoaderLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void dump() { SampleLoader.dump(); }
+
+  bool doInitialization(Module &M) override {
+    return SampleLoader.doInitialization(M);
+  }
+  StringRef getPassName() const override { return "Sample profile pass"; }
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+  }
+
+private:
+  SampleProfileLoader SampleLoader;
+};
+
+/// Return true if the given callsite is hot wrt to its caller.
+///
+/// Functions that were inlined in the original binary will be represented
+/// in the inline stack in the sample profile. If the profile shows that
+/// the original inline decision was "good" (i.e., the callsite is executed
+/// frequently), then we will recreate the inline decision and apply the
+/// profile from the inlined callsite.
+///
+/// To decide whether an inlined callsite is hot, we compute the fraction
+/// of samples used by the callsite with respect to the total number of samples
+/// collected in the caller.
+///
+/// If that fraction is larger than the default given by
+/// SampleProfileHotThreshold, the callsite will be inlined again.
+bool callsiteIsHot(const FunctionSamples *CallerFS,
+                   const FunctionSamples *CallsiteFS) {
+  if (!CallsiteFS)
+    return false; // The callsite was not inlined in the original binary.
+
+  uint64_t ParentTotalSamples = CallerFS->getTotalSamples();
+  if (ParentTotalSamples == 0)
+    return false; // Avoid division by zero.
+
+  uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
+  if (CallsiteTotalSamples == 0)
+    return false; // Callsite is trivially cold.
+
+  double PercentSamples =
+      (double)CallsiteTotalSamples / (double)ParentTotalSamples * 100.0;
+  return PercentSamples >= SampleProfileHotThreshold;
+}
+}
+
+/// Mark as used the sample record for the given function samples at
+/// (LineOffset, Discriminator).
+///
+/// \returns true if this is the first time we mark the given record.
+bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS,
+                                            uint32_t LineOffset,
+                                            uint32_t Discriminator,
+                                            uint64_t Samples) {
+  LineLocation Loc(LineOffset, Discriminator);
+  unsigned &Count = SampleCoverage[FS][Loc];
+  bool FirstTime = (++Count == 1);
+  if (FirstTime)
+    TotalUsedSamples += Samples;
+  return FirstTime;
+}
+
+/// Return the number of sample records that were applied from this profile.
+///
+/// This count does not include records from cold inlined callsites.
+unsigned
+SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const {
+  auto I = SampleCoverage.find(FS);
+
+  // The size of the coverage map for FS represents the number of records
+  // that were marked used at least once.
+  unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 0;
+
+  // If there are inlined callsites in this function, count the samples found
+  // in the respective bodies. However, do not bother counting callees with 0
+  // total samples, these are callees that were never invoked at runtime.
+  for (const auto &I : FS->getCallsiteSamples())
+    for (const auto &J : I.second) {
+      const FunctionSamples *CalleeSamples = &J.second;
+      if (callsiteIsHot(FS, CalleeSamples))
+        Count += countUsedRecords(CalleeSamples);
+    }
+
+  return Count;
+}
+
+/// Return the number of sample records in the body of this profile.
+///
+/// This count does not include records from cold inlined callsites.
+unsigned
+SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const {
+  unsigned Count = FS->getBodySamples().size();
+
+  // Only count records in hot callsites.
+  for (const auto &I : FS->getCallsiteSamples())
+    for (const auto &J : I.second) {
+      const FunctionSamples *CalleeSamples = &J.second;
+      if (callsiteIsHot(FS, CalleeSamples))
+        Count += countBodyRecords(CalleeSamples);
+    }
+
+  return Count;
+}
+
+/// Return the number of samples collected in the body of this profile.
+///
+/// This count does not include samples from cold inlined callsites.
+uint64_t
+SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const {
+  uint64_t Total = 0;
+  for (const auto &I : FS->getBodySamples())
+    Total += I.second.getSamples();
+
+  // Only count samples in hot callsites.
+  for (const auto &I : FS->getCallsiteSamples())
+    for (const auto &J : I.second) {
+      const FunctionSamples *CalleeSamples = &J.second;
+      if (callsiteIsHot(FS, CalleeSamples))
+        Total += countBodySamples(CalleeSamples);
+    }
+
+  return Total;
+}
+
+/// Return the fraction of sample records used in this profile.
+///
+/// The returned value is an unsigned integer in the range 0-100 indicating
+/// the percentage of sample records that were used while applying this
+/// profile to the associated function.
+unsigned SampleCoverageTracker::computeCoverage(unsigned Used,
+                                                unsigned Total) const {
+  assert(Used <= Total &&
+         "number of used records cannot exceed the total number of records");
+  return Total > 0 ? Used * 100 / Total : 100;
+}
+
+/// Clear all the per-function data used to load samples and propagate weights.
+void SampleProfileLoader::clearFunctionData() {
+  BlockWeights.clear();
+  EdgeWeights.clear();
+  VisitedBlocks.clear();
+  VisitedEdges.clear();
+  EquivalenceClass.clear();
+  DT = nullptr;
+  PDT = nullptr;
+  LI = nullptr;
+  Predecessors.clear();
+  Successors.clear();
+  CoverageTracker.clear();
+}
+
+/// Returns the line offset to the start line of the subprogram.
+/// We assume that a single function will not exceed 65535 LOC.
+unsigned SampleProfileLoader::getOffset(const DILocation *DIL) const {
+  return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
+         0xffff;
+}
+
+/// \brief Print the weight of edge \p E on stream \p OS.
+///
+/// \param OS  Stream to emit the output to.
+/// \param E  Edge to print.
+void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) {
+  OS << "weight[" << E.first->getName() << "->" << E.second->getName()
+     << "]: " << EdgeWeights[E] << "\n";
+}
+
+/// \brief Print the equivalence class of block \p BB on stream \p OS.
+///
+/// \param OS  Stream to emit the output to.
+/// \param BB  Block to print.
+void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,
+                                                const BasicBlock *BB) {
+  const BasicBlock *Equiv = EquivalenceClass[BB];
+  OS << "equivalence[" << BB->getName()
+     << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
+}
+
+/// \brief Print the weight of block \p BB on stream \p OS.
+///
+/// \param OS  Stream to emit the output to.
+/// \param BB  Block to print.
+void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
+                                           const BasicBlock *BB) const {
+  const auto &I = BlockWeights.find(BB);
+  uint64_t W = (I == BlockWeights.end() ? 0 : I->second);
+  OS << "weight[" << BB->getName() << "]: " << W << "\n";
+}
+
+/// \brief Get the weight for an instruction.
+///
+/// The "weight" of an instruction \p Inst is the number of samples
+/// collected on that instruction at runtime. To retrieve it, we
+/// need to compute the line number of \p Inst relative to the start of its
+/// function. We use HeaderLineno to compute the offset. We then
+/// look up the samples collected for \p Inst using BodySamples.
+///
+/// \param Inst Instruction to query.
+///
+/// \returns the weight of \p Inst.
+ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
+  const DebugLoc &DLoc = Inst.getDebugLoc();
+  if (!DLoc)
+    return std::error_code();
+
+  const FunctionSamples *FS = findFunctionSamples(Inst);
+  if (!FS)
+    return std::error_code();
+
+  // Ignore all intrinsics and branch instructions.
+  // Branch instruction usually contains debug info from sources outside of
+  // the residing basic block, thus we ignore them during annotation.
+  if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst))
+    return std::error_code();
+
+  // If a call/invoke instruction is inlined in profile, but not inlined here,
+  // it means that the inlined callsite has no sample, thus the call
+  // instruction should have 0 count.
+  if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
+      findCalleeFunctionSamples(Inst))
+    return 0;
+
+  const DILocation *DIL = DLoc;
+  uint32_t LineOffset = getOffset(DIL);
+  uint32_t Discriminator = DIL->getBaseDiscriminator();
+  ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
+  if (R) {
+    bool FirstMark =
+        CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
+    if (FirstMark) {
+      const Function *F = Inst.getParent()->getParent();
+      LLVMContext &Ctx = F->getContext();
+      emitOptimizationRemark(
+          Ctx, DEBUG_TYPE, *F, DLoc,
+          Twine("Applied ") + Twine(*R) +
+              " samples from profile (offset: " + Twine(LineOffset) +
+              ((Discriminator) ? Twine(".") + Twine(Discriminator) : "") + ")");
+    }
+    DEBUG(dbgs() << "    " << DLoc.getLine() << "."
+                 << DIL->getBaseDiscriminator() << ":" << Inst
+                 << " (line offset: " << LineOffset << "."
+                 << DIL->getBaseDiscriminator() << " - weight: " << R.get()
+                 << ")\n");
+  }
+  return R;
+}
+
+/// \brief Compute the weight of a basic block.
+///
+/// The weight of basic block \p BB is the maximum weight of all the
+/// instructions in BB.
+///
+/// \param BB The basic block to query.
+///
+/// \returns the weight for \p BB.
+ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
+  uint64_t Max = 0;
+  bool HasWeight = false;
+  for (auto &I : BB->getInstList()) {
+    const ErrorOr<uint64_t> &R = getInstWeight(I);
+    if (R) {
+      Max = std::max(Max, R.get());
+      HasWeight = true;
+    }
+  }
+  return HasWeight ? ErrorOr<uint64_t>(Max) : std::error_code();
+}
+
+/// \brief Compute and store the weights of every basic block.
+///
+/// This populates the BlockWeights map by computing
+/// the weights of every basic block in the CFG.
+///
+/// \param F The function to query.
+bool SampleProfileLoader::computeBlockWeights(Function &F) {
+  bool Changed = false;
+  DEBUG(dbgs() << "Block weights\n");
+  for (const auto &BB : F) {
+    ErrorOr<uint64_t> Weight = getBlockWeight(&BB);
+    if (Weight) {
+      BlockWeights[&BB] = Weight.get();
+      VisitedBlocks.insert(&BB);
+      Changed = true;
+    }
+    DEBUG(printBlockWeight(dbgs(), &BB));
+  }
+
+  return Changed;
+}
+
+/// \brief Get the FunctionSamples for a call instruction.
+///
+/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
+/// instance in which that call instruction is calling to. It contains
+/// all samples that resides in the inlined instance. We first find the
+/// inlined instance in which the call instruction is from, then we
+/// traverse its children to find the callsite with the matching
+/// location.
+///
+/// \param Inst Call/Invoke instruction to query.
+///
+/// \returns The FunctionSamples pointer to the inlined instance.
+const FunctionSamples *
+SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
+  const DILocation *DIL = Inst.getDebugLoc();
+  if (!DIL) {
+    return nullptr;
+  }
+
+  StringRef CalleeName;
+  if (const CallInst *CI = dyn_cast<CallInst>(&Inst))
+    if (Function *Callee = CI->getCalledFunction())
+      CalleeName = Callee->getName();
+
+  const FunctionSamples *FS = findFunctionSamples(Inst);
+  if (FS == nullptr)
+    return nullptr;
+
+  return FS->findFunctionSamplesAt(
+      LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()), CalleeName);
+}
+
+/// Returns a vector of FunctionSamples that are the indirect call targets
+/// of \p Inst. The vector is sorted by the total number of samples.
+std::vector<const FunctionSamples *>
+SampleProfileLoader::findIndirectCallFunctionSamples(
+    const Instruction &Inst) const {
+  const DILocation *DIL = Inst.getDebugLoc();
+  std::vector<const FunctionSamples *> R;
+
+  if (!DIL) {
+    return R;
+  }
+
+  const FunctionSamples *FS = findFunctionSamples(Inst);
+  if (FS == nullptr)
+    return R;
+
+  if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(
+          LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()))) {
+    if (M->size() == 0)
+      return R;
+    for (const auto &NameFS : *M) {
+      R.push_back(&NameFS.second);
+    }
+    std::sort(R.begin(), R.end(),
+              [](const FunctionSamples *L, const FunctionSamples *R) {
+                return L->getTotalSamples() > R->getTotalSamples();
+              });
+  }
+  return R;
+}
+
+/// \brief Get the FunctionSamples for an instruction.
+///
+/// The FunctionSamples of an instruction \p Inst is the inlined instance
+/// in which that instruction is coming from. We traverse the inline stack
+/// of that instruction, and match it with the tree nodes in the profile.
+///
+/// \param Inst Instruction to query.
+///
+/// \returns the FunctionSamples pointer to the inlined instance.
+const FunctionSamples *
+SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
+  SmallVector<std::pair<LineLocation, StringRef>, 10> S;
+  const DILocation *DIL = Inst.getDebugLoc();
+  if (!DIL)
+    return Samples;
+
+  const DILocation *PrevDIL = DIL;
+  for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) {
+    S.push_back(std::make_pair(
+        LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()),
+        PrevDIL->getScope()->getSubprogram()->getLinkageName()));
+    PrevDIL = DIL;
+  }
+  if (S.size() == 0)
+    return Samples;
+  const FunctionSamples *FS = Samples;
+  for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) {
+    FS = FS->findFunctionSamplesAt(S[i].first, S[i].second);
+  }
+  return FS;
+}
+
+/// \brief Iteratively inline hot callsites of a function.
+///
+/// Iteratively traverse all callsites of the function \p F, and find if
+/// the corresponding inlined instance exists and is hot in profile. If
+/// it is hot enough, inline the callsites and adds new callsites of the
+/// callee into the caller. If the call is an indirect call, first promote
+/// it to direct call. Each indirect call is limited with a single target.
+///
+/// \param F function to perform iterative inlining.
+/// \param ImportGUIDs a set to be updated to include all GUIDs that come
+///     from a different module but inlined in the profiled binary.
+///
+/// \returns True if there is any inline happened.
+bool SampleProfileLoader::inlineHotFunctions(
+    Function &F, DenseSet<GlobalValue::GUID> &ImportGUIDs) {
+  DenseSet<Instruction *> PromotedInsns;
+  bool Changed = false;
+  LLVMContext &Ctx = F.getContext();
+  std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&](
+      Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); };
+  while (true) {
+    bool LocalChanged = false;
+    SmallVector<Instruction *, 10> CIS;
+    for (auto &BB : F) {
+      bool Hot = false;
+      SmallVector<Instruction *, 10> Candidates;
+      for (auto &I : BB.getInstList()) {
+        const FunctionSamples *FS = nullptr;
+        if ((isa<CallInst>(I) || isa<InvokeInst>(I)) &&
+            !isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) {
+          Candidates.push_back(&I);
+          if (callsiteIsHot(Samples, FS))
+            Hot = true;
+        }
+      }
+      if (Hot) {
+        CIS.insert(CIS.begin(), Candidates.begin(), Candidates.end());
+      }
+    }
+    for (auto I : CIS) {
+      InlineFunctionInfo IFI(nullptr, ACT ? &GetAssumptionCache : nullptr);
+      Function *CalledFunction = CallSite(I).getCalledFunction();
+      // Do not inline recursive calls.
+      if (CalledFunction == &F)
+        continue;
+      Instruction *DI = I;
+      if (!CalledFunction && !PromotedInsns.count(I) &&
+          CallSite(I).isIndirectCall())
+        for (const auto *FS : findIndirectCallFunctionSamples(*I)) {
+          auto CalleeFunctionName = FS->getName();
+          // If it is a recursive call, we do not inline it as it could bloat
+          // the code exponentially. There is way to better handle this, e.g.
+          // clone the caller first, and inline the cloned caller if it is
+          // recursive. As llvm does not inline recursive calls, we will simply
+          // ignore it instead of handling it explicitly.
+          if (CalleeFunctionName == F.getName())
+            continue;
+          const char *Reason = "Callee function not available";
+          auto R = SymbolMap.find(CalleeFunctionName);
+          if (R == SymbolMap.end())
+            continue;
+          CalledFunction = R->getValue();
+          if (CalledFunction && isLegalToPromote(I, CalledFunction, &Reason)) {
+            // The indirect target was promoted and inlined in the profile, as a
+            // result, we do not have profile info for the branch probability.
+            // We set the probability to 80% taken to indicate that the static
+            // call is likely taken.
+            DI = dyn_cast<Instruction>(
+                promoteIndirectCall(I, CalledFunction, 80, 100, false)
+                    ->stripPointerCasts());
+            PromotedInsns.insert(I);
+          } else {
+            DEBUG(dbgs() << "\nFailed to promote indirect call to "
+                         << CalleeFunctionName << " because " << Reason
+                         << "\n");
+            continue;
+          }
+        }
+      if (!CalledFunction || !CalledFunction->getSubprogram()) {
+        findCalleeFunctionSamples(*I)->findImportedFunctions(
+            ImportGUIDs, F.getParent(),
+            Samples->getTotalSamples() * SampleProfileHotThreshold / 100);
+        continue;
+      }
+      DebugLoc DLoc = I->getDebugLoc();
+      if (InlineFunction(CallSite(DI), IFI)) {
+        LocalChanged = true;
+        emitOptimizationRemark(Ctx, DEBUG_TYPE, F, DLoc,
+                               Twine("inlined hot callee '") +
+                                   CalledFunction->getName() + "' into '" +
+                                   F.getName() + "'");
+      }
+    }
+    if (LocalChanged) {
+      Changed = true;
+    } else {
+      break;
+    }
+  }
+  return Changed;
+}
+
+/// \brief Find equivalence classes for the given block.
+///
+/// This finds all the blocks that are guaranteed to execute the same
+/// number of times as \p BB1. To do this, it traverses all the
+/// descendants of \p BB1 in the dominator or post-dominator tree.
+///
+/// A block BB2 will be in the same equivalence class as \p BB1 if
+/// the following holds:
+///
+/// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2
+///    is a descendant of \p BB1 in the dominator tree, then BB2 should
+///    dominate BB1 in the post-dominator tree.
+///
+/// 2- Both BB2 and \p BB1 must be in the same loop.
+///
+/// For every block BB2 that meets those two requirements, we set BB2's
+/// equivalence class to \p BB1.
+///
+/// \param BB1  Block to check.
+/// \param Descendants  Descendants of \p BB1 in either the dom or pdom tree.
+/// \param DomTree  Opposite dominator tree. If \p Descendants is filled
+///                 with blocks from \p BB1's dominator tree, then
+///                 this is the post-dominator tree, and vice versa.
+void SampleProfileLoader::findEquivalencesFor(
+    BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
+    DominatorTreeBase<BasicBlock> *DomTree) {
+  const BasicBlock *EC = EquivalenceClass[BB1];
+  uint64_t Weight = BlockWeights[EC];
+  for (const auto *BB2 : Descendants) {
+    bool IsDomParent = DomTree->dominates(BB2, BB1);
+    bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2);
+    if (BB1 != BB2 && IsDomParent && IsInSameLoop) {
+      EquivalenceClass[BB2] = EC;
+      // If BB2 is visited, then the entire EC should be marked as visited.
+      if (VisitedBlocks.count(BB2)) {
+        VisitedBlocks.insert(EC);
+      }
+
+      // If BB2 is heavier than BB1, make BB2 have the same weight
+      // as BB1.
+      //
+      // Note that we don't worry about the opposite situation here
+      // (when BB2 is lighter than BB1). We will deal with this
+      // during the propagation phase. Right now, we just want to
+      // make sure that BB1 has the largest weight of all the
+      // members of its equivalence set.
+      Weight = std::max(Weight, BlockWeights[BB2]);
+    }
+  }
+  if (EC == &EC->getParent()->getEntryBlock()) {
+    BlockWeights[EC] = Samples->getHeadSamples() + 1;
+  } else {
+    BlockWeights[EC] = Weight;
+  }
+}
+
+/// \brief Find equivalence classes.
+///
+/// Since samples may be missing from blocks, we can fill in the gaps by setting
+/// the weights of all the blocks in the same equivalence class to the same
+/// weight. To compute the concept of equivalence, we use dominance and loop
+/// information. Two blocks B1 and B2 are in the same equivalence class if B1
+/// dominates B2, B2 post-dominates B1 and both are in the same loop.
+///
+/// \param F The function to query.
+void SampleProfileLoader::findEquivalenceClasses(Function &F) {
+  SmallVector<BasicBlock *, 8> DominatedBBs;
+  DEBUG(dbgs() << "\nBlock equivalence classes\n");
+  // Find equivalence sets based on dominance and post-dominance information.
+  for (auto &BB : F) {
+    BasicBlock *BB1 = &BB;
+
+    // Compute BB1's equivalence class once.
+    if (EquivalenceClass.count(BB1)) {
+      DEBUG(printBlockEquivalence(dbgs(), BB1));
+      continue;
+    }
+
+    // By default, blocks are in their own equivalence class.
+    EquivalenceClass[BB1] = BB1;
+
+    // Traverse all the blocks dominated by BB1. We are looking for
+    // every basic block BB2 such that:
+    //
+    // 1- BB1 dominates BB2.
+    // 2- BB2 post-dominates BB1.
+    // 3- BB1 and BB2 are in the same loop nest.
+    //
+    // If all those conditions hold, it means that BB2 is executed
+    // as many times as BB1, so they are placed in the same equivalence
+    // class by making BB2's equivalence class be BB1.
+    DominatedBBs.clear();
+    DT->getDescendants(BB1, DominatedBBs);
+    findEquivalencesFor(BB1, DominatedBBs, PDT.get());
+
+    DEBUG(printBlockEquivalence(dbgs(), BB1));
+  }
+
+  // Assign weights to equivalence classes.
+  //
+  // All the basic blocks in the same equivalence class will execute
+  // the same number of times. Since we know that the head block in
+  // each equivalence class has the largest weight, assign that weight
+  // to all the blocks in that equivalence class.
+  DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n");
+  for (auto &BI : F) {
+    const BasicBlock *BB = &BI;
+    const BasicBlock *EquivBB = EquivalenceClass[BB];
+    if (BB != EquivBB)
+      BlockWeights[BB] = BlockWeights[EquivBB];
+    DEBUG(printBlockWeight(dbgs(), BB));
+  }
+}
+
+/// \brief Visit the given edge to decide if it has a valid weight.
+///
+/// If \p E has not been visited before, we copy to \p UnknownEdge
+/// and increment the count of unknown edges.
+///
+/// \param E  Edge to visit.
+/// \param NumUnknownEdges  Current number of unknown edges.
+/// \param UnknownEdge  Set if E has not been visited before.
+///
+/// \returns E's weight, if known. Otherwise, return 0.
+uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
+                                        Edge *UnknownEdge) {
+  if (!VisitedEdges.count(E)) {
+    (*NumUnknownEdges)++;
+    *UnknownEdge = E;
+    return 0;
+  }
+
+  return EdgeWeights[E];
+}
+
+/// \brief Propagate weights through incoming/outgoing edges.
+///
+/// If the weight of a basic block is known, and there is only one edge
+/// with an unknown weight, we can calculate the weight of that edge.
+///
+/// Similarly, if all the edges have a known count, we can calculate the
+/// count of the basic block, if needed.
+///
+/// \param F  Function to process.
+/// \param UpdateBlockCount  Whether we should update basic block counts that
+///                          has already been annotated.
+///
+/// \returns  True if new weights were assigned to edges or blocks.
+bool SampleProfileLoader::propagateThroughEdges(Function &F,
+                                                bool UpdateBlockCount) {
+  bool Changed = false;
+  DEBUG(dbgs() << "\nPropagation through edges\n");
+  for (const auto &BI : F) {
+    const BasicBlock *BB = &BI;
+    const BasicBlock *EC = EquivalenceClass[BB];
+
+    // Visit all the predecessor and successor edges to determine
+    // which ones have a weight assigned already. Note that it doesn't
+    // matter that we only keep track of a single unknown edge. The
+    // only case we are interested in handling is when only a single
+    // edge is unknown (see setEdgeOrBlockWeight).
+    for (unsigned i = 0; i < 2; i++) {
+      uint64_t TotalWeight = 0;
+      unsigned NumUnknownEdges = 0, NumTotalEdges = 0;
+      Edge UnknownEdge, SelfReferentialEdge, SingleEdge;
+
+      if (i == 0) {
+        // First, visit all predecessor edges.
+        NumTotalEdges = Predecessors[BB].size();
+        for (auto *Pred : Predecessors[BB]) {
+          Edge E = std::make_pair(Pred, BB);
+          TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
+          if (E.first == E.second)
+            SelfReferentialEdge = E;
+        }
+        if (NumTotalEdges == 1) {
+          SingleEdge = std::make_pair(Predecessors[BB][0], BB);
+        }
+      } else {
+        // On the second round, visit all successor edges.
+        NumTotalEdges = Successors[BB].size();
+        for (auto *Succ : Successors[BB]) {
+          Edge E = std::make_pair(BB, Succ);
+          TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
+        }
+        if (NumTotalEdges == 1) {
+          SingleEdge = std::make_pair(BB, Successors[BB][0]);
+        }
+      }
+
+      // After visiting all the edges, there are three cases that we
+      // can handle immediately:
+      //
+      // - All the edge weights are known (i.e., NumUnknownEdges == 0).
+      //   In this case, we simply check that the sum of all the edges
+      //   is the same as BB's weight. If not, we change BB's weight
+      //   to match. Additionally, if BB had not been visited before,
+      //   we mark it visited.
+      //
+      // - Only one edge is unknown and BB has already been visited.
+      //   In this case, we can compute the weight of the edge by
+      //   subtracting the total block weight from all the known
+      //   edge weights. If the edges weight more than BB, then the
+      //   edge of the last remaining edge is set to zero.
+      //
+      // - There exists a self-referential edge and the weight of BB is
+      //   known. In this case, this edge can be based on BB's weight.
+      //   We add up all the other known edges and set the weight on
+      //   the self-referential edge as we did in the previous case.
+      //
+      // In any other case, we must continue iterating. Eventually,
+      // all edges will get a weight, or iteration will stop when
+      // it reaches SampleProfileMaxPropagateIterations.
+      if (NumUnknownEdges <= 1) {
+        uint64_t &BBWeight = BlockWeights[EC];
+        if (NumUnknownEdges == 0) {
+          if (!VisitedBlocks.count(EC)) {
+            // If we already know the weight of all edges, the weight of the
+            // basic block can be computed. It should be no larger than the sum
+            // of all edge weights.
+            if (TotalWeight > BBWeight) {
+              BBWeight = TotalWeight;
+              Changed = true;
+              DEBUG(dbgs() << "All edge weights for " << BB->getName()
+                           << " known. Set weight for block: ";
+                    printBlockWeight(dbgs(), BB););
+            }
+          } else if (NumTotalEdges == 1 &&
+                     EdgeWeights[SingleEdge] < BlockWeights[EC]) {
+            // If there is only one edge for the visited basic block, use the
+            // block weight to adjust edge weight if edge weight is smaller.
+            EdgeWeights[SingleEdge] = BlockWeights[EC];
+            Changed = true;
+          }
+        } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) {
+          // If there is a single unknown edge and the block has been
+          // visited, then we can compute E's weight.
+          if (BBWeight >= TotalWeight)
+            EdgeWeights[UnknownEdge] = BBWeight - TotalWeight;
+          else
+            EdgeWeights[UnknownEdge] = 0;
+          const BasicBlock *OtherEC;
+          if (i == 0)
+            OtherEC = EquivalenceClass[UnknownEdge.first];
+          else
+            OtherEC = EquivalenceClass[UnknownEdge.second];
+          // Edge weights should never exceed the BB weights it connects.
+          if (VisitedBlocks.count(OtherEC) &&
+              EdgeWeights[UnknownEdge] > BlockWeights[OtherEC])
+            EdgeWeights[UnknownEdge] = BlockWeights[OtherEC];
+          VisitedEdges.insert(UnknownEdge);
+          Changed = true;
+          DEBUG(dbgs() << "Set weight for edge: ";
+                printEdgeWeight(dbgs(), UnknownEdge));
+        }
+      } else if (VisitedBlocks.count(EC) && BlockWeights[EC] == 0) {
+        // If a block Weights 0, all its in/out edges should weight 0.
+        if (i == 0) {
+          for (auto *Pred : Predecessors[BB]) {
+            Edge E = std::make_pair(Pred, BB);
+            EdgeWeights[E] = 0;
+            VisitedEdges.insert(E);
+          }
+        } else {
+          for (auto *Succ : Successors[BB]) {
+            Edge E = std::make_pair(BB, Succ);
+            EdgeWeights[E] = 0;
+            VisitedEdges.insert(E);
+          }
+        }
+      } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) {
+        uint64_t &BBWeight = BlockWeights[BB];
+        // We have a self-referential edge and the weight of BB is known.
+        if (BBWeight >= TotalWeight)
+          EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight;
+        else
+          EdgeWeights[SelfReferentialEdge] = 0;
+        VisitedEdges.insert(SelfReferentialEdge);
+        Changed = true;
+        DEBUG(dbgs() << "Set self-referential edge weight to: ";
+              printEdgeWeight(dbgs(), SelfReferentialEdge));
+      }
+      if (UpdateBlockCount && !VisitedBlocks.count(EC) && TotalWeight > 0) {
+        BlockWeights[EC] = TotalWeight;
+        VisitedBlocks.insert(EC);
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+/// \brief Build in/out edge lists for each basic block in the CFG.
+///
+/// We are interested in unique edges. If a block B1 has multiple
+/// edges to another block B2, we only add a single B1->B2 edge.
+void SampleProfileLoader::buildEdges(Function &F) {
+  for (auto &BI : F) {
+    BasicBlock *B1 = &BI;
+
+    // Add predecessors for B1.
+    SmallPtrSet<BasicBlock *, 16> Visited;
+    if (!Predecessors[B1].empty())
+      llvm_unreachable("Found a stale predecessors list in a basic block.");
+    for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) {
+      BasicBlock *B2 = *PI;
+      if (Visited.insert(B2).second)
+        Predecessors[B1].push_back(B2);
+    }
+
+    // Add successors for B1.
+    Visited.clear();
+    if (!Successors[B1].empty())
+      llvm_unreachable("Found a stale successors list in a basic block.");
+    for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) {
+      BasicBlock *B2 = *SI;
+      if (Visited.insert(B2).second)
+        Successors[B1].push_back(B2);
+    }
+  }
+}
+
+/// Sorts the CallTargetMap \p M by count in descending order and stores the
+/// sorted result in \p Sorted. Returns the total counts.
+static uint64_t SortCallTargets(SmallVector<InstrProfValueData, 2> &Sorted,
+                                const SampleRecord::CallTargetMap &M) {
+  Sorted.clear();
+  uint64_t Sum = 0;
+  for (auto I = M.begin(); I != M.end(); ++I) {
+    Sum += I->getValue();
+    Sorted.push_back({Function::getGUID(I->getKey()), I->getValue()});
+  }
+  std::sort(Sorted.begin(), Sorted.end(),
+            [](const InstrProfValueData &L, const InstrProfValueData &R) {
+              if (L.Count == R.Count)
+                return L.Value > R.Value;
+              else
+                return L.Count > R.Count;
+            });
+  return Sum;
+}
+
+/// \brief Propagate weights into edges
+///
+/// The following rules are applied to every block BB in the CFG:
+///
+/// - If BB has a single predecessor/successor, then the weight
+///   of that edge is the weight of the block.
+///
+/// - If all incoming or outgoing edges are known except one, and the
+///   weight of the block is already known, the weight of the unknown
+///   edge will be the weight of the block minus the sum of all the known
+///   edges. If the sum of all the known edges is larger than BB's weight,
+///   we set the unknown edge weight to zero.
+///
+/// - If there is a self-referential edge, and the weight of the block is
+///   known, the weight for that edge is set to the weight of the block
+///   minus the weight of the other incoming edges to that block (if
+///   known).
+void SampleProfileLoader::propagateWeights(Function &F) {
+  bool Changed = true;
+  unsigned I = 0;
+
+  // If BB weight is larger than its corresponding loop's header BB weight,
+  // use the BB weight to replace the loop header BB weight.
+  for (auto &BI : F) {
+    BasicBlock *BB = &BI;
+    Loop *L = LI->getLoopFor(BB);
+    if (!L) {
+      continue;
+    }
+    BasicBlock *Header = L->getHeader();
+    if (Header && BlockWeights[BB] > BlockWeights[Header]) {
+      BlockWeights[Header] = BlockWeights[BB];
+    }
+  }
+
+  // Before propagation starts, build, for each block, a list of
+  // unique predecessors and successors. This is necessary to handle
+  // identical edges in multiway branches. Since we visit all blocks and all
+  // edges of the CFG, it is cleaner to build these lists once at the start
+  // of the pass.
+  buildEdges(F);
+
+  // Propagate until we converge or we go past the iteration limit.
+  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+    Changed = propagateThroughEdges(F, false);
+  }
+
+  // The first propagation propagates BB counts from annotated BBs to unknown
+  // BBs. The 2nd propagation pass resets edges weights, and use all BB weights
+  // to propagate edge weights.
+  VisitedEdges.clear();
+  Changed = true;
+  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+    Changed = propagateThroughEdges(F, false);
+  }
+
+  // The 3rd propagation pass allows adjust annotated BB weights that are
+  // obviously wrong.
+  Changed = true;
+  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+    Changed = propagateThroughEdges(F, true);
+  }
+
+  // Generate MD_prof metadata for every branch instruction using the
+  // edge weights computed during propagation.
+  DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
+  LLVMContext &Ctx = F.getContext();
+  MDBuilder MDB(Ctx);
+  for (auto &BI : F) {
+    BasicBlock *BB = &BI;
+
+    if (BlockWeights[BB]) {
+      for (auto &I : BB->getInstList()) {
+        if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
+          continue;
+        CallSite CS(&I);
+        if (!CS.getCalledFunction()) {
+          const DebugLoc &DLoc = I.getDebugLoc();
+          if (!DLoc)
+            continue;
+          const DILocation *DIL = DLoc;
+          uint32_t LineOffset = getOffset(DIL);
+          uint32_t Discriminator = DIL->getBaseDiscriminator();
+
+          const FunctionSamples *FS = findFunctionSamples(I);
+          if (!FS)
+            continue;
+          auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
+          if (!T || T.get().size() == 0)
+            continue;
+          SmallVector<InstrProfValueData, 2> SortedCallTargets;
+          uint64_t Sum = SortCallTargets(SortedCallTargets, T.get());
+          annotateValueSite(*I.getParent()->getParent()->getParent(), I,
+                            SortedCallTargets, Sum, IPVK_IndirectCallTarget,
+                            SortedCallTargets.size());
+        } else if (!dyn_cast<IntrinsicInst>(&I)) {
+          SmallVector<uint32_t, 1> Weights;
+          Weights.push_back(BlockWeights[BB]);
+          I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+        }
+      }
+    }
+    TerminatorInst *TI = BB->getTerminator();
+    if (TI->getNumSuccessors() == 1)
+      continue;
+    if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
+      continue;
+
+    DebugLoc BranchLoc = TI->getDebugLoc();
+    DEBUG(dbgs() << "\nGetting weights for branch at line "
+                 << ((BranchLoc) ? Twine(BranchLoc.getLine())
+                                 : Twine("<UNKNOWN LOCATION>"))
+                 << ".\n");
+    SmallVector<uint32_t, 4> Weights;
+    uint32_t MaxWeight = 0;
+    DebugLoc MaxDestLoc;
+    for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
+      BasicBlock *Succ = TI->getSuccessor(I);
+      Edge E = std::make_pair(BB, Succ);
+      uint64_t Weight = EdgeWeights[E];
+      DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
+      // Use uint32_t saturated arithmetic to adjust the incoming weights,
+      // if needed. Sample counts in profiles are 64-bit unsigned values,
+      // but internally branch weights are expressed as 32-bit values.
+      if (Weight > std::numeric_limits<uint32_t>::max()) {
+        DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
+        Weight = std::numeric_limits<uint32_t>::max();
+      }
+      // Weight is added by one to avoid propagation errors introduced by
+      // 0 weights.
+      Weights.push_back(static_cast<uint32_t>(Weight + 1));
+      if (Weight != 0) {
+        if (Weight > MaxWeight) {
+          MaxWeight = Weight;
+          MaxDestLoc = Succ->getFirstNonPHIOrDbgOrLifetime()->getDebugLoc();
+        }
+      }
+    }
+
+    uint64_t TempWeight;
+    // Only set weights if there is at least one non-zero weight.
+    // In any other case, let the analyzer set weights.
+    // Do not set weights if the weights are present. In ThinLTO, the profile
+    // annotation is done twice. If the first annotation already set the
+    // weights, the second pass does not need to set it.
+    if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
+      DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
+      TI->setMetadata(llvm::LLVMContext::MD_prof,
+                      MDB.createBranchWeights(Weights));
+      emitOptimizationRemark(
+          Ctx, DEBUG_TYPE, F, MaxDestLoc,
+          Twine("most popular destination for conditional branches at ") +
+              ((BranchLoc) ? Twine(BranchLoc->getFilename() + ":" +
+                                   Twine(BranchLoc.getLine()) + ":" +
+                                   Twine(BranchLoc.getCol()))
+                           : Twine("<UNKNOWN LOCATION>")));
+    } else {
+      DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
+    }
+  }
+}
+
+/// \brief Get the line number for the function header.
+///
+/// This looks up function \p F in the current compilation unit and
+/// retrieves the line number where the function is defined. This is
+/// line 0 for all the samples read from the profile file. Every line
+/// number is relative to this line.
+///
+/// \param F  Function object to query.
+///
+/// \returns the line number where \p F is defined. If it returns 0,
+///          it means that there is no debug information available for \p F.
+unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
+  if (DISubprogram *S = F.getSubprogram())
+    return S->getLine();
+
+  // If the start of \p F is missing, emit a diagnostic to inform the user
+  // about the missed opportunity.
+  F.getContext().diagnose(DiagnosticInfoSampleProfile(
+      "No debug information found in function " + F.getName() +
+          ": Function profile not used",
+      DS_Warning));
+  return 0;
+}
+
+void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
+  DT.reset(new DominatorTree);
+  DT->recalculate(F);
+
+  PDT.reset(new DominatorTreeBase<BasicBlock>(true));
+  PDT->recalculate(F);
+
+  LI.reset(new LoopInfo);
+  LI->analyze(*DT);
+}
+
+/// \brief Generate branch weight metadata for all branches in \p F.
+///
+/// Branch weights are computed out of instruction samples using a
+/// propagation heuristic. Propagation proceeds in 3 phases:
+///
+/// 1- Assignment of block weights. All the basic blocks in the function
+///    are initial assigned the same weight as their most frequently
+///    executed instruction.
+///
+/// 2- Creation of equivalence classes. Since samples may be missing from
+///    blocks, we can fill in the gaps by setting the weights of all the
+///    blocks in the same equivalence class to the same weight. To compute
+///    the concept of equivalence, we use dominance and loop information.
+///    Two blocks B1 and B2 are in the same equivalence class if B1
+///    dominates B2, B2 post-dominates B1 and both are in the same loop.
+///
+/// 3- Propagation of block weights into edges. This uses a simple
+///    propagation heuristic. The following rules are applied to every
+///    block BB in the CFG:
+///
+///    - If BB has a single predecessor/successor, then the weight
+///      of that edge is the weight of the block.
+///
+///    - If all the edges are known except one, and the weight of the
+///      block is already known, the weight of the unknown edge will
+///      be the weight of the block minus the sum of all the known
+///      edges. If the sum of all the known edges is larger than BB's weight,
+///      we set the unknown edge weight to zero.
+///
+///    - If there is a self-referential edge, and the weight of the block is
+///      known, the weight for that edge is set to the weight of the block
+///      minus the weight of the other incoming edges to that block (if
+///      known).
+///
+/// Since this propagation is not guaranteed to finalize for every CFG, we
+/// only allow it to proceed for a limited number of iterations (controlled
+/// by -sample-profile-max-propagate-iterations).
+///
+/// FIXME: Try to replace this propagation heuristic with a scheme
+/// that is guaranteed to finalize. A work-list approach similar to
+/// the standard value propagation algorithm used by SSA-CCP might
+/// work here.
+///
+/// Once all the branch weights are computed, we emit the MD_prof
+/// metadata on BB using the computed values for each of its branches.
+///
+/// \param F The function to query.
+///
+/// \returns true if \p F was modified. Returns false, otherwise.
+bool SampleProfileLoader::emitAnnotations(Function &F) {
+  bool Changed = false;
+
+  if (getFunctionLoc(F) == 0)
+    return false;
+
+  DEBUG(dbgs() << "Line number for the first instruction in " << F.getName()
+               << ": " << getFunctionLoc(F) << "\n");
+
+  DenseSet<GlobalValue::GUID> ImportGUIDs;
+  Changed |= inlineHotFunctions(F, ImportGUIDs);
+
+  // Compute basic block weights.
+  Changed |= computeBlockWeights(F);
+
+  if (Changed) {
+    // Add an entry count to the function using the samples gathered at the
+    // function entry. Also sets the GUIDs that comes from a different
+    // module but inlined in the profiled binary. This is aiming at making
+    // the IR match the profiled binary before annotation.
+    F.setEntryCount(Samples->getHeadSamples() + 1, &ImportGUIDs);
+
+    // Compute dominance and loop info needed for propagation.
+    computeDominanceAndLoopInfo(F);
+
+    // Find equivalence classes.
+    findEquivalenceClasses(F);
+
+    // Propagate weights to all edges.
+    propagateWeights(F);
+  }
+
+  // If coverage checking was requested, compute it now.
+  if (SampleProfileRecordCoverage) {
+    unsigned Used = CoverageTracker.countUsedRecords(Samples);
+    unsigned Total = CoverageTracker.countBodyRecords(Samples);
+    unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
+    if (Coverage < SampleProfileRecordCoverage) {
+      F.getContext().diagnose(DiagnosticInfoSampleProfile(
+          F.getSubprogram()->getFilename(), getFunctionLoc(F),
+          Twine(Used) + " of " + Twine(Total) + " available profile records (" +
+              Twine(Coverage) + "%) were applied",
+          DS_Warning));
+    }
+  }
+
+  if (SampleProfileSampleCoverage) {
+    uint64_t Used = CoverageTracker.getTotalUsedSamples();
+    uint64_t Total = CoverageTracker.countBodySamples(Samples);
+    unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
+    if (Coverage < SampleProfileSampleCoverage) {
+      F.getContext().diagnose(DiagnosticInfoSampleProfile(
+          F.getSubprogram()->getFilename(), getFunctionLoc(F),
+          Twine(Used) + " of " + Twine(Total) + " available profile samples (" +
+              Twine(Coverage) + "%) were applied",
+          DS_Warning));
+    }
+  }
+  return Changed;
+}
+
+char SampleProfileLoaderLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
+                      "Sample Profile loader", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
+                    "Sample Profile loader", false, false)
+
+bool SampleProfileLoader::doInitialization(Module &M) {
+  auto &Ctx = M.getContext();
+  auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx);
+  if (std::error_code EC = ReaderOrErr.getError()) {
+    std::string Msg = "Could not open profile: " + EC.message();
+    Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
+    return false;
+  }
+  Reader = std::move(ReaderOrErr.get());
+  ProfileIsValid = (Reader->read() == sampleprof_error::success);
+  return true;
+}
+
+ModulePass *llvm::createSampleProfileLoaderPass() {
+  return new SampleProfileLoaderLegacyPass(SampleProfileFile);
+}
+
+ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
+  return new SampleProfileLoaderLegacyPass(Name);
+}
+
+bool SampleProfileLoader::runOnModule(Module &M) {
+  if (!ProfileIsValid)
+    return false;
+
+  // Compute the total number of samples collected in this profile.
+  for (const auto &I : Reader->getProfiles())
+    TotalCollectedSamples += I.second.getTotalSamples();
+
+  // Populate the symbol map.
+  for (const auto &N_F : M.getValueSymbolTable()) {
+    std::string OrigName = N_F.getKey();
+    Function *F = dyn_cast<Function>(N_F.getValue());
+    if (F == nullptr)
+      continue;
+    SymbolMap[OrigName] = F;
+    auto pos = OrigName.find('.');
+    if (pos != std::string::npos) {
+      std::string NewName = OrigName.substr(0, pos);
+      auto r = SymbolMap.insert(std::make_pair(NewName, F));
+      // Failiing to insert means there is already an entry in SymbolMap,
+      // thus there are multiple functions that are mapped to the same
+      // stripped name. In this case of name conflicting, set the value
+      // to nullptr to avoid confusion.
+      if (!r.second)
+        r.first->second = nullptr;
+    }
+  }
+
+  bool retval = false;
+  for (auto &F : M)
+    if (!F.isDeclaration()) {
+      clearFunctionData();
+      retval |= runOnFunction(F);
+    }
+  if (M.getProfileSummary() == nullptr)
+    M.setProfileSummary(Reader->getSummary().getMD(M.getContext()));
+  return retval;
+}
+
+bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
+  // FIXME: pass in AssumptionCache correctly for the new pass manager.
+  SampleLoader.setACT(&getAnalysis<AssumptionCacheTracker>());
+  return SampleLoader.runOnModule(M);
+}
+
+bool SampleProfileLoader::runOnFunction(Function &F) {
+  F.setEntryCount(0);
+  Samples = Reader->getSamplesFor(F);
+  if (Samples && !Samples->empty())
+    return emitAnnotations(F);
+  return false;
+}
+
+PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
+                                               ModuleAnalysisManager &AM) {
+
+  SampleProfileLoader SampleLoader(
+      ProfileFileName.empty() ? SampleProfileFile : ProfileFileName);
+
+  SampleLoader.doInitialization(M);
+
+  if (!SampleLoader.runOnModule(M))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
new file mode 100644
index 000000000000..3c3c5dd19d1f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -0,0 +1,88 @@
+//===-- StripDeadPrototypes.cpp - Remove unused function declarations ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loops over all of the functions in the input module, looking for
+// dead declarations and removes them. Dead declarations are declarations of
+// functions for which no implementation is available (i.e., declarations for
+// unused library functions).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/StripDeadPrototypes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "strip-dead-prototypes"
+
+STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed");
+
+static bool stripDeadPrototypes(Module &M) {
+  bool MadeChange = false;
+
+  // Erase dead function prototypes.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    Function *F = &*I++;
+    // Function must be a prototype and unused.
+    if (F->isDeclaration() && F->use_empty()) {
+      F->eraseFromParent();
+      ++NumDeadPrototypes;
+      MadeChange = true;
+    }
+  }
+
+  // Erase dead global var prototypes.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ) {
+    GlobalVariable *GV = &*I++;
+    // Global must be a prototype and unused.
+    if (GV->isDeclaration() && GV->use_empty())
+      GV->eraseFromParent();
+  }
+
+  // Return an indication of whether we changed anything or not.
+  return MadeChange;
+}
+
+PreservedAnalyses StripDeadPrototypesPass::run(Module &M,
+                                               ModuleAnalysisManager &) {
+  if (stripDeadPrototypes(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+namespace {
+
+class StripDeadPrototypesLegacyPass : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  StripDeadPrototypesLegacyPass() : ModulePass(ID) {
+    initializeStripDeadPrototypesLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    return stripDeadPrototypes(M);
+  }
+};
+
+} // end anonymous namespace
+
+char StripDeadPrototypesLegacyPass::ID = 0;
+INITIALIZE_PASS(StripDeadPrototypesLegacyPass, "strip-dead-prototypes",
+                "Strip Unused Function Prototypes", false, false)
+
+ModulePass *llvm::createStripDeadPrototypesPass() {
+  return new StripDeadPrototypesLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
new file mode 100644
index 000000000000..de1b51e206ff
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
@@ -0,0 +1,380 @@
+//===- StripSymbols.cpp - Strip symbols and debug info from a module ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The StripSymbols transformation implements code stripping. Specifically, it
+// can delete:
+//
+//   * names for virtual registers
+//   * symbols for internal globals and functions
+//   * debug information
+//
+// Note that this transformation makes code much less readable, so it should
+// only be used in situations where the 'strip' utility would be used, such as
+// reducing code size or making it harder to reverse engineer code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+namespace {
+  class StripSymbols : public ModulePass {
+    bool OnlyDebugInfo;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripSymbols(bool ODI = false)
+      : ModulePass(ID), OnlyDebugInfo(ODI) {
+        initializeStripSymbolsPass(*PassRegistry::getPassRegistry());
+      }
+
+    bool runOnModule(Module &M) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesAll();
+    }
+  };
+
+  class StripNonDebugSymbols : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripNonDebugSymbols()
+      : ModulePass(ID) {
+        initializeStripNonDebugSymbolsPass(*PassRegistry::getPassRegistry());
+      }
+
+    bool runOnModule(Module &M) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesAll();
+    }
+  };
+
+  class StripDebugDeclare : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripDebugDeclare()
+      : ModulePass(ID) {
+        initializeStripDebugDeclarePass(*PassRegistry::getPassRegistry());
+      }
+
+    bool runOnModule(Module &M) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesAll();
+    }
+  };
+
+  class StripDeadDebugInfo : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripDeadDebugInfo()
+      : ModulePass(ID) {
+        initializeStripDeadDebugInfoPass(*PassRegistry::getPassRegistry());
+      }
+
+    bool runOnModule(Module &M) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char StripSymbols::ID = 0;
+INITIALIZE_PASS(StripSymbols, "strip",
+                "Strip all symbols from a module", false, false)
+
+ModulePass *llvm::createStripSymbolsPass(bool OnlyDebugInfo) {
+  return new StripSymbols(OnlyDebugInfo);
+}
+
+char StripNonDebugSymbols::ID = 0;
+INITIALIZE_PASS(StripNonDebugSymbols, "strip-nondebug",
+                "Strip all symbols, except dbg symbols, from a module",
+                false, false)
+
+ModulePass *llvm::createStripNonDebugSymbolsPass() {
+  return new StripNonDebugSymbols();
+}
+
+char StripDebugDeclare::ID = 0;
+INITIALIZE_PASS(StripDebugDeclare, "strip-debug-declare",
+                "Strip all llvm.dbg.declare intrinsics", false, false)
+
+ModulePass *llvm::createStripDebugDeclarePass() {
+  return new StripDebugDeclare();
+}
+
+char StripDeadDebugInfo::ID = 0;
+INITIALIZE_PASS(StripDeadDebugInfo, "strip-dead-debug-info",
+                "Strip debug info for unused symbols", false, false)
+
+ModulePass *llvm::createStripDeadDebugInfoPass() {
+  return new StripDeadDebugInfo();
+}
+
+/// OnlyUsedBy - Return true if V is only used by Usr.
+static bool OnlyUsedBy(Value *V, Value *Usr) {
+  for (User *U : V->users())
+    if (U != Usr)
+      return false;
+
+  return true;
+}
+
+static void RemoveDeadConstant(Constant *C) {
+  assert(C->use_empty() && "Constant is not dead!");
+  SmallPtrSet<Constant*, 4> Operands;
+  for (Value *Op : C->operands())
+    if (OnlyUsedBy(Op, C))
+      Operands.insert(cast<Constant>(Op));
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+    if (!GV->hasLocalLinkage()) return;   // Don't delete non-static globals.
+    GV->eraseFromParent();
+  }
+  else if (!isa<Function>(C))
+    if (isa<CompositeType>(C->getType()))
+      C->destroyConstant();
+
+  // If the constant referenced anything, see if we can delete it as well.
+  for (Constant *O : Operands)
+    RemoveDeadConstant(O);
+}
+
+// Strip the symbol table of its names.
+//
+static void StripSymtab(ValueSymbolTable &ST, bool PreserveDbgInfo) {
+  for (ValueSymbolTable::iterator VI = ST.begin(), VE = ST.end(); VI != VE; ) {
+    Value *V = VI->getValue();
+    ++VI;
+    if (!isa<GlobalValue>(V) || cast<GlobalValue>(V)->hasLocalLinkage()) {
+      if (!PreserveDbgInfo || !V->getName().startswith("llvm.dbg"))
+        // Set name to "", removing from symbol table!
+        V->setName("");
+    }
+  }
+}
+
+// Strip any named types of their names.
+static void StripTypeNames(Module &M, bool PreserveDbgInfo) {
+  TypeFinder StructTypes;
+  StructTypes.run(M, false);
+
+  for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
+    StructType *STy = StructTypes[i];
+    if (STy->isLiteral() || STy->getName().empty()) continue;
+
+    if (PreserveDbgInfo && STy->getName().startswith("llvm.dbg"))
+      continue;
+
+    STy->setName("");
+  }
+}
+
+/// Find values that are marked as llvm.used.
+static void findUsedValues(GlobalVariable *LLVMUsed,
+                           SmallPtrSetImpl<const GlobalValue*> &UsedValues) {
+  if (!LLVMUsed) return;
+  UsedValues.insert(LLVMUsed);
+
+  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+
+  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
+    if (GlobalValue *GV =
+          dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
+      UsedValues.insert(GV);
+}
+
+/// StripSymbolNames - Strip symbol names.
+static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
+
+  SmallPtrSet<const GlobalValue*, 8> llvmUsedValues;
+  findUsedValues(M.getGlobalVariable("llvm.used"), llvmUsedValues);
+  findUsedValues(M.getGlobalVariable("llvm.compiler.used"), llvmUsedValues);
+
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (I->hasLocalLinkage() && llvmUsedValues.count(&*I) == 0)
+      if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
+        I->setName("");     // Internal symbols can't participate in linkage
+  }
+
+  for (Function &I : M) {
+    if (I.hasLocalLinkage() && llvmUsedValues.count(&I) == 0)
+      if (!PreserveDbgInfo || !I.getName().startswith("llvm.dbg"))
+        I.setName(""); // Internal symbols can't participate in linkage
+    if (auto *Symtab = I.getValueSymbolTable())
+      StripSymtab(*Symtab, PreserveDbgInfo);
+  }
+
+  // Remove all names from types.
+  StripTypeNames(M, PreserveDbgInfo);
+
+  return true;
+}
+
+bool StripSymbols::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  bool Changed = false;
+  Changed |= StripDebugInfo(M);
+  if (!OnlyDebugInfo)
+    Changed |= StripSymbolNames(M, false);
+  return Changed;
+}
+
+bool StripNonDebugSymbols::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  return StripSymbolNames(M, true);
+}
+
+bool StripDebugDeclare::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  Function *Declare = M.getFunction("llvm.dbg.declare");
+  std::vector<Constant*> DeadConstants;
+
+  if (Declare) {
+    while (!Declare->use_empty()) {
+      CallInst *CI = cast<CallInst>(Declare->user_back());
+      Value *Arg1 = CI->getArgOperand(0);
+      Value *Arg2 = CI->getArgOperand(1);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg1->use_empty()) {
+        if (Constant *C = dyn_cast<Constant>(Arg1))
+          DeadConstants.push_back(C);
+        else
+          RecursivelyDeleteTriviallyDeadInstructions(Arg1);
+      }
+      if (Arg2->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg2))
+          DeadConstants.push_back(C);
+    }
+    Declare->eraseFromParent();
+  }
+
+  while (!DeadConstants.empty()) {
+    Constant *C = DeadConstants.back();
+    DeadConstants.pop_back();
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+      if (GV->hasLocalLinkage())
+        RemoveDeadConstant(GV);
+    } else
+      RemoveDeadConstant(C);
+  }
+
+  return true;
+}
+
+/// Remove any debug info for global variables/functions in the given module for
+/// which said global variable/function no longer exists (i.e. is null).
+///
+/// Debugging information is encoded in llvm IR using metadata. This is designed
+/// such a way that debug info for symbols preserved even if symbols are
+/// optimized away by the optimizer. This special pass removes debug info for
+/// such symbols.
+bool StripDeadDebugInfo::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  bool Changed = false;
+
+  LLVMContext &C = M.getContext();
+
+  // Find all debug info in F. This is actually overkill in terms of what we
+  // want to do, but we want to try and be as resilient as possible in the face
+  // of potential debug info changes by using the formal interfaces given to us
+  // as much as possible.
+  DebugInfoFinder F;
+  F.processModule(M);
+
+  // For each compile unit, find the live set of global variables/functions and
+  // replace the current list of potentially dead global variables/functions
+  // with the live list.
+  SmallVector<Metadata *, 64> LiveGlobalVariables;
+  DenseSet<DIGlobalVariableExpression *> VisitedSet;
+
+  std::set<DIGlobalVariableExpression *> LiveGVs;
+  for (GlobalVariable &GV : M.globals()) {
+    SmallVector<DIGlobalVariableExpression *, 1> GVEs;
+    GV.getDebugInfo(GVEs);
+    for (auto *GVE : GVEs)
+      LiveGVs.insert(GVE);
+  }
+
+  std::set<DICompileUnit *> LiveCUs;
+  // Any CU referenced from a subprogram is live.
+  for (DISubprogram *SP : F.subprograms()) {
+    if (SP->getUnit())
+      LiveCUs.insert(SP->getUnit());
+  }
+
+  bool HasDeadCUs = false;
+  for (DICompileUnit *DIC : F.compile_units()) {
+    // Create our live global variable list.
+    bool GlobalVariableChange = false;
+    for (auto *DIG : DIC->getGlobalVariables()) {
+      if (DIG->getExpression() && DIG->getExpression()->isConstant())
+        LiveGVs.insert(DIG);
+
+      // Make sure we only visit each global variable only once.
+      if (!VisitedSet.insert(DIG).second)
+        continue;
+
+      // If a global variable references DIG, the global variable is live.
+      if (LiveGVs.count(DIG))
+        LiveGlobalVariables.push_back(DIG);
+      else
+        GlobalVariableChange = true;
+    }
+
+    if (!LiveGlobalVariables.empty())
+      LiveCUs.insert(DIC);
+    else if (!LiveCUs.count(DIC))
+      HasDeadCUs = true;
+
+    // If we found dead global variables, replace the current global
+    // variable list with our new live global variable list.
+    if (GlobalVariableChange) {
+      DIC->replaceGlobalVariables(MDTuple::get(C, LiveGlobalVariables));
+      Changed = true;
+    }
+
+    // Reset lists for the next iteration.
+    LiveGlobalVariables.clear();
+  }
+
+  if (HasDeadCUs) {
+    // Delete the old node and replace it with a new one
+    NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
+    NMD->clearOperands();
+    if (!LiveCUs.empty()) {
+      for (DICompileUnit *CU : LiveCUs)
+        NMD->addOperand(CU);
+    }
+    Changed = true;
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
new file mode 100644
index 000000000000..8ef6bb652309
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -0,0 +1,491 @@
+//===- ThinLTOBitcodeWriter.cpp - Bitcode writing pass for ThinLTO --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+using namespace llvm;
+
+namespace {
+
+// Promote each local-linkage entity defined by ExportM and used by ImportM by
+// changing visibility and appending the given ModuleId.
+void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId,
+                      SetVector<GlobalValue *> &PromoteExtra) {
+  DenseMap<const Comdat *, Comdat *> RenamedComdats;
+  for (auto &ExportGV : ExportM.global_values()) {
+    if (!ExportGV.hasLocalLinkage())
+      continue;
+
+    auto Name = ExportGV.getName();
+    GlobalValue *ImportGV = ImportM.getNamedValue(Name);
+    if ((!ImportGV || ImportGV->use_empty()) && !PromoteExtra.count(&ExportGV))
+      continue;
+
+    std::string NewName = (Name + ModuleId).str();
+
+    if (const auto *C = ExportGV.getComdat())
+      if (C->getName() == Name)
+        RenamedComdats.try_emplace(C, ExportM.getOrInsertComdat(NewName));
+
+    ExportGV.setName(NewName);
+    ExportGV.setLinkage(GlobalValue::ExternalLinkage);
+    ExportGV.setVisibility(GlobalValue::HiddenVisibility);
+
+    if (ImportGV) {
+      ImportGV->setName(NewName);
+      ImportGV->setVisibility(GlobalValue::HiddenVisibility);
+    }
+  }
+
+  if (!RenamedComdats.empty())
+    for (auto &GO : ExportM.global_objects())
+      if (auto *C = GO.getComdat()) {
+        auto Replacement = RenamedComdats.find(C);
+        if (Replacement != RenamedComdats.end())
+          GO.setComdat(Replacement->second);
+      }
+}
+
+// Promote all internal (i.e. distinct) type ids used by the module by replacing
+// them with external type ids formed using the module id.
+//
+// Note that this needs to be done before we clone the module because each clone
+// will receive its own set of distinct metadata nodes.
+void promoteTypeIds(Module &M, StringRef ModuleId) {
+  DenseMap<Metadata *, Metadata *> LocalToGlobal;
+  auto ExternalizeTypeId = [&](CallInst *CI, unsigned ArgNo) {
+    Metadata *MD =
+        cast<MetadataAsValue>(CI->getArgOperand(ArgNo))->getMetadata();
+
+    if (isa<MDNode>(MD) && cast<MDNode>(MD)->isDistinct()) {
+      Metadata *&GlobalMD = LocalToGlobal[MD];
+      if (!GlobalMD) {
+        std::string NewName =
+            (to_string(LocalToGlobal.size()) + ModuleId).str();
+        GlobalMD = MDString::get(M.getContext(), NewName);
+      }
+
+      CI->setArgOperand(ArgNo,
+                        MetadataAsValue::get(M.getContext(), GlobalMD));
+    }
+  };
+
+  if (Function *TypeTestFunc =
+          M.getFunction(Intrinsic::getName(Intrinsic::type_test))) {
+    for (const Use &U : TypeTestFunc->uses()) {
+      auto CI = cast<CallInst>(U.getUser());
+      ExternalizeTypeId(CI, 1);
+    }
+  }
+
+  if (Function *TypeCheckedLoadFunc =
+          M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load))) {
+    for (const Use &U : TypeCheckedLoadFunc->uses()) {
+      auto CI = cast<CallInst>(U.getUser());
+      ExternalizeTypeId(CI, 2);
+    }
+  }
+
+  for (GlobalObject &GO : M.global_objects()) {
+    SmallVector<MDNode *, 1> MDs;
+    GO.getMetadata(LLVMContext::MD_type, MDs);
+
+    GO.eraseMetadata(LLVMContext::MD_type);
+    for (auto MD : MDs) {
+      auto I = LocalToGlobal.find(MD->getOperand(1));
+      if (I == LocalToGlobal.end()) {
+        GO.addMetadata(LLVMContext::MD_type, *MD);
+        continue;
+      }
+      GO.addMetadata(
+          LLVMContext::MD_type,
+          *MDNode::get(M.getContext(),
+                       ArrayRef<Metadata *>{MD->getOperand(0), I->second}));
+    }
+  }
+}
+
+// Drop unused globals, and drop type information from function declarations.
+// FIXME: If we made functions typeless then there would be no need to do this.
+void simplifyExternals(Module &M) {
+  FunctionType *EmptyFT =
+      FunctionType::get(Type::getVoidTy(M.getContext()), false);
+
+  for (auto I = M.begin(), E = M.end(); I != E;) {
+    Function &F = *I++;
+    if (F.isDeclaration() && F.use_empty()) {
+      F.eraseFromParent();
+      continue;
+    }
+
+    if (!F.isDeclaration() || F.getFunctionType() == EmptyFT)
+      continue;
+
+    Function *NewF =
+        Function::Create(EmptyFT, GlobalValue::ExternalLinkage, "", &M);
+    NewF->setVisibility(F.getVisibility());
+    NewF->takeName(&F);
+    F.replaceAllUsesWith(ConstantExpr::getBitCast(NewF, F.getType()));
+    F.eraseFromParent();
+  }
+
+  for (auto I = M.global_begin(), E = M.global_end(); I != E;) {
+    GlobalVariable &GV = *I++;
+    if (GV.isDeclaration() && GV.use_empty()) {
+      GV.eraseFromParent();
+      continue;
+    }
+  }
+}
+
+void filterModule(
+    Module *M, function_ref<bool(const GlobalValue *)> ShouldKeepDefinition) {
+  for (Module::alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E;) {
+    GlobalAlias *GA = &*I++;
+    if (ShouldKeepDefinition(GA))
+      continue;
+
+    GlobalObject *GO;
+    if (GA->getValueType()->isFunctionTy())
+      GO = Function::Create(cast<FunctionType>(GA->getValueType()),
+                            GlobalValue::ExternalLinkage, "", M);
+    else
+      GO = new GlobalVariable(
+          *M, GA->getValueType(), false, GlobalValue::ExternalLinkage,
+          nullptr, "", nullptr,
+          GA->getThreadLocalMode(), GA->getType()->getAddressSpace());
+    GO->takeName(GA);
+    GA->replaceAllUsesWith(GO);
+    GA->eraseFromParent();
+  }
+
+  for (Function &F : *M) {
+    if (ShouldKeepDefinition(&F))
+      continue;
+
+    F.deleteBody();
+    F.setComdat(nullptr);
+    F.clearMetadata();
+  }
+
+  for (GlobalVariable &GV : M->globals()) {
+    if (ShouldKeepDefinition(&GV))
+      continue;
+
+    GV.setInitializer(nullptr);
+    GV.setLinkage(GlobalValue::ExternalLinkage);
+    GV.setComdat(nullptr);
+    GV.clearMetadata();
+  }
+}
+
+void forEachVirtualFunction(Constant *C, function_ref<void(Function *)> Fn) {
+  if (auto *F = dyn_cast<Function>(C))
+    return Fn(F);
+  if (isa<GlobalValue>(C))
+    return;
+  for (Value *Op : C->operands())
+    forEachVirtualFunction(cast<Constant>(Op), Fn);
+}
+
+// If it's possible to split M into regular and thin LTO parts, do so and write
+// a multi-module bitcode file with the two parts to OS. Otherwise, write only a
+// regular LTO bitcode file to OS.
+void splitAndWriteThinLTOBitcode(
+    raw_ostream &OS, raw_ostream *ThinLinkOS,
+    function_ref<AAResults &(Function &)> AARGetter, Module &M) {
+  std::string ModuleId = getUniqueModuleId(&M);
+  if (ModuleId.empty()) {
+    // We couldn't generate a module ID for this module, just write it out as a
+    // regular LTO module.
+    WriteBitcodeToFile(&M, OS);
+    if (ThinLinkOS)
+      // We don't have a ThinLTO part, but still write the module to the
+      // ThinLinkOS if requested so that the expected output file is produced.
+      WriteBitcodeToFile(&M, *ThinLinkOS);
+    return;
+  }
+
+  promoteTypeIds(M, ModuleId);
+
+  // Returns whether a global has attached type metadata. Such globals may
+  // participate in CFI or whole-program devirtualization, so they need to
+  // appear in the merged module instead of the thin LTO module.
+  auto HasTypeMetadata = [&](const GlobalObject *GO) {
+    SmallVector<MDNode *, 1> MDs;
+    GO->getMetadata(LLVMContext::MD_type, MDs);
+    return !MDs.empty();
+  };
+
+  // Collect the set of virtual functions that are eligible for virtual constant
+  // propagation. Each eligible function must not access memory, must return
+  // an integer of width <=64 bits, must take at least one argument, must not
+  // use its first argument (assumed to be "this") and all arguments other than
+  // the first one must be of <=64 bit integer type.
+  //
+  // Note that we test whether this copy of the function is readnone, rather
+  // than testing function attributes, which must hold for any copy of the
+  // function, even a less optimized version substituted at link time. This is
+  // sound because the virtual constant propagation optimizations effectively
+  // inline all implementations of the virtual function into each call site,
+  // rather than using function attributes to perform local optimization.
+  std::set<const Function *> EligibleVirtualFns;
+  // If any member of a comdat lives in MergedM, put all members of that
+  // comdat in MergedM to keep the comdat together.
+  DenseSet<const Comdat *> MergedMComdats;
+  for (GlobalVariable &GV : M.globals())
+    if (HasTypeMetadata(&GV)) {
+      if (const auto *C = GV.getComdat())
+        MergedMComdats.insert(C);
+      forEachVirtualFunction(GV.getInitializer(), [&](Function *F) {
+        auto *RT = dyn_cast<IntegerType>(F->getReturnType());
+        if (!RT || RT->getBitWidth() > 64 || F->arg_empty() ||
+            !F->arg_begin()->use_empty())
+          return;
+        for (auto &Arg : make_range(std::next(F->arg_begin()), F->arg_end())) {
+          auto *ArgT = dyn_cast<IntegerType>(Arg.getType());
+          if (!ArgT || ArgT->getBitWidth() > 64)
+            return;
+        }
+        if (!F->isDeclaration() &&
+            computeFunctionBodyMemoryAccess(*F, AARGetter(*F)) == MAK_ReadNone)
+          EligibleVirtualFns.insert(F);
+      });
+    }
+
+  ValueToValueMapTy VMap;
+  std::unique_ptr<Module> MergedM(
+      CloneModule(&M, VMap, [&](const GlobalValue *GV) -> bool {
+        if (const auto *C = GV->getComdat())
+          if (MergedMComdats.count(C))
+            return true;
+        if (auto *F = dyn_cast<Function>(GV))
+          return EligibleVirtualFns.count(F);
+        if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+          return HasTypeMetadata(GVar);
+        return false;
+      }));
+  StripDebugInfo(*MergedM);
+
+  for (Function &F : *MergedM)
+    if (!F.isDeclaration()) {
+      // Reset the linkage of all functions eligible for virtual constant
+      // propagation. The canonical definitions live in the thin LTO module so
+      // that they can be imported.
+      F.setLinkage(GlobalValue::AvailableExternallyLinkage);
+      F.setComdat(nullptr);
+    }
+
+  SetVector<GlobalValue *> CfiFunctions;
+  for (auto &F : M)
+    if ((!F.hasLocalLinkage() || F.hasAddressTaken()) && HasTypeMetadata(&F))
+      CfiFunctions.insert(&F);
+
+  // Remove all globals with type metadata, globals with comdats that live in
+  // MergedM, and aliases pointing to such globals from the thin LTO module.
+  filterModule(&M, [&](const GlobalValue *GV) {
+    if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+      if (HasTypeMetadata(GVar))
+        return false;
+    if (const auto *C = GV->getComdat())
+      if (MergedMComdats.count(C))
+        return false;
+    return true;
+  });
+
+  promoteInternals(*MergedM, M, ModuleId, CfiFunctions);
+  promoteInternals(M, *MergedM, ModuleId, CfiFunctions);
+
+  SmallVector<MDNode *, 8> CfiFunctionMDs;
+  for (auto V : CfiFunctions) {
+    Function &F = *cast<Function>(V);
+    SmallVector<MDNode *, 2> Types;
+    F.getMetadata(LLVMContext::MD_type, Types);
+
+    auto &Ctx = MergedM->getContext();
+    SmallVector<Metadata *, 4> Elts;
+    Elts.push_back(MDString::get(Ctx, F.getName()));
+    CfiFunctionLinkage Linkage;
+    if (!F.isDeclarationForLinker())
+      Linkage = CFL_Definition;
+    else if (F.isWeakForLinker())
+      Linkage = CFL_WeakDeclaration;
+    else
+      Linkage = CFL_Declaration;
+    Elts.push_back(ConstantAsMetadata::get(
+        llvm::ConstantInt::get(Type::getInt8Ty(Ctx), Linkage)));
+    for (auto Type : Types)
+      Elts.push_back(Type);
+    CfiFunctionMDs.push_back(MDTuple::get(Ctx, Elts));
+  }
+
+  if(!CfiFunctionMDs.empty()) {
+    NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("cfi.functions");
+    for (auto MD : CfiFunctionMDs)
+      NMD->addOperand(MD);
+  }
+
+  simplifyExternals(*MergedM);
+
+  // FIXME: Try to re-use BSI and PFI from the original module here.
+  ProfileSummaryInfo PSI(M);
+  ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
+
+  // Mark the merged module as requiring full LTO. We still want an index for
+  // it though, so that it can participate in summary-based dead stripping.
+  MergedM->addModuleFlag(Module::Error, "ThinLTO", uint32_t(0));
+  ModuleSummaryIndex MergedMIndex =
+      buildModuleSummaryIndex(*MergedM, nullptr, &PSI);
+
+  SmallVector<char, 0> Buffer;
+
+  BitcodeWriter W(Buffer);
+  // Save the module hash produced for the full bitcode, which will
+  // be used in the backends, and use that in the minimized bitcode
+  // produced for the full link.
+  ModuleHash ModHash = {{0}};
+  W.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
+                /*GenerateHash=*/true, &ModHash);
+  W.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false,
+                &MergedMIndex);
+  W.writeSymtab();
+  W.writeStrtab();
+  OS << Buffer;
+
+  // If a minimized bitcode module was requested for the thin link,
+  // strip the debug info (the merged module was already stripped above)
+  // and write it to the given OS.
+  if (ThinLinkOS) {
+    Buffer.clear();
+    BitcodeWriter W2(Buffer);
+    StripDebugInfo(M);
+    W2.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
+                   /*GenerateHash=*/false, &ModHash);
+    W2.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false,
+                   &MergedMIndex);
+    W2.writeSymtab();
+    W2.writeStrtab();
+    *ThinLinkOS << Buffer;
+  }
+}
+
+// Returns whether this module needs to be split because it uses type metadata.
+bool requiresSplit(Module &M) {
+  SmallVector<MDNode *, 1> MDs;
+  for (auto &GO : M.global_objects()) {
+    GO.getMetadata(LLVMContext::MD_type, MDs);
+    if (!MDs.empty())
+      return true;
+  }
+
+  return false;
+}
+
+void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
+                         function_ref<AAResults &(Function &)> AARGetter,
+                         Module &M, const ModuleSummaryIndex *Index) {
+  // See if this module has any type metadata. If so, we need to split it.
+  if (requiresSplit(M))
+    return splitAndWriteThinLTOBitcode(OS, ThinLinkOS, AARGetter, M);
+
+  // Otherwise we can just write it out as a regular module.
+
+  // Save the module hash produced for the full bitcode, which will
+  // be used in the backends, and use that in the minimized bitcode
+  // produced for the full link.
+  ModuleHash ModHash = {{0}};
+  WriteBitcodeToFile(&M, OS, /*ShouldPreserveUseListOrder=*/false, Index,
+                     /*GenerateHash=*/true, &ModHash);
+  // If a minimized bitcode module was requested for the thin link,
+  // strip the debug info and write it to the given OS.
+  if (ThinLinkOS) {
+    StripDebugInfo(M);
+    WriteBitcodeToFile(&M, *ThinLinkOS, /*ShouldPreserveUseListOrder=*/false,
+                       Index,
+                       /*GenerateHash=*/false, &ModHash);
+  }
+}
+
+class WriteThinLTOBitcode : public ModulePass {
+  raw_ostream &OS; // raw_ostream to print on
+  // The output stream on which to emit a minimized module for use
+  // just in the thin link, if requested.
+  raw_ostream *ThinLinkOS;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()), ThinLinkOS(nullptr) {
+    initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
+  }
+
+  explicit WriteThinLTOBitcode(raw_ostream &o, raw_ostream *ThinLinkOS)
+      : ModulePass(ID), OS(o), ThinLinkOS(ThinLinkOS) {
+    initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "ThinLTO Bitcode Writer"; }
+
+  bool runOnModule(Module &M) override {
+    const ModuleSummaryIndex *Index =
+        &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex());
+    writeThinLTOBitcode(OS, ThinLinkOS, LegacyAARGetter(*this), M, Index);
+    return true;
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<ModuleSummaryIndexWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+} // anonymous namespace
+
+char WriteThinLTOBitcode::ID = 0;
+INITIALIZE_PASS_BEGIN(WriteThinLTOBitcode, "write-thinlto-bitcode",
+                      "Write ThinLTO Bitcode", false, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(WriteThinLTOBitcode, "write-thinlto-bitcode",
+                    "Write ThinLTO Bitcode", false, true)
+
+ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str,
+                                                raw_ostream *ThinLinkOS) {
+  return new WriteThinLTOBitcode(Str, ThinLinkOS);
+}
+
+PreservedAnalyses
+llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  writeThinLTOBitcode(OS, ThinLinkOS,
+                      [&FAM](Function &F) -> AAResults & {
+                        return FAM.getResult<AAManager>(F);
+                      },
+                      M, &AM.getResult<ModuleSummaryIndexAnalysis>(M));
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
new file mode 100644
index 000000000000..00769cd63229
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -0,0 +1,1424 @@
+//===- WholeProgramDevirt.cpp - Whole program virtual call optimization ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements whole program optimization of virtual calls in cases
+// where we know (via !type metadata) that the list of callees is fixed. This
+// includes the following:
+// - Single implementation devirtualization: if a virtual call has a single
+//   possible callee, replace all calls with a direct call to that callee.
+// - Virtual constant propagation: if the virtual function's return type is an
+//   integer <=64 bits and all possible callees are readnone, for each class and
+//   each list of constant arguments: evaluate the function, store the return
+//   value alongside the virtual table, and rewrite each virtual call as a load
+//   from the virtual table.
+// - Uniform return value optimization: if the conditions for virtual constant
+//   propagation hold and each function returns the same constant value, replace
+//   each virtual call with that constant.
+// - Unique return value optimization for i1 return values: if the conditions
+//   for virtual constant propagation hold and a single vtable's function
+//   returns 0, or a single vtable's function returns 1, replace each virtual
+//   call with a comparison of the vptr against that vtable's address.
+//
+// This pass is intended to be used during the regular and thin LTO pipelines.
+// During regular LTO, the pass determines the best optimization for each
+// virtual call and applies the resolutions directly to virtual calls that are
+// eligible for virtual call optimization (i.e. calls that use either of the
+// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics). During
+// ThinLTO, the pass operates in two phases:
+// - Export phase: this is run during the thin link over a single merged module
+//   that contains all vtables with !type metadata that participate in the link.
+//   The pass computes a resolution for each virtual call and stores it in the
+//   type identifier summary.
+// - Import phase: this is run during the thin backends over the individual
+//   modules. The pass applies the resolutions previously computed during the
+//   import phase to each eligible virtual call.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndexYAML.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/Utils/Evaluator.h"
+#include <algorithm>
+#include <cstddef>
+#include <map>
+#include <set>
+#include <string>
+
+using namespace llvm;
+using namespace wholeprogramdevirt;
+
+#define DEBUG_TYPE "wholeprogramdevirt"
+
+static cl::opt<PassSummaryAction> ClSummaryAction(
+    "wholeprogramdevirt-summary-action",
+    cl::desc("What to do with the summary when running this pass"),
+    cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"),
+               clEnumValN(PassSummaryAction::Import, "import",
+                          "Import typeid resolutions from summary and globals"),
+               clEnumValN(PassSummaryAction::Export, "export",
+                          "Export typeid resolutions to summary and globals")),
+    cl::Hidden);
+
+static cl::opt<std::string> ClReadSummary(
+    "wholeprogramdevirt-read-summary",
+    cl::desc("Read summary from given YAML file before running pass"),
+    cl::Hidden);
+
+static cl::opt<std::string> ClWriteSummary(
+    "wholeprogramdevirt-write-summary",
+    cl::desc("Write summary to given YAML file after running pass"),
+    cl::Hidden);
+
+// Find the minimum offset that we may store a value of size Size bits at. If
+// IsAfter is set, look for an offset before the object, otherwise look for an
+// offset after the object.
+uint64_t
+wholeprogramdevirt::findLowestOffset(ArrayRef<VirtualCallTarget> Targets,
+                                     bool IsAfter, uint64_t Size) {
+  // Find a minimum offset taking into account only vtable sizes.
+  uint64_t MinByte = 0;
+  for (const VirtualCallTarget &Target : Targets) {
+    if (IsAfter)
+      MinByte = std::max(MinByte, Target.minAfterBytes());
+    else
+      MinByte = std::max(MinByte, Target.minBeforeBytes());
+  }
+
+  // Build a vector of arrays of bytes covering, for each target, a slice of the
+  // used region (see AccumBitVector::BytesUsed in
+  // llvm/Transforms/IPO/WholeProgramDevirt.h) starting at MinByte. Effectively,
+  // this aligns the used regions to start at MinByte.
+  //
+  // In this example, A, B and C are vtables, # is a byte already allocated for
+  // a virtual function pointer, AAAA... (etc.) are the used regions for the
+  // vtables and Offset(X) is the value computed for the Offset variable below
+  // for X.
+  //
+  //                    Offset(A)
+  //                    |       |
+  //                            |MinByte
+  // A: ################AAAAAAAA|AAAAAAAA
+  // B: ########BBBBBBBBBBBBBBBB|BBBB
+  // C: ########################|CCCCCCCCCCCCCCCC
+  //            |   Offset(B)   |
+  //
+  // This code produces the slices of A, B and C that appear after the divider
+  // at MinByte.
+  std::vector<ArrayRef<uint8_t>> Used;
+  for (const VirtualCallTarget &Target : Targets) {
+    ArrayRef<uint8_t> VTUsed = IsAfter ? Target.TM->Bits->After.BytesUsed
+                                       : Target.TM->Bits->Before.BytesUsed;
+    uint64_t Offset = IsAfter ? MinByte - Target.minAfterBytes()
+                              : MinByte - Target.minBeforeBytes();
+
+    // Disregard used regions that are smaller than Offset. These are
+    // effectively all-free regions that do not need to be checked.
+    if (VTUsed.size() > Offset)
+      Used.push_back(VTUsed.slice(Offset));
+  }
+
+  if (Size == 1) {
+    // Find a free bit in each member of Used.
+    for (unsigned I = 0;; ++I) {
+      uint8_t BitsUsed = 0;
+      for (auto &&B : Used)
+        if (I < B.size())
+          BitsUsed |= B[I];
+      if (BitsUsed != 0xff)
+        return (MinByte + I) * 8 +
+               countTrailingZeros(uint8_t(~BitsUsed), ZB_Undefined);
+    }
+  } else {
+    // Find a free (Size/8) byte region in each member of Used.
+    // FIXME: see if alignment helps.
+    for (unsigned I = 0;; ++I) {
+      for (auto &&B : Used) {
+        unsigned Byte = 0;
+        while ((I + Byte) < B.size() && Byte < (Size / 8)) {
+          if (B[I + Byte])
+            goto NextI;
+          ++Byte;
+        }
+      }
+      return (MinByte + I) * 8;
+    NextI:;
+    }
+  }
+}
+
+void wholeprogramdevirt::setBeforeReturnValues(
+    MutableArrayRef<VirtualCallTarget> Targets, uint64_t AllocBefore,
+    unsigned BitWidth, int64_t &OffsetByte, uint64_t &OffsetBit) {
+  if (BitWidth == 1)
+    OffsetByte = -(AllocBefore / 8 + 1);
+  else
+    OffsetByte = -((AllocBefore + 7) / 8 + (BitWidth + 7) / 8);
+  OffsetBit = AllocBefore % 8;
+
+  for (VirtualCallTarget &Target : Targets) {
+    if (BitWidth == 1)
+      Target.setBeforeBit(AllocBefore);
+    else
+      Target.setBeforeBytes(AllocBefore, (BitWidth + 7) / 8);
+  }
+}
+
+void wholeprogramdevirt::setAfterReturnValues(
+    MutableArrayRef<VirtualCallTarget> Targets, uint64_t AllocAfter,
+    unsigned BitWidth, int64_t &OffsetByte, uint64_t &OffsetBit) {
+  if (BitWidth == 1)
+    OffsetByte = AllocAfter / 8;
+  else
+    OffsetByte = (AllocAfter + 7) / 8;
+  OffsetBit = AllocAfter % 8;
+
+  for (VirtualCallTarget &Target : Targets) {
+    if (BitWidth == 1)
+      Target.setAfterBit(AllocAfter);
+    else
+      Target.setAfterBytes(AllocAfter, (BitWidth + 7) / 8);
+  }
+}
+
+VirtualCallTarget::VirtualCallTarget(Function *Fn, const TypeMemberInfo *TM)
+    : Fn(Fn), TM(TM),
+      IsBigEndian(Fn->getParent()->getDataLayout().isBigEndian()), WasDevirt(false) {}
+
+namespace {
+
+// A slot in a set of virtual tables. The TypeID identifies the set of virtual
+// tables, and the ByteOffset is the offset in bytes from the address point to
+// the virtual function pointer.
+struct VTableSlot {
+  Metadata *TypeID;
+  uint64_t ByteOffset;
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <> struct DenseMapInfo<VTableSlot> {
+  static VTableSlot getEmptyKey() {
+    return {DenseMapInfo<Metadata *>::getEmptyKey(),
+            DenseMapInfo<uint64_t>::getEmptyKey()};
+  }
+  static VTableSlot getTombstoneKey() {
+    return {DenseMapInfo<Metadata *>::getTombstoneKey(),
+            DenseMapInfo<uint64_t>::getTombstoneKey()};
+  }
+  static unsigned getHashValue(const VTableSlot &I) {
+    return DenseMapInfo<Metadata *>::getHashValue(I.TypeID) ^
+           DenseMapInfo<uint64_t>::getHashValue(I.ByteOffset);
+  }
+  static bool isEqual(const VTableSlot &LHS,
+                      const VTableSlot &RHS) {
+    return LHS.TypeID == RHS.TypeID && LHS.ByteOffset == RHS.ByteOffset;
+  }
+};
+
+} // end namespace llvm
+
+namespace {
+
+// A virtual call site. VTable is the loaded virtual table pointer, and CS is
+// the indirect virtual call.
+struct VirtualCallSite {
+  Value *VTable;
+  CallSite CS;
+
+  // If non-null, this field points to the associated unsafe use count stored in
+  // the DevirtModule::NumUnsafeUsesForTypeTest map below. See the description
+  // of that field for details.
+  unsigned *NumUnsafeUses;
+
+  void emitRemark(const Twine &OptName, const Twine &TargetName) {
+    Function *F = CS.getCaller();
+    emitOptimizationRemark(
+        F->getContext(), DEBUG_TYPE, *F,
+        CS.getInstruction()->getDebugLoc(),
+        OptName + ": devirtualized a call to " + TargetName);
+  }
+
+  void replaceAndErase(const Twine &OptName, const Twine &TargetName,
+                       bool RemarksEnabled, Value *New) {
+    if (RemarksEnabled)
+      emitRemark(OptName, TargetName);
+    CS->replaceAllUsesWith(New);
+    if (auto II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      BranchInst::Create(II->getNormalDest(), CS.getInstruction());
+      II->getUnwindDest()->removePredecessor(II->getParent());
+    }
+    CS->eraseFromParent();
+    // This use is no longer unsafe.
+    if (NumUnsafeUses)
+      --*NumUnsafeUses;
+  }
+};
+
+// Call site information collected for a specific VTableSlot and possibly a list
+// of constant integer arguments. The grouping by arguments is handled by the
+// VTableSlotInfo class.
+struct CallSiteInfo {
+  /// The set of call sites for this slot. Used during regular LTO and the
+  /// import phase of ThinLTO (as well as the export phase of ThinLTO for any
+  /// call sites that appear in the merged module itself); in each of these
+  /// cases we are directly operating on the call sites at the IR level.
+  std::vector<VirtualCallSite> CallSites;
+
+  // These fields are used during the export phase of ThinLTO and reflect
+  // information collected from function summaries.
+
+  /// Whether any function summary contains an llvm.assume(llvm.type.test) for
+  /// this slot.
+  bool SummaryHasTypeTestAssumeUsers;
+
+  /// CFI-specific: a vector containing the list of function summaries that use
+  /// the llvm.type.checked.load intrinsic and therefore will require
+  /// resolutions for llvm.type.test in order to implement CFI checks if
+  /// devirtualization was unsuccessful. If devirtualization was successful, the
+  /// pass will clear this vector by calling markDevirt(). If at the end of the
+  /// pass the vector is non-empty, we will need to add a use of llvm.type.test
+  /// to each of the function summaries in the vector.
+  std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers;
+
+  bool isExported() const {
+    return SummaryHasTypeTestAssumeUsers ||
+           !SummaryTypeCheckedLoadUsers.empty();
+  }
+
+  /// As explained in the comment for SummaryTypeCheckedLoadUsers.
+  void markDevirt() { SummaryTypeCheckedLoadUsers.clear(); }
+};
+
+// Call site information collected for a specific VTableSlot.
+struct VTableSlotInfo {
+  // The set of call sites which do not have all constant integer arguments
+  // (excluding "this").
+  CallSiteInfo CSInfo;
+
+  // The set of call sites with all constant integer arguments (excluding
+  // "this"), grouped by argument list.
+  std::map<std::vector<uint64_t>, CallSiteInfo> ConstCSInfo;
+
+  void addCallSite(Value *VTable, CallSite CS, unsigned *NumUnsafeUses);
+
+private:
+  CallSiteInfo &findCallSiteInfo(CallSite CS);
+};
+
+CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallSite CS) {
+  std::vector<uint64_t> Args;
+  auto *CI = dyn_cast<IntegerType>(CS.getType());
+  if (!CI || CI->getBitWidth() > 64 || CS.arg_empty())
+    return CSInfo;
+  for (auto &&Arg : make_range(CS.arg_begin() + 1, CS.arg_end())) {
+    auto *CI = dyn_cast<ConstantInt>(Arg);
+    if (!CI || CI->getBitWidth() > 64)
+      return CSInfo;
+    Args.push_back(CI->getZExtValue());
+  }
+  return ConstCSInfo[Args];
+}
+
+void VTableSlotInfo::addCallSite(Value *VTable, CallSite CS,
+                                 unsigned *NumUnsafeUses) {
+  findCallSiteInfo(CS).CallSites.push_back({VTable, CS, NumUnsafeUses});
+}
+
+struct DevirtModule {
+  Module &M;
+  function_ref<AAResults &(Function &)> AARGetter;
+
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+
+  IntegerType *Int8Ty;
+  PointerType *Int8PtrTy;
+  IntegerType *Int32Ty;
+  IntegerType *Int64Ty;
+  IntegerType *IntPtrTy;
+
+  bool RemarksEnabled;
+
+  MapVector<VTableSlot, VTableSlotInfo> CallSlots;
+
+  // This map keeps track of the number of "unsafe" uses of a loaded function
+  // pointer. The key is the associated llvm.type.test intrinsic call generated
+  // by this pass. An unsafe use is one that calls the loaded function pointer
+  // directly. Every time we eliminate an unsafe use (for example, by
+  // devirtualizing it or by applying virtual constant propagation), we
+  // decrement the value stored in this map. If a value reaches zero, we can
+  // eliminate the type check by RAUWing the associated llvm.type.test call with
+  // true.
+  std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest;
+
+  DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter,
+               ModuleSummaryIndex *ExportSummary,
+               const ModuleSummaryIndex *ImportSummary)
+      : M(M), AARGetter(AARGetter), ExportSummary(ExportSummary),
+        ImportSummary(ImportSummary), Int8Ty(Type::getInt8Ty(M.getContext())),
+        Int8PtrTy(Type::getInt8PtrTy(M.getContext())),
+        Int32Ty(Type::getInt32Ty(M.getContext())),
+        Int64Ty(Type::getInt64Ty(M.getContext())),
+        IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)),
+        RemarksEnabled(areRemarksEnabled()) {
+    assert(!(ExportSummary && ImportSummary));
+  }
+
+  bool areRemarksEnabled();
+
+  void scanTypeTestUsers(Function *TypeTestFunc, Function *AssumeFunc);
+  void scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc);
+
+  void buildTypeIdentifierMap(
+      std::vector<VTableBits> &Bits,
+      DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
+  Constant *getPointerAtOffset(Constant *I, uint64_t Offset);
+  bool
+  tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot,
+                            const std::set<TypeMemberInfo> &TypeMemberInfos,
+                            uint64_t ByteOffset);
+
+  void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn,
+                             bool &IsExported);
+  bool trySingleImplDevirt(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+                           VTableSlotInfo &SlotInfo,
+                           WholeProgramDevirtResolution *Res);
+
+  bool tryEvaluateFunctionsWithArgs(
+      MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+      ArrayRef<uint64_t> Args);
+
+  void applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                             uint64_t TheRetVal);
+  bool tryUniformRetValOpt(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+                           CallSiteInfo &CSInfo,
+                           WholeProgramDevirtResolution::ByArg *Res);
+
+  // Returns the global symbol name that is used to export information about the
+  // given vtable slot and list of arguments.
+  std::string getGlobalName(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                            StringRef Name);
+
+  // This function is called during the export phase to create a symbol
+  // definition containing information about the given vtable slot and list of
+  // arguments.
+  void exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name,
+                    Constant *C);
+
+  // This function is called during the import phase to create a reference to
+  // the symbol definition created during the export phase.
+  Constant *importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                         StringRef Name, unsigned AbsWidth = 0);
+
+  void applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, bool IsOne,
+                            Constant *UniqueMemberAddr);
+  bool tryUniqueRetValOpt(unsigned BitWidth,
+                          MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+                          CallSiteInfo &CSInfo,
+                          WholeProgramDevirtResolution::ByArg *Res,
+                          VTableSlot Slot, ArrayRef<uint64_t> Args);
+
+  void applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
+                             Constant *Byte, Constant *Bit);
+  bool tryVirtualConstProp(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+                           VTableSlotInfo &SlotInfo,
+                           WholeProgramDevirtResolution *Res, VTableSlot Slot);
+
+  void rebuildGlobal(VTableBits &B);
+
+  // Apply the summary resolution for Slot to all virtual calls in SlotInfo.
+  void importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo);
+
+  // If we were able to eliminate all unsafe uses for a type checked load,
+  // eliminate the associated type tests by replacing them with true.
+  void removeRedundantTypeTests();
+
+  bool run();
+
+  // Lower the module using the action and summary passed as command line
+  // arguments. For testing purposes only.
+  static bool runForTesting(Module &M,
+                            function_ref<AAResults &(Function &)> AARGetter);
+};
+
+struct WholeProgramDevirt : public ModulePass {
+  static char ID;
+
+  bool UseCommandLine = false;
+
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+
+  WholeProgramDevirt() : ModulePass(ID), UseCommandLine(true) {
+    initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
+  }
+
+  WholeProgramDevirt(ModuleSummaryIndex *ExportSummary,
+                     const ModuleSummaryIndex *ImportSummary)
+      : ModulePass(ID), ExportSummary(ExportSummary),
+        ImportSummary(ImportSummary) {
+    initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    if (UseCommandLine)
+      return DevirtModule::runForTesting(M, LegacyAARGetter(*this));
+    return DevirtModule(M, LegacyAARGetter(*this), ExportSummary, ImportSummary)
+        .run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(WholeProgramDevirt, "wholeprogramdevirt",
+                      "Whole program devirtualization", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(WholeProgramDevirt, "wholeprogramdevirt",
+                    "Whole program devirtualization", false, false)
+char WholeProgramDevirt::ID = 0;
+
+ModulePass *
+llvm::createWholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary,
+                                   const ModuleSummaryIndex *ImportSummary) {
+  return new WholeProgramDevirt(ExportSummary, ImportSummary);
+}
+
+PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
+                                              ModuleAnalysisManager &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto AARGetter = [&](Function &F) -> AAResults & {
+    return FAM.getResult<AAManager>(F);
+  };
+  if (!DevirtModule(M, AARGetter, nullptr, nullptr).run())
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+bool DevirtModule::runForTesting(
+    Module &M, function_ref<AAResults &(Function &)> AARGetter) {
+  ModuleSummaryIndex Summary;
+
+  // Handle the command-line summary arguments. This code is for testing
+  // purposes only, so we handle errors directly.
+  if (!ClReadSummary.empty()) {
+    ExitOnError ExitOnErr("-wholeprogramdevirt-read-summary: " + ClReadSummary +
+                          ": ");
+    auto ReadSummaryFile =
+        ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+
+    yaml::Input In(ReadSummaryFile->getBuffer());
+    In >> Summary;
+    ExitOnErr(errorCodeToError(In.error()));
+  }
+
+  bool Changed =
+      DevirtModule(
+          M, AARGetter,
+          ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr,
+          ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr)
+          .run();
+
+  if (!ClWriteSummary.empty()) {
+    ExitOnError ExitOnErr(
+        "-wholeprogramdevirt-write-summary: " + ClWriteSummary + ": ");
+    std::error_code EC;
+    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+    ExitOnErr(errorCodeToError(EC));
+
+    yaml::Output Out(OS);
+    Out << Summary;
+  }
+
+  return Changed;
+}
+
+void DevirtModule::buildTypeIdentifierMap(
+    std::vector<VTableBits> &Bits,
+    DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
+  DenseMap<GlobalVariable *, VTableBits *> GVToBits;
+  Bits.reserve(M.getGlobalList().size());
+  SmallVector<MDNode *, 2> Types;
+  for (GlobalVariable &GV : M.globals()) {
+    Types.clear();
+    GV.getMetadata(LLVMContext::MD_type, Types);
+    if (Types.empty())
+      continue;
+
+    VTableBits *&BitsPtr = GVToBits[&GV];
+    if (!BitsPtr) {
+      Bits.emplace_back();
+      Bits.back().GV = &GV;
+      Bits.back().ObjectSize =
+          M.getDataLayout().getTypeAllocSize(GV.getInitializer()->getType());
+      BitsPtr = &Bits.back();
+    }
+
+    for (MDNode *Type : Types) {
+      auto TypeID = Type->getOperand(1).get();
+
+      uint64_t Offset =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+              ->getZExtValue();
+
+      TypeIdMap[TypeID].insert({BitsPtr, Offset});
+    }
+  }
+}
+
+Constant *DevirtModule::getPointerAtOffset(Constant *I, uint64_t Offset) {
+  if (I->getType()->isPointerTy()) {
+    if (Offset == 0)
+      return I;
+    return nullptr;
+  }
+
+  const DataLayout &DL = M.getDataLayout();
+
+  if (auto *C = dyn_cast<ConstantStruct>(I)) {
+    const StructLayout *SL = DL.getStructLayout(C->getType());
+    if (Offset >= SL->getSizeInBytes())
+      return nullptr;
+
+    unsigned Op = SL->getElementContainingOffset(Offset);
+    return getPointerAtOffset(cast<Constant>(I->getOperand(Op)),
+                              Offset - SL->getElementOffset(Op));
+  }
+  if (auto *C = dyn_cast<ConstantArray>(I)) {
+    ArrayType *VTableTy = C->getType();
+    uint64_t ElemSize = DL.getTypeAllocSize(VTableTy->getElementType());
+
+    unsigned Op = Offset / ElemSize;
+    if (Op >= C->getNumOperands())
+      return nullptr;
+
+    return getPointerAtOffset(cast<Constant>(I->getOperand(Op)),
+                              Offset % ElemSize);
+  }
+  return nullptr;
+}
+
+bool DevirtModule::tryFindVirtualCallTargets(
+    std::vector<VirtualCallTarget> &TargetsForSlot,
+    const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) {
+  for (const TypeMemberInfo &TM : TypeMemberInfos) {
+    if (!TM.Bits->GV->isConstant())
+      return false;
+
+    Constant *Ptr = getPointerAtOffset(TM.Bits->GV->getInitializer(),
+                                       TM.Offset + ByteOffset);
+    if (!Ptr)
+      return false;
+
+    auto Fn = dyn_cast<Function>(Ptr->stripPointerCasts());
+    if (!Fn)
+      return false;
+
+    // We can disregard __cxa_pure_virtual as a possible call target, as
+    // calls to pure virtuals are UB.
+    if (Fn->getName() == "__cxa_pure_virtual")
+      continue;
+
+    TargetsForSlot.push_back({Fn, &TM});
+  }
+
+  // Give up if we couldn't find any targets.
+  return !TargetsForSlot.empty();
+}
+
+void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
+                                         Constant *TheFn, bool &IsExported) {
+  auto Apply = [&](CallSiteInfo &CSInfo) {
+    for (auto &&VCallSite : CSInfo.CallSites) {
+      if (RemarksEnabled)
+        VCallSite.emitRemark("single-impl", TheFn->getName());
+      VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast(
+          TheFn, VCallSite.CS.getCalledValue()->getType()));
+      // This use is no longer unsafe.
+      if (VCallSite.NumUnsafeUses)
+        --*VCallSite.NumUnsafeUses;
+    }
+    if (CSInfo.isExported()) {
+      IsExported = true;
+      CSInfo.markDevirt();
+    }
+  };
+  Apply(SlotInfo.CSInfo);
+  for (auto &P : SlotInfo.ConstCSInfo)
+    Apply(P.second);
+}
+
+bool DevirtModule::trySingleImplDevirt(
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+    VTableSlotInfo &SlotInfo, WholeProgramDevirtResolution *Res) {
+  // See if the program contains a single implementation of this virtual
+  // function.
+  Function *TheFn = TargetsForSlot[0].Fn;
+  for (auto &&Target : TargetsForSlot)
+    if (TheFn != Target.Fn)
+      return false;
+
+  // If so, update each call site to call that implementation directly.
+  if (RemarksEnabled)
+    TargetsForSlot[0].WasDevirt = true;
+
+  bool IsExported = false;
+  applySingleImplDevirt(SlotInfo, TheFn, IsExported);
+  if (!IsExported)
+    return false;
+
+  // If the only implementation has local linkage, we must promote to external
+  // to make it visible to thin LTO objects. We can only get here during the
+  // ThinLTO export phase.
+  if (TheFn->hasLocalLinkage()) {
+    TheFn->setLinkage(GlobalValue::ExternalLinkage);
+    TheFn->setVisibility(GlobalValue::HiddenVisibility);
+    TheFn->setName(TheFn->getName() + "$merged");
+  }
+
+  Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
+  Res->SingleImplName = TheFn->getName();
+
+  return true;
+}
+
+bool DevirtModule::tryEvaluateFunctionsWithArgs(
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+    ArrayRef<uint64_t> Args) {
+  // Evaluate each function and store the result in each target's RetVal
+  // field.
+  for (VirtualCallTarget &Target : TargetsForSlot) {
+    if (Target.Fn->arg_size() != Args.size() + 1)
+      return false;
+
+    Evaluator Eval(M.getDataLayout(), nullptr);
+    SmallVector<Constant *, 2> EvalArgs;
+    EvalArgs.push_back(
+        Constant::getNullValue(Target.Fn->getFunctionType()->getParamType(0)));
+    for (unsigned I = 0; I != Args.size(); ++I) {
+      auto *ArgTy = dyn_cast<IntegerType>(
+          Target.Fn->getFunctionType()->getParamType(I + 1));
+      if (!ArgTy)
+        return false;
+      EvalArgs.push_back(ConstantInt::get(ArgTy, Args[I]));
+    }
+
+    Constant *RetVal;
+    if (!Eval.EvaluateFunction(Target.Fn, RetVal, EvalArgs) ||
+        !isa<ConstantInt>(RetVal))
+      return false;
+    Target.RetVal = cast<ConstantInt>(RetVal)->getZExtValue();
+  }
+  return true;
+}
+
+void DevirtModule::applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                                         uint64_t TheRetVal) {
+  for (auto Call : CSInfo.CallSites)
+    Call.replaceAndErase(
+        "uniform-ret-val", FnName, RemarksEnabled,
+        ConstantInt::get(cast<IntegerType>(Call.CS.getType()), TheRetVal));
+  CSInfo.markDevirt();
+}
+
+bool DevirtModule::tryUniformRetValOpt(
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, CallSiteInfo &CSInfo,
+    WholeProgramDevirtResolution::ByArg *Res) {
+  // Uniform return value optimization. If all functions return the same
+  // constant, replace all calls with that constant.
+  uint64_t TheRetVal = TargetsForSlot[0].RetVal;
+  for (const VirtualCallTarget &Target : TargetsForSlot)
+    if (Target.RetVal != TheRetVal)
+      return false;
+
+  if (CSInfo.isExported()) {
+    Res->TheKind = WholeProgramDevirtResolution::ByArg::UniformRetVal;
+    Res->Info = TheRetVal;
+  }
+
+  applyUniformRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), TheRetVal);
+  if (RemarksEnabled)
+    for (auto &&Target : TargetsForSlot)
+      Target.WasDevirt = true;
+  return true;
+}
+
+std::string DevirtModule::getGlobalName(VTableSlot Slot,
+                                        ArrayRef<uint64_t> Args,
+                                        StringRef Name) {
+  std::string FullName = "__typeid_";
+  raw_string_ostream OS(FullName);
+  OS << cast<MDString>(Slot.TypeID)->getString() << '_' << Slot.ByteOffset;
+  for (uint64_t Arg : Args)
+    OS << '_' << Arg;
+  OS << '_' << Name;
+  return OS.str();
+}
+
+void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                                StringRef Name, Constant *C) {
+  GlobalAlias *GA = GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage,
+                                        getGlobalName(Slot, Args, Name), C, &M);
+  GA->setVisibility(GlobalValue::HiddenVisibility);
+}
+
+Constant *DevirtModule::importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                                     StringRef Name, unsigned AbsWidth) {
+  Constant *C = M.getOrInsertGlobal(getGlobalName(Slot, Args, Name), Int8Ty);
+  auto *GV = dyn_cast<GlobalVariable>(C);
+  // We only need to set metadata if the global is newly created, in which
+  // case it would not have hidden visibility.
+  if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility)
+    return C;
+
+  GV->setVisibility(GlobalValue::HiddenVisibility);
+  auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
+    auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
+    auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
+    GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                    MDNode::get(M.getContext(), {MinC, MaxC}));
+  };
+  if (AbsWidth == IntPtrTy->getBitWidth())
+    SetAbsRange(~0ull, ~0ull); // Full set.
+  else if (AbsWidth)
+    SetAbsRange(0, 1ull << AbsWidth);
+  return GV;
+}
+
+void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                                        bool IsOne,
+                                        Constant *UniqueMemberAddr) {
+  for (auto &&Call : CSInfo.CallSites) {
+    IRBuilder<> B(Call.CS.getInstruction());
+    Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
+                              Call.VTable, UniqueMemberAddr);
+    Cmp = B.CreateZExt(Cmp, Call.CS->getType());
+    Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, Cmp);
+  }
+  CSInfo.markDevirt();
+}
+
+bool DevirtModule::tryUniqueRetValOpt(
+    unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+    CallSiteInfo &CSInfo, WholeProgramDevirtResolution::ByArg *Res,
+    VTableSlot Slot, ArrayRef<uint64_t> Args) {
+  // IsOne controls whether we look for a 0 or a 1.
+  auto tryUniqueRetValOptFor = [&](bool IsOne) {
+    const TypeMemberInfo *UniqueMember = nullptr;
+    for (const VirtualCallTarget &Target : TargetsForSlot) {
+      if (Target.RetVal == (IsOne ? 1 : 0)) {
+        if (UniqueMember)
+          return false;
+        UniqueMember = Target.TM;
+      }
+    }
+
+    // We should have found a unique member or bailed out by now. We already
+    // checked for a uniform return value in tryUniformRetValOpt.
+    assert(UniqueMember);
+
+    Constant *UniqueMemberAddr =
+        ConstantExpr::getBitCast(UniqueMember->Bits->GV, Int8PtrTy);
+    UniqueMemberAddr = ConstantExpr::getGetElementPtr(
+        Int8Ty, UniqueMemberAddr,
+        ConstantInt::get(Int64Ty, UniqueMember->Offset));
+
+    if (CSInfo.isExported()) {
+      Res->TheKind = WholeProgramDevirtResolution::ByArg::UniqueRetVal;
+      Res->Info = IsOne;
+
+      exportGlobal(Slot, Args, "unique_member", UniqueMemberAddr);
+    }
+
+    // Replace each call with the comparison.
+    applyUniqueRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), IsOne,
+                         UniqueMemberAddr);
+
+    // Update devirtualization statistics for targets.
+    if (RemarksEnabled)
+      for (auto &&Target : TargetsForSlot)
+        Target.WasDevirt = true;
+
+    return true;
+  };
+
+  if (BitWidth == 1) {
+    if (tryUniqueRetValOptFor(true))
+      return true;
+    if (tryUniqueRetValOptFor(false))
+      return true;
+  }
+  return false;
+}
+
+void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
+                                         Constant *Byte, Constant *Bit) {
+  for (auto Call : CSInfo.CallSites) {
+    auto *RetType = cast<IntegerType>(Call.CS.getType());
+    IRBuilder<> B(Call.CS.getInstruction());
+    Value *Addr = B.CreateGEP(Int8Ty, Call.VTable, Byte);
+    if (RetType->getBitWidth() == 1) {
+      Value *Bits = B.CreateLoad(Addr);
+      Value *BitsAndBit = B.CreateAnd(Bits, Bit);
+      auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0));
+      Call.replaceAndErase("virtual-const-prop-1-bit", FnName, RemarksEnabled,
+                           IsBitSet);
+    } else {
+      Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo());
+      Value *Val = B.CreateLoad(RetType, ValAddr);
+      Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled, Val);
+    }
+  }
+  CSInfo.markDevirt();
+}
+
+bool DevirtModule::tryVirtualConstProp(
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
+    WholeProgramDevirtResolution *Res, VTableSlot Slot) {
+  // This only works if the function returns an integer.
+  auto RetType = dyn_cast<IntegerType>(TargetsForSlot[0].Fn->getReturnType());
+  if (!RetType)
+    return false;
+  unsigned BitWidth = RetType->getBitWidth();
+  if (BitWidth > 64)
+    return false;
+
+  // Make sure that each function is defined, does not access memory, takes at
+  // least one argument, does not use its first argument (which we assume is
+  // 'this'), and has the same return type.
+  //
+  // Note that we test whether this copy of the function is readnone, rather
+  // than testing function attributes, which must hold for any copy of the
+  // function, even a less optimized version substituted at link time. This is
+  // sound because the virtual constant propagation optimizations effectively
+  // inline all implementations of the virtual function into each call site,
+  // rather than using function attributes to perform local optimization.
+  for (VirtualCallTarget &Target : TargetsForSlot) {
+    if (Target.Fn->isDeclaration() ||
+        computeFunctionBodyMemoryAccess(*Target.Fn, AARGetter(*Target.Fn)) !=
+            MAK_ReadNone ||
+        Target.Fn->arg_empty() || !Target.Fn->arg_begin()->use_empty() ||
+        Target.Fn->getReturnType() != RetType)
+      return false;
+  }
+
+  for (auto &&CSByConstantArg : SlotInfo.ConstCSInfo) {
+    if (!tryEvaluateFunctionsWithArgs(TargetsForSlot, CSByConstantArg.first))
+      continue;
+
+    WholeProgramDevirtResolution::ByArg *ResByArg = nullptr;
+    if (Res)
+      ResByArg = &Res->ResByArg[CSByConstantArg.first];
+
+    if (tryUniformRetValOpt(TargetsForSlot, CSByConstantArg.second, ResByArg))
+      continue;
+
+    if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second,
+                           ResByArg, Slot, CSByConstantArg.first))
+      continue;
+
+    // Find an allocation offset in bits in all vtables associated with the
+    // type.
+    uint64_t AllocBefore =
+        findLowestOffset(TargetsForSlot, /*IsAfter=*/false, BitWidth);
+    uint64_t AllocAfter =
+        findLowestOffset(TargetsForSlot, /*IsAfter=*/true, BitWidth);
+
+    // Calculate the total amount of padding needed to store a value at both
+    // ends of the object.
+    uint64_t TotalPaddingBefore = 0, TotalPaddingAfter = 0;
+    for (auto &&Target : TargetsForSlot) {
+      TotalPaddingBefore += std::max<int64_t>(
+          (AllocBefore + 7) / 8 - Target.allocatedBeforeBytes() - 1, 0);
+      TotalPaddingAfter += std::max<int64_t>(
+          (AllocAfter + 7) / 8 - Target.allocatedAfterBytes() - 1, 0);
+    }
+
+    // If the amount of padding is too large, give up.
+    // FIXME: do something smarter here.
+    if (std::min(TotalPaddingBefore, TotalPaddingAfter) > 128)
+      continue;
+
+    // Calculate the offset to the value as a (possibly negative) byte offset
+    // and (if applicable) a bit offset, and store the values in the targets.
+    int64_t OffsetByte;
+    uint64_t OffsetBit;
+    if (TotalPaddingBefore <= TotalPaddingAfter)
+      setBeforeReturnValues(TargetsForSlot, AllocBefore, BitWidth, OffsetByte,
+                            OffsetBit);
+    else
+      setAfterReturnValues(TargetsForSlot, AllocAfter, BitWidth, OffsetByte,
+                           OffsetBit);
+
+    if (RemarksEnabled)
+      for (auto &&Target : TargetsForSlot)
+        Target.WasDevirt = true;
+
+    Constant *ByteConst = ConstantInt::get(Int32Ty, OffsetByte);
+    Constant *BitConst = ConstantInt::get(Int8Ty, 1ULL << OffsetBit);
+
+    if (CSByConstantArg.second.isExported()) {
+      ResByArg->TheKind = WholeProgramDevirtResolution::ByArg::VirtualConstProp;
+      exportGlobal(Slot, CSByConstantArg.first, "byte",
+                   ConstantExpr::getIntToPtr(ByteConst, Int8PtrTy));
+      exportGlobal(Slot, CSByConstantArg.first, "bit",
+                   ConstantExpr::getIntToPtr(BitConst, Int8PtrTy));
+    }
+
+    // Rewrite each call to a load from OffsetByte/OffsetBit.
+    applyVirtualConstProp(CSByConstantArg.second,
+                          TargetsForSlot[0].Fn->getName(), ByteConst, BitConst);
+  }
+  return true;
+}
+
+void DevirtModule::rebuildGlobal(VTableBits &B) {
+  if (B.Before.Bytes.empty() && B.After.Bytes.empty())
+    return;
+
+  // Align each byte array to pointer width.
+  unsigned PointerSize = M.getDataLayout().getPointerSize();
+  B.Before.Bytes.resize(alignTo(B.Before.Bytes.size(), PointerSize));
+  B.After.Bytes.resize(alignTo(B.After.Bytes.size(), PointerSize));
+
+  // Before was stored in reverse order; flip it now.
+  for (size_t I = 0, Size = B.Before.Bytes.size(); I != Size / 2; ++I)
+    std::swap(B.Before.Bytes[I], B.Before.Bytes[Size - 1 - I]);
+
+  // Build an anonymous global containing the before bytes, followed by the
+  // original initializer, followed by the after bytes.
+  auto NewInit = ConstantStruct::getAnon(
+      {ConstantDataArray::get(M.getContext(), B.Before.Bytes),
+       B.GV->getInitializer(),
+       ConstantDataArray::get(M.getContext(), B.After.Bytes)});
+  auto NewGV =
+      new GlobalVariable(M, NewInit->getType(), B.GV->isConstant(),
+                         GlobalVariable::PrivateLinkage, NewInit, "", B.GV);
+  NewGV->setSection(B.GV->getSection());
+  NewGV->setComdat(B.GV->getComdat());
+
+  // Copy the original vtable's metadata to the anonymous global, adjusting
+  // offsets as required.
+  NewGV->copyMetadata(B.GV, B.Before.Bytes.size());
+
+  // Build an alias named after the original global, pointing at the second
+  // element (the original initializer).
+  auto Alias = GlobalAlias::create(
+      B.GV->getInitializer()->getType(), 0, B.GV->getLinkage(), "",
+      ConstantExpr::getGetElementPtr(
+          NewInit->getType(), NewGV,
+          ArrayRef<Constant *>{ConstantInt::get(Int32Ty, 0),
+                               ConstantInt::get(Int32Ty, 1)}),
+      &M);
+  Alias->setVisibility(B.GV->getVisibility());
+  Alias->takeName(B.GV);
+
+  B.GV->replaceAllUsesWith(Alias);
+  B.GV->eraseFromParent();
+}
+
+bool DevirtModule::areRemarksEnabled() {
+  const auto &FL = M.getFunctionList();
+  if (FL.empty())
+    return false;
+  const Function &Fn = FL.front();
+
+  const auto &BBL = Fn.getBasicBlockList();
+  if (BBL.empty())
+    return false;
+  auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBL.front());
+  return DI.isEnabled();
+}
+
+void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc,
+                                     Function *AssumeFunc) {
+  // Find all virtual calls via a virtual table pointer %p under an assumption
+  // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p
+  // points to a member of the type identifier %md. Group calls by (type ID,
+  // offset) pair (effectively the identity of the virtual function) and store
+  // to CallSlots.
+  DenseSet<Value *> SeenPtrs;
+  for (auto I = TypeTestFunc->use_begin(), E = TypeTestFunc->use_end();
+       I != E;) {
+    auto CI = dyn_cast<CallInst>(I->getUser());
+    ++I;
+    if (!CI)
+      continue;
+
+    // Search for virtual calls based on %p and add them to DevirtCalls.
+    SmallVector<DevirtCallSite, 1> DevirtCalls;
+    SmallVector<CallInst *, 1> Assumes;
+    findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI);
+
+    // If we found any, add them to CallSlots. Only do this if we haven't seen
+    // the vtable pointer before, as it may have been CSE'd with pointers from
+    // other call sites, and we don't want to process call sites multiple times.
+    if (!Assumes.empty()) {
+      Metadata *TypeId =
+          cast<MetadataAsValue>(CI->getArgOperand(1))->getMetadata();
+      Value *Ptr = CI->getArgOperand(0)->stripPointerCasts();
+      if (SeenPtrs.insert(Ptr).second) {
+        for (DevirtCallSite Call : DevirtCalls) {
+          CallSlots[{TypeId, Call.Offset}].addCallSite(CI->getArgOperand(0),
+                                                       Call.CS, nullptr);
+        }
+      }
+    }
+
+    // We no longer need the assumes or the type test.
+    for (auto Assume : Assumes)
+      Assume->eraseFromParent();
+    // We can't use RecursivelyDeleteTriviallyDeadInstructions here because we
+    // may use the vtable argument later.
+    if (CI->use_empty())
+      CI->eraseFromParent();
+  }
+}
+
+void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
+  Function *TypeTestFunc = Intrinsic::getDeclaration(&M, Intrinsic::type_test);
+
+  for (auto I = TypeCheckedLoadFunc->use_begin(),
+            E = TypeCheckedLoadFunc->use_end();
+       I != E;) {
+    auto CI = dyn_cast<CallInst>(I->getUser());
+    ++I;
+    if (!CI)
+      continue;
+
+    Value *Ptr = CI->getArgOperand(0);
+    Value *Offset = CI->getArgOperand(1);
+    Value *TypeIdValue = CI->getArgOperand(2);
+    Metadata *TypeId = cast<MetadataAsValue>(TypeIdValue)->getMetadata();
+
+    SmallVector<DevirtCallSite, 1> DevirtCalls;
+    SmallVector<Instruction *, 1> LoadedPtrs;
+    SmallVector<Instruction *, 1> Preds;
+    bool HasNonCallUses = false;
+    findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds,
+                                               HasNonCallUses, CI);
+
+    // Start by generating "pessimistic" code that explicitly loads the function
+    // pointer from the vtable and performs the type check. If possible, we will
+    // eliminate the load and the type check later.
+
+    // If possible, only generate the load at the point where it is used.
+    // This helps avoid unnecessary spills.
+    IRBuilder<> LoadB(
+        (LoadedPtrs.size() == 1 && !HasNonCallUses) ? LoadedPtrs[0] : CI);
+    Value *GEP = LoadB.CreateGEP(Int8Ty, Ptr, Offset);
+    Value *GEPPtr = LoadB.CreateBitCast(GEP, PointerType::getUnqual(Int8PtrTy));
+    Value *LoadedValue = LoadB.CreateLoad(Int8PtrTy, GEPPtr);
+
+    for (Instruction *LoadedPtr : LoadedPtrs) {
+      LoadedPtr->replaceAllUsesWith(LoadedValue);
+      LoadedPtr->eraseFromParent();
+    }
+
+    // Likewise for the type test.
+    IRBuilder<> CallB((Preds.size() == 1 && !HasNonCallUses) ? Preds[0] : CI);
+    CallInst *TypeTestCall = CallB.CreateCall(TypeTestFunc, {Ptr, TypeIdValue});
+
+    for (Instruction *Pred : Preds) {
+      Pred->replaceAllUsesWith(TypeTestCall);
+      Pred->eraseFromParent();
+    }
+
+    // We have already erased any extractvalue instructions that refer to the
+    // intrinsic call, but the intrinsic may have other non-extractvalue uses
+    // (although this is unlikely). In that case, explicitly build a pair and
+    // RAUW it.
+    if (!CI->use_empty()) {
+      Value *Pair = UndefValue::get(CI->getType());
+      IRBuilder<> B(CI);
+      Pair = B.CreateInsertValue(Pair, LoadedValue, {0});
+      Pair = B.CreateInsertValue(Pair, TypeTestCall, {1});
+      CI->replaceAllUsesWith(Pair);
+    }
+
+    // The number of unsafe uses is initially the number of uses.
+    auto &NumUnsafeUses = NumUnsafeUsesForTypeTest[TypeTestCall];
+    NumUnsafeUses = DevirtCalls.size();
+
+    // If the function pointer has a non-call user, we cannot eliminate the type
+    // check, as one of those users may eventually call the pointer. Increment
+    // the unsafe use count to make sure it cannot reach zero.
+    if (HasNonCallUses)
+      ++NumUnsafeUses;
+    for (DevirtCallSite Call : DevirtCalls) {
+      CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CS,
+                                                   &NumUnsafeUses);
+    }
+
+    CI->eraseFromParent();
+  }
+}
+
+void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
+  const TypeIdSummary *TidSummary =
+      ImportSummary->getTypeIdSummary(cast<MDString>(Slot.TypeID)->getString());
+  if (!TidSummary)
+    return;
+  auto ResI = TidSummary->WPDRes.find(Slot.ByteOffset);
+  if (ResI == TidSummary->WPDRes.end())
+    return;
+  const WholeProgramDevirtResolution &Res = ResI->second;
+
+  if (Res.TheKind == WholeProgramDevirtResolution::SingleImpl) {
+    // The type of the function in the declaration is irrelevant because every
+    // call site will cast it to the correct type.
+    auto *SingleImpl = M.getOrInsertFunction(
+        Res.SingleImplName, Type::getVoidTy(M.getContext()));
+
+    // This is the import phase so we should not be exporting anything.
+    bool IsExported = false;
+    applySingleImplDevirt(SlotInfo, SingleImpl, IsExported);
+    assert(!IsExported);
+  }
+
+  for (auto &CSByConstantArg : SlotInfo.ConstCSInfo) {
+    auto I = Res.ResByArg.find(CSByConstantArg.first);
+    if (I == Res.ResByArg.end())
+      continue;
+    auto &ResByArg = I->second;
+    // FIXME: We should figure out what to do about the "function name" argument
+    // to the apply* functions, as the function names are unavailable during the
+    // importing phase. For now we just pass the empty string. This does not
+    // impact correctness because the function names are just used for remarks.
+    switch (ResByArg.TheKind) {
+    case WholeProgramDevirtResolution::ByArg::UniformRetVal:
+      applyUniformRetValOpt(CSByConstantArg.second, "", ResByArg.Info);
+      break;
+    case WholeProgramDevirtResolution::ByArg::UniqueRetVal: {
+      Constant *UniqueMemberAddr =
+          importGlobal(Slot, CSByConstantArg.first, "unique_member");
+      applyUniqueRetValOpt(CSByConstantArg.second, "", ResByArg.Info,
+                           UniqueMemberAddr);
+      break;
+    }
+    case WholeProgramDevirtResolution::ByArg::VirtualConstProp: {
+      Constant *Byte = importGlobal(Slot, CSByConstantArg.first, "byte", 32);
+      Byte = ConstantExpr::getPtrToInt(Byte, Int32Ty);
+      Constant *Bit = importGlobal(Slot, CSByConstantArg.first, "bit", 8);
+      Bit = ConstantExpr::getPtrToInt(Bit, Int8Ty);
+      applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit);
+    }
+    default:
+      break;
+    }
+  }
+}
+
+void DevirtModule::removeRedundantTypeTests() {
+  auto True = ConstantInt::getTrue(M.getContext());
+  for (auto &&U : NumUnsafeUsesForTypeTest) {
+    if (U.second == 0) {
+      U.first->replaceAllUsesWith(True);
+      U.first->eraseFromParent();
+    }
+  }
+}
+
+bool DevirtModule::run() {
+  Function *TypeTestFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+  Function *TypeCheckedLoadFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
+  Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume));
+
+  // Normally if there are no users of the devirtualization intrinsics in the
+  // module, this pass has nothing to do. But if we are exporting, we also need
+  // to handle any users that appear only in the function summaries.
+  if (!ExportSummary &&
+      (!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc ||
+       AssumeFunc->use_empty()) &&
+      (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
+    return false;
+
+  if (TypeTestFunc && AssumeFunc)
+    scanTypeTestUsers(TypeTestFunc, AssumeFunc);
+
+  if (TypeCheckedLoadFunc)
+    scanTypeCheckedLoadUsers(TypeCheckedLoadFunc);
+
+  if (ImportSummary) {
+    for (auto &S : CallSlots)
+      importResolution(S.first, S.second);
+
+    removeRedundantTypeTests();
+
+    // The rest of the code is only necessary when exporting or during regular
+    // LTO, so we are done.
+    return true;
+  }
+
+  // Rebuild type metadata into a map for easy lookup.
+  std::vector<VTableBits> Bits;
+  DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
+  buildTypeIdentifierMap(Bits, TypeIdMap);
+  if (TypeIdMap.empty())
+    return true;
+
+  // Collect information from summary about which calls to try to devirtualize.
+  if (ExportSummary) {
+    DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
+    for (auto &P : TypeIdMap) {
+      if (auto *TypeId = dyn_cast<MDString>(P.first))
+        MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back(
+            TypeId);
+    }
+
+    for (auto &P : *ExportSummary) {
+      for (auto &S : P.second.SummaryList) {
+        auto *FS = dyn_cast<FunctionSummary>(S.get());
+        if (!FS)
+          continue;
+        // FIXME: Only add live functions.
+        for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VF.GUID]) {
+            CallSlots[{MD, VF.Offset}].CSInfo.SummaryHasTypeTestAssumeUsers =
+                true;
+          }
+        }
+        for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VF.GUID]) {
+            CallSlots[{MD, VF.Offset}]
+                .CSInfo.SummaryTypeCheckedLoadUsers.push_back(FS);
+          }
+        }
+        for (const FunctionSummary::ConstVCall &VC :
+             FS->type_test_assume_const_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
+            CallSlots[{MD, VC.VFunc.Offset}]
+                .ConstCSInfo[VC.Args]
+                .SummaryHasTypeTestAssumeUsers = true;
+          }
+        }
+        for (const FunctionSummary::ConstVCall &VC :
+             FS->type_checked_load_const_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
+            CallSlots[{MD, VC.VFunc.Offset}]
+                .ConstCSInfo[VC.Args]
+                .SummaryTypeCheckedLoadUsers.push_back(FS);
+          }
+        }
+      }
+    }
+  }
+
+  // For each (type, offset) pair:
+  bool DidVirtualConstProp = false;
+  std::map<std::string, Function*> DevirtTargets;
+  for (auto &S : CallSlots) {
+    // Search each of the members of the type identifier for the virtual
+    // function implementation at offset S.first.ByteOffset, and add to
+    // TargetsForSlot.
+    std::vector<VirtualCallTarget> TargetsForSlot;
+    if (tryFindVirtualCallTargets(TargetsForSlot, TypeIdMap[S.first.TypeID],
+                                  S.first.ByteOffset)) {
+      WholeProgramDevirtResolution *Res = nullptr;
+      if (ExportSummary && isa<MDString>(S.first.TypeID))
+        Res = &ExportSummary
+                   ->getOrInsertTypeIdSummary(
+                       cast<MDString>(S.first.TypeID)->getString())
+                   .WPDRes[S.first.ByteOffset];
+
+      if (!trySingleImplDevirt(TargetsForSlot, S.second, Res) &&
+          tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first))
+        DidVirtualConstProp = true;
+
+      // Collect functions devirtualized at least for one call site for stats.
+      if (RemarksEnabled)
+        for (const auto &T : TargetsForSlot)
+          if (T.WasDevirt)
+            DevirtTargets[T.Fn->getName()] = T.Fn;
+    }
+
+    // CFI-specific: if we are exporting and any llvm.type.checked.load
+    // intrinsics were *not* devirtualized, we need to add the resulting
+    // llvm.type.test intrinsics to the function summaries so that the
+    // LowerTypeTests pass will export them.
+    if (ExportSummary && isa<MDString>(S.first.TypeID)) {
+      auto GUID =
+          GlobalValue::getGUID(cast<MDString>(S.first.TypeID)->getString());
+      for (auto FS : S.second.CSInfo.SummaryTypeCheckedLoadUsers)
+        FS->addTypeTest(GUID);
+      for (auto &CCS : S.second.ConstCSInfo)
+        for (auto FS : CCS.second.SummaryTypeCheckedLoadUsers)
+          FS->addTypeTest(GUID);
+    }
+  }
+
+  if (RemarksEnabled) {
+    // Generate remarks for each devirtualized function.
+    for (const auto &DT : DevirtTargets) {
+      Function *F = DT.second;
+      DISubprogram *SP = F->getSubprogram();
+      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, SP,
+                             Twine("devirtualized ") + F->getName());
+    }
+  }
+
+  removeRedundantTypeTests();
+
+  // Rebuild each global we touched as part of virtual constant propagation to
+  // include the before and after bytes.
+  if (DidVirtualConstProp)
+    for (VTableBits &B : Bits)
+      rebuildGlobal(B);
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
new file mode 100644
index 000000000000..809471cfd74f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -0,0 +1,1750 @@
+//===- InstCombineAddSub.cpp ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for add, fadd, sub, and fsub.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+namespace {
+
+  /// Class representing coefficient of floating-point addend.
+  /// This class needs to be highly efficient, which is especially true for
+  /// the constructor. As of I write this comment, the cost of the default
+  /// constructor is merely 4-byte-store-zero (Assuming compiler is able to
+  /// perform write-merging).
+  ///
+  class FAddendCoef {
+  public:
+    // The constructor has to initialize a APFloat, which is unnecessary for
+    // most addends which have coefficient either 1 or -1. So, the constructor
+    // is expensive. In order to avoid the cost of the constructor, we should
+    // reuse some instances whenever possible. The pre-created instances
+    // FAddCombine::Add[0-5] embodies this idea.
+    //
+    FAddendCoef() : IsFp(false), BufHasFpVal(false), IntVal(0) {}
+    ~FAddendCoef();
+
+    void set(short C) {
+      assert(!insaneIntVal(C) && "Insane coefficient");
+      IsFp = false; IntVal = C;
+    }
+
+    void set(const APFloat& C);
+
+    void negate();
+
+    bool isZero() const { return isInt() ? !IntVal : getFpVal().isZero(); }
+    Value *getValue(Type *) const;
+
+    // If possible, don't define operator+/operator- etc because these
+    // operators inevitably call FAddendCoef's constructor which is not cheap.
+    void operator=(const FAddendCoef &A);
+    void operator+=(const FAddendCoef &A);
+    void operator*=(const FAddendCoef &S);
+
+    bool isOne() const { return isInt() && IntVal == 1; }
+    bool isTwo() const { return isInt() && IntVal == 2; }
+    bool isMinusOne() const { return isInt() && IntVal == -1; }
+    bool isMinusTwo() const { return isInt() && IntVal == -2; }
+
+  private:
+    bool insaneIntVal(int V) { return V > 4 || V < -4; }
+    APFloat *getFpValPtr()
+      { return reinterpret_cast<APFloat*>(&FpValBuf.buffer[0]); }
+    const APFloat *getFpValPtr() const
+      { return reinterpret_cast<const APFloat*>(&FpValBuf.buffer[0]); }
+
+    const APFloat &getFpVal() const {
+      assert(IsFp && BufHasFpVal && "Incorret state");
+      return *getFpValPtr();
+    }
+
+    APFloat &getFpVal() {
+      assert(IsFp && BufHasFpVal && "Incorret state");
+      return *getFpValPtr();
+    }
+
+    bool isInt() const { return !IsFp; }
+
+    // If the coefficient is represented by an integer, promote it to a
+    // floating point.
+    void convertToFpType(const fltSemantics &Sem);
+
+    // Construct an APFloat from a signed integer.
+    // TODO: We should get rid of this function when APFloat can be constructed
+    //       from an *SIGNED* integer.
+    APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val);
+
+  private:
+    bool IsFp;
+
+    // True iff FpValBuf contains an instance of APFloat.
+    bool BufHasFpVal;
+
+    // The integer coefficient of an individual addend is either 1 or -1,
+    // and we try to simplify at most 4 addends from neighboring at most
+    // two instructions. So the range of <IntVal> falls in [-4, 4]. APInt
+    // is overkill of this end.
+    short IntVal;
+
+    AlignedCharArrayUnion<APFloat> FpValBuf;
+  };
+
+  /// FAddend is used to represent floating-point addend. An addend is
+  /// represented as <C, V>, where the V is a symbolic value, and C is a
+  /// constant coefficient. A constant addend is represented as <C, 0>.
+  ///
+  class FAddend {
+  public:
+    FAddend() : Val(nullptr) {}
+
+    Value *getSymVal() const { return Val; }
+    const FAddendCoef &getCoef() const { return Coeff; }
+
+    bool isConstant() const { return Val == nullptr; }
+    bool isZero() const { return Coeff.isZero(); }
+
+    void set(short Coefficient, Value *V) {
+      Coeff.set(Coefficient);
+      Val = V;
+    }
+    void set(const APFloat &Coefficient, Value *V) {
+      Coeff.set(Coefficient);
+      Val = V;
+    }
+    void set(const ConstantFP *Coefficient, Value *V) {
+      Coeff.set(Coefficient->getValueAPF());
+      Val = V;
+    }
+
+    void negate() { Coeff.negate(); }
+
+    /// Drill down the U-D chain one step to find the definition of V, and
+    /// try to break the definition into one or two addends.
+    static unsigned drillValueDownOneStep(Value* V, FAddend &A0, FAddend &A1);
+
+    /// Similar to FAddend::drillDownOneStep() except that the value being
+    /// splitted is the addend itself.
+    unsigned drillAddendDownOneStep(FAddend &Addend0, FAddend &Addend1) const;
+
+    void operator+=(const FAddend &T) {
+      assert((Val == T.Val) && "Symbolic-values disagree");
+      Coeff += T.Coeff;
+    }
+
+  private:
+    void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; }
+
+    // This addend has the value of "Coeff * Val".
+    Value *Val;
+    FAddendCoef Coeff;
+  };
+
+  /// FAddCombine is the class for optimizing an unsafe fadd/fsub along
+  /// with its neighboring at most two instructions.
+  ///
+  class FAddCombine {
+  public:
+    FAddCombine(InstCombiner::BuilderTy &B) : Builder(B), Instr(nullptr) {}
+    Value *simplify(Instruction *FAdd);
+
+  private:
+    typedef SmallVector<const FAddend*, 4> AddendVect;
+
+    Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota);
+
+    Value *performFactorization(Instruction *I);
+
+    /// Convert given addend to a Value
+    Value *createAddendVal(const FAddend &A, bool& NeedNeg);
+
+    /// Return the number of instructions needed to emit the N-ary addition.
+    unsigned calcInstrNumber(const AddendVect& Vect);
+    Value *createFSub(Value *Opnd0, Value *Opnd1);
+    Value *createFAdd(Value *Opnd0, Value *Opnd1);
+    Value *createFMul(Value *Opnd0, Value *Opnd1);
+    Value *createFDiv(Value *Opnd0, Value *Opnd1);
+    Value *createFNeg(Value *V);
+    Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
+    void createInstPostProc(Instruction *NewInst, bool NoNumber = false);
+
+    InstCombiner::BuilderTy &Builder;
+    Instruction *Instr;
+
+     // Debugging stuff are clustered here.
+    #ifndef NDEBUG
+      unsigned CreateInstrNum;
+      void initCreateInstNum() { CreateInstrNum = 0; }
+      void incCreateInstNum() { CreateInstrNum++; }
+    #else
+      void initCreateInstNum() {}
+      void incCreateInstNum() {}
+    #endif
+  };
+
+} // anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//
+// Implementation of
+//    {FAddendCoef, FAddend, FAddition, FAddCombine}.
+//
+//===----------------------------------------------------------------------===//
+FAddendCoef::~FAddendCoef() {
+  if (BufHasFpVal)
+    getFpValPtr()->~APFloat();
+}
+
+void FAddendCoef::set(const APFloat& C) {
+  APFloat *P = getFpValPtr();
+
+  if (isInt()) {
+    // As the buffer is meanless byte stream, we cannot call
+    // APFloat::operator=().
+    new(P) APFloat(C);
+  } else
+    *P = C;
+
+  IsFp = BufHasFpVal = true;
+}
+
+void FAddendCoef::convertToFpType(const fltSemantics &Sem) {
+  if (!isInt())
+    return;
+
+  APFloat *P = getFpValPtr();
+  if (IntVal > 0)
+    new(P) APFloat(Sem, IntVal);
+  else {
+    new(P) APFloat(Sem, 0 - IntVal);
+    P->changeSign();
+  }
+  IsFp = BufHasFpVal = true;
+}
+
+APFloat FAddendCoef::createAPFloatFromInt(const fltSemantics &Sem, int Val) {
+  if (Val >= 0)
+    return APFloat(Sem, Val);
+
+  APFloat T(Sem, 0 - Val);
+  T.changeSign();
+
+  return T;
+}
+
+void FAddendCoef::operator=(const FAddendCoef &That) {
+  if (That.isInt())
+    set(That.IntVal);
+  else
+    set(That.getFpVal());
+}
+
+void FAddendCoef::operator+=(const FAddendCoef &That) {
+  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
+  if (isInt() == That.isInt()) {
+    if (isInt())
+      IntVal += That.IntVal;
+    else
+      getFpVal().add(That.getFpVal(), RndMode);
+    return;
+  }
+
+  if (isInt()) {
+    const APFloat &T = That.getFpVal();
+    convertToFpType(T.getSemantics());
+    getFpVal().add(T, RndMode);
+    return;
+  }
+
+  APFloat &T = getFpVal();
+  T.add(createAPFloatFromInt(T.getSemantics(), That.IntVal), RndMode);
+}
+
+void FAddendCoef::operator*=(const FAddendCoef &That) {
+  if (That.isOne())
+    return;
+
+  if (That.isMinusOne()) {
+    negate();
+    return;
+  }
+
+  if (isInt() && That.isInt()) {
+    int Res = IntVal * (int)That.IntVal;
+    assert(!insaneIntVal(Res) && "Insane int value");
+    IntVal = Res;
+    return;
+  }
+
+  const fltSemantics &Semantic =
+    isInt() ? That.getFpVal().getSemantics() : getFpVal().getSemantics();
+
+  if (isInt())
+    convertToFpType(Semantic);
+  APFloat &F0 = getFpVal();
+
+  if (That.isInt())
+    F0.multiply(createAPFloatFromInt(Semantic, That.IntVal),
+                APFloat::rmNearestTiesToEven);
+  else
+    F0.multiply(That.getFpVal(), APFloat::rmNearestTiesToEven);
+}
+
+void FAddendCoef::negate() {
+  if (isInt())
+    IntVal = 0 - IntVal;
+  else
+    getFpVal().changeSign();
+}
+
+Value *FAddendCoef::getValue(Type *Ty) const {
+  return isInt() ?
+    ConstantFP::get(Ty, float(IntVal)) :
+    ConstantFP::get(Ty->getContext(), getFpVal());
+}
+
+// The definition of <Val>     Addends
+// =========================================
+//  A + B                     <1, A>, <1,B>
+//  A - B                     <1, A>, <1,B>
+//  0 - B                     <-1, B>
+//  C * A,                    <C, A>
+//  A + C                     <1, A> <C, NULL>
+//  0 +/- 0                   <0, NULL> (corner case)
+//
+// Legend: A and B are not constant, C is constant
+//
+unsigned FAddend::drillValueDownOneStep
+  (Value *Val, FAddend &Addend0, FAddend &Addend1) {
+  Instruction *I = nullptr;
+  if (!Val || !(I = dyn_cast<Instruction>(Val)))
+    return 0;
+
+  unsigned Opcode = I->getOpcode();
+
+  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
+    ConstantFP *C0, *C1;
+    Value *Opnd0 = I->getOperand(0);
+    Value *Opnd1 = I->getOperand(1);
+    if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero())
+      Opnd0 = nullptr;
+
+    if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero())
+      Opnd1 = nullptr;
+
+    if (Opnd0) {
+      if (!C0)
+        Addend0.set(1, Opnd0);
+      else
+        Addend0.set(C0, nullptr);
+    }
+
+    if (Opnd1) {
+      FAddend &Addend = Opnd0 ? Addend1 : Addend0;
+      if (!C1)
+        Addend.set(1, Opnd1);
+      else
+        Addend.set(C1, nullptr);
+      if (Opcode == Instruction::FSub)
+        Addend.negate();
+    }
+
+    if (Opnd0 || Opnd1)
+      return Opnd0 && Opnd1 ? 2 : 1;
+
+    // Both operands are zero. Weird!
+    Addend0.set(APFloat(C0->getValueAPF().getSemantics()), nullptr);
+    return 1;
+  }
+
+  if (I->getOpcode() == Instruction::FMul) {
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) {
+      Addend0.set(C, V1);
+      return 1;
+    }
+
+    if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) {
+      Addend0.set(C, V0);
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+// Try to break *this* addend into two addends. e.g. Suppose this addend is
+// <2.3, V>, and V = X + Y, by calling this function, we obtain two addends,
+// i.e. <2.3, X> and <2.3, Y>.
+//
+unsigned FAddend::drillAddendDownOneStep
+  (FAddend &Addend0, FAddend &Addend1) const {
+  if (isConstant())
+    return 0;
+
+  unsigned BreakNum = FAddend::drillValueDownOneStep(Val, Addend0, Addend1);
+  if (!BreakNum || Coeff.isOne())
+    return BreakNum;
+
+  Addend0.Scale(Coeff);
+
+  if (BreakNum == 2)
+    Addend1.Scale(Coeff);
+
+  return BreakNum;
+}
+
+// Try to perform following optimization on the input instruction I. Return the
+// simplified expression if was successful; otherwise, return 0.
+//
+//   Instruction "I" is                Simplified into
+// -------------------------------------------------------
+//   (x * y) +/- (x * z)               x * (y +/- z)
+//   (y / x) +/- (z / x)               (y +/- z) / x
+//
+Value *FAddCombine::performFactorization(Instruction *I) {
+  assert((I->getOpcode() == Instruction::FAdd ||
+          I->getOpcode() == Instruction::FSub) && "Expect add/sub");
+
+  Instruction *I0 = dyn_cast<Instruction>(I->getOperand(0));
+  Instruction *I1 = dyn_cast<Instruction>(I->getOperand(1));
+
+  if (!I0 || !I1 || I0->getOpcode() != I1->getOpcode())
+    return nullptr;
+
+  bool isMpy = false;
+  if (I0->getOpcode() == Instruction::FMul)
+    isMpy = true;
+  else if (I0->getOpcode() != Instruction::FDiv)
+    return nullptr;
+
+  Value *Opnd0_0 = I0->getOperand(0);
+  Value *Opnd0_1 = I0->getOperand(1);
+  Value *Opnd1_0 = I1->getOperand(0);
+  Value *Opnd1_1 = I1->getOperand(1);
+
+  //  Input Instr I       Factor   AddSub0  AddSub1
+  //  ----------------------------------------------
+  // (x*y) +/- (x*z)        x        y         z
+  // (y/x) +/- (z/x)        x        y         z
+  //
+  Value *Factor = nullptr;
+  Value *AddSub0 = nullptr, *AddSub1 = nullptr;
+
+  if (isMpy) {
+    if (Opnd0_0 == Opnd1_0 || Opnd0_0 == Opnd1_1)
+      Factor = Opnd0_0;
+    else if (Opnd0_1 == Opnd1_0 || Opnd0_1 == Opnd1_1)
+      Factor = Opnd0_1;
+
+    if (Factor) {
+      AddSub0 = (Factor == Opnd0_0) ? Opnd0_1 : Opnd0_0;
+      AddSub1 = (Factor == Opnd1_0) ? Opnd1_1 : Opnd1_0;
+    }
+  } else if (Opnd0_1 == Opnd1_1) {
+    Factor = Opnd0_1;
+    AddSub0 = Opnd0_0;
+    AddSub1 = Opnd1_0;
+  }
+
+  if (!Factor)
+    return nullptr;
+
+  FastMathFlags Flags;
+  Flags.setUnsafeAlgebra();
+  if (I0) Flags &= I->getFastMathFlags();
+  if (I1) Flags &= I->getFastMathFlags();
+
+  // Create expression "NewAddSub = AddSub0 +/- AddsSub1"
+  Value *NewAddSub = (I->getOpcode() == Instruction::FAdd) ?
+                      createFAdd(AddSub0, AddSub1) :
+                      createFSub(AddSub0, AddSub1);
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(NewAddSub)) {
+    const APFloat &F = CFP->getValueAPF();
+    if (!F.isNormal())
+      return nullptr;
+  } else if (Instruction *II = dyn_cast<Instruction>(NewAddSub))
+    II->setFastMathFlags(Flags);
+
+  if (isMpy) {
+    Value *RI = createFMul(Factor, NewAddSub);
+    if (Instruction *II = dyn_cast<Instruction>(RI))
+      II->setFastMathFlags(Flags);
+    return RI;
+  }
+
+  Value *RI = createFDiv(NewAddSub, Factor);
+  if (Instruction *II = dyn_cast<Instruction>(RI))
+    II->setFastMathFlags(Flags);
+  return RI;
+}
+
+Value *FAddCombine::simplify(Instruction *I) {
+  assert(I->hasUnsafeAlgebra() && "Should be in unsafe mode");
+
+  // Currently we are not able to handle vector type.
+  if (I->getType()->isVectorTy())
+    return nullptr;
+
+  assert((I->getOpcode() == Instruction::FAdd ||
+          I->getOpcode() == Instruction::FSub) && "Expect add/sub");
+
+  // Save the instruction before calling other member-functions.
+  Instr = I;
+
+  FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1;
+
+  unsigned OpndNum = FAddend::drillValueDownOneStep(I, Opnd0, Opnd1);
+
+  // Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1.
+  unsigned Opnd0_ExpNum = 0;
+  unsigned Opnd1_ExpNum = 0;
+
+  if (!Opnd0.isConstant())
+    Opnd0_ExpNum = Opnd0.drillAddendDownOneStep(Opnd0_0, Opnd0_1);
+
+  // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1.
+  if (OpndNum == 2 && !Opnd1.isConstant())
+    Opnd1_ExpNum = Opnd1.drillAddendDownOneStep(Opnd1_0, Opnd1_1);
+
+  // Step 3: Try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1
+  if (Opnd0_ExpNum && Opnd1_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd0_0);
+    AllOpnds.push_back(&Opnd1_0);
+    if (Opnd0_ExpNum == 2)
+      AllOpnds.push_back(&Opnd0_1);
+    if (Opnd1_ExpNum == 2)
+      AllOpnds.push_back(&Opnd1_1);
+
+    // Compute instruction quota. We should save at least one instruction.
+    unsigned InstQuota = 0;
+
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&
+                 (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1;
+
+    if (Value *R = simplifyFAdd(AllOpnds, InstQuota))
+      return R;
+  }
+
+  if (OpndNum != 2) {
+    // The input instruction is : "I=0.0 +/- V". If the "V" were able to be
+    // splitted into two addends, say "V = X - Y", the instruction would have
+    // been optimized into "I = Y - X" in the previous steps.
+    //
+    const FAddendCoef &CE = Opnd0.getCoef();
+    return CE.isOne() ? Opnd0.getSymVal() : nullptr;
+  }
+
+  // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1]
+  if (Opnd1_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd0);
+    AllOpnds.push_back(&Opnd1_0);
+    if (Opnd1_ExpNum == 2)
+      AllOpnds.push_back(&Opnd1_1);
+
+    if (Value *R = simplifyFAdd(AllOpnds, 1))
+      return R;
+  }
+
+  // step 5: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1]
+  if (Opnd0_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd1);
+    AllOpnds.push_back(&Opnd0_0);
+    if (Opnd0_ExpNum == 2)
+      AllOpnds.push_back(&Opnd0_1);
+
+    if (Value *R = simplifyFAdd(AllOpnds, 1))
+      return R;
+  }
+
+  // step 6: Try factorization as the last resort,
+  return performFactorization(I);
+}
+
+Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
+  unsigned AddendNum = Addends.size();
+  assert(AddendNum <= 4 && "Too many addends");
+
+  // For saving intermediate results;
+  unsigned NextTmpIdx = 0;
+  FAddend TmpResult[3];
+
+  // Points to the constant addend of the resulting simplified expression.
+  // If the resulting expr has constant-addend, this constant-addend is
+  // desirable to reside at the top of the resulting expression tree. Placing
+  // constant close to supper-expr(s) will potentially reveal some optimization
+  // opportunities in super-expr(s).
+  //
+  const FAddend *ConstAdd = nullptr;
+
+  // Simplified addends are placed <SimpVect>.
+  AddendVect SimpVect;
+
+  // The outer loop works on one symbolic-value at a time. Suppose the input
+  // addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ...
+  // The symbolic-values will be processed in this order: x, y, z.
+  //
+  for (unsigned SymIdx = 0; SymIdx < AddendNum; SymIdx++) {
+
+    const FAddend *ThisAddend = Addends[SymIdx];
+    if (!ThisAddend) {
+      // This addend was processed before.
+      continue;
+    }
+
+    Value *Val = ThisAddend->getSymVal();
+    unsigned StartIdx = SimpVect.size();
+    SimpVect.push_back(ThisAddend);
+
+    // The inner loop collects addends sharing same symbolic-value, and these
+    // addends will be later on folded into a single addend. Following above
+    // example, if the symbolic value "y" is being processed, the inner loop
+    // will collect two addends "<b1,y>" and "<b2,Y>". These two addends will
+    // be later on folded into "<b1+b2, y>".
+    //
+    for (unsigned SameSymIdx = SymIdx + 1;
+         SameSymIdx < AddendNum; SameSymIdx++) {
+      const FAddend *T = Addends[SameSymIdx];
+      if (T && T->getSymVal() == Val) {
+        // Set null such that next iteration of the outer loop will not process
+        // this addend again.
+        Addends[SameSymIdx] = nullptr;
+        SimpVect.push_back(T);
+      }
+    }
+
+    // If multiple addends share same symbolic value, fold them together.
+    if (StartIdx + 1 != SimpVect.size()) {
+      FAddend &R = TmpResult[NextTmpIdx ++];
+      R = *SimpVect[StartIdx];
+      for (unsigned Idx = StartIdx + 1; Idx < SimpVect.size(); Idx++)
+        R += *SimpVect[Idx];
+
+      // Pop all addends being folded and push the resulting folded addend.
+      SimpVect.resize(StartIdx);
+      if (Val) {
+        if (!R.isZero()) {
+          SimpVect.push_back(&R);
+        }
+      } else {
+        // Don't push constant addend at this time. It will be the last element
+        // of <SimpVect>.
+        ConstAdd = &R;
+      }
+    }
+  }
+
+  assert((NextTmpIdx <= array_lengthof(TmpResult) + 1) &&
+         "out-of-bound access");
+
+  if (ConstAdd)
+    SimpVect.push_back(ConstAdd);
+
+  Value *Result;
+  if (!SimpVect.empty())
+    Result = createNaryFAdd(SimpVect, InstrQuota);
+  else {
+    // The addition is folded to 0.0.
+    Result = ConstantFP::get(Instr->getType(), 0.0);
+  }
+
+  return Result;
+}
+
+Value *FAddCombine::createNaryFAdd
+  (const AddendVect &Opnds, unsigned InstrQuota) {
+  assert(!Opnds.empty() && "Expect at least one addend");
+
+  // Step 1: Check if the # of instructions needed exceeds the quota.
+  //
+  unsigned InstrNeeded = calcInstrNumber(Opnds);
+  if (InstrNeeded > InstrQuota)
+    return nullptr;
+
+  initCreateInstNum();
+
+  // step 2: Emit the N-ary addition.
+  // Note that at most three instructions are involved in Fadd-InstCombine: the
+  // addition in question, and at most two neighboring instructions.
+  // The resulting optimized addition should have at least one less instruction
+  // than the original addition expression tree. This implies that the resulting
+  // N-ary addition has at most two instructions, and we don't need to worry
+  // about tree-height when constructing the N-ary addition.
+
+  Value *LastVal = nullptr;
+  bool LastValNeedNeg = false;
+
+  // Iterate the addends, creating fadd/fsub using adjacent two addends.
+  for (const FAddend *Opnd : Opnds) {
+    bool NeedNeg;
+    Value *V = createAddendVal(*Opnd, NeedNeg);
+    if (!LastVal) {
+      LastVal = V;
+      LastValNeedNeg = NeedNeg;
+      continue;
+    }
+
+    if (LastValNeedNeg == NeedNeg) {
+      LastVal = createFAdd(LastVal, V);
+      continue;
+    }
+
+    if (LastValNeedNeg)
+      LastVal = createFSub(V, LastVal);
+    else
+      LastVal = createFSub(LastVal, V);
+
+    LastValNeedNeg = false;
+  }
+
+  if (LastValNeedNeg) {
+    LastVal = createFNeg(LastVal);
+  }
+
+  #ifndef NDEBUG
+    assert(CreateInstrNum == InstrNeeded &&
+           "Inconsistent in instruction numbers");
+  #endif
+
+  return LastVal;
+}
+
+Value *FAddCombine::createFSub(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder.CreateFSub(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+Value *FAddCombine::createFNeg(Value *V) {
+  Value *Zero = cast<Value>(ConstantFP::getZeroValueForNegation(V->getType()));
+  Value *NewV = createFSub(Zero, V);
+  if (Instruction *I = dyn_cast<Instruction>(NewV))
+    createInstPostProc(I, true); // fneg's don't receive instruction numbers.
+  return NewV;
+}
+
+Value *FAddCombine::createFAdd(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder.CreateFAdd(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+Value *FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder.CreateFMul(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+Value *FAddCombine::createFDiv(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder.CreateFDiv(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+void FAddCombine::createInstPostProc(Instruction *NewInstr, bool NoNumber) {
+  NewInstr->setDebugLoc(Instr->getDebugLoc());
+
+  // Keep track of the number of instruction created.
+  if (!NoNumber)
+    incCreateInstNum();
+
+  // Propagate fast-math flags
+  NewInstr->setFastMathFlags(Instr->getFastMathFlags());
+}
+
+// Return the number of instruction needed to emit the N-ary addition.
+// NOTE: Keep this function in sync with createAddendVal().
+unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
+  unsigned OpndNum = Opnds.size();
+  unsigned InstrNeeded = OpndNum - 1;
+
+  // The number of addends in the form of "(-1)*x".
+  unsigned NegOpndNum = 0;
+
+  // Adjust the number of instructions needed to emit the N-ary add.
+  for (const FAddend *Opnd : Opnds) {
+    if (Opnd->isConstant())
+      continue;
+
+    // The constant check above is really for a few special constant
+    // coefficients.
+    if (isa<UndefValue>(Opnd->getSymVal()))
+      continue;
+
+    const FAddendCoef &CE = Opnd->getCoef();
+    if (CE.isMinusOne() || CE.isMinusTwo())
+      NegOpndNum++;
+
+    // Let the addend be "c * x". If "c == +/-1", the value of the addend
+    // is immediately available; otherwise, it needs exactly one instruction
+    // to evaluate the value.
+    if (!CE.isMinusOne() && !CE.isOne())
+      InstrNeeded++;
+  }
+  if (NegOpndNum == OpndNum)
+    InstrNeeded++;
+  return InstrNeeded;
+}
+
+// Input Addend        Value           NeedNeg(output)
+// ================================================================
+// Constant C          C               false
+// <+/-1, V>           V               coefficient is -1
+// <2/-2, V>          "fadd V, V"      coefficient is -2
+// <C, V>             "fmul V, C"      false
+//
+// NOTE: Keep this function in sync with FAddCombine::calcInstrNumber.
+Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
+  const FAddendCoef &Coeff = Opnd.getCoef();
+
+  if (Opnd.isConstant()) {
+    NeedNeg = false;
+    return Coeff.getValue(Instr->getType());
+  }
+
+  Value *OpndVal = Opnd.getSymVal();
+
+  if (Coeff.isMinusOne() || Coeff.isOne()) {
+    NeedNeg = Coeff.isMinusOne();
+    return OpndVal;
+  }
+
+  if (Coeff.isTwo() || Coeff.isMinusTwo()) {
+    NeedNeg = Coeff.isMinusTwo();
+    return createFAdd(OpndVal, OpndVal);
+  }
+
+  NeedNeg = false;
+  return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
+}
+
+/// \brief Return true if we can prove that:
+///    (sub LHS, RHS)  === (sub nsw LHS, RHS)
+/// This basically requires proving that the add in the original type would not
+/// overflow to change the sign bit or have a carry out.
+/// TODO: Handle this for Vectors.
+bool InstCombiner::willNotOverflowSignedSub(const Value *LHS,
+                                            const Value *RHS,
+                                            const Instruction &CxtI) const {
+  // If LHS and RHS each have at least two sign bits, the subtraction
+  // cannot overflow.
+  if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 &&
+      ComputeNumSignBits(RHS, 0, &CxtI) > 1)
+    return true;
+
+  KnownBits LHSKnown = computeKnownBits(LHS, 0, &CxtI);
+
+  KnownBits RHSKnown = computeKnownBits(RHS, 0, &CxtI);
+
+  // Subtraction of two 2's complement numbers having identical signs will
+  // never overflow.
+  if ((LHSKnown.isNegative() && RHSKnown.isNegative()) ||
+      (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()))
+    return true;
+
+  // TODO: implement logic similar to checkRippleForAdd
+  return false;
+}
+
+/// \brief Return true if we can prove that:
+///    (sub LHS, RHS)  === (sub nuw LHS, RHS)
+bool InstCombiner::willNotOverflowUnsignedSub(const Value *LHS,
+                                              const Value *RHS,
+                                              const Instruction &CxtI) const {
+  // If the LHS is negative and the RHS is non-negative, no unsigned wrap.
+  KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
+  KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
+  if (LHSKnown.isNegative() && RHSKnown.isNonNegative())
+    return true;
+
+  return false;
+}
+
+// Checks if any operand is negative and we can convert add to sub.
+// This function checks for following negative patterns
+//   ADD(XOR(OR(Z, NOT(C)), C)), 1) == NEG(AND(Z, C))
+//   ADD(XOR(AND(Z, C), C), 1) == NEG(OR(Z, ~C))
+//   XOR(AND(Z, C), (C + 1)) == NEG(OR(Z, ~C)) if C is even
+static Value *checkForNegativeOperand(BinaryOperator &I,
+                                      InstCombiner::BuilderTy &Builder) {
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+  // This function creates 2 instructions to replace ADD, we need at least one
+  // of LHS or RHS to have one use to ensure benefit in transform.
+  if (!LHS->hasOneUse() && !RHS->hasOneUse())
+    return nullptr;
+
+  Value *X = nullptr, *Y = nullptr, *Z = nullptr;
+  const APInt *C1 = nullptr, *C2 = nullptr;
+
+  // if ONE is on other side, swap
+  if (match(RHS, m_Add(m_Value(X), m_One())))
+    std::swap(LHS, RHS);
+
+  if (match(LHS, m_Add(m_Value(X), m_One()))) {
+    // if XOR on other side, swap
+    if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1))))
+      std::swap(X, RHS);
+
+    if (match(X, m_Xor(m_Value(Y), m_APInt(C1)))) {
+      // X = XOR(Y, C1), Y = OR(Z, C2), C2 = NOT(C1) ==> X == NOT(AND(Z, C1))
+      // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, AND(Z, C1))
+      if (match(Y, m_Or(m_Value(Z), m_APInt(C2))) && (*C2 == ~(*C1))) {
+        Value *NewAnd = Builder.CreateAnd(Z, *C1);
+        return Builder.CreateSub(RHS, NewAnd, "sub");
+      } else if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && (*C1 == *C2)) {
+        // X = XOR(Y, C1), Y = AND(Z, C2), C2 == C1 ==> X == NOT(OR(Z, ~C1))
+        // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, OR(Z, ~C1))
+        Value *NewOr = Builder.CreateOr(Z, ~(*C1));
+        return Builder.CreateSub(RHS, NewOr, "sub");
+      }
+    }
+  }
+
+  // Restore LHS and RHS
+  LHS = I.getOperand(0);
+  RHS = I.getOperand(1);
+
+  // if XOR is on other side, swap
+  if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1))))
+    std::swap(LHS, RHS);
+
+  // C2 is ODD
+  // LHS = XOR(Y, C1), Y = AND(Z, C2), C1 == (C2 + 1) => LHS == NEG(OR(Z, ~C2))
+  // ADD(LHS, RHS) == SUB(RHS, OR(Z, ~C2))
+  if (match(LHS, m_Xor(m_Value(Y), m_APInt(C1))))
+    if (C1->countTrailingZeros() == 0)
+      if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && *C1 == (*C2 + 1)) {
+        Value *NewOr = Builder.CreateOr(Z, ~(*C2));
+        return Builder.CreateSub(RHS, NewOr, "sub");
+      }
+  return nullptr;
+}
+
+static Instruction *foldAddWithConstant(BinaryOperator &Add,
+                                        InstCombiner::BuilderTy &Builder) {
+  Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
+  const APInt *C;
+  if (!match(Op1, m_APInt(C)))
+    return nullptr;
+
+  if (C->isSignMask()) {
+    // If wrapping is not allowed, then the addition must set the sign bit:
+    // X + (signmask) --> X | signmask
+    if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap())
+      return BinaryOperator::CreateOr(Op0, Op1);
+
+    // If wrapping is allowed, then the addition flips the sign bit of LHS:
+    // X + (signmask) --> X ^ signmask
+    return BinaryOperator::CreateXor(Op0, Op1);
+  }
+
+  Value *X;
+  const APInt *C2;
+  Type *Ty = Add.getType();
+
+  // Is this add the last step in a convoluted sext?
+  // add(zext(xor i16 X, -32768), -32768) --> sext X
+  if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) &&
+      C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C)
+    return CastInst::Create(Instruction::SExt, X, Ty);
+
+  // (add (zext (add nuw X, C2)), C) --> (zext (add nuw X, C2 + C))
+  // FIXME: This should check hasOneUse to not increase the instruction count?
+  if (C->isNegative() &&
+      match(Op0, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2)))) &&
+      C->sge(-C2->sext(C->getBitWidth()))) {
+    Constant *NewC =
+        ConstantInt::get(X->getType(), *C2 + C->trunc(C2->getBitWidth()));
+    return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty);
+  }
+
+  if (C->isOneValue() && Op0->hasOneUse()) {
+    // add (sext i1 X), 1 --> zext (not X)
+    // TODO: The smallest IR representation is (select X, 0, 1), and that would
+    // not require the one-use check. But we need to remove a transform in
+    // visitSelect and make sure that IR value tracking for select is equal or
+    // better than for these ops.
+    if (match(Op0, m_SExt(m_Value(X))) &&
+        X->getType()->getScalarSizeInBits() == 1)
+      return new ZExtInst(Builder.CreateNot(X), Ty);
+
+    // Shifts and add used to flip and mask off the low bit:
+    // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1
+    const APInt *C3;
+    if (match(Op0, m_AShr(m_Shl(m_Value(X), m_APInt(C2)), m_APInt(C3))) &&
+        C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) {
+      Value *NotX = Builder.CreateNot(X);
+      return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1));
+    }
+  }
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V =
+          SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+                          SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+   // (A*B)+(A*C) -> A*(B+C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *X = foldAddWithConstant(I, Builder))
+    return X;
+
+  // FIXME: This should be moved into the above helper function to allow these
+  // transforms for splat vectors.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+    // zext(bool) + C -> bool ? C + 1 : C
+    if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
+      if (ZI->getSrcTy()->isIntegerTy(1))
+        return SelectInst::Create(ZI->getOperand(0), AddOne(CI), CI);
+
+    Value *XorLHS = nullptr; ConstantInt *XorRHS = nullptr;
+    if (match(LHS, m_Xor(m_Value(XorLHS), m_ConstantInt(XorRHS)))) {
+      uint32_t TySizeBits = I.getType()->getScalarSizeInBits();
+      const APInt &RHSVal = CI->getValue();
+      unsigned ExtendAmt = 0;
+      // If we have ADD(XOR(AND(X, 0xFF), 0x80), 0xF..F80), it's a sext.
+      // If we have ADD(XOR(AND(X, 0xFF), 0xF..F80), 0x80), it's a sext.
+      if (XorRHS->getValue() == -RHSVal) {
+        if (RHSVal.isPowerOf2())
+          ExtendAmt = TySizeBits - RHSVal.logBase2() - 1;
+        else if (XorRHS->getValue().isPowerOf2())
+          ExtendAmt = TySizeBits - XorRHS->getValue().logBase2() - 1;
+      }
+
+      if (ExtendAmt) {
+        APInt Mask = APInt::getHighBitsSet(TySizeBits, ExtendAmt);
+        if (!MaskedValueIsZero(XorLHS, Mask, 0, &I))
+          ExtendAmt = 0;
+      }
+
+      if (ExtendAmt) {
+        Constant *ShAmt = ConstantInt::get(I.getType(), ExtendAmt);
+        Value *NewShl = Builder.CreateShl(XorLHS, ShAmt, "sext");
+        return BinaryOperator::CreateAShr(NewShl, ShAmt);
+      }
+
+      // If this is a xor that was canonicalized from a sub, turn it back into
+      // a sub and fuse this add with it.
+      if (LHS->hasOneUse() && (XorRHS->getValue()+1).isPowerOf2()) {
+        KnownBits LHSKnown = computeKnownBits(XorLHS, 0, &I);
+        if ((XorRHS->getValue() | LHSKnown.Zero).isAllOnesValue())
+          return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI),
+                                           XorLHS);
+      }
+      // (X + signmask) + C could have gotten canonicalized to (X^signmask) + C,
+      // transform them into (X + (signmask ^ C))
+      if (XorRHS->getValue().isSignMask())
+        return BinaryOperator::CreateAdd(XorLHS,
+                                         ConstantExpr::getXor(XorRHS, CI));
+    }
+  }
+
+  if (isa<Constant>(RHS))
+    if (Instruction *NV = foldOpWithConstantIntoOperand(I))
+      return NV;
+
+  if (I.getType()->isIntOrIntVectorTy(1))
+    return BinaryOperator::CreateXor(LHS, RHS);
+
+  // X + X --> X << 1
+  if (LHS == RHS) {
+    BinaryOperator *New =
+      BinaryOperator::CreateShl(LHS, ConstantInt::get(I.getType(), 1));
+    New->setHasNoSignedWrap(I.hasNoSignedWrap());
+    New->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+    return New;
+  }
+
+  // -A + B  -->  B - A
+  // -A + -B  -->  -(A + B)
+  if (Value *LHSV = dyn_castNegVal(LHS)) {
+    if (!isa<Constant>(RHS))
+      if (Value *RHSV = dyn_castNegVal(RHS)) {
+        Value *NewAdd = Builder.CreateAdd(LHSV, RHSV, "sum");
+        return BinaryOperator::CreateNeg(NewAdd);
+      }
+
+    return BinaryOperator::CreateSub(RHS, LHSV);
+  }
+
+  // A + -B  -->  A - B
+  if (!isa<Constant>(RHS))
+    if (Value *V = dyn_castNegVal(RHS))
+      return BinaryOperator::CreateSub(LHS, V);
+
+  if (Value *V = checkForNegativeOperand(I, Builder))
+    return replaceInstUsesWith(I, V);
+
+  // A+B --> A|B iff A and B have no bits set in common.
+  if (haveNoCommonBitsSet(LHS, RHS, DL, &AC, &I, &DT))
+    return BinaryOperator::CreateOr(LHS, RHS);
+
+  if (Constant *CRHS = dyn_cast<Constant>(RHS)) {
+    Value *X;
+    if (match(LHS, m_Not(m_Value(X)))) // ~X + C --> (C-1) - X
+      return BinaryOperator::CreateSub(SubOne(CRHS), X);
+  }
+
+  // FIXME: We already did a check for ConstantInt RHS above this.
+  // FIXME: Is this pattern covered by another fold? No regression tests fail on
+  // removal.
+  if (ConstantInt *CRHS = dyn_cast<ConstantInt>(RHS)) {
+    // (X & FF00) + xx00  -> (X+xx00) & FF00
+    Value *X;
+    ConstantInt *C2;
+    if (LHS->hasOneUse() &&
+        match(LHS, m_And(m_Value(X), m_ConstantInt(C2))) &&
+        CRHS->getValue() == (CRHS->getValue() & C2->getValue())) {
+      // See if all bits from the first bit set in the Add RHS up are included
+      // in the mask.  First, get the rightmost bit.
+      const APInt &AddRHSV = CRHS->getValue();
+
+      // Form a mask of all bits from the lowest bit added through the top.
+      APInt AddRHSHighBits(~((AddRHSV & -AddRHSV)-1));
+
+      // See if the and mask includes all of these bits.
+      APInt AddRHSHighBitsAnd(AddRHSHighBits & C2->getValue());
+
+      if (AddRHSHighBits == AddRHSHighBitsAnd) {
+        // Okay, the xform is safe.  Insert the new add pronto.
+        Value *NewAdd = Builder.CreateAdd(X, CRHS, LHS->getName());
+        return BinaryOperator::CreateAnd(NewAdd, C2);
+      }
+    }
+  }
+
+  // add (select X 0 (sub n A)) A  -->  select X A n
+  {
+    SelectInst *SI = dyn_cast<SelectInst>(LHS);
+    Value *A = RHS;
+    if (!SI) {
+      SI = dyn_cast<SelectInst>(RHS);
+      A = LHS;
+    }
+    if (SI && SI->hasOneUse()) {
+      Value *TV = SI->getTrueValue();
+      Value *FV = SI->getFalseValue();
+      Value *N;
+
+      // Can we fold the add into the argument of the select?
+      // We check both true and false select arguments for a matching subtract.
+      if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A))))
+        // Fold the add into the true select value.
+        return SelectInst::Create(SI->getCondition(), N, A);
+
+      if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A))))
+        // Fold the add into the false select value.
+        return SelectInst::Create(SI->getCondition(), A, N);
+    }
+  }
+
+  // Check for (add (sext x), y), see if we can merge this into an
+  // integer add followed by a sext.
+  if (SExtInst *LHSConv = dyn_cast<SExtInst>(LHS)) {
+    // (add (sext x), cst) --> (sext (add x, cst'))
+    if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
+      if (LHSConv->hasOneUse()) {
+        Constant *CI =
+            ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
+        if (ConstantExpr::getSExt(CI, I.getType()) == RHSC &&
+            willNotOverflowSignedAdd(LHSConv->getOperand(0), CI, I)) {
+          // Insert the new, smaller add.
+          Value *NewAdd =
+              Builder.CreateNSWAdd(LHSConv->getOperand(0), CI, "addconv");
+          return new SExtInst(NewAdd, I.getType());
+        }
+      }
+    }
+
+    // (add (sext x), (sext y)) --> (sext (add int x, y))
+    if (SExtInst *RHSConv = dyn_cast<SExtInst>(RHS)) {
+      // Only do this if x/y have the same type, if at least one of them has a
+      // single use (so we don't increase the number of sexts), and if the
+      // integer add will not overflow.
+      if (LHSConv->getOperand(0)->getType() ==
+              RHSConv->getOperand(0)->getType() &&
+          (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+          willNotOverflowSignedAdd(LHSConv->getOperand(0),
+                                   RHSConv->getOperand(0), I)) {
+        // Insert the new integer add.
+        Value *NewAdd = Builder.CreateNSWAdd(LHSConv->getOperand(0),
+                                             RHSConv->getOperand(0), "addconv");
+        return new SExtInst(NewAdd, I.getType());
+      }
+    }
+  }
+
+  // Check for (add (zext x), y), see if we can merge this into an
+  // integer add followed by a zext.
+  if (auto *LHSConv = dyn_cast<ZExtInst>(LHS)) {
+    // (add (zext x), cst) --> (zext (add x, cst'))
+    if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
+      if (LHSConv->hasOneUse()) {
+        Constant *CI =
+            ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
+        if (ConstantExpr::getZExt(CI, I.getType()) == RHSC &&
+            willNotOverflowUnsignedAdd(LHSConv->getOperand(0), CI, I)) {
+          // Insert the new, smaller add.
+          Value *NewAdd =
+              Builder.CreateNUWAdd(LHSConv->getOperand(0), CI, "addconv");
+          return new ZExtInst(NewAdd, I.getType());
+        }
+      }
+    }
+
+    // (add (zext x), (zext y)) --> (zext (add int x, y))
+    if (auto *RHSConv = dyn_cast<ZExtInst>(RHS)) {
+      // Only do this if x/y have the same type, if at least one of them has a
+      // single use (so we don't increase the number of zexts), and if the
+      // integer add will not overflow.
+      if (LHSConv->getOperand(0)->getType() ==
+              RHSConv->getOperand(0)->getType() &&
+          (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+          willNotOverflowUnsignedAdd(LHSConv->getOperand(0),
+                                     RHSConv->getOperand(0), I)) {
+        // Insert the new integer add.
+        Value *NewAdd = Builder.CreateNUWAdd(
+            LHSConv->getOperand(0), RHSConv->getOperand(0), "addconv");
+        return new ZExtInst(NewAdd, I.getType());
+      }
+    }
+  }
+
+  // (add (xor A, B) (and A, B)) --> (or A, B)
+  {
+    Value *A = nullptr, *B = nullptr;
+    if (match(RHS, m_Xor(m_Value(A), m_Value(B))) &&
+        match(LHS, m_c_And(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateOr(A, B);
+
+    if (match(LHS, m_Xor(m_Value(A), m_Value(B))) &&
+        match(RHS, m_c_And(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateOr(A, B);
+  }
+
+  // (add (or A, B) (and A, B)) --> (add A, B)
+  {
+    Value *A = nullptr, *B = nullptr;
+    if (match(RHS, m_Or(m_Value(A), m_Value(B))) &&
+        match(LHS, m_c_And(m_Specific(A), m_Specific(B)))) {
+      auto *New = BinaryOperator::CreateAdd(A, B);
+      New->setHasNoSignedWrap(I.hasNoSignedWrap());
+      New->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+      return New;
+    }
+
+    if (match(LHS, m_Or(m_Value(A), m_Value(B))) &&
+        match(RHS, m_c_And(m_Specific(A), m_Specific(B)))) {
+      auto *New = BinaryOperator::CreateAdd(A, B);
+      New->setHasNoSignedWrap(I.hasNoSignedWrap());
+      New->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+      return New;
+    }
+  }
+
+  // TODO(jingyue): Consider willNotOverflowSignedAdd and
+  // willNotOverflowUnsignedAdd to reduce the number of invocations of
+  // computeKnownBits.
+  if (!I.hasNoSignedWrap() && willNotOverflowSignedAdd(LHS, RHS, I)) {
+    Changed = true;
+    I.setHasNoSignedWrap(true);
+  }
+  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) {
+    Changed = true;
+    I.setHasNoUnsignedWrap(true);
+  }
+
+  return Changed ? &I : nullptr;
+}
+
+Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (isa<Constant>(RHS))
+    if (Instruction *FoldedFAdd = foldOpWithConstantIntoOperand(I))
+      return FoldedFAdd;
+
+  // -A + B  -->  B - A
+  // -A + -B  -->  -(A + B)
+  if (Value *LHSV = dyn_castFNegVal(LHS)) {
+    Instruction *RI = BinaryOperator::CreateFSub(RHS, LHSV);
+    RI->copyFastMathFlags(&I);
+    return RI;
+  }
+
+  // A + -B  -->  A - B
+  if (!isa<Constant>(RHS))
+    if (Value *V = dyn_castFNegVal(RHS)) {
+      Instruction *RI = BinaryOperator::CreateFSub(LHS, V);
+      RI->copyFastMathFlags(&I);
+      return RI;
+    }
+
+  // Check for (fadd double (sitofp x), y), see if we can merge this into an
+  // integer add followed by a promotion.
+  if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
+    Value *LHSIntVal = LHSConv->getOperand(0);
+    Type *FPType = LHSConv->getType();
+
+    // TODO: This check is overly conservative. In many cases known bits
+    // analysis can tell us that the result of the addition has less significant
+    // bits than the integer type can hold.
+    auto IsValidPromotion = [](Type *FTy, Type *ITy) {
+      Type *FScalarTy = FTy->getScalarType();
+      Type *IScalarTy = ITy->getScalarType();
+
+      // Do we have enough bits in the significand to represent the result of
+      // the integer addition?
+      unsigned MaxRepresentableBits =
+          APFloat::semanticsPrecision(FScalarTy->getFltSemantics());
+      return IScalarTy->getIntegerBitWidth() <= MaxRepresentableBits;
+    };
+
+    // (fadd double (sitofp x), fpcst) --> (sitofp (add int x, intcst))
+    // ... if the constant fits in the integer value.  This is useful for things
+    // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer
+    // requires a constant pool load, and generally allows the add to be better
+    // instcombined.
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS))
+      if (IsValidPromotion(FPType, LHSIntVal->getType())) {
+        Constant *CI =
+          ConstantExpr::getFPToSI(CFP, LHSIntVal->getType());
+        if (LHSConv->hasOneUse() &&
+            ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
+            willNotOverflowSignedAdd(LHSIntVal, CI, I)) {
+          // Insert the new integer add.
+          Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, CI, "addconv");
+          return new SIToFPInst(NewAdd, I.getType());
+        }
+      }
+
+    // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y))
+    if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) {
+      Value *RHSIntVal = RHSConv->getOperand(0);
+      // It's enough to check LHS types only because we require int types to
+      // be the same for this transform.
+      if (IsValidPromotion(FPType, LHSIntVal->getType())) {
+        // Only do this if x/y have the same type, if at least one of them has a
+        // single use (so we don't increase the number of int->fp conversions),
+        // and if the integer add will not overflow.
+        if (LHSIntVal->getType() == RHSIntVal->getType() &&
+            (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+            willNotOverflowSignedAdd(LHSIntVal, RHSIntVal, I)) {
+          // Insert the new integer add.
+          Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, RHSIntVal, "addconv");
+          return new SIToFPInst(NewAdd, I.getType());
+        }
+      }
+    }
+  }
+
+  // select C, 0, B + select C, A, 0 -> select C, A, B
+  {
+    Value *A1, *B1, *C1, *A2, *B2, *C2;
+    if (match(LHS, m_Select(m_Value(C1), m_Value(A1), m_Value(B1))) &&
+        match(RHS, m_Select(m_Value(C2), m_Value(A2), m_Value(B2)))) {
+      if (C1 == C2) {
+        Constant *Z1=nullptr, *Z2=nullptr;
+        Value *A, *B, *C=C1;
+        if (match(A1, m_AnyZero()) && match(B2, m_AnyZero())) {
+            Z1 = dyn_cast<Constant>(A1); A = A2;
+            Z2 = dyn_cast<Constant>(B2); B = B1;
+        } else if (match(B1, m_AnyZero()) && match(A2, m_AnyZero())) {
+            Z1 = dyn_cast<Constant>(B1); B = B2;
+            Z2 = dyn_cast<Constant>(A2); A = A1;
+        }
+
+        if (Z1 && Z2 &&
+            (I.hasNoSignedZeros() ||
+             (Z1->isNegativeZeroValue() && Z2->isNegativeZeroValue()))) {
+          return SelectInst::Create(C, A, B);
+        }
+      }
+    }
+  }
+
+  if (I.hasUnsafeAlgebra()) {
+    if (Value *V = FAddCombine(Builder).simplify(&I))
+      return replaceInstUsesWith(I, V);
+  }
+
+  return Changed ? &I : nullptr;
+}
+
+/// Optimize pointer differences into the same array into a size.  Consider:
+///  &A[10] - &A[0]: we should compile this to "10".  LHS/RHS are the pointer
+/// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
+///
+Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
+                                               Type *Ty) {
+  // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
+  // this.
+  bool Swapped = false;
+  GEPOperator *GEP1 = nullptr, *GEP2 = nullptr;
+
+  // For now we require one side to be the base pointer "A" or a constant
+  // GEP derived from it.
+  if (GEPOperator *LHSGEP = dyn_cast<GEPOperator>(LHS)) {
+    // (gep X, ...) - X
+    if (LHSGEP->getOperand(0) == RHS) {
+      GEP1 = LHSGEP;
+      Swapped = false;
+    } else if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
+      // (gep X, ...) - (gep X, ...)
+      if (LHSGEP->getOperand(0)->stripPointerCasts() ==
+            RHSGEP->getOperand(0)->stripPointerCasts()) {
+        GEP2 = RHSGEP;
+        GEP1 = LHSGEP;
+        Swapped = false;
+      }
+    }
+  }
+
+  if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
+    // X - (gep X, ...)
+    if (RHSGEP->getOperand(0) == LHS) {
+      GEP1 = RHSGEP;
+      Swapped = true;
+    } else if (GEPOperator *LHSGEP = dyn_cast<GEPOperator>(LHS)) {
+      // (gep X, ...) - (gep X, ...)
+      if (RHSGEP->getOperand(0)->stripPointerCasts() ==
+            LHSGEP->getOperand(0)->stripPointerCasts()) {
+        GEP2 = LHSGEP;
+        GEP1 = RHSGEP;
+        Swapped = true;
+      }
+    }
+  }
+
+  // Avoid duplicating the arithmetic if GEP2 has non-constant indices and
+  // multiple users.
+  if (!GEP1 ||
+      (GEP2 && !GEP2->hasAllConstantIndices() && !GEP2->hasOneUse()))
+    return nullptr;
+
+  // Emit the offset of the GEP and an intptr_t.
+  Value *Result = EmitGEPOffset(GEP1);
+
+  // If we had a constant expression GEP on the other side offsetting the
+  // pointer, subtract it from the offset we have.
+  if (GEP2) {
+    Value *Offset = EmitGEPOffset(GEP2);
+    Result = Builder.CreateSub(Result, Offset);
+  }
+
+  // If we have p - gep(p, ...)  then we have to negate the result.
+  if (Swapped)
+    Result = Builder.CreateNeg(Result, "diff.neg");
+
+  return Builder.CreateIntCast(Result, Ty, true);
+}
+
+Instruction *InstCombiner::visitSub(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V =
+          SimplifySubInst(Op0, Op1, I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+                          SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  // (A*B)-(A*C) -> A*(B-C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
+
+  // If this is a 'B = x-(-A)', change to B = x+A.
+  if (Value *V = dyn_castNegVal(Op1)) {
+    BinaryOperator *Res = BinaryOperator::CreateAdd(Op0, V);
+
+    if (const auto *BO = dyn_cast<BinaryOperator>(Op1)) {
+      assert(BO->getOpcode() == Instruction::Sub &&
+             "Expected a subtraction operator!");
+      if (BO->hasNoSignedWrap() && I.hasNoSignedWrap())
+        Res->setHasNoSignedWrap(true);
+    } else {
+      if (cast<Constant>(Op1)->isNotMinSignedValue() && I.hasNoSignedWrap())
+        Res->setHasNoSignedWrap(true);
+    }
+
+    return Res;
+  }
+
+  if (I.getType()->isIntOrIntVectorTy(1))
+    return BinaryOperator::CreateXor(Op0, Op1);
+
+  // Replace (-1 - A) with (~A).
+  if (match(Op0, m_AllOnes()))
+    return BinaryOperator::CreateNot(Op1);
+
+  if (Constant *C = dyn_cast<Constant>(Op0)) {
+    // C - ~X == X + (1+C)
+    Value *X = nullptr;
+    if (match(Op1, m_Not(m_Value(X))))
+      return BinaryOperator::CreateAdd(X, AddOne(C));
+
+    // Try to fold constant sub into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
+    // Try to fold constant sub into PHI values.
+    if (PHINode *PN = dyn_cast<PHINode>(Op1))
+      if (Instruction *R = foldOpIntoPhi(I, PN))
+        return R;
+
+    // C-(X+C2) --> (C-C2)-X
+    Constant *C2;
+    if (match(Op1, m_Add(m_Value(X), m_Constant(C2))))
+      return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X);
+
+    // Fold (sub 0, (zext bool to B)) --> (sext bool to B)
+    if (C->isNullValue() && match(Op1, m_ZExt(m_Value(X))))
+      if (X->getType()->isIntOrIntVectorTy(1))
+        return CastInst::CreateSExtOrBitCast(X, Op1->getType());
+
+    // Fold (sub 0, (sext bool to B)) --> (zext bool to B)
+    if (C->isNullValue() && match(Op1, m_SExt(m_Value(X))))
+      if (X->getType()->isIntOrIntVectorTy(1))
+        return CastInst::CreateZExtOrBitCast(X, Op1->getType());
+  }
+
+  const APInt *Op0C;
+  if (match(Op0, m_APInt(Op0C))) {
+    unsigned BitWidth = I.getType()->getScalarSizeInBits();
+
+    // -(X >>u 31) -> (X >>s 31)
+    // -(X >>s 31) -> (X >>u 31)
+    if (Op0C->isNullValue()) {
+      Value *X;
+      const APInt *ShAmt;
+      if (match(Op1, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
+          *ShAmt == BitWidth - 1) {
+        Value *ShAmtOp = cast<Instruction>(Op1)->getOperand(1);
+        return BinaryOperator::CreateAShr(X, ShAmtOp);
+      }
+      if (match(Op1, m_AShr(m_Value(X), m_APInt(ShAmt))) &&
+          *ShAmt == BitWidth - 1) {
+        Value *ShAmtOp = cast<Instruction>(Op1)->getOperand(1);
+        return BinaryOperator::CreateLShr(X, ShAmtOp);
+      }
+    }
+
+    // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
+    // zero.
+    if (Op0C->isMask()) {
+      KnownBits RHSKnown = computeKnownBits(Op1, 0, &I);
+      if ((*Op0C | RHSKnown.Zero).isAllOnesValue())
+        return BinaryOperator::CreateXor(Op1, Op0);
+    }
+  }
+
+  {
+    Value *Y;
+    // X-(X+Y) == -Y    X-(Y+X) == -Y
+    if (match(Op1, m_c_Add(m_Specific(Op0), m_Value(Y))))
+      return BinaryOperator::CreateNeg(Y);
+
+    // (X-Y)-X == -Y
+    if (match(Op0, m_Sub(m_Specific(Op1), m_Value(Y))))
+      return BinaryOperator::CreateNeg(Y);
+  }
+
+  // (sub (or A, B) (xor A, B)) --> (and A, B)
+  {
+    Value *A, *B;
+    if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
+        match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateAnd(A, B);
+  }
+
+  {
+    Value *Y;
+    // ((X | Y) - X) --> (~X & Y)
+    if (match(Op0, m_OneUse(m_c_Or(m_Value(Y), m_Specific(Op1)))))
+      return BinaryOperator::CreateAnd(
+          Y, Builder.CreateNot(Op1, Op1->getName() + ".not"));
+  }
+
+  if (Op1->hasOneUse()) {
+    Value *X = nullptr, *Y = nullptr, *Z = nullptr;
+    Constant *C = nullptr;
+
+    // (X - (Y - Z))  -->  (X + (Z - Y)).
+    if (match(Op1, m_Sub(m_Value(Y), m_Value(Z))))
+      return BinaryOperator::CreateAdd(Op0,
+                                      Builder.CreateSub(Z, Y, Op1->getName()));
+
+    // (X - (X & Y))   -->   (X & ~Y)
+    //
+    if (match(Op1, m_c_And(m_Value(Y), m_Specific(Op0))))
+      return BinaryOperator::CreateAnd(Op0,
+                                  Builder.CreateNot(Y, Y->getName() + ".not"));
+
+    // 0 - (X sdiv C)  -> (X sdiv -C)  provided the negation doesn't overflow.
+    if (match(Op1, m_SDiv(m_Value(X), m_Constant(C))) && match(Op0, m_Zero()) &&
+        C->isNotMinSignedValue() && !C->isOneValue())
+      return BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(C));
+
+    // 0 - (X << Y)  -> (-X << Y)   when X is freely negatable.
+    if (match(Op1, m_Shl(m_Value(X), m_Value(Y))) && match(Op0, m_Zero()))
+      if (Value *XNeg = dyn_castNegVal(X))
+        return BinaryOperator::CreateShl(XNeg, Y);
+
+    // Subtracting -1/0 is the same as adding 1/0:
+    // sub [nsw] Op0, sext(bool Y) -> add [nsw] Op0, zext(bool Y)
+    // 'nuw' is dropped in favor of the canonical form.
+    if (match(Op1, m_SExt(m_Value(Y))) &&
+        Y->getType()->getScalarSizeInBits() == 1) {
+      Value *Zext = Builder.CreateZExt(Y, I.getType());
+      BinaryOperator *Add = BinaryOperator::CreateAdd(Op0, Zext);
+      Add->setHasNoSignedWrap(I.hasNoSignedWrap());
+      return Add;
+    }
+
+    // X - A*-B -> X + A*B
+    // X - -A*B -> X + A*B
+    Value *A, *B;
+    Constant *CI;
+    if (match(Op1, m_c_Mul(m_Value(A), m_Neg(m_Value(B)))))
+      return BinaryOperator::CreateAdd(Op0, Builder.CreateMul(A, B));
+
+    // X - A*CI -> X + A*-CI
+    // No need to handle commuted multiply because multiply handling will
+    // ensure constant will be move to the right hand side.
+    if (match(Op1, m_Mul(m_Value(A), m_Constant(CI)))) {
+      Value *NewMul = Builder.CreateMul(A, ConstantExpr::getNeg(CI));
+      return BinaryOperator::CreateAdd(Op0, NewMul);
+    }
+  }
+
+  // Optimize pointer differences into the same array into a size.  Consider:
+  //  &A[10] - &A[0]: we should compile this to "10".
+  Value *LHSOp, *RHSOp;
+  if (match(Op0, m_PtrToInt(m_Value(LHSOp))) &&
+      match(Op1, m_PtrToInt(m_Value(RHSOp))))
+    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
+      return replaceInstUsesWith(I, Res);
+
+  // trunc(p)-trunc(q) -> trunc(p-q)
+  if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
+      match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
+    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
+      return replaceInstUsesWith(I, Res);
+
+  bool Changed = false;
+  if (!I.hasNoSignedWrap() && willNotOverflowSignedSub(Op0, Op1, I)) {
+    Changed = true;
+    I.setHasNoSignedWrap(true);
+  }
+  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedSub(Op0, Op1, I)) {
+    Changed = true;
+    I.setHasNoUnsignedWrap(true);
+  }
+
+  return Changed ? &I : nullptr;
+}
+
+Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  // fsub nsz 0, X ==> fsub nsz -0.0, X
+  if (I.getFastMathFlags().noSignedZeros() && match(Op0, m_Zero())) {
+    // Subtraction from -0.0 is the canonical form of fneg.
+    Instruction *NewI = BinaryOperator::CreateFNeg(Op1);
+    NewI->copyFastMathFlags(&I);
+    return NewI;
+  }
+
+  if (isa<Constant>(Op0))
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *NV = FoldOpIntoSelect(I, SI))
+        return NV;
+
+  // If this is a 'B = x-(-A)', change to B = x+A, potentially looking
+  // through FP extensions/truncations along the way.
+  if (Value *V = dyn_castFNegVal(Op1)) {
+    Instruction *NewI = BinaryOperator::CreateFAdd(Op0, V);
+    NewI->copyFastMathFlags(&I);
+    return NewI;
+  }
+  if (FPTruncInst *FPTI = dyn_cast<FPTruncInst>(Op1)) {
+    if (Value *V = dyn_castFNegVal(FPTI->getOperand(0))) {
+      Value *NewTrunc = Builder.CreateFPTrunc(V, I.getType());
+      Instruction *NewI = BinaryOperator::CreateFAdd(Op0, NewTrunc);
+      NewI->copyFastMathFlags(&I);
+      return NewI;
+    }
+  } else if (FPExtInst *FPEI = dyn_cast<FPExtInst>(Op1)) {
+    if (Value *V = dyn_castFNegVal(FPEI->getOperand(0))) {
+      Value *NewExt = Builder.CreateFPExt(V, I.getType());
+      Instruction *NewI = BinaryOperator::CreateFAdd(Op0, NewExt);
+      NewI->copyFastMathFlags(&I);
+      return NewI;
+    }
+  }
+
+  if (I.hasUnsafeAlgebra()) {
+    if (Value *V = FAddCombine(Builder).simplify(&I))
+      return replaceInstUsesWith(I, V);
+  }
+
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
new file mode 100644
index 000000000000..773c86e23707
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -0,0 +1,2645 @@
+//===- InstCombineAndOrXor.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitAnd, visitOr, and visitXor functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Utils/CmpInstAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into
+/// a four bit mask.
+static unsigned getFCmpCode(FCmpInst::Predicate CC) {
+  assert(FCmpInst::FCMP_FALSE <= CC && CC <= FCmpInst::FCMP_TRUE &&
+         "Unexpected FCmp predicate!");
+  // Take advantage of the bit pattern of FCmpInst::Predicate here.
+  //                                                 U L G E
+  static_assert(FCmpInst::FCMP_FALSE ==  0, "");  // 0 0 0 0
+  static_assert(FCmpInst::FCMP_OEQ   ==  1, "");  // 0 0 0 1
+  static_assert(FCmpInst::FCMP_OGT   ==  2, "");  // 0 0 1 0
+  static_assert(FCmpInst::FCMP_OGE   ==  3, "");  // 0 0 1 1
+  static_assert(FCmpInst::FCMP_OLT   ==  4, "");  // 0 1 0 0
+  static_assert(FCmpInst::FCMP_OLE   ==  5, "");  // 0 1 0 1
+  static_assert(FCmpInst::FCMP_ONE   ==  6, "");  // 0 1 1 0
+  static_assert(FCmpInst::FCMP_ORD   ==  7, "");  // 0 1 1 1
+  static_assert(FCmpInst::FCMP_UNO   ==  8, "");  // 1 0 0 0
+  static_assert(FCmpInst::FCMP_UEQ   ==  9, "");  // 1 0 0 1
+  static_assert(FCmpInst::FCMP_UGT   == 10, "");  // 1 0 1 0
+  static_assert(FCmpInst::FCMP_UGE   == 11, "");  // 1 0 1 1
+  static_assert(FCmpInst::FCMP_ULT   == 12, "");  // 1 1 0 0
+  static_assert(FCmpInst::FCMP_ULE   == 13, "");  // 1 1 0 1
+  static_assert(FCmpInst::FCMP_UNE   == 14, "");  // 1 1 1 0
+  static_assert(FCmpInst::FCMP_TRUE  == 15, "");  // 1 1 1 1
+  return CC;
+}
+
+/// This is the complement of getICmpCode, which turns an opcode and two
+/// operands into either a constant true or false, or a brand new ICmp
+/// instruction. The sign is passed in to determine which kind of predicate to
+/// use in the new icmp instruction.
+static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
+                              InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate NewPred;
+  if (Value *NewConstant = getICmpValue(Sign, Code, LHS, RHS, NewPred))
+    return NewConstant;
+  return Builder.CreateICmp(NewPred, LHS, RHS);
+}
+
+/// This is the complement of getFCmpCode, which turns an opcode and two
+/// operands into either a FCmp instruction, or a true/false constant.
+static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS,
+                           InstCombiner::BuilderTy &Builder) {
+  const auto Pred = static_cast<FCmpInst::Predicate>(Code);
+  assert(FCmpInst::FCMP_FALSE <= Pred && Pred <= FCmpInst::FCMP_TRUE &&
+         "Unexpected FCmp predicate!");
+  if (Pred == FCmpInst::FCMP_FALSE)
+    return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
+  if (Pred == FCmpInst::FCMP_TRUE)
+    return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
+  return Builder.CreateFCmp(Pred, LHS, RHS);
+}
+
+/// \brief Transform BITWISE_OP(BSWAP(A),BSWAP(B)) or
+/// BITWISE_OP(BSWAP(A), Constant) to BSWAP(BITWISE_OP(A, B))
+/// \param I Binary operator to transform.
+/// \return Pointer to node that must replace the original binary operator, or
+///         null pointer if no transformation was made.
+static Value *SimplifyBSwap(BinaryOperator &I,
+                            InstCombiner::BuilderTy &Builder) {
+  assert(I.isBitwiseLogicOp() && "Unexpected opcode for bswap simplifying");
+
+  Value *OldLHS = I.getOperand(0);
+  Value *OldRHS = I.getOperand(1);
+
+  Value *NewLHS;
+  if (!match(OldLHS, m_BSwap(m_Value(NewLHS))))
+    return nullptr;
+
+  Value *NewRHS;
+  const APInt *C;
+
+  if (match(OldRHS, m_BSwap(m_Value(NewRHS)))) {
+    // OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) )
+    if (!OldLHS->hasOneUse() && !OldRHS->hasOneUse())
+      return nullptr;
+    // NewRHS initialized by the matcher.
+  } else if (match(OldRHS, m_APInt(C))) {
+    // OP( BSWAP(x), CONSTANT ) -> BSWAP( OP(x, BSWAP(CONSTANT) ) )
+    if (!OldLHS->hasOneUse())
+      return nullptr;
+    NewRHS = ConstantInt::get(I.getType(), C->byteSwap());
+  } else
+    return nullptr;
+
+  Value *BinOp = Builder.CreateBinOp(I.getOpcode(), NewLHS, NewRHS);
+  Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap,
+                                          I.getType());
+  return Builder.CreateCall(F, BinOp);
+}
+
+/// This handles expressions of the form ((val OP C1) & C2).  Where
+/// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.
+Instruction *InstCombiner::OptAndOp(BinaryOperator *Op,
+                                    ConstantInt *OpRHS,
+                                    ConstantInt *AndRHS,
+                                    BinaryOperator &TheAnd) {
+  Value *X = Op->getOperand(0);
+  Constant *Together = nullptr;
+  if (!Op->isShift())
+    Together = ConstantExpr::getAnd(AndRHS, OpRHS);
+
+  switch (Op->getOpcode()) {
+  default: break;
+  case Instruction::Xor:
+    if (Op->hasOneUse()) {
+      // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2)
+      Value *And = Builder.CreateAnd(X, AndRHS);
+      And->takeName(Op);
+      return BinaryOperator::CreateXor(And, Together);
+    }
+    break;
+  case Instruction::Or:
+    if (Op->hasOneUse()){
+      ConstantInt *TogetherCI = dyn_cast<ConstantInt>(Together);
+      if (TogetherCI && !TogetherCI->isZero()){
+        // (X | C1) & C2 --> (X & (C2^(C1&C2))) | C1
+        // NOTE: This reduces the number of bits set in the & mask, which
+        // can expose opportunities for store narrowing.
+        Together = ConstantExpr::getXor(AndRHS, Together);
+        Value *And = Builder.CreateAnd(X, Together);
+        And->takeName(Op);
+        return BinaryOperator::CreateOr(And, OpRHS);
+      }
+    }
+
+    break;
+  case Instruction::Add:
+    if (Op->hasOneUse()) {
+      // Adding a one to a single bit bit-field should be turned into an XOR
+      // of the bit.  First thing to check is to see if this AND is with a
+      // single bit constant.
+      const APInt &AndRHSV = AndRHS->getValue();
+
+      // If there is only one bit set.
+      if (AndRHSV.isPowerOf2()) {
+        // Ok, at this point, we know that we are masking the result of the
+        // ADD down to exactly one bit.  If the constant we are adding has
+        // no bits set below this bit, then we can eliminate the ADD.
+        const APInt& AddRHS = OpRHS->getValue();
+
+        // Check to see if any bits below the one bit set in AndRHSV are set.
+        if ((AddRHS & (AndRHSV - 1)).isNullValue()) {
+          // If not, the only thing that can effect the output of the AND is
+          // the bit specified by AndRHSV.  If that bit is set, the effect of
+          // the XOR is to toggle the bit.  If it is clear, then the ADD has
+          // no effect.
+          if ((AddRHS & AndRHSV).isNullValue()) { // Bit is not set, noop
+            TheAnd.setOperand(0, X);
+            return &TheAnd;
+          } else {
+            // Pull the XOR out of the AND.
+            Value *NewAnd = Builder.CreateAnd(X, AndRHS);
+            NewAnd->takeName(Op);
+            return BinaryOperator::CreateXor(NewAnd, AndRHS);
+          }
+        }
+      }
+    }
+    break;
+
+  case Instruction::Shl: {
+    // We know that the AND will not produce any of the bits shifted in, so if
+    // the anded constant includes them, clear them now!
+    //
+    uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+    uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+    APInt ShlMask(APInt::getHighBitsSet(BitWidth, BitWidth-OpRHSVal));
+    ConstantInt *CI = Builder.getInt(AndRHS->getValue() & ShlMask);
+
+    if (CI->getValue() == ShlMask)
+      // Masking out bits that the shift already masks.
+      return replaceInstUsesWith(TheAnd, Op);   // No need for the and.
+
+    if (CI != AndRHS) {                  // Reducing bits set in and.
+      TheAnd.setOperand(1, CI);
+      return &TheAnd;
+    }
+    break;
+  }
+  case Instruction::LShr: {
+    // We know that the AND will not produce any of the bits shifted in, so if
+    // the anded constant includes them, clear them now!  This only applies to
+    // unsigned shifts, because a signed shr may bring in set bits!
+    //
+    uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+    uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+    APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
+    ConstantInt *CI = Builder.getInt(AndRHS->getValue() & ShrMask);
+
+    if (CI->getValue() == ShrMask)
+      // Masking out bits that the shift already masks.
+      return replaceInstUsesWith(TheAnd, Op);
+
+    if (CI != AndRHS) {
+      TheAnd.setOperand(1, CI);  // Reduce bits set in and cst.
+      return &TheAnd;
+    }
+    break;
+  }
+  case Instruction::AShr:
+    // Signed shr.
+    // See if this is shifting in some sign extension, then masking it out
+    // with an and.
+    if (Op->hasOneUse()) {
+      uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+      uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+      APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
+      Constant *C = Builder.getInt(AndRHS->getValue() & ShrMask);
+      if (C == AndRHS) {          // Masking out bits shifted in.
+        // (Val ashr C1) & C2 -> (Val lshr C1) & C2
+        // Make the argument unsigned.
+        Value *ShVal = Op->getOperand(0);
+        ShVal = Builder.CreateLShr(ShVal, OpRHS, Op->getName());
+        return BinaryOperator::CreateAnd(ShVal, AndRHS, TheAnd.getName());
+      }
+    }
+    break;
+  }
+  return nullptr;
+}
+
+/// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise
+/// (V < Lo || V >= Hi). This method expects that Lo <= Hi. IsSigned indicates
+/// whether to treat V, Lo, and Hi as signed or not.
+Value *InstCombiner::insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
+                                     bool isSigned, bool Inside) {
+  assert((isSigned ? Lo.sle(Hi) : Lo.ule(Hi)) &&
+         "Lo is not <= Hi in range emission code!");
+
+  Type *Ty = V->getType();
+  if (Lo == Hi)
+    return Inside ? ConstantInt::getFalse(Ty) : ConstantInt::getTrue(Ty);
+
+  // V >= Min && V <  Hi --> V <  Hi
+  // V <  Min || V >= Hi --> V >= Hi
+  ICmpInst::Predicate Pred = Inside ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_UGE;
+  if (isSigned ? Lo.isMinSignedValue() : Lo.isMinValue()) {
+    Pred = isSigned ? ICmpInst::getSignedPredicate(Pred) : Pred;
+    return Builder.CreateICmp(Pred, V, ConstantInt::get(Ty, Hi));
+  }
+
+  // V >= Lo && V <  Hi --> V - Lo u<  Hi - Lo
+  // V <  Lo || V >= Hi --> V - Lo u>= Hi - Lo
+  Value *VMinusLo =
+      Builder.CreateSub(V, ConstantInt::get(Ty, Lo), V->getName() + ".off");
+  Constant *HiMinusLo = ConstantInt::get(Ty, Hi - Lo);
+  return Builder.CreateICmp(Pred, VMinusLo, HiMinusLo);
+}
+
+/// Classify (icmp eq (A & B), C) and (icmp ne (A & B), C) as matching patterns
+/// that can be simplified.
+/// One of A and B is considered the mask. The other is the value. This is
+/// described as the "AMask" or "BMask" part of the enum. If the enum contains
+/// only "Mask", then both A and B can be considered masks. If A is the mask,
+/// then it was proven that (A & C) == C. This is trivial if C == A or C == 0.
+/// If both A and C are constants, this proof is also easy.
+/// For the following explanations, we assume that A is the mask.
+///
+/// "AllOnes" declares that the comparison is true only if (A & B) == A or all
+/// bits of A are set in B.
+///   Example: (icmp eq (A & 3), 3) -> AMask_AllOnes
+///
+/// "AllZeros" declares that the comparison is true only if (A & B) == 0 or all
+/// bits of A are cleared in B.
+///   Example: (icmp eq (A & 3), 0) -> Mask_AllZeroes
+///
+/// "Mixed" declares that (A & B) == C and C might or might not contain any
+/// number of one bits and zero bits.
+///   Example: (icmp eq (A & 3), 1) -> AMask_Mixed
+///
+/// "Not" means that in above descriptions "==" should be replaced by "!=".
+///   Example: (icmp ne (A & 3), 3) -> AMask_NotAllOnes
+///
+/// If the mask A contains a single bit, then the following is equivalent:
+///    (icmp eq (A & B), A) equals (icmp ne (A & B), 0)
+///    (icmp ne (A & B), A) equals (icmp eq (A & B), 0)
+enum MaskedICmpType {
+  AMask_AllOnes           =     1,
+  AMask_NotAllOnes        =     2,
+  BMask_AllOnes           =     4,
+  BMask_NotAllOnes        =     8,
+  Mask_AllZeros           =    16,
+  Mask_NotAllZeros        =    32,
+  AMask_Mixed             =    64,
+  AMask_NotMixed          =   128,
+  BMask_Mixed             =   256,
+  BMask_NotMixed          =   512
+};
+
+/// Return the set of patterns (from MaskedICmpType) that (icmp SCC (A & B), C)
+/// satisfies.
+static unsigned getMaskedICmpType(Value *A, Value *B, Value *C,
+                                  ICmpInst::Predicate Pred) {
+  ConstantInt *ACst = dyn_cast<ConstantInt>(A);
+  ConstantInt *BCst = dyn_cast<ConstantInt>(B);
+  ConstantInt *CCst = dyn_cast<ConstantInt>(C);
+  bool IsEq = (Pred == ICmpInst::ICMP_EQ);
+  bool IsAPow2 = (ACst && !ACst->isZero() && ACst->getValue().isPowerOf2());
+  bool IsBPow2 = (BCst && !BCst->isZero() && BCst->getValue().isPowerOf2());
+  unsigned MaskVal = 0;
+  if (CCst && CCst->isZero()) {
+    // if C is zero, then both A and B qualify as mask
+    MaskVal |= (IsEq ? (Mask_AllZeros | AMask_Mixed | BMask_Mixed)
+                     : (Mask_NotAllZeros | AMask_NotMixed | BMask_NotMixed));
+    if (IsAPow2)
+      MaskVal |= (IsEq ? (AMask_NotAllOnes | AMask_NotMixed)
+                       : (AMask_AllOnes | AMask_Mixed));
+    if (IsBPow2)
+      MaskVal |= (IsEq ? (BMask_NotAllOnes | BMask_NotMixed)
+                       : (BMask_AllOnes | BMask_Mixed));
+    return MaskVal;
+  }
+
+  if (A == C) {
+    MaskVal |= (IsEq ? (AMask_AllOnes | AMask_Mixed)
+                     : (AMask_NotAllOnes | AMask_NotMixed));
+    if (IsAPow2)
+      MaskVal |= (IsEq ? (Mask_NotAllZeros | AMask_NotMixed)
+                       : (Mask_AllZeros | AMask_Mixed));
+  } else if (ACst && CCst && ConstantExpr::getAnd(ACst, CCst) == CCst) {
+    MaskVal |= (IsEq ? AMask_Mixed : AMask_NotMixed);
+  }
+
+  if (B == C) {
+    MaskVal |= (IsEq ? (BMask_AllOnes | BMask_Mixed)
+                     : (BMask_NotAllOnes | BMask_NotMixed));
+    if (IsBPow2)
+      MaskVal |= (IsEq ? (Mask_NotAllZeros | BMask_NotMixed)
+                       : (Mask_AllZeros | BMask_Mixed));
+  } else if (BCst && CCst && ConstantExpr::getAnd(BCst, CCst) == CCst) {
+    MaskVal |= (IsEq ? BMask_Mixed : BMask_NotMixed);
+  }
+
+  return MaskVal;
+}
+
+/// Convert an analysis of a masked ICmp into its equivalent if all boolean
+/// operations had the opposite sense. Since each "NotXXX" flag (recording !=)
+/// is adjacent to the corresponding normal flag (recording ==), this just
+/// involves swapping those bits over.
+static unsigned conjugateICmpMask(unsigned Mask) {
+  unsigned NewMask;
+  NewMask = (Mask & (AMask_AllOnes | BMask_AllOnes | Mask_AllZeros |
+                     AMask_Mixed | BMask_Mixed))
+            << 1;
+
+  NewMask |= (Mask & (AMask_NotAllOnes | BMask_NotAllOnes | Mask_NotAllZeros |
+                      AMask_NotMixed | BMask_NotMixed))
+             >> 1;
+
+  return NewMask;
+}
+
+/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E).
+/// Return the set of pattern classes (from MaskedICmpType) that both LHS and
+/// RHS satisfy.
+static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
+                                         Value *&D, Value *&E, ICmpInst *LHS,
+                                         ICmpInst *RHS,
+                                         ICmpInst::Predicate &PredL,
+                                         ICmpInst::Predicate &PredR) {
+  if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType())
+    return 0;
+  // vectors are not (yet?) supported
+  if (LHS->getOperand(0)->getType()->isVectorTy())
+    return 0;
+
+  // Here comes the tricky part:
+  // LHS might be of the form L11 & L12 == X, X == L21 & L22,
+  // and L11 & L12 == L21 & L22. The same goes for RHS.
+  // Now we must find those components L** and R**, that are equal, so
+  // that we can extract the parameters A, B, C, D, and E for the canonical
+  // above.
+  Value *L1 = LHS->getOperand(0);
+  Value *L2 = LHS->getOperand(1);
+  Value *L11, *L12, *L21, *L22;
+  // Check whether the icmp can be decomposed into a bit test.
+  if (decomposeBitTestICmp(LHS, PredL, L11, L12, L2)) {
+    L21 = L22 = L1 = nullptr;
+  } else {
+    // Look for ANDs in the LHS icmp.
+    if (!L1->getType()->isIntegerTy()) {
+      // You can icmp pointers, for example. They really aren't masks.
+      L11 = L12 = nullptr;
+    } else if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) {
+      // Any icmp can be viewed as being trivially masked; if it allows us to
+      // remove one, it's worth it.
+      L11 = L1;
+      L12 = Constant::getAllOnesValue(L1->getType());
+    }
+
+    if (!L2->getType()->isIntegerTy()) {
+      // You can icmp pointers, for example. They really aren't masks.
+      L21 = L22 = nullptr;
+    } else if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) {
+      L21 = L2;
+      L22 = Constant::getAllOnesValue(L2->getType());
+    }
+  }
+
+  // Bail if LHS was a icmp that can't be decomposed into an equality.
+  if (!ICmpInst::isEquality(PredL))
+    return 0;
+
+  Value *R1 = RHS->getOperand(0);
+  Value *R2 = RHS->getOperand(1);
+  Value *R11, *R12;
+  bool Ok = false;
+  if (decomposeBitTestICmp(RHS, PredR, R11, R12, R2)) {
+    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+      A = R11;
+      D = R12;
+    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
+      A = R12;
+      D = R11;
+    } else {
+      return 0;
+    }
+    E = R2;
+    R1 = nullptr;
+    Ok = true;
+  } else if (R1->getType()->isIntegerTy()) {
+    if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) {
+      // As before, model no mask as a trivial mask if it'll let us do an
+      // optimization.
+      R11 = R1;
+      R12 = Constant::getAllOnesValue(R1->getType());
+    }
+
+    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+      A = R11;
+      D = R12;
+      E = R2;
+      Ok = true;
+    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
+      A = R12;
+      D = R11;
+      E = R2;
+      Ok = true;
+    }
+  }
+
+  // Bail if RHS was a icmp that can't be decomposed into an equality.
+  if (!ICmpInst::isEquality(PredR))
+    return 0;
+
+  // Look for ANDs on the right side of the RHS icmp.
+  if (!Ok && R2->getType()->isIntegerTy()) {
+    if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) {
+      R11 = R2;
+      R12 = Constant::getAllOnesValue(R2->getType());
+    }
+
+    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+      A = R11;
+      D = R12;
+      E = R1;
+      Ok = true;
+    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
+      A = R12;
+      D = R11;
+      E = R1;
+      Ok = true;
+    } else {
+      return 0;
+    }
+  }
+  if (!Ok)
+    return 0;
+
+  if (L11 == A) {
+    B = L12;
+    C = L2;
+  } else if (L12 == A) {
+    B = L11;
+    C = L2;
+  } else if (L21 == A) {
+    B = L22;
+    C = L1;
+  } else if (L22 == A) {
+    B = L21;
+    C = L1;
+  }
+
+  unsigned LeftType = getMaskedICmpType(A, B, C, PredL);
+  unsigned RightType = getMaskedICmpType(A, D, E, PredR);
+  return LeftType & RightType;
+}
+
+/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
+/// into a single (icmp(A & X) ==/!= Y).
+static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
+                                     llvm::InstCombiner::BuilderTy &Builder) {
+  Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+  unsigned Mask =
+      getMaskedTypeForICmpPair(A, B, C, D, E, LHS, RHS, PredL, PredR);
+  if (Mask == 0)
+    return nullptr;
+
+  assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
+         "Expected equality predicates for masked type of icmps.");
+
+  // In full generality:
+  //     (icmp (A & B) Op C) | (icmp (A & D) Op E)
+  // ==  ![ (icmp (A & B) !Op C) & (icmp (A & D) !Op E) ]
+  //
+  // If the latter can be converted into (icmp (A & X) Op Y) then the former is
+  // equivalent to (icmp (A & X) !Op Y).
+  //
+  // Therefore, we can pretend for the rest of this function that we're dealing
+  // with the conjunction, provided we flip the sense of any comparisons (both
+  // input and output).
+
+  // In most cases we're going to produce an EQ for the "&&" case.
+  ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+  if (!IsAnd) {
+    // Convert the masking analysis into its equivalent with negated
+    // comparisons.
+    Mask = conjugateICmpMask(Mask);
+  }
+
+  if (Mask & Mask_AllZeros) {
+    // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
+    // -> (icmp eq (A & (B|D)), 0)
+    Value *NewOr = Builder.CreateOr(B, D);
+    Value *NewAnd = Builder.CreateAnd(A, NewOr);
+    // We can't use C as zero because we might actually handle
+    //   (icmp ne (A & B), B) & (icmp ne (A & D), D)
+    // with B and D, having a single bit set.
+    Value *Zero = Constant::getNullValue(A->getType());
+    return Builder.CreateICmp(NewCC, NewAnd, Zero);
+  }
+  if (Mask & BMask_AllOnes) {
+    // (icmp eq (A & B), B) & (icmp eq (A & D), D)
+    // -> (icmp eq (A & (B|D)), (B|D))
+    Value *NewOr = Builder.CreateOr(B, D);
+    Value *NewAnd = Builder.CreateAnd(A, NewOr);
+    return Builder.CreateICmp(NewCC, NewAnd, NewOr);
+  }
+  if (Mask & AMask_AllOnes) {
+    // (icmp eq (A & B), A) & (icmp eq (A & D), A)
+    // -> (icmp eq (A & (B&D)), A)
+    Value *NewAnd1 = Builder.CreateAnd(B, D);
+    Value *NewAnd2 = Builder.CreateAnd(A, NewAnd1);
+    return Builder.CreateICmp(NewCC, NewAnd2, A);
+  }
+
+  // Remaining cases assume at least that B and D are constant, and depend on
+  // their actual values. This isn't strictly necessary, just a "handle the
+  // easy cases for now" decision.
+  ConstantInt *BCst = dyn_cast<ConstantInt>(B);
+  if (!BCst)
+    return nullptr;
+  ConstantInt *DCst = dyn_cast<ConstantInt>(D);
+  if (!DCst)
+    return nullptr;
+
+  if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) {
+    // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
+    // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+    //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
+    // Only valid if one of the masks is a superset of the other (check "B&D" is
+    // the same as either B or D).
+    APInt NewMask = BCst->getValue() & DCst->getValue();
+
+    if (NewMask == BCst->getValue())
+      return LHS;
+    else if (NewMask == DCst->getValue())
+      return RHS;
+  }
+
+  if (Mask & AMask_NotAllOnes) {
+    // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+    //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
+    // Only valid if one of the masks is a superset of the other (check "B|D" is
+    // the same as either B or D).
+    APInt NewMask = BCst->getValue() | DCst->getValue();
+
+    if (NewMask == BCst->getValue())
+      return LHS;
+    else if (NewMask == DCst->getValue())
+      return RHS;
+  }
+
+  if (Mask & BMask_Mixed) {
+    // (icmp eq (A & B), C) & (icmp eq (A & D), E)
+    // We already know that B & C == C && D & E == E.
+    // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of
+    // C and E, which are shared by both the mask B and the mask D, don't
+    // contradict, then we can transform to
+    // -> (icmp eq (A & (B|D)), (C|E))
+    // Currently, we only handle the case of B, C, D, and E being constant.
+    // We can't simply use C and E because we might actually handle
+    //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
+    // with B and D, having a single bit set.
+    ConstantInt *CCst = dyn_cast<ConstantInt>(C);
+    if (!CCst)
+      return nullptr;
+    ConstantInt *ECst = dyn_cast<ConstantInt>(E);
+    if (!ECst)
+      return nullptr;
+    if (PredL != NewCC)
+      CCst = cast<ConstantInt>(ConstantExpr::getXor(BCst, CCst));
+    if (PredR != NewCC)
+      ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
+
+    // If there is a conflict, we should actually return a false for the
+    // whole construct.
+    if (((BCst->getValue() & DCst->getValue()) &
+         (CCst->getValue() ^ ECst->getValue())).getBoolValue())
+      return ConstantInt::get(LHS->getType(), !IsAnd);
+
+    Value *NewOr1 = Builder.CreateOr(B, D);
+    Value *NewOr2 = ConstantExpr::getOr(CCst, ECst);
+    Value *NewAnd = Builder.CreateAnd(A, NewOr1);
+    return Builder.CreateICmp(NewCC, NewAnd, NewOr2);
+  }
+
+  return nullptr;
+}
+
+/// Try to fold a signed range checked with lower bound 0 to an unsigned icmp.
+/// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
+/// If \p Inverted is true then the check is for the inverted range, e.g.
+/// (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
+Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
+                                        bool Inverted) {
+  // Check the lower range comparison, e.g. x >= 0
+  // InstCombine already ensured that if there is a constant it's on the RHS.
+  ConstantInt *RangeStart = dyn_cast<ConstantInt>(Cmp0->getOperand(1));
+  if (!RangeStart)
+    return nullptr;
+
+  ICmpInst::Predicate Pred0 = (Inverted ? Cmp0->getInversePredicate() :
+                               Cmp0->getPredicate());
+
+  // Accept x > -1 or x >= 0 (after potentially inverting the predicate).
+  if (!((Pred0 == ICmpInst::ICMP_SGT && RangeStart->isMinusOne()) ||
+        (Pred0 == ICmpInst::ICMP_SGE && RangeStart->isZero())))
+    return nullptr;
+
+  ICmpInst::Predicate Pred1 = (Inverted ? Cmp1->getInversePredicate() :
+                               Cmp1->getPredicate());
+
+  Value *Input = Cmp0->getOperand(0);
+  Value *RangeEnd;
+  if (Cmp1->getOperand(0) == Input) {
+    // For the upper range compare we have: icmp x, n
+    RangeEnd = Cmp1->getOperand(1);
+  } else if (Cmp1->getOperand(1) == Input) {
+    // For the upper range compare we have: icmp n, x
+    RangeEnd = Cmp1->getOperand(0);
+    Pred1 = ICmpInst::getSwappedPredicate(Pred1);
+  } else {
+    return nullptr;
+  }
+
+  // Check the upper range comparison, e.g. x < n
+  ICmpInst::Predicate NewPred;
+  switch (Pred1) {
+    case ICmpInst::ICMP_SLT: NewPred = ICmpInst::ICMP_ULT; break;
+    case ICmpInst::ICMP_SLE: NewPred = ICmpInst::ICMP_ULE; break;
+    default: return nullptr;
+  }
+
+  // This simplification is only valid if the upper range is not negative.
+  KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1);
+  if (!Known.isNonNegative())
+    return nullptr;
+
+  if (Inverted)
+    NewPred = ICmpInst::getInversePredicate(NewPred);
+
+  return Builder.CreateICmp(NewPred, Input, RangeEnd);
+}
+
+static Value *
+foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS,
+                                     bool JoinedByAnd,
+                                     InstCombiner::BuilderTy &Builder) {
+  Value *X = LHS->getOperand(0);
+  if (X != RHS->getOperand(0))
+    return nullptr;
+
+  const APInt *C1, *C2;
+  if (!match(LHS->getOperand(1), m_APInt(C1)) ||
+      !match(RHS->getOperand(1), m_APInt(C2)))
+    return nullptr;
+
+  // We only handle (X != C1 && X != C2) and (X == C1 || X == C2).
+  ICmpInst::Predicate Pred = LHS->getPredicate();
+  if (Pred !=  RHS->getPredicate())
+    return nullptr;
+  if (JoinedByAnd && Pred != ICmpInst::ICMP_NE)
+    return nullptr;
+  if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
+    return nullptr;
+
+  // The larger unsigned constant goes on the right.
+  if (C1->ugt(*C2))
+    std::swap(C1, C2);
+
+  APInt Xor = *C1 ^ *C2;
+  if (Xor.isPowerOf2()) {
+    // If LHSC and RHSC differ by only one bit, then set that bit in X and
+    // compare against the larger constant:
+    // (X == C1 || X == C2) --> (X | (C1 ^ C2)) == C2
+    // (X != C1 && X != C2) --> (X | (C1 ^ C2)) != C2
+    // We choose an 'or' with a Pow2 constant rather than the inverse mask with
+    // 'and' because that may lead to smaller codegen from a smaller constant.
+    Value *Or = Builder.CreateOr(X, ConstantInt::get(X->getType(), Xor));
+    return Builder.CreateICmp(Pred, Or, ConstantInt::get(X->getType(), *C2));
+  }
+
+  // Special case: get the ordering right when the values wrap around zero.
+  // Ie, we assumed the constants were unsigned when swapping earlier.
+  if (C1->isNullValue() && C2->isAllOnesValue())
+    std::swap(C1, C2);
+
+  if (*C1 == *C2 - 1) {
+    // (X == 13 || X == 14) --> X - 13 <=u 1
+    // (X != 13 && X != 14) --> X - 13  >u 1
+    // An 'add' is the canonical IR form, so favor that over a 'sub'.
+    Value *Add = Builder.CreateAdd(X, ConstantInt::get(X->getType(), -(*C1)));
+    auto NewPred = JoinedByAnd ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_ULE;
+    return Builder.CreateICmp(NewPred, Add, ConstantInt::get(X->getType(), 1));
+  }
+
+  return nullptr;
+}
+
+// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
+// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
+Value *InstCombiner::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
+                                                   bool JoinedByAnd,
+                                                   Instruction &CxtI) {
+  ICmpInst::Predicate Pred = LHS->getPredicate();
+  if (Pred != RHS->getPredicate())
+    return nullptr;
+  if (JoinedByAnd && Pred != ICmpInst::ICMP_NE)
+    return nullptr;
+  if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
+    return nullptr;
+
+  // TODO support vector splats
+  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
+  if (!LHSC || !RHSC || !LHSC->isZero() || !RHSC->isZero())
+    return nullptr;
+
+  Value *A, *B, *C, *D;
+  if (match(LHS->getOperand(0), m_And(m_Value(A), m_Value(B))) &&
+      match(RHS->getOperand(0), m_And(m_Value(C), m_Value(D)))) {
+    if (A == D || B == D)
+      std::swap(C, D);
+    if (B == C)
+      std::swap(A, B);
+
+    if (A == C &&
+        isKnownToBeAPowerOfTwo(B, false, 0, &CxtI) &&
+        isKnownToBeAPowerOfTwo(D, false, 0, &CxtI)) {
+      Value *Mask = Builder.CreateOr(B, D);
+      Value *Masked = Builder.CreateAnd(A, Mask);
+      auto NewPred = JoinedByAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+      return Builder.CreateICmp(NewPred, Masked, Mask);
+    }
+  }
+
+  return nullptr;
+}
+
+/// Fold (icmp)&(icmp) if possible.
+Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                    Instruction &CxtI) {
+  // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2)
+  // if K1 and K2 are a one-bit mask.
+  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, true, CxtI))
+    return V;
+
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+
+  // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
+  if (PredicatesFoldable(PredL, PredR)) {
+    if (LHS->getOperand(0) == RHS->getOperand(1) &&
+        LHS->getOperand(1) == RHS->getOperand(0))
+      LHS->swapOperands();
+    if (LHS->getOperand(0) == RHS->getOperand(0) &&
+        LHS->getOperand(1) == RHS->getOperand(1)) {
+      Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
+      unsigned Code = getICmpCode(LHS) & getICmpCode(RHS);
+      bool isSigned = LHS->isSigned() || RHS->isSigned();
+      return getNewICmpValue(isSigned, Code, Op0, Op1, Builder);
+    }
+  }
+
+  // handle (roughly):  (icmp eq (A & B), C) & (icmp eq (A & D), E)
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder))
+    return V;
+
+  // E.g. (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
+  if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/false))
+    return V;
+
+  // E.g. (icmp slt x, n) & (icmp sge x, 0) --> icmp ult x, n
+  if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/false))
+    return V;
+
+  if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, true, Builder))
+    return V;
+
+  // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
+  Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
+  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
+  if (!LHSC || !RHSC)
+    return nullptr;
+
+  if (LHSC == RHSC && PredL == PredR) {
+    // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
+    // where C is a power of 2 or
+    // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
+    if ((PredL == ICmpInst::ICMP_ULT && LHSC->getValue().isPowerOf2()) ||
+        (PredL == ICmpInst::ICMP_EQ && LHSC->isZero())) {
+      Value *NewOr = Builder.CreateOr(LHS0, RHS0);
+      return Builder.CreateICmp(PredL, NewOr, LHSC);
+    }
+  }
+
+  // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2
+  // where CMAX is the all ones value for the truncated type,
+  // iff the lower bits of C2 and CA are zero.
+  if (PredL == ICmpInst::ICMP_EQ && PredL == PredR && LHS->hasOneUse() &&
+      RHS->hasOneUse()) {
+    Value *V;
+    ConstantInt *AndC, *SmallC = nullptr, *BigC = nullptr;
+
+    // (trunc x) == C1 & (and x, CA) == C2
+    // (and x, CA) == C2 & (trunc x) == C1
+    if (match(RHS0, m_Trunc(m_Value(V))) &&
+        match(LHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
+      SmallC = RHSC;
+      BigC = LHSC;
+    } else if (match(LHS0, m_Trunc(m_Value(V))) &&
+               match(RHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
+      SmallC = LHSC;
+      BigC = RHSC;
+    }
+
+    if (SmallC && BigC) {
+      unsigned BigBitSize = BigC->getType()->getBitWidth();
+      unsigned SmallBitSize = SmallC->getType()->getBitWidth();
+
+      // Check that the low bits are zero.
+      APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize);
+      if ((Low & AndC->getValue()).isNullValue() &&
+          (Low & BigC->getValue()).isNullValue()) {
+        Value *NewAnd = Builder.CreateAnd(V, Low | AndC->getValue());
+        APInt N = SmallC->getValue().zext(BigBitSize) | BigC->getValue();
+        Value *NewVal = ConstantInt::get(AndC->getType()->getContext(), N);
+        return Builder.CreateICmp(PredL, NewAnd, NewVal);
+      }
+    }
+  }
+
+  // From here on, we only handle:
+  //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
+  if (LHS0 != RHS0)
+    return nullptr;
+
+  // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
+  if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
+      PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
+      PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
+      PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
+    return nullptr;
+
+  // We can't fold (ugt x, C) & (sgt x, C2).
+  if (!PredicatesFoldable(PredL, PredR))
+    return nullptr;
+
+  // Ensure that the larger constant is on the RHS.
+  bool ShouldSwap;
+  if (CmpInst::isSigned(PredL) ||
+      (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
+    ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
+  else
+    ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
+
+  if (ShouldSwap) {
+    std::swap(LHS, RHS);
+    std::swap(LHSC, RHSC);
+    std::swap(PredL, PredR);
+  }
+
+  // At this point, we know we have two icmp instructions
+  // comparing a value against two constants and and'ing the result
+  // together.  Because of the above check, we know that we only have
+  // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know
+  // (from the icmp folding check above), that the two constants
+  // are not equal and that the larger constant is on the RHS
+  assert(LHSC != RHSC && "Compares not folded above?");
+
+  switch (PredL) {
+  default:
+    llvm_unreachable("Unknown integer condition code!");
+  case ICmpInst::ICMP_NE:
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_ULT:
+      if (LHSC == SubOne(RHSC)) // (X != 13 & X u< 14) -> X < 13
+        return Builder.CreateICmpULT(LHS0, LHSC);
+      if (LHSC->isZero()) // (X !=  0 & X u< 14) -> X-1 u< 13
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+                               false, true);
+      break; // (X != 13 & X u< 15) -> no change
+    case ICmpInst::ICMP_SLT:
+      if (LHSC == SubOne(RHSC)) // (X != 13 & X s< 14) -> X < 13
+        return Builder.CreateICmpSLT(LHS0, LHSC);
+      break;                 // (X != 13 & X s< 15) -> no change
+    case ICmpInst::ICMP_NE:
+      // Potential folds for this case should already be handled.
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_UGT:
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_NE:
+      if (RHSC == AddOne(LHSC)) // (X u> 13 & X != 14) -> X u> 14
+        return Builder.CreateICmp(PredL, LHS0, RHSC);
+      break;                 // (X u> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) <u 1
+      return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+                             false, true);
+    }
+    break;
+  case ICmpInst::ICMP_SGT:
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_NE:
+      if (RHSC == AddOne(LHSC)) // (X s> 13 & X != 14) -> X s> 14
+        return Builder.CreateICmp(PredL, LHS0, RHSC);
+      break;                 // (X s> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) s< 1
+      return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), true,
+                             true);
+    }
+    break;
+  }
+
+  return nullptr;
+}
+
+/// Optimize (fcmp)&(fcmp).  NOTE: Unlike the rest of instcombine, this returns
+/// a Value which should already be inserted into the function.
+Value *InstCombiner::foldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
+  Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
+  Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1);
+  FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate();
+
+  if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) {
+    // Swap RHS operands to match LHS.
+    Op1CC = FCmpInst::getSwappedPredicate(Op1CC);
+    std::swap(Op1LHS, Op1RHS);
+  }
+
+  // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y).
+  // Suppose the relation between x and y is R, where R is one of
+  // U(1000), L(0100), G(0010) or E(0001), and CC0 and CC1 are the bitmasks for
+  // testing the desired relations.
+  //
+  // Since (R & CC0) and (R & CC1) are either R or 0, we actually have this:
+  //    bool(R & CC0) && bool(R & CC1)
+  //  = bool((R & CC0) & (R & CC1))
+  //  = bool(R & (CC0 & CC1)) <= by re-association, commutation, and idempotency
+  if (Op0LHS == Op1LHS && Op0RHS == Op1RHS)
+    return getFCmpValue(getFCmpCode(Op0CC) & getFCmpCode(Op1CC), Op0LHS, Op0RHS,
+                        Builder);
+
+  if (LHS->getPredicate() == FCmpInst::FCMP_ORD &&
+      RHS->getPredicate() == FCmpInst::FCMP_ORD) {
+    if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType())
+      return nullptr;
+
+    // (fcmp ord x, c) & (fcmp ord y, c)  -> (fcmp ord x, y)
+    if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1)))
+      if (ConstantFP *RHSC = dyn_cast<ConstantFP>(RHS->getOperand(1))) {
+        // If either of the constants are nans, then the whole thing returns
+        // false.
+        if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
+          return Builder.getFalse();
+        return Builder.CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0));
+      }
+
+    // Handle vector zeros.  This occurs because the canonical form of
+    // "fcmp ord x,x" is "fcmp ord x, 0".
+    if (isa<ConstantAggregateZero>(LHS->getOperand(1)) &&
+        isa<ConstantAggregateZero>(RHS->getOperand(1)))
+      return Builder.CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0));
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
+/// Match De Morgan's Laws:
+/// (~A & ~B) == (~(A | B))
+/// (~A | ~B) == (~(A & B))
+static Instruction *matchDeMorgansLaws(BinaryOperator &I,
+                                       InstCombiner::BuilderTy &Builder) {
+  auto Opcode = I.getOpcode();
+  assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
+         "Trying to match De Morgan's Laws with something other than and/or");
+
+  // Flip the logic operation.
+  Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
+
+  Value *A, *B;
+  if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
+      match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
+      !IsFreeToInvert(A, A->hasOneUse()) &&
+      !IsFreeToInvert(B, B->hasOneUse())) {
+    Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
+    return BinaryOperator::CreateNot(AndOr);
+  }
+
+  return nullptr;
+}
+
+bool InstCombiner::shouldOptimizeCast(CastInst *CI) {
+  Value *CastSrc = CI->getOperand(0);
+
+  // Noop casts and casts of constants should be eliminated trivially.
+  if (CI->getSrcTy() == CI->getDestTy() || isa<Constant>(CastSrc))
+    return false;
+
+  // If this cast is paired with another cast that can be eliminated, we prefer
+  // to have it eliminated.
+  if (const auto *PrecedingCI = dyn_cast<CastInst>(CastSrc))
+    if (isEliminableCastPair(PrecedingCI, CI))
+      return false;
+
+  // If this is a vector sext from a compare, then we don't want to break the
+  // idiom where each element of the extended vector is either zero or all ones.
+  if (CI->getOpcode() == Instruction::SExt &&
+      isa<CmpInst>(CastSrc) && CI->getDestTy()->isVectorTy())
+    return false;
+
+  return true;
+}
+
+/// Fold {and,or,xor} (cast X), C.
+static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast,
+                                          InstCombiner::BuilderTy &Builder) {
+  Constant *C;
+  if (!match(Logic.getOperand(1), m_Constant(C)))
+    return nullptr;
+
+  auto LogicOpc = Logic.getOpcode();
+  Type *DestTy = Logic.getType();
+  Type *SrcTy = Cast->getSrcTy();
+
+  // Move the logic operation ahead of a zext if the constant is unchanged in
+  // the smaller source type. Performing the logic in a smaller type may provide
+  // more information to later folds, and the smaller logic instruction may be
+  // cheaper (particularly in the case of vectors).
+  Value *X;
+  if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) {
+    Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy);
+    Constant *ZextTruncC = ConstantExpr::getZExt(TruncC, DestTy);
+    if (ZextTruncC == C) {
+      // LogicOpc (zext X), C --> zext (LogicOpc X, C)
+      Value *NewOp = Builder.CreateBinOp(LogicOpc, X, TruncC);
+      return new ZExtInst(NewOp, DestTy);
+    }
+  }
+
+  return nullptr;
+}
+
+/// Fold {and,or,xor} (cast X), Y.
+Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
+  auto LogicOpc = I.getOpcode();
+  assert(I.isBitwiseLogicOp() && "Unexpected opcode for bitwise logic folding");
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  CastInst *Cast0 = dyn_cast<CastInst>(Op0);
+  if (!Cast0)
+    return nullptr;
+
+  // This must be a cast from an integer or integer vector source type to allow
+  // transformation of the logic operation to the source type.
+  Type *DestTy = I.getType();
+  Type *SrcTy = Cast0->getSrcTy();
+  if (!SrcTy->isIntOrIntVectorTy())
+    return nullptr;
+
+  if (Instruction *Ret = foldLogicCastConstant(I, Cast0, Builder))
+    return Ret;
+
+  CastInst *Cast1 = dyn_cast<CastInst>(Op1);
+  if (!Cast1)
+    return nullptr;
+
+  // Both operands of the logic operation are casts. The casts must be of the
+  // same type for reduction.
+  auto CastOpcode = Cast0->getOpcode();
+  if (CastOpcode != Cast1->getOpcode() || SrcTy != Cast1->getSrcTy())
+    return nullptr;
+
+  Value *Cast0Src = Cast0->getOperand(0);
+  Value *Cast1Src = Cast1->getOperand(0);
+
+  // fold logic(cast(A), cast(B)) -> cast(logic(A, B))
+  if (shouldOptimizeCast(Cast0) && shouldOptimizeCast(Cast1)) {
+    Value *NewOp = Builder.CreateBinOp(LogicOpc, Cast0Src, Cast1Src,
+                                        I.getName());
+    return CastInst::Create(CastOpcode, NewOp, DestTy);
+  }
+
+  // For now, only 'and'/'or' have optimizations after this.
+  if (LogicOpc == Instruction::Xor)
+    return nullptr;
+
+  // If this is logic(cast(icmp), cast(icmp)), try to fold this even if the
+  // cast is otherwise not optimizable.  This happens for vector sexts.
+  ICmpInst *ICmp0 = dyn_cast<ICmpInst>(Cast0Src);
+  ICmpInst *ICmp1 = dyn_cast<ICmpInst>(Cast1Src);
+  if (ICmp0 && ICmp1) {
+    Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1, I)
+                                              : foldOrOfICmps(ICmp0, ICmp1, I);
+    if (Res)
+      return CastInst::Create(CastOpcode, Res, DestTy);
+    return nullptr;
+  }
+
+  // If this is logic(cast(fcmp), cast(fcmp)), try to fold this even if the
+  // cast is otherwise not optimizable.  This happens for vector sexts.
+  FCmpInst *FCmp0 = dyn_cast<FCmpInst>(Cast0Src);
+  FCmpInst *FCmp1 = dyn_cast<FCmpInst>(Cast1Src);
+  if (FCmp0 && FCmp1) {
+    Value *Res = LogicOpc == Instruction::And ? foldAndOfFCmps(FCmp0, FCmp1)
+                                              : foldOrOfFCmps(FCmp0, FCmp1);
+    if (Res)
+      return CastInst::Create(CastOpcode, Res, DestTy);
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
+static Instruction *foldBoolSextMaskToSelect(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Canonicalize SExt or Not to the LHS
+  if (match(Op1, m_SExt(m_Value())) || match(Op1, m_Not(m_Value()))) {
+    std::swap(Op0, Op1);
+  }
+
+  // Fold (and (sext bool to A), B) --> (select bool, B, 0)
+  Value *X = nullptr;
+  if (match(Op0, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+    Value *Zero = Constant::getNullValue(Op1->getType());
+    return SelectInst::Create(X, Op1, Zero);
+  }
+
+  // Fold (and ~(sext bool to A), B) --> (select bool, 0, B)
+  if (match(Op0, m_Not(m_SExt(m_Value(X)))) &&
+      X->getType()->isIntOrIntVectorTy(1)) {
+    Value *Zero = Constant::getNullValue(Op0->getType());
+    return SelectInst::Create(X, Zero, Op1);
+  }
+
+  return nullptr;
+}
+
+static Instruction *foldAndToXor(BinaryOperator &I,
+                                 InstCombiner::BuilderTy &Builder) {
+  assert(I.getOpcode() == Instruction::And);
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  Value *A, *B;
+
+  // Operand complexity canonicalization guarantees that the 'or' is Op0.
+  // (A | B) & ~(A & B) --> A ^ B
+  // (A | B) & ~(B & A) --> A ^ B
+  if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+      match(Op1, m_Not(m_c_And(m_Specific(A), m_Specific(B)))))
+    return BinaryOperator::CreateXor(A, B);
+
+  // (A | ~B) & (~A | B) --> ~(A ^ B)
+  // (A | ~B) & (B | ~A) --> ~(A ^ B)
+  // (~B | A) & (~A | B) --> ~(A ^ B)
+  // (~B | A) & (B | ~A) --> ~(A ^ B)
+  if (Op0->hasOneUse() || Op1->hasOneUse())
+    if (match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) &&
+        match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B))))
+      return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+  return nullptr;
+}
+
+static Instruction *foldOrToXor(BinaryOperator &I,
+                                InstCombiner::BuilderTy &Builder) {
+  assert(I.getOpcode() == Instruction::Or);
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  Value *A, *B;
+
+  // Operand complexity canonicalization guarantees that the 'and' is Op0.
+  // (A & B) | ~(A | B) --> ~(A ^ B)
+  // (A & B) | ~(B | A) --> ~(A ^ B)
+  if (Op0->hasOneUse() || Op1->hasOneUse())
+    if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
+      return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+  // (A & ~B) | (~A & B) --> A ^ B
+  // (A & ~B) | (B & ~A) --> A ^ B
+  // (~B & A) | (~A & B) --> A ^ B
+  // (~B & A) | (B & ~A) --> A ^ B
+  if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
+      match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B))))
+    return BinaryOperator::CreateXor(A, B);
+
+  return nullptr;
+}
+
+// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
+// here. We should standardize that construct where it is needed or choose some
+// other way to ensure that commutated variants of patterns are not missed.
+Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyAndInst(Op0, Op1, SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  // See if we can simplify any instructions used by the instruction whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  // Do this before using distributive laws to catch simple and/or/not patterns.
+  if (Instruction *Xor = foldAndToXor(I, Builder))
+    return Xor;
+
+  // (A|B)&(A|C) -> A|(B&C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyBSwap(I, Builder))
+    return replaceInstUsesWith(I, V);
+
+  if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) {
+    const APInt &AndRHSMask = AndRHS->getValue();
+
+    // Optimize a variety of ((val OP C1) & C2) combinations...
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
+      Value *Op0LHS = Op0I->getOperand(0);
+      Value *Op0RHS = Op0I->getOperand(1);
+      switch (Op0I->getOpcode()) {
+      default: break;
+      case Instruction::Xor:
+      case Instruction::Or: {
+        // If the mask is only needed on one incoming arm, push it up.
+        if (!Op0I->hasOneUse()) break;
+
+        APInt NotAndRHS(~AndRHSMask);
+        if (MaskedValueIsZero(Op0LHS, NotAndRHS, 0, &I)) {
+          // Not masking anything out for the LHS, move to RHS.
+          Value *NewRHS = Builder.CreateAnd(Op0RHS, AndRHS,
+                                            Op0RHS->getName()+".masked");
+          return BinaryOperator::Create(Op0I->getOpcode(), Op0LHS, NewRHS);
+        }
+        if (!isa<Constant>(Op0RHS) &&
+            MaskedValueIsZero(Op0RHS, NotAndRHS, 0, &I)) {
+          // Not masking anything out for the RHS, move to LHS.
+          Value *NewLHS = Builder.CreateAnd(Op0LHS, AndRHS,
+                                            Op0LHS->getName()+".masked");
+          return BinaryOperator::Create(Op0I->getOpcode(), NewLHS, Op0RHS);
+        }
+
+        break;
+      }
+      case Instruction::Sub:
+        // -x & 1 -> x & 1
+        if (AndRHSMask.isOneValue() && match(Op0LHS, m_Zero()))
+          return BinaryOperator::CreateAnd(Op0RHS, AndRHS);
+
+        break;
+
+      case Instruction::Shl:
+      case Instruction::LShr:
+        // (1 << x) & 1 --> zext(x == 0)
+        // (1 >> x) & 1 --> zext(x == 0)
+        if (AndRHSMask.isOneValue() && Op0LHS == AndRHS) {
+          Value *NewICmp =
+            Builder.CreateICmpEQ(Op0RHS, Constant::getNullValue(I.getType()));
+          return new ZExtInst(NewICmp, I.getType());
+        }
+        break;
+      }
+
+      // ((C1 OP zext(X)) & C2) -> zext((C1-X) & C2) if C2 fits in the bitwidth
+      // of X and OP behaves well when given trunc(C1) and X.
+      switch (Op0I->getOpcode()) {
+      default:
+        break;
+      case Instruction::Xor:
+      case Instruction::Or:
+      case Instruction::Mul:
+      case Instruction::Add:
+      case Instruction::Sub:
+        Value *X;
+        ConstantInt *C1;
+        if (match(Op0I, m_c_BinOp(m_ZExt(m_Value(X)), m_ConstantInt(C1)))) {
+          if (AndRHSMask.isIntN(X->getType()->getScalarSizeInBits())) {
+            auto *TruncC1 = ConstantExpr::getTrunc(C1, X->getType());
+            Value *BinOp;
+            if (isa<ZExtInst>(Op0LHS))
+              BinOp = Builder.CreateBinOp(Op0I->getOpcode(), X, TruncC1);
+            else
+              BinOp = Builder.CreateBinOp(Op0I->getOpcode(), TruncC1, X);
+            auto *TruncC2 = ConstantExpr::getTrunc(AndRHS, X->getType());
+            auto *And = Builder.CreateAnd(BinOp, TruncC2);
+            return new ZExtInst(And, I.getType());
+          }
+        }
+      }
+
+      if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1)))
+        if (Instruction *Res = OptAndOp(Op0I, Op0CI, AndRHS, I))
+          return Res;
+    }
+
+    // If this is an integer truncation, and if the source is an 'and' with
+    // immediate, transform it.  This frequently occurs for bitfield accesses.
+    {
+      Value *X = nullptr; ConstantInt *YC = nullptr;
+      if (match(Op0, m_Trunc(m_And(m_Value(X), m_ConstantInt(YC))))) {
+        // Change: and (trunc (and X, YC) to T), C2
+        // into  : and (trunc X to T), trunc(YC) & C2
+        // This will fold the two constants together, which may allow
+        // other simplifications.
+        Value *NewCast = Builder.CreateTrunc(X, I.getType(), "and.shrunk");
+        Constant *C3 = ConstantExpr::getTrunc(YC, I.getType());
+        C3 = ConstantExpr::getAnd(C3, AndRHS);
+        return BinaryOperator::CreateAnd(NewCast, C3);
+      }
+    }
+  }
+
+  if (isa<Constant>(Op1))
+    if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
+      return FoldedLogic;
+
+  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+    return DeMorgan;
+
+  {
+    Value *A = nullptr, *B = nullptr, *C = nullptr;
+    // A&(A^B) => A & ~B
+    {
+      Value *tmpOp0 = Op0;
+      Value *tmpOp1 = Op1;
+      if (match(Op0, m_OneUse(m_Xor(m_Value(A), m_Value(B))))) {
+        if (A == Op1 || B == Op1 ) {
+          tmpOp1 = Op0;
+          tmpOp0 = Op1;
+          // Simplify below
+        }
+      }
+
+      if (match(tmpOp1, m_OneUse(m_Xor(m_Value(A), m_Value(B))))) {
+        if (B == tmpOp0) {
+          std::swap(A, B);
+        }
+        // Notice that the pattern (A&(~B)) is actually (A&(-1^B)), so if
+        // A is originally -1 (or a vector of -1 and undefs), then we enter
+        // an endless loop. By checking that A is non-constant we ensure that
+        // we will never get to the loop.
+        if (A == tmpOp0 && !isa<Constant>(A)) // A&(A^B) -> A & ~B
+          return BinaryOperator::CreateAnd(A, Builder.CreateNot(B));
+      }
+    }
+
+    // (A&((~A)|B)) -> A&B
+    if (match(Op0, m_c_Or(m_Not(m_Specific(Op1)), m_Value(A))))
+      return BinaryOperator::CreateAnd(A, Op1);
+    if (match(Op1, m_c_Or(m_Not(m_Specific(Op0)), m_Value(A))))
+      return BinaryOperator::CreateAnd(A, Op0);
+
+    // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
+      if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
+        if (Op1->hasOneUse() || IsFreeToInvert(C, C->hasOneUse()))
+          return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(C));
+
+    // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C
+    if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
+      if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
+        if (Op0->hasOneUse() || IsFreeToInvert(C, C->hasOneUse()))
+          return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C));
+
+    // (A | B) & ((~A) ^ B) -> (A & B)
+    // (A | B) & (B ^ (~A)) -> (A & B)
+    // (B | A) & ((~A) ^ B) -> (A & B)
+    // (B | A) & (B ^ (~A)) -> (A & B)
+    if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateAnd(A, B);
+
+    // ((~A) ^ B) & (A | B) -> (A & B)
+    // ((~A) ^ B) & (B | A) -> (A & B)
+    // (B ^ (~A)) & (A | B) -> (A & B)
+    // (B ^ (~A)) & (B | A) -> (A & B)
+    if (match(Op0, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Op1, m_c_Or(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateAnd(A, B);
+  }
+
+  {
+    ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
+    ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
+    if (LHS && RHS)
+      if (Value *Res = foldAndOfICmps(LHS, RHS, I))
+        return replaceInstUsesWith(I, Res);
+
+    // TODO: Make this recursive; it's a little tricky because an arbitrary
+    // number of 'and' instructions might have to be created.
+    Value *X, *Y;
+    if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
+      if (auto *Cmp = dyn_cast<ICmpInst>(X))
+        if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
+          return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y));
+      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+        if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
+          return replaceInstUsesWith(I, Builder.CreateAnd(Res, X));
+    }
+    if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
+      if (auto *Cmp = dyn_cast<ICmpInst>(X))
+        if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
+          return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y));
+      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+        if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
+          return replaceInstUsesWith(I, Builder.CreateAnd(Res, X));
+    }
+  }
+
+  // If and'ing two fcmp, try combine them into one.
+  if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
+    if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
+      if (Value *Res = foldAndOfFCmps(LHS, RHS))
+        return replaceInstUsesWith(I, Res);
+
+  if (Instruction *CastedAnd = foldCastedBitwiseLogic(I))
+    return CastedAnd;
+
+  if (Instruction *Select = foldBoolSextMaskToSelect(I))
+    return Select;
+
+  return Changed ? &I : nullptr;
+}
+
+/// Given an OR instruction, check to see if this is a bswap idiom. If so,
+/// insert the new intrinsic and return it.
+Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Look through zero extends.
+  if (Instruction *Ext = dyn_cast<ZExtInst>(Op0))
+    Op0 = Ext->getOperand(0);
+
+  if (Instruction *Ext = dyn_cast<ZExtInst>(Op1))
+    Op1 = Ext->getOperand(0);
+
+  // (A | B) | C  and  A | (B | C)                  -> bswap if possible.
+  bool OrOfOrs = match(Op0, m_Or(m_Value(), m_Value())) ||
+                 match(Op1, m_Or(m_Value(), m_Value()));
+
+  // (A >> B) | (C << D)  and  (A << B) | (B >> C)  -> bswap if possible.
+  bool OrOfShifts = match(Op0, m_LogicalShift(m_Value(), m_Value())) &&
+                    match(Op1, m_LogicalShift(m_Value(), m_Value()));
+
+  // (A & B) | (C & D)                              -> bswap if possible.
+  bool OrOfAnds = match(Op0, m_And(m_Value(), m_Value())) &&
+                  match(Op1, m_And(m_Value(), m_Value()));
+
+  if (!OrOfOrs && !OrOfShifts && !OrOfAnds)
+    return nullptr;
+
+  SmallVector<Instruction*, 4> Insts;
+  if (!recognizeBSwapOrBitReverseIdiom(&I, true, false, Insts))
+    return nullptr;
+  Instruction *LastInst = Insts.pop_back_val();
+  LastInst->removeFromParent();
+
+  for (auto *Inst : Insts)
+    Worklist.Add(Inst);
+  return LastInst;
+}
+
+/// If all elements of two constant vectors are 0/-1 and inverses, return true.
+static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
+  unsigned NumElts = C1->getType()->getVectorNumElements();
+  for (unsigned i = 0; i != NumElts; ++i) {
+    Constant *EltC1 = C1->getAggregateElement(i);
+    Constant *EltC2 = C2->getAggregateElement(i);
+    if (!EltC1 || !EltC2)
+      return false;
+
+    // One element must be all ones, and the other must be all zeros.
+    // FIXME: Allow undef elements.
+    if (!((match(EltC1, m_Zero()) && match(EltC2, m_AllOnes())) ||
+          (match(EltC2, m_Zero()) && match(EltC1, m_AllOnes()))))
+      return false;
+  }
+  return true;
+}
+
+/// We have an expression of the form (A & C) | (B & D). If A is a scalar or
+/// vector composed of all-zeros or all-ones values and is the bitwise 'not' of
+/// B, it can be used as the condition operand of a select instruction.
+static Value *getSelectCondition(Value *A, Value *B,
+                                 InstCombiner::BuilderTy &Builder) {
+  // If these are scalars or vectors of i1, A can be used directly.
+  Type *Ty = A->getType();
+  if (match(A, m_Not(m_Specific(B))) && Ty->isIntOrIntVectorTy(1))
+    return A;
+
+  // If A and B are sign-extended, look through the sexts to find the booleans.
+  Value *Cond;
+  Value *NotB;
+  if (match(A, m_SExt(m_Value(Cond))) &&
+      Cond->getType()->isIntOrIntVectorTy(1) &&
+      match(B, m_OneUse(m_Not(m_Value(NotB))))) {
+    NotB = peekThroughBitcast(NotB, true);
+    if (match(NotB, m_SExt(m_Specific(Cond))))
+      return Cond;
+  }
+
+  // All scalar (and most vector) possibilities should be handled now.
+  // Try more matches that only apply to non-splat constant vectors.
+  if (!Ty->isVectorTy())
+    return nullptr;
+
+  // If both operands are constants, see if the constants are inverse bitmasks.
+  Constant *AC, *BC;
+  if (match(A, m_Constant(AC)) && match(B, m_Constant(BC)) &&
+      areInverseVectorBitmasks(AC, BC))
+    return ConstantExpr::getTrunc(AC, CmpInst::makeCmpResultType(Ty));
+
+  // If both operands are xor'd with constants using the same sexted boolean
+  // operand, see if the constants are inverse bitmasks.
+  if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AC)))) &&
+      match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BC)))) &&
+      Cond->getType()->isIntOrIntVectorTy(1) &&
+      areInverseVectorBitmasks(AC, BC)) {
+    AC = ConstantExpr::getTrunc(AC, CmpInst::makeCmpResultType(Ty));
+    return Builder.CreateXor(Cond, AC);
+  }
+  return nullptr;
+}
+
+/// We have an expression of the form (A & C) | (B & D). Try to simplify this
+/// to "A' ? C : D", where A' is a boolean or vector of booleans.
+static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,
+                                   InstCombiner::BuilderTy &Builder) {
+  // The potential condition of the select may be bitcasted. In that case, look
+  // through its bitcast and the corresponding bitcast of the 'not' condition.
+  Type *OrigType = A->getType();
+  A = peekThroughBitcast(A, true);
+  B = peekThroughBitcast(B, true);
+
+  if (Value *Cond = getSelectCondition(A, B, Builder)) {
+    // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D))
+    // The bitcasts will either all exist or all not exist. The builder will
+    // not create unnecessary casts if the types already match.
+    Value *BitcastC = Builder.CreateBitCast(C, A->getType());
+    Value *BitcastD = Builder.CreateBitCast(D, A->getType());
+    Value *Select = Builder.CreateSelect(Cond, BitcastC, BitcastD);
+    return Builder.CreateBitCast(Select, OrigType);
+  }
+
+  return nullptr;
+}
+
+/// Fold (icmp)|(icmp) if possible.
+Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                   Instruction &CxtI) {
+  // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
+  // if K1 and K2 are a one-bit mask.
+  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, false, CxtI))
+    return V;
+
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+
+  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
+
+  // Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3)
+  //                   -->  (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3)
+  // The original condition actually refers to the following two ranges:
+  // [MAX_UINT-C1+1, MAX_UINT-C1+1+C3] and [MAX_UINT-C2+1, MAX_UINT-C2+1+C3]
+  // We can fold these two ranges if:
+  // 1) C1 and C2 is unsigned greater than C3.
+  // 2) The two ranges are separated.
+  // 3) C1 ^ C2 is one-bit mask.
+  // 4) LowRange1 ^ LowRange2 and HighRange1 ^ HighRange2 are one-bit mask.
+  // This implies all values in the two ranges differ by exactly one bit.
+
+  if ((PredL == ICmpInst::ICMP_ULT || PredL == ICmpInst::ICMP_ULE) &&
+      PredL == PredR && LHSC && RHSC && LHS->hasOneUse() && RHS->hasOneUse() &&
+      LHSC->getType() == RHSC->getType() &&
+      LHSC->getValue() == (RHSC->getValue())) {
+
+    Value *LAdd = LHS->getOperand(0);
+    Value *RAdd = RHS->getOperand(0);
+
+    Value *LAddOpnd, *RAddOpnd;
+    ConstantInt *LAddC, *RAddC;
+    if (match(LAdd, m_Add(m_Value(LAddOpnd), m_ConstantInt(LAddC))) &&
+        match(RAdd, m_Add(m_Value(RAddOpnd), m_ConstantInt(RAddC))) &&
+        LAddC->getValue().ugt(LHSC->getValue()) &&
+        RAddC->getValue().ugt(LHSC->getValue())) {
+
+      APInt DiffC = LAddC->getValue() ^ RAddC->getValue();
+      if (LAddOpnd == RAddOpnd && DiffC.isPowerOf2()) {
+        ConstantInt *MaxAddC = nullptr;
+        if (LAddC->getValue().ult(RAddC->getValue()))
+          MaxAddC = RAddC;
+        else
+          MaxAddC = LAddC;
+
+        APInt RRangeLow = -RAddC->getValue();
+        APInt RRangeHigh = RRangeLow + LHSC->getValue();
+        APInt LRangeLow = -LAddC->getValue();
+        APInt LRangeHigh = LRangeLow + LHSC->getValue();
+        APInt LowRangeDiff = RRangeLow ^ LRangeLow;
+        APInt HighRangeDiff = RRangeHigh ^ LRangeHigh;
+        APInt RangeDiff = LRangeLow.sgt(RRangeLow) ? LRangeLow - RRangeLow
+                                                   : RRangeLow - LRangeLow;
+
+        if (LowRangeDiff.isPowerOf2() && LowRangeDiff == HighRangeDiff &&
+            RangeDiff.ugt(LHSC->getValue())) {
+          Value *MaskC = ConstantInt::get(LAddC->getType(), ~DiffC);
+
+          Value *NewAnd = Builder.CreateAnd(LAddOpnd, MaskC);
+          Value *NewAdd = Builder.CreateAdd(NewAnd, MaxAddC);
+          return Builder.CreateICmp(LHS->getPredicate(), NewAdd, LHSC);
+        }
+      }
+    }
+  }
+
+  // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
+  if (PredicatesFoldable(PredL, PredR)) {
+    if (LHS->getOperand(0) == RHS->getOperand(1) &&
+        LHS->getOperand(1) == RHS->getOperand(0))
+      LHS->swapOperands();
+    if (LHS->getOperand(0) == RHS->getOperand(0) &&
+        LHS->getOperand(1) == RHS->getOperand(1)) {
+      Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
+      unsigned Code = getICmpCode(LHS) | getICmpCode(RHS);
+      bool isSigned = LHS->isSigned() || RHS->isSigned();
+      return getNewICmpValue(isSigned, Code, Op0, Op1, Builder);
+    }
+  }
+
+  // handle (roughly):
+  // (icmp ne (A & B), C) | (icmp ne (A & D), E)
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder))
+    return V;
+
+  Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
+  if (LHS->hasOneUse() || RHS->hasOneUse()) {
+    // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1)
+    // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1)
+    Value *A = nullptr, *B = nullptr;
+    if (PredL == ICmpInst::ICMP_EQ && LHSC && LHSC->isZero()) {
+      B = LHS0;
+      if (PredR == ICmpInst::ICMP_ULT && LHS0 == RHS->getOperand(1))
+        A = RHS0;
+      else if (PredR == ICmpInst::ICMP_UGT && LHS0 == RHS0)
+        A = RHS->getOperand(1);
+    }
+    // (icmp ult A, B) | (icmp eq B, 0) -> (icmp ule A, B-1)
+    // (icmp ugt B, A) | (icmp eq B, 0) -> (icmp ule A, B-1)
+    else if (PredR == ICmpInst::ICMP_EQ && RHSC && RHSC->isZero()) {
+      B = RHS0;
+      if (PredL == ICmpInst::ICMP_ULT && RHS0 == LHS->getOperand(1))
+        A = LHS0;
+      else if (PredL == ICmpInst::ICMP_UGT && LHS0 == RHS0)
+        A = LHS->getOperand(1);
+    }
+    if (A && B)
+      return Builder.CreateICmp(
+          ICmpInst::ICMP_UGE,
+          Builder.CreateAdd(B, ConstantInt::getSigned(B->getType(), -1)), A);
+  }
+
+  // E.g. (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
+  if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/true))
+    return V;
+
+  // E.g. (icmp sgt x, n) | (icmp slt x, 0) --> icmp ugt x, n
+  if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/true))
+    return V;
+
+  if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, false, Builder))
+    return V;
+
+  // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
+  if (!LHSC || !RHSC)
+    return nullptr;
+
+  if (LHSC == RHSC && PredL == PredR) {
+    // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
+    if (PredL == ICmpInst::ICMP_NE && LHSC->isZero()) {
+      Value *NewOr = Builder.CreateOr(LHS0, RHS0);
+      return Builder.CreateICmp(PredL, NewOr, LHSC);
+    }
+  }
+
+  // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1)
+  //   iff C2 + CA == C1.
+  if (PredL == ICmpInst::ICMP_ULT && PredR == ICmpInst::ICMP_EQ) {
+    ConstantInt *AddC;
+    if (match(LHS0, m_Add(m_Specific(RHS0), m_ConstantInt(AddC))))
+      if (RHSC->getValue() + AddC->getValue() == LHSC->getValue())
+        return Builder.CreateICmpULE(LHS0, LHSC);
+  }
+
+  // From here on, we only handle:
+  //    (icmp1 A, C1) | (icmp2 A, C2) --> something simpler.
+  if (LHS0 != RHS0)
+    return nullptr;
+
+  // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
+  if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
+      PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
+      PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
+      PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
+    return nullptr;
+
+  // We can't fold (ugt x, C) | (sgt x, C2).
+  if (!PredicatesFoldable(PredL, PredR))
+    return nullptr;
+
+  // Ensure that the larger constant is on the RHS.
+  bool ShouldSwap;
+  if (CmpInst::isSigned(PredL) ||
+      (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
+    ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
+  else
+    ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
+
+  if (ShouldSwap) {
+    std::swap(LHS, RHS);
+    std::swap(LHSC, RHSC);
+    std::swap(PredL, PredR);
+  }
+
+  // At this point, we know we have two icmp instructions
+  // comparing a value against two constants and or'ing the result
+  // together.  Because of the above check, we know that we only have
+  // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the
+  // icmp folding check above), that the two constants are not
+  // equal.
+  assert(LHSC != RHSC && "Compares not folded above?");
+
+  switch (PredL) {
+  default:
+    llvm_unreachable("Unknown integer condition code!");
+  case ICmpInst::ICMP_EQ:
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:
+      // Potential folds for this case should already be handled.
+      break;
+    case ICmpInst::ICMP_UGT: // (X == 13 | X u> 14) -> no change
+    case ICmpInst::ICMP_SGT: // (X == 13 | X s> 14) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_ULT:
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change
+      break;
+    case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2
+      assert(!RHSC->isMaxValue(false) && "Missed icmp simplification");
+      return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1,
+                             false, false);
+    }
+    break;
+  case ICmpInst::ICMP_SLT:
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ: // (X s< 13 | X == 14) -> no change
+      break;
+    case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) s> 2
+      assert(!RHSC->isMaxValue(true) && "Missed icmp simplification");
+      return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, true,
+                             false);
+    }
+    break;
+  }
+  return nullptr;
+}
+
+/// Optimize (fcmp)|(fcmp).  NOTE: Unlike the rest of instcombine, this returns
+/// a Value which should already be inserted into the function.
+Value *InstCombiner::foldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
+  Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
+  Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1);
+  FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate();
+
+  if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) {
+    // Swap RHS operands to match LHS.
+    Op1CC = FCmpInst::getSwappedPredicate(Op1CC);
+    std::swap(Op1LHS, Op1RHS);
+  }
+
+  // Simplify (fcmp cc0 x, y) | (fcmp cc1 x, y).
+  // This is a similar transformation to the one in FoldAndOfFCmps.
+  //
+  // Since (R & CC0) and (R & CC1) are either R or 0, we actually have this:
+  //    bool(R & CC0) || bool(R & CC1)
+  //  = bool((R & CC0) | (R & CC1))
+  //  = bool(R & (CC0 | CC1)) <= by reversed distribution (contribution? ;)
+  if (Op0LHS == Op1LHS && Op0RHS == Op1RHS)
+    return getFCmpValue(getFCmpCode(Op0CC) | getFCmpCode(Op1CC), Op0LHS, Op0RHS,
+                        Builder);
+
+  if (LHS->getPredicate() == FCmpInst::FCMP_UNO &&
+      RHS->getPredicate() == FCmpInst::FCMP_UNO &&
+      LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType()) {
+    if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1)))
+      if (ConstantFP *RHSC = dyn_cast<ConstantFP>(RHS->getOperand(1))) {
+        // If either of the constants are nans, then the whole thing returns
+        // true.
+        if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
+          return Builder.getTrue();
+
+        // Otherwise, no need to compare the two constants, compare the
+        // rest.
+        return Builder.CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0));
+      }
+
+    // Handle vector zeros.  This occurs because the canonical form of
+    // "fcmp uno x,x" is "fcmp uno x, 0".
+    if (isa<ConstantAggregateZero>(LHS->getOperand(1)) &&
+        isa<ConstantAggregateZero>(RHS->getOperand(1)))
+      return Builder.CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0));
+
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
+/// This helper function folds:
+///
+///     ((A | B) & C1) | (B & C2)
+///
+/// into:
+///
+///     (A & C1) | B
+///
+/// when the XOR of the two constants is "all ones" (-1).
+static Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op,
+                                        Value *A, Value *B, Value *C,
+                                        InstCombiner::BuilderTy &Builder) {
+  ConstantInt *CI1 = dyn_cast<ConstantInt>(C);
+  if (!CI1) return nullptr;
+
+  Value *V1 = nullptr;
+  ConstantInt *CI2 = nullptr;
+  if (!match(Op, m_And(m_Value(V1), m_ConstantInt(CI2)))) return nullptr;
+
+  APInt Xor = CI1->getValue() ^ CI2->getValue();
+  if (!Xor.isAllOnesValue()) return nullptr;
+
+  if (V1 == A || V1 == B) {
+    Value *NewOp = Builder.CreateAnd((V1 == A) ? B : A, CI1);
+    return BinaryOperator::CreateOr(NewOp, V1);
+  }
+
+  return nullptr;
+}
+
+/// \brief This helper function folds:
+///
+///     ((A ^ B) & C1) | (B & C2)
+///
+/// into:
+///
+///     (A & C1) ^ B
+///
+/// when the XOR of the two constants is "all ones" (-1).
+static Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op,
+                                         Value *A, Value *B, Value *C,
+                                         InstCombiner::BuilderTy &Builder) {
+  ConstantInt *CI1 = dyn_cast<ConstantInt>(C);
+  if (!CI1)
+    return nullptr;
+
+  Value *V1 = nullptr;
+  ConstantInt *CI2 = nullptr;
+  if (!match(Op, m_And(m_Value(V1), m_ConstantInt(CI2))))
+    return nullptr;
+
+  APInt Xor = CI1->getValue() ^ CI2->getValue();
+  if (!Xor.isAllOnesValue())
+    return nullptr;
+
+  if (V1 == A || V1 == B) {
+    Value *NewOp = Builder.CreateAnd(V1 == A ? B : A, CI1);
+    return BinaryOperator::CreateXor(NewOp, V1);
+  }
+
+  return nullptr;
+}
+
+// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
+// here. We should standardize that construct where it is needed or choose some
+// other way to ensure that commutated variants of patterns are not missed.
+Instruction *InstCombiner::visitOr(BinaryOperator &I) {
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyOrInst(Op0, Op1, SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  // See if we can simplify any instructions used by the instruction whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  // Do this before using distributive laws to catch simple and/or/not patterns.
+  if (Instruction *Xor = foldOrToXor(I, Builder))
+    return Xor;
+
+  // (A&B)|(A&C) -> A&(B|C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyBSwap(I, Builder))
+    return replaceInstUsesWith(I, V);
+
+  if (isa<Constant>(Op1))
+    if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
+      return FoldedLogic;
+
+  // Given an OR instruction, check to see if this is a bswap.
+  if (Instruction *BSwap = MatchBSwap(I))
+    return BSwap;
+
+  {
+    Value *A;
+    const APInt *C;
+    // (X^C)|Y -> (X|Y)^C iff Y&C == 0
+    if (match(Op0, m_OneUse(m_Xor(m_Value(A), m_APInt(C)))) &&
+        MaskedValueIsZero(Op1, *C, 0, &I)) {
+      Value *NOr = Builder.CreateOr(A, Op1);
+      NOr->takeName(Op0);
+      return BinaryOperator::CreateXor(NOr,
+                                       ConstantInt::get(NOr->getType(), *C));
+    }
+
+    // Y|(X^C) -> (X|Y)^C iff Y&C == 0
+    if (match(Op1, m_OneUse(m_Xor(m_Value(A), m_APInt(C)))) &&
+        MaskedValueIsZero(Op0, *C, 0, &I)) {
+      Value *NOr = Builder.CreateOr(A, Op0);
+      NOr->takeName(Op0);
+      return BinaryOperator::CreateXor(NOr,
+                                       ConstantInt::get(NOr->getType(), *C));
+    }
+  }
+
+  Value *A, *B;
+
+  // ((~A & B) | A) -> (A | B)
+  if (match(Op0, m_c_And(m_Not(m_Specific(Op1)), m_Value(A))))
+    return BinaryOperator::CreateOr(A, Op1);
+  if (match(Op1, m_c_And(m_Not(m_Specific(Op0)), m_Value(A))))
+    return BinaryOperator::CreateOr(Op0, A);
+
+  // ((A & B) | ~A) -> (~A | B)
+  // The NOT is guaranteed to be in the RHS by complexity ordering.
+  if (match(Op1, m_Not(m_Value(A))) &&
+      match(Op0, m_c_And(m_Specific(A), m_Value(B))))
+    return BinaryOperator::CreateOr(Op1, B);
+
+  // (A & C)|(B & D)
+  Value *C = nullptr, *D = nullptr;
+  if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
+      match(Op1, m_And(m_Value(B), m_Value(D)))) {
+    Value *V1 = nullptr, *V2 = nullptr;
+    ConstantInt *C1 = dyn_cast<ConstantInt>(C);
+    ConstantInt *C2 = dyn_cast<ConstantInt>(D);
+    if (C1 && C2) {  // (A & C1)|(B & C2)
+      if ((C1->getValue() & C2->getValue()).isNullValue()) {
+        // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2)
+        // iff (C1&C2) == 0 and (N&~C1) == 0
+        if (match(A, m_Or(m_Value(V1), m_Value(V2))) &&
+            ((V1 == B &&
+              MaskedValueIsZero(V2, ~C1->getValue(), 0, &I)) || // (V|N)
+             (V2 == B &&
+              MaskedValueIsZero(V1, ~C1->getValue(), 0, &I))))  // (N|V)
+          return BinaryOperator::CreateAnd(A,
+                                Builder.getInt(C1->getValue()|C2->getValue()));
+        // Or commutes, try both ways.
+        if (match(B, m_Or(m_Value(V1), m_Value(V2))) &&
+            ((V1 == A &&
+              MaskedValueIsZero(V2, ~C2->getValue(), 0, &I)) || // (V|N)
+             (V2 == A &&
+              MaskedValueIsZero(V1, ~C2->getValue(), 0, &I))))  // (N|V)
+          return BinaryOperator::CreateAnd(B,
+                                 Builder.getInt(C1->getValue()|C2->getValue()));
+
+        // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2)
+        // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0.
+        ConstantInt *C3 = nullptr, *C4 = nullptr;
+        if (match(A, m_Or(m_Value(V1), m_ConstantInt(C3))) &&
+            (C3->getValue() & ~C1->getValue()).isNullValue() &&
+            match(B, m_Or(m_Specific(V1), m_ConstantInt(C4))) &&
+            (C4->getValue() & ~C2->getValue()).isNullValue()) {
+          V2 = Builder.CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield");
+          return BinaryOperator::CreateAnd(V2,
+                                 Builder.getInt(C1->getValue()|C2->getValue()));
+        }
+      }
+    }
+
+    // Don't try to form a select if it's unlikely that we'll get rid of at
+    // least one of the operands. A select is generally more expensive than the
+    // 'or' that it is replacing.
+    if (Op0->hasOneUse() || Op1->hasOneUse()) {
+      // (Cond & C) | (~Cond & D) -> Cond ? C : D, and commuted variants.
+      if (Value *V = matchSelectFromAndOr(A, C, B, D, Builder))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(A, C, D, B, Builder))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(C, A, B, D, Builder))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(C, A, D, B, Builder))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(B, D, A, C, Builder))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(B, D, C, A, Builder))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(D, B, A, C, Builder))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(D, B, C, A, Builder))
+        return replaceInstUsesWith(I, V);
+    }
+
+    // ((A|B)&1)|(B&-2) -> (A&1) | B
+    if (match(A, m_c_Or(m_Value(V1), m_Specific(B)))) {
+      if (Instruction *Ret = FoldOrWithConstants(I, Op1, V1, B, C, Builder))
+        return Ret;
+    }
+    // (B&-2)|((A|B)&1) -> (A&1) | B
+    if (match(B, m_c_Or(m_Specific(A), m_Value(V1)))) {
+      if (Instruction *Ret = FoldOrWithConstants(I, Op0, A, V1, D, Builder))
+        return Ret;
+    }
+    // ((A^B)&1)|(B&-2) -> (A&1) ^ B
+    if (match(A, m_c_Xor(m_Value(V1), m_Specific(B)))) {
+      if (Instruction *Ret = FoldXorWithConstants(I, Op1, V1, B, C, Builder))
+        return Ret;
+    }
+    // (B&-2)|((A^B)&1) -> (A&1) ^ B
+    if (match(B, m_c_Xor(m_Specific(A), m_Value(V1)))) {
+      if (Instruction *Ret = FoldXorWithConstants(I, Op0, A, V1, D, Builder))
+        return Ret;
+    }
+  }
+
+  // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C
+  if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
+    if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
+      return BinaryOperator::CreateOr(Op0, C);
+
+  // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C
+  if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
+    if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
+      return BinaryOperator::CreateOr(Op1, C);
+
+  // ((B | C) & A) | B -> B | (A & C)
+  if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
+    return BinaryOperator::CreateOr(Op1, Builder.CreateAnd(A, C));
+
+  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+    return DeMorgan;
+
+  // Canonicalize xor to the RHS.
+  bool SwappedForXor = false;
+  if (match(Op0, m_Xor(m_Value(), m_Value()))) {
+    std::swap(Op0, Op1);
+    SwappedForXor = true;
+  }
+
+  // A | ( A ^ B) -> A |  B
+  // A | (~A ^ B) -> A | ~B
+  // (A & B) | (A ^ B)
+  if (match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
+    if (Op0 == A || Op0 == B)
+      return BinaryOperator::CreateOr(A, B);
+
+    if (match(Op0, m_And(m_Specific(A), m_Specific(B))) ||
+        match(Op0, m_And(m_Specific(B), m_Specific(A))))
+      return BinaryOperator::CreateOr(A, B);
+
+    if (Op1->hasOneUse() && match(A, m_Not(m_Specific(Op0)))) {
+      Value *Not = Builder.CreateNot(B, B->getName() + ".not");
+      return BinaryOperator::CreateOr(Not, Op0);
+    }
+    if (Op1->hasOneUse() && match(B, m_Not(m_Specific(Op0)))) {
+      Value *Not = Builder.CreateNot(A, A->getName() + ".not");
+      return BinaryOperator::CreateOr(Not, Op0);
+    }
+  }
+
+  // A | ~(A | B) -> A | ~B
+  // A | ~(A ^ B) -> A | ~B
+  if (match(Op1, m_Not(m_Value(A))))
+    if (BinaryOperator *B = dyn_cast<BinaryOperator>(A))
+      if ((Op0 == B->getOperand(0) || Op0 == B->getOperand(1)) &&
+          Op1->hasOneUse() && (B->getOpcode() == Instruction::Or ||
+                               B->getOpcode() == Instruction::Xor)) {
+        Value *NotOp = Op0 == B->getOperand(0) ? B->getOperand(1) :
+                                                 B->getOperand(0);
+        Value *Not = Builder.CreateNot(NotOp, NotOp->getName() + ".not");
+        return BinaryOperator::CreateOr(Not, Op0);
+      }
+
+  // (A & B) | (~A ^ B) -> (~A ^ B)
+  // (A & B) | (B ^ ~A) -> (~A ^ B)
+  // (B & A) | (~A ^ B) -> (~A ^ B)
+  // (B & A) | (B ^ ~A) -> (~A ^ B)
+  // The match order is important: match the xor first because the 'not'
+  // operation defines 'A'. We do not need to match the xor as Op0 because the
+  // xor was canonicalized to Op1 above.
+  if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
+      match(Op0, m_c_And(m_Specific(A), m_Specific(B))))
+    return BinaryOperator::CreateXor(Builder.CreateNot(A), B);
+
+  if (SwappedForXor)
+    std::swap(Op0, Op1);
+
+  {
+    ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
+    ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
+    if (LHS && RHS)
+      if (Value *Res = foldOrOfICmps(LHS, RHS, I))
+        return replaceInstUsesWith(I, Res);
+
+    // TODO: Make this recursive; it's a little tricky because an arbitrary
+    // number of 'or' instructions might have to be created.
+    Value *X, *Y;
+    if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
+      if (auto *Cmp = dyn_cast<ICmpInst>(X))
+        if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
+          return replaceInstUsesWith(I, Builder.CreateOr(Res, Y));
+      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+        if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
+          return replaceInstUsesWith(I, Builder.CreateOr(Res, X));
+    }
+    if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
+      if (auto *Cmp = dyn_cast<ICmpInst>(X))
+        if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
+          return replaceInstUsesWith(I, Builder.CreateOr(Res, Y));
+      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+        if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
+          return replaceInstUsesWith(I, Builder.CreateOr(Res, X));
+    }
+  }
+
+  // (fcmp uno x, c) | (fcmp uno y, c)  -> (fcmp uno x, y)
+  if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
+    if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
+      if (Value *Res = foldOrOfFCmps(LHS, RHS))
+        return replaceInstUsesWith(I, Res);
+
+  if (Instruction *CastedOr = foldCastedBitwiseLogic(I))
+    return CastedOr;
+
+  // or(sext(A), B) / or(B, sext(A)) --> A ? -1 : B, where A is i1 or <N x i1>.
+  if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
+      A->getType()->isIntOrIntVectorTy(1))
+    return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1);
+  if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) &&
+      A->getType()->isIntOrIntVectorTy(1))
+    return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0);
+
+  // Note: If we've gotten to the point of visiting the outer OR, then the
+  // inner one couldn't be simplified.  If it was a constant, then it won't
+  // be simplified by a later pass either, so we try swapping the inner/outer
+  // ORs in the hopes that we'll be able to simplify it this way.
+  // (X|C) | V --> (X|V) | C
+  ConstantInt *C1;
+  if (Op0->hasOneUse() && !isa<ConstantInt>(Op1) &&
+      match(Op0, m_Or(m_Value(A), m_ConstantInt(C1)))) {
+    Value *Inner = Builder.CreateOr(A, Op1);
+    Inner->takeName(Op0);
+    return BinaryOperator::CreateOr(Inner, C1);
+  }
+
+  // Change (or (bool?A:B),(bool?C:D)) --> (bool?(or A,C):(or B,D))
+  // Since this OR statement hasn't been optimized further yet, we hope
+  // that this transformation will allow the new ORs to be optimized.
+  {
+    Value *X = nullptr, *Y = nullptr;
+    if (Op0->hasOneUse() && Op1->hasOneUse() &&
+        match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) &&
+        match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) {
+      Value *orTrue = Builder.CreateOr(A, C);
+      Value *orFalse = Builder.CreateOr(B, D);
+      return SelectInst::Create(X, orTrue, orFalse);
+    }
+  }
+
+  return Changed ? &I : nullptr;
+}
+
+/// A ^ B can be specified using other logic ops in a variety of patterns. We
+/// can fold these early and efficiently by morphing an existing instruction.
+static Instruction *foldXorToXor(BinaryOperator &I,
+                                 InstCombiner::BuilderTy &Builder) {
+  assert(I.getOpcode() == Instruction::Xor);
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  Value *A, *B;
+
+  // There are 4 commuted variants for each of the basic patterns.
+
+  // (A & B) ^ (A | B) -> A ^ B
+  // (A & B) ^ (B | A) -> A ^ B
+  // (A | B) ^ (A & B) -> A ^ B
+  // (A | B) ^ (B & A) -> A ^ B
+  if ((match(Op0, m_And(m_Value(A), m_Value(B))) &&
+       match(Op1, m_c_Or(m_Specific(A), m_Specific(B)))) ||
+      (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+       match(Op1, m_c_And(m_Specific(A), m_Specific(B))))) {
+    I.setOperand(0, A);
+    I.setOperand(1, B);
+    return &I;
+  }
+
+  // (A | ~B) ^ (~A | B) -> A ^ B
+  // (~B | A) ^ (~A | B) -> A ^ B
+  // (~A | B) ^ (A | ~B) -> A ^ B
+  // (B | ~A) ^ (A | ~B) -> A ^ B
+  if ((match(Op0, m_Or(m_Value(A), m_Not(m_Value(B)))) &&
+       match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B)))) ||
+      (match(Op0, m_Or(m_Not(m_Value(A)), m_Value(B))) &&
+       match(Op1, m_c_Or(m_Specific(A), m_Not(m_Specific(B)))))) {
+    I.setOperand(0, A);
+    I.setOperand(1, B);
+    return &I;
+  }
+
+  // (A & ~B) ^ (~A & B) -> A ^ B
+  // (~B & A) ^ (~A & B) -> A ^ B
+  // (~A & B) ^ (A & ~B) -> A ^ B
+  // (B & ~A) ^ (A & ~B) -> A ^ B
+  if ((match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
+       match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))) ||
+      (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) &&
+       match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))))) {
+    I.setOperand(0, A);
+    I.setOperand(1, B);
+    return &I;
+  }
+
+  // For the remaining cases we need to get rid of one of the operands.
+  if (!Op0->hasOneUse() && !Op1->hasOneUse())
+    return nullptr;
+
+  // (A | B) ^ ~(A & B) -> ~(A ^ B)
+  // (A | B) ^ ~(B & A) -> ~(A ^ B)
+  // (A & B) ^ ~(A | B) -> ~(A ^ B)
+  // (A & B) ^ ~(B | A) -> ~(A ^ B)
+  // Complexity sorting ensures the not will be on the right side.
+  if ((match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+       match(Op1, m_Not(m_c_And(m_Specific(A), m_Specific(B))))) ||
+      (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+       match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))))
+    return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+  return nullptr;
+}
+
+Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
+  if (PredicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
+    if (LHS->getOperand(0) == RHS->getOperand(1) &&
+        LHS->getOperand(1) == RHS->getOperand(0))
+      LHS->swapOperands();
+    if (LHS->getOperand(0) == RHS->getOperand(0) &&
+        LHS->getOperand(1) == RHS->getOperand(1)) {
+      // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
+      Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
+      unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS);
+      bool isSigned = LHS->isSigned() || RHS->isSigned();
+      return getNewICmpValue(isSigned, Code, Op0, Op1, Builder);
+    }
+  }
+
+  // Instead of trying to imitate the folds for and/or, decompose this 'xor'
+  // into those logic ops. That is, try to turn this into an and-of-icmps
+  // because we have many folds for that pattern.
+  //
+  // This is based on a truth table definition of xor:
+  // X ^ Y --> (X | Y) & !(X & Y)
+  if (Value *OrICmp = SimplifyBinOp(Instruction::Or, LHS, RHS, SQ)) {
+    // TODO: If OrICmp is true, then the definition of xor simplifies to !(X&Y).
+    // TODO: If OrICmp is false, the whole thing is false (InstSimplify?).
+    if (Value *AndICmp = SimplifyBinOp(Instruction::And, LHS, RHS, SQ)) {
+      // TODO: Independently handle cases where the 'and' side is a constant.
+      if (OrICmp == LHS && AndICmp == RHS && RHS->hasOneUse()) {
+        // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS
+        RHS->setPredicate(RHS->getInversePredicate());
+        return Builder.CreateAnd(LHS, RHS);
+      }
+      if (OrICmp == RHS && AndICmp == LHS && LHS->hasOneUse()) {
+        // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS
+        LHS->setPredicate(LHS->getInversePredicate());
+        return Builder.CreateAnd(LHS, RHS);
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
+// here. We should standardize that construct where it is needed or choose some
+// other way to ensure that commutated variants of patterns are not missed.
+Instruction *InstCombiner::visitXor(BinaryOperator &I) {
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyXorInst(Op0, Op1, SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *NewXor = foldXorToXor(I, Builder))
+    return NewXor;
+
+  // (A&B)^(A&C) -> A&(B^C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
+
+  // See if we can simplify any instructions used by the instruction whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  if (Value *V = SimplifyBSwap(I, Builder))
+    return replaceInstUsesWith(I, V);
+
+  // Apply DeMorgan's Law for 'nand' / 'nor' logic with an inverted operand.
+  Value *X, *Y;
+
+  // We must eliminate the and/or (one-use) for these transforms to not increase
+  // the instruction count.
+  // ~(~X & Y) --> (X | ~Y)
+  // ~(Y & ~X) --> (X | ~Y)
+  if (match(&I, m_Not(m_OneUse(m_c_And(m_Not(m_Value(X)), m_Value(Y)))))) {
+    Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+    return BinaryOperator::CreateOr(X, NotY);
+  }
+  // ~(~X | Y) --> (X & ~Y)
+  // ~(Y | ~X) --> (X & ~Y)
+  if (match(&I, m_Not(m_OneUse(m_c_Or(m_Not(m_Value(X)), m_Value(Y)))))) {
+    Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+    return BinaryOperator::CreateAnd(X, NotY);
+  }
+
+  // Is this a 'not' (~) fed by a binary operator?
+  BinaryOperator *NotVal;
+  if (match(&I, m_Not(m_BinOp(NotVal)))) {
+    if (NotVal->getOpcode() == Instruction::And ||
+        NotVal->getOpcode() == Instruction::Or) {
+      // Apply DeMorgan's Law when inverts are free:
+      // ~(X & Y) --> (~X | ~Y)
+      // ~(X | Y) --> (~X & ~Y)
+      if (IsFreeToInvert(NotVal->getOperand(0),
+                         NotVal->getOperand(0)->hasOneUse()) &&
+          IsFreeToInvert(NotVal->getOperand(1),
+                         NotVal->getOperand(1)->hasOneUse())) {
+        Value *NotX = Builder.CreateNot(NotVal->getOperand(0), "notlhs");
+        Value *NotY = Builder.CreateNot(NotVal->getOperand(1), "notrhs");
+        if (NotVal->getOpcode() == Instruction::And)
+          return BinaryOperator::CreateOr(NotX, NotY);
+        return BinaryOperator::CreateAnd(NotX, NotY);
+      }
+    }
+
+    // ~(~X >>s Y) --> (X >>s Y)
+    if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y))))
+      return BinaryOperator::CreateAShr(X, Y);
+
+    // If we are inverting a right-shifted constant, we may be able to eliminate
+    // the 'not' by inverting the constant and using the opposite shift type.
+    // Canonicalization rules ensure that only a negative constant uses 'ashr',
+    // but we must check that in case that transform has not fired yet.
+    const APInt *C;
+    if (match(NotVal, m_AShr(m_APInt(C), m_Value(Y))) && C->isNegative()) {
+      // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits)
+      Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+      return BinaryOperator::CreateLShr(NotC, Y);
+    }
+
+    if (match(NotVal, m_LShr(m_APInt(C), m_Value(Y))) && C->isNonNegative()) {
+      // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits)
+      Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+      return BinaryOperator::CreateAShr(NotC, Y);
+    }
+  }
+
+  // not (cmp A, B) = !cmp A, B
+  CmpInst::Predicate Pred;
+  if (match(&I, m_Not(m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))))) {
+    cast<CmpInst>(Op0)->setPredicate(CmpInst::getInversePredicate(Pred));
+    return replaceInstUsesWith(I, Op0);
+  }
+
+  if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1)) {
+    // fold (xor(zext(cmp)), 1) and (xor(sext(cmp)), -1) to ext(!cmp).
+    if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+      if (CmpInst *CI = dyn_cast<CmpInst>(Op0C->getOperand(0))) {
+        if (CI->hasOneUse() && Op0C->hasOneUse()) {
+          Instruction::CastOps Opcode = Op0C->getOpcode();
+          if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
+              (RHSC == ConstantExpr::getCast(Opcode, Builder.getTrue(),
+                                             Op0C->getDestTy()))) {
+            CI->setPredicate(CI->getInversePredicate());
+            return CastInst::Create(Opcode, CI, Op0C->getType());
+          }
+        }
+      }
+    }
+
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
+      // ~(c-X) == X-c-1 == X+(-c-1)
+      if (Op0I->getOpcode() == Instruction::Sub && RHSC->isMinusOne())
+        if (Constant *Op0I0C = dyn_cast<Constant>(Op0I->getOperand(0))) {
+          Constant *NegOp0I0C = ConstantExpr::getNeg(Op0I0C);
+          return BinaryOperator::CreateAdd(Op0I->getOperand(1),
+                                           SubOne(NegOp0I0C));
+        }
+
+      if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
+        if (Op0I->getOpcode() == Instruction::Add) {
+          // ~(X-c) --> (-c-1)-X
+          if (RHSC->isMinusOne()) {
+            Constant *NegOp0CI = ConstantExpr::getNeg(Op0CI);
+            return BinaryOperator::CreateSub(SubOne(NegOp0CI),
+                                             Op0I->getOperand(0));
+          } else if (RHSC->getValue().isSignMask()) {
+            // (X + C) ^ signmask -> (X + C + signmask)
+            Constant *C = Builder.getInt(RHSC->getValue() + Op0CI->getValue());
+            return BinaryOperator::CreateAdd(Op0I->getOperand(0), C);
+
+          }
+        } else if (Op0I->getOpcode() == Instruction::Or) {
+          // (X|C1)^C2 -> X^(C1|C2) iff X&~C1 == 0
+          if (MaskedValueIsZero(Op0I->getOperand(0), Op0CI->getValue(),
+                                0, &I)) {
+            Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHSC);
+            // Anything in both C1 and C2 is known to be zero, remove it from
+            // NewRHS.
+            Constant *CommonBits = ConstantExpr::getAnd(Op0CI, RHSC);
+            NewRHS = ConstantExpr::getAnd(NewRHS,
+                                       ConstantExpr::getNot(CommonBits));
+            Worklist.Add(Op0I);
+            I.setOperand(0, Op0I->getOperand(0));
+            I.setOperand(1, NewRHS);
+            return &I;
+          }
+        } else if (Op0I->getOpcode() == Instruction::LShr) {
+          // ((X^C1) >> C2) ^ C3 -> (X>>C2) ^ ((C1>>C2)^C3)
+          // E1 = "X ^ C1"
+          BinaryOperator *E1;
+          ConstantInt *C1;
+          if (Op0I->hasOneUse() &&
+              (E1 = dyn_cast<BinaryOperator>(Op0I->getOperand(0))) &&
+              E1->getOpcode() == Instruction::Xor &&
+              (C1 = dyn_cast<ConstantInt>(E1->getOperand(1)))) {
+            // fold (C1 >> C2) ^ C3
+            ConstantInt *C2 = Op0CI, *C3 = RHSC;
+            APInt FoldConst = C1->getValue().lshr(C2->getValue());
+            FoldConst ^= C3->getValue();
+            // Prepare the two operands.
+            Value *Opnd0 = Builder.CreateLShr(E1->getOperand(0), C2);
+            Opnd0->takeName(Op0I);
+            cast<Instruction>(Opnd0)->setDebugLoc(I.getDebugLoc());
+            Value *FoldVal = ConstantInt::get(Opnd0->getType(), FoldConst);
+
+            return BinaryOperator::CreateXor(Opnd0, FoldVal);
+          }
+        }
+      }
+    }
+  }
+
+  if (isa<Constant>(Op1))
+    if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
+      return FoldedLogic;
+
+  {
+    Value *A, *B;
+    if (match(Op1, m_OneUse(m_Or(m_Value(A), m_Value(B))))) {
+      if (A == Op0) {                                      // A^(A|B) == A^(B|A)
+        cast<BinaryOperator>(Op1)->swapOperands();
+        std::swap(A, B);
+      }
+      if (B == Op0) {                                      // A^(B|A) == (B|A)^A
+        I.swapOperands();     // Simplified below.
+        std::swap(Op0, Op1);
+      }
+    } else if (match(Op1, m_OneUse(m_And(m_Value(A), m_Value(B))))) {
+      if (A == Op0) {                                      // A^(A&B) -> A^(B&A)
+        cast<BinaryOperator>(Op1)->swapOperands();
+        std::swap(A, B);
+      }
+      if (B == Op0) {                                      // A^(B&A) -> (B&A)^A
+        I.swapOperands();     // Simplified below.
+        std::swap(Op0, Op1);
+      }
+    }
+  }
+
+  {
+    Value *A, *B;
+    if (match(Op0, m_OneUse(m_Or(m_Value(A), m_Value(B))))) {
+      if (A == Op1)                                  // (B|A)^B == (A|B)^B
+        std::swap(A, B);
+      if (B == Op1)                                  // (A|B)^B == A & ~B
+        return BinaryOperator::CreateAnd(A, Builder.CreateNot(Op1));
+    } else if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B))))) {
+      if (A == Op1)                                        // (A&B)^A -> (B&A)^A
+        std::swap(A, B);
+      const APInt *C;
+      if (B == Op1 &&                                      // (B&A)^A == ~B & A
+          !match(Op1, m_APInt(C))) {  // Canonical form is (B&C)^C
+        return BinaryOperator::CreateAnd(Builder.CreateNot(A), Op1);
+      }
+    }
+  }
+
+  {
+    Value *A, *B, *C, *D;
+    // (A ^ C)^(A | B) -> ((~A) & B) ^ C
+    if (match(Op0, m_Xor(m_Value(D), m_Value(C))) &&
+        match(Op1, m_Or(m_Value(A), m_Value(B)))) {
+      if (D == A)
+        return BinaryOperator::CreateXor(
+            Builder.CreateAnd(Builder.CreateNot(A), B), C);
+      if (D == B)
+        return BinaryOperator::CreateXor(
+            Builder.CreateAnd(Builder.CreateNot(B), A), C);
+    }
+    // (A | B)^(A ^ C) -> ((~A) & B) ^ C
+    if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op1, m_Xor(m_Value(D), m_Value(C)))) {
+      if (D == A)
+        return BinaryOperator::CreateXor(
+            Builder.CreateAnd(Builder.CreateNot(A), B), C);
+      if (D == B)
+        return BinaryOperator::CreateXor(
+            Builder.CreateAnd(Builder.CreateNot(B), A), C);
+    }
+    // (A & B) ^ (A ^ B) -> (A | B)
+    if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_Xor(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateOr(A, B);
+    // (A ^ B) ^ (A & B) -> (A | B)
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_And(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateOr(A, B);
+  }
+
+  // (A & ~B) ^ ~A -> ~(A & B)
+  // (~B & A) ^ ~A -> ~(A & B)
+  Value *A, *B;
+  if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
+      match(Op1, m_Not(m_Specific(A))))
+    return BinaryOperator::CreateNot(Builder.CreateAnd(A, B));
+
+  if (auto *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
+    if (auto *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
+      if (Value *V = foldXorOfICmps(LHS, RHS))
+        return replaceInstUsesWith(I, V);
+
+  if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
+    return CastedXor;
+
+  return Changed ? &I : nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
new file mode 100644
index 000000000000..391c430dab75
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -0,0 +1,4416 @@
+//===- InstCombineCalls.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitCall and visitInvoke functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+STATISTIC(NumSimplified, "Number of library calls simplified");
+
+static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements(
+    "unfold-element-atomic-memcpy-max-elements",
+    cl::init(16),
+    cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
+             "allowed to unfold"));
+
+/// Return the specified type promoted as it would be to pass though a va_arg
+/// area.
+static Type *getPromotedType(Type *Ty) {
+  if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
+    if (ITy->getBitWidth() < 32)
+      return Type::getInt32Ty(Ty->getContext());
+  }
+  return Ty;
+}
+
+/// Return a constant boolean vector that has true elements in all positions
+/// where the input constant data vector has an element with the sign bit set.
+static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
+  SmallVector<Constant *, 32> BoolVec;
+  IntegerType *BoolTy = Type::getInt1Ty(V->getContext());
+  for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) {
+    Constant *Elt = V->getElementAsConstant(I);
+    assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) &&
+           "Unexpected constant data vector element type");
+    bool Sign = V->getElementType()->isIntegerTy()
+                    ? cast<ConstantInt>(Elt)->isNegative()
+                    : cast<ConstantFP>(Elt)->isNegative();
+    BoolVec.push_back(ConstantInt::get(BoolTy, Sign));
+  }
+  return ConstantVector::get(BoolVec);
+}
+
+Instruction *InstCombiner::SimplifyElementUnorderedAtomicMemCpy(
+    ElementUnorderedAtomicMemCpyInst *AMI) {
+  // Try to unfold this intrinsic into sequence of explicit atomic loads and
+  // stores.
+  // First check that number of elements is compile time constant.
+  auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength());
+  if (!LengthCI)
+    return nullptr;
+
+  // Check that there are not too many elements.
+  uint64_t LengthInBytes = LengthCI->getZExtValue();
+  uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes();
+  uint64_t NumElements = LengthInBytes / ElementSizeInBytes;
+  if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
+    return nullptr;
+
+  // Only expand if there are elements to copy.
+  if (NumElements > 0) {
+    // Don't unfold into illegal integers
+    uint64_t ElementSizeInBits = ElementSizeInBytes * 8;
+    if (!getDataLayout().isLegalInteger(ElementSizeInBits))
+      return nullptr;
+
+    // Cast source and destination to the correct type. Intrinsic input
+    // arguments are usually represented as i8*. Often operands will be
+    // explicitly casted to i8* and we can just strip those casts instead of
+    // inserting new ones. However it's easier to rely on other InstCombine
+    // rules which will cover trivial cases anyway.
+    Value *Src = AMI->getRawSource();
+    Value *Dst = AMI->getRawDest();
+    Type *ElementPointerType =
+        Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits,
+                           Src->getType()->getPointerAddressSpace());
+
+    Value *SrcCasted = Builder.CreatePointerCast(Src, ElementPointerType,
+                                                 "memcpy_unfold.src_casted");
+    Value *DstCasted = Builder.CreatePointerCast(Dst, ElementPointerType,
+                                                 "memcpy_unfold.dst_casted");
+
+    for (uint64_t i = 0; i < NumElements; ++i) {
+      // Get current element addresses
+      ConstantInt *ElementIdxCI =
+          ConstantInt::get(AMI->getContext(), APInt(64, i));
+      Value *SrcElementAddr =
+          Builder.CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
+      Value *DstElementAddr =
+          Builder.CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
+
+      // Load from the source. Transfer alignment information and mark load as
+      // unordered atomic.
+      LoadInst *Load = Builder.CreateLoad(SrcElementAddr, "memcpy_unfold.val");
+      Load->setOrdering(AtomicOrdering::Unordered);
+      // We know alignment of the first element. It is also guaranteed by the
+      // verifier that element size is less or equal than first element
+      // alignment and both of this values are powers of two. This means that
+      // all subsequent accesses are at least element size aligned.
+      // TODO: We can infer better alignment but there is no evidence that this
+      // will matter.
+      Load->setAlignment(i == 0 ? AMI->getParamAlignment(1)
+                                : ElementSizeInBytes);
+      Load->setDebugLoc(AMI->getDebugLoc());
+
+      // Store loaded value via unordered atomic store.
+      StoreInst *Store = Builder.CreateStore(Load, DstElementAddr);
+      Store->setOrdering(AtomicOrdering::Unordered);
+      Store->setAlignment(i == 0 ? AMI->getParamAlignment(0)
+                                 : ElementSizeInBytes);
+      Store->setDebugLoc(AMI->getDebugLoc());
+    }
+  }
+
+  // Set the number of elements of the copy to 0, it will be deleted on the
+  // next iteration.
+  AMI->setLength(Constant::getNullValue(LengthCI->getType()));
+  return AMI;
+}
+
+Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
+  unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT);
+  unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT);
+  unsigned MinAlign = std::min(DstAlign, SrcAlign);
+  unsigned CopyAlign = MI->getAlignment();
+
+  if (CopyAlign < MinAlign) {
+    MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), MinAlign, false));
+    return MI;
+  }
+
+  // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
+  // load/store.
+  ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
+  if (!MemOpLength) return nullptr;
+
+  // Source and destination pointer types are always "i8*" for intrinsic.  See
+  // if the size is something we can handle with a single primitive load/store.
+  // A single load+store correctly handles overlapping memory in the memmove
+  // case.
+  uint64_t Size = MemOpLength->getLimitedValue();
+  assert(Size && "0-sized memory transferring should be removed already.");
+
+  if (Size > 8 || (Size&(Size-1)))
+    return nullptr;  // If not 1/2/4/8 bytes, exit.
+
+  // Use an integer load+store unless we can find something better.
+  unsigned SrcAddrSp =
+    cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace();
+  unsigned DstAddrSp =
+    cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace();
+
+  IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
+  Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
+  Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);
+
+  // If the memcpy has metadata describing the members, see if we can get the
+  // TBAA tag describing our copy.
+  MDNode *CopyMD = nullptr;
+  if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
+    if (M->getNumOperands() == 3 && M->getOperand(0) &&
+        mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
+        mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
+        M->getOperand(1) &&
+        mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
+        mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
+        Size &&
+        M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
+      CopyMD = cast<MDNode>(M->getOperand(2));
+  }
+
+  // If the memcpy/memmove provides better alignment info than we can
+  // infer, use it.
+  SrcAlign = std::max(SrcAlign, CopyAlign);
+  DstAlign = std::max(DstAlign, CopyAlign);
+
+  Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
+  Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
+  LoadInst *L = Builder.CreateLoad(Src, MI->isVolatile());
+  L->setAlignment(SrcAlign);
+  if (CopyMD)
+    L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
+  MDNode *LoopMemParallelMD =
+    MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
+  if (LoopMemParallelMD)
+    L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
+
+  StoreInst *S = Builder.CreateStore(L, Dest, MI->isVolatile());
+  S->setAlignment(DstAlign);
+  if (CopyMD)
+    S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
+  if (LoopMemParallelMD)
+    S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
+
+  // Set the size of the copy to 0, it will be deleted on the next iteration.
+  MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
+  return MI;
+}
+
+Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
+  unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
+  if (MI->getAlignment() < Alignment) {
+    MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
+                                             Alignment, false));
+    return MI;
+  }
+
+  // Extract the length and alignment and fill if they are constant.
+  ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
+  ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
+  if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
+    return nullptr;
+  uint64_t Len = LenC->getLimitedValue();
+  Alignment = MI->getAlignment();
+  assert(Len && "0-sized memory setting should be removed already.");
+
+  // memset(s,c,n) -> store s, c (for n=1,2,4,8)
+  if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
+    Type *ITy = IntegerType::get(MI->getContext(), Len*8);  // n=1 -> i8.
+
+    Value *Dest = MI->getDest();
+    unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace();
+    Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp);
+    Dest = Builder.CreateBitCast(Dest, NewDstPtrTy);
+
+    // Alignment 0 is identity for alignment 1 for memset, but not store.
+    if (Alignment == 0) Alignment = 1;
+
+    // Extract the fill value and store.
+    uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
+    StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest,
+                                       MI->isVolatile());
+    S->setAlignment(Alignment);
+
+    // Set the size of the copy to 0, it will be deleted on the next iteration.
+    MI->setLength(Constant::getNullValue(LenC->getType()));
+    return MI;
+  }
+
+  return nullptr;
+}
+
+static Value *simplifyX86immShift(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  bool LogicalShift = false;
+  bool ShiftLeft = false;
+
+  switch (II.getIntrinsicID()) {
+  default: llvm_unreachable("Unexpected intrinsic!");
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_avx2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx512_psra_q_128:
+  case Intrinsic::x86_avx512_psrai_q_128:
+  case Intrinsic::x86_avx512_psra_q_256:
+  case Intrinsic::x86_avx512_psrai_q_256:
+  case Intrinsic::x86_avx512_psra_d_512:
+  case Intrinsic::x86_avx512_psra_q_512:
+  case Intrinsic::x86_avx512_psra_w_512:
+  case Intrinsic::x86_avx512_psrai_d_512:
+  case Intrinsic::x86_avx512_psrai_q_512:
+  case Intrinsic::x86_avx512_psrai_w_512:
+    LogicalShift = false; ShiftLeft = false;
+    break;
+  case Intrinsic::x86_sse2_psrl_d:
+  case Intrinsic::x86_sse2_psrl_q:
+  case Intrinsic::x86_sse2_psrl_w:
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_avx2_psrl_d:
+  case Intrinsic::x86_avx2_psrl_q:
+  case Intrinsic::x86_avx2_psrl_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w:
+  case Intrinsic::x86_avx512_psrl_d_512:
+  case Intrinsic::x86_avx512_psrl_q_512:
+  case Intrinsic::x86_avx512_psrl_w_512:
+  case Intrinsic::x86_avx512_psrli_d_512:
+  case Intrinsic::x86_avx512_psrli_q_512:
+  case Intrinsic::x86_avx512_psrli_w_512:
+    LogicalShift = true; ShiftLeft = false;
+    break;
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+  case Intrinsic::x86_avx2_psll_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+  case Intrinsic::x86_avx2_pslli_w:
+  case Intrinsic::x86_avx512_psll_d_512:
+  case Intrinsic::x86_avx512_psll_q_512:
+  case Intrinsic::x86_avx512_psll_w_512:
+  case Intrinsic::x86_avx512_pslli_d_512:
+  case Intrinsic::x86_avx512_pslli_q_512:
+  case Intrinsic::x86_avx512_pslli_w_512:
+    LogicalShift = true; ShiftLeft = true;
+    break;
+  }
+  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+  // Simplify if count is constant.
+  auto Arg1 = II.getArgOperand(1);
+  auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1);
+  auto CDV = dyn_cast<ConstantDataVector>(Arg1);
+  auto CInt = dyn_cast<ConstantInt>(Arg1);
+  if (!CAZ && !CDV && !CInt)
+    return nullptr;
+
+  APInt Count(64, 0);
+  if (CDV) {
+    // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
+    // operand to compute the shift amount.
+    auto VT = cast<VectorType>(CDV->getType());
+    unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits();
+    assert((64 % BitWidth) == 0 && "Unexpected packed shift size");
+    unsigned NumSubElts = 64 / BitWidth;
+
+    // Concatenate the sub-elements to create the 64-bit value.
+    for (unsigned i = 0; i != NumSubElts; ++i) {
+      unsigned SubEltIdx = (NumSubElts - 1) - i;
+      auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
+      Count <<= BitWidth;
+      Count |= SubElt->getValue().zextOrTrunc(64);
+    }
+  }
+  else if (CInt)
+    Count = CInt->getValue();
+
+  auto Vec = II.getArgOperand(0);
+  auto VT = cast<VectorType>(Vec->getType());
+  auto SVT = VT->getElementType();
+  unsigned VWidth = VT->getNumElements();
+  unsigned BitWidth = SVT->getPrimitiveSizeInBits();
+
+  // If shift-by-zero then just return the original value.
+  if (Count.isNullValue())
+    return Vec;
+
+  // Handle cases when Shift >= BitWidth.
+  if (Count.uge(BitWidth)) {
+    // If LogicalShift - just return zero.
+    if (LogicalShift)
+      return ConstantAggregateZero::get(VT);
+
+    // If ArithmeticShift - clamp Shift to (BitWidth - 1).
+    Count = APInt(64, BitWidth - 1);
+  }
+
+  // Get a constant vector of the same type as the first operand.
+  auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
+  auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
+
+  if (ShiftLeft)
+    return Builder.CreateShl(Vec, ShiftVec);
+
+  if (LogicalShift)
+    return Builder.CreateLShr(Vec, ShiftVec);
+
+  return Builder.CreateAShr(Vec, ShiftVec);
+}
+
+// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
+// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
+// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
+static Value *simplifyX86varShift(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  bool LogicalShift = false;
+  bool ShiftLeft = false;
+
+  switch (II.getIntrinsicID()) {
+  default: llvm_unreachable("Unexpected intrinsic!");
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256:
+  case Intrinsic::x86_avx512_psrav_q_128:
+  case Intrinsic::x86_avx512_psrav_q_256:
+  case Intrinsic::x86_avx512_psrav_d_512:
+  case Intrinsic::x86_avx512_psrav_q_512:
+  case Intrinsic::x86_avx512_psrav_w_128:
+  case Intrinsic::x86_avx512_psrav_w_256:
+  case Intrinsic::x86_avx512_psrav_w_512:
+    LogicalShift = false;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+  case Intrinsic::x86_avx512_psrlv_d_512:
+  case Intrinsic::x86_avx512_psrlv_q_512:
+  case Intrinsic::x86_avx512_psrlv_w_128:
+  case Intrinsic::x86_avx512_psrlv_w_256:
+  case Intrinsic::x86_avx512_psrlv_w_512:
+    LogicalShift = true;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_q_256:
+  case Intrinsic::x86_avx512_psllv_d_512:
+  case Intrinsic::x86_avx512_psllv_q_512:
+  case Intrinsic::x86_avx512_psllv_w_128:
+  case Intrinsic::x86_avx512_psllv_w_256:
+  case Intrinsic::x86_avx512_psllv_w_512:
+    LogicalShift = true;
+    ShiftLeft = true;
+    break;
+  }
+  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+  // Simplify if all shift amounts are constant/undef.
+  auto *CShift = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!CShift)
+    return nullptr;
+
+  auto Vec = II.getArgOperand(0);
+  auto VT = cast<VectorType>(II.getType());
+  auto SVT = VT->getVectorElementType();
+  int NumElts = VT->getNumElements();
+  int BitWidth = SVT->getIntegerBitWidth();
+
+  // Collect each element's shift amount.
+  // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
+  bool AnyOutOfRange = false;
+  SmallVector<int, 8> ShiftAmts;
+  for (int I = 0; I < NumElts; ++I) {
+    auto *CElt = CShift->getAggregateElement(I);
+    if (CElt && isa<UndefValue>(CElt)) {
+      ShiftAmts.push_back(-1);
+      continue;
+    }
+
+    auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
+    if (!COp)
+      return nullptr;
+
+    // Handle out of range shifts.
+    // If LogicalShift - set to BitWidth (special case).
+    // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
+    APInt ShiftVal = COp->getValue();
+    if (ShiftVal.uge(BitWidth)) {
+      AnyOutOfRange = LogicalShift;
+      ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
+      continue;
+    }
+
+    ShiftAmts.push_back((int)ShiftVal.getZExtValue());
+  }
+
+  // If all elements out of range or UNDEF, return vector of zeros/undefs.
+  // ArithmeticShift should only hit this if they are all UNDEF.
+  auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
+  if (all_of(ShiftAmts, OutOfRange)) {
+    SmallVector<Constant *, 8> ConstantVec;
+    for (int Idx : ShiftAmts) {
+      if (Idx < 0) {
+        ConstantVec.push_back(UndefValue::get(SVT));
+      } else {
+        assert(LogicalShift && "Logical shift expected");
+        ConstantVec.push_back(ConstantInt::getNullValue(SVT));
+      }
+    }
+    return ConstantVector::get(ConstantVec);
+  }
+
+  // We can't handle only some out of range values with generic logical shifts.
+  if (AnyOutOfRange)
+    return nullptr;
+
+  // Build the shift amount constant vector.
+  SmallVector<Constant *, 8> ShiftVecAmts;
+  for (int Idx : ShiftAmts) {
+    if (Idx < 0)
+      ShiftVecAmts.push_back(UndefValue::get(SVT));
+    else
+      ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
+  }
+  auto ShiftVec = ConstantVector::get(ShiftVecAmts);
+
+  if (ShiftLeft)
+    return Builder.CreateShl(Vec, ShiftVec);
+
+  if (LogicalShift)
+    return Builder.CreateLShr(Vec, ShiftVec);
+
+  return Builder.CreateAShr(Vec, ShiftVec);
+}
+
+static Value *simplifyX86muldq(const IntrinsicInst &II,
+                               InstCombiner::BuilderTy &Builder) {
+  Value *Arg0 = II.getArgOperand(0);
+  Value *Arg1 = II.getArgOperand(1);
+  Type *ResTy = II.getType();
+  assert(Arg0->getType()->getScalarSizeInBits() == 32 &&
+         Arg1->getType()->getScalarSizeInBits() == 32 &&
+         ResTy->getScalarSizeInBits() == 64 && "Unexpected muldq/muludq types");
+
+  // muldq/muludq(undef, undef) -> zero (matches generic mul behavior)
+  if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
+    return ConstantAggregateZero::get(ResTy);
+
+  // Constant folding.
+  // PMULDQ  = (mul(vXi64 sext(shuffle<0,2,..>(Arg0)),
+  //                vXi64 sext(shuffle<0,2,..>(Arg1))))
+  // PMULUDQ = (mul(vXi64 zext(shuffle<0,2,..>(Arg0)),
+  //                vXi64 zext(shuffle<0,2,..>(Arg1))))
+  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
+    return nullptr;
+
+  unsigned NumElts = ResTy->getVectorNumElements();
+  assert(Arg0->getType()->getVectorNumElements() == (2 * NumElts) &&
+         Arg1->getType()->getVectorNumElements() == (2 * NumElts) &&
+         "Unexpected muldq/muludq types");
+
+  unsigned IntrinsicID = II.getIntrinsicID();
+  bool IsSigned = (Intrinsic::x86_sse41_pmuldq == IntrinsicID ||
+                   Intrinsic::x86_avx2_pmul_dq == IntrinsicID ||
+                   Intrinsic::x86_avx512_pmul_dq_512 == IntrinsicID);
+
+  SmallVector<unsigned, 16> ShuffleMask;
+  for (unsigned i = 0; i != NumElts; ++i)
+    ShuffleMask.push_back(i * 2);
+
+  auto *LHS = Builder.CreateShuffleVector(Arg0, Arg0, ShuffleMask);
+  auto *RHS = Builder.CreateShuffleVector(Arg1, Arg1, ShuffleMask);
+
+  if (IsSigned) {
+    LHS = Builder.CreateSExt(LHS, ResTy);
+    RHS = Builder.CreateSExt(RHS, ResTy);
+  } else {
+    LHS = Builder.CreateZExt(LHS, ResTy);
+    RHS = Builder.CreateZExt(RHS, ResTy);
+  }
+
+  return Builder.CreateMul(LHS, RHS);
+}
+
+static Value *simplifyX86pack(IntrinsicInst &II, bool IsSigned) {
+  Value *Arg0 = II.getArgOperand(0);
+  Value *Arg1 = II.getArgOperand(1);
+  Type *ResTy = II.getType();
+
+  // Fast all undef handling.
+  if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
+    return UndefValue::get(ResTy);
+
+  Type *ArgTy = Arg0->getType();
+  unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
+  unsigned NumDstElts = ResTy->getVectorNumElements();
+  unsigned NumSrcElts = ArgTy->getVectorNumElements();
+  assert(NumDstElts == (2 * NumSrcElts) && "Unexpected packing types");
+
+  unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
+  unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
+  unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
+  assert(ArgTy->getScalarSizeInBits() == (2 * DstScalarSizeInBits) &&
+         "Unexpected packing types");
+
+  // Constant folding.
+  auto *Cst0 = dyn_cast<Constant>(Arg0);
+  auto *Cst1 = dyn_cast<Constant>(Arg1);
+  if (!Cst0 || !Cst1)
+    return nullptr;
+
+  SmallVector<Constant *, 32> Vals;
+  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+    for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
+      unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
+      auto *Cst = (Elt >= NumSrcEltsPerLane) ? Cst1 : Cst0;
+      auto *COp = Cst->getAggregateElement(SrcIdx);
+      if (COp && isa<UndefValue>(COp)) {
+        Vals.push_back(UndefValue::get(ResTy->getScalarType()));
+        continue;
+      }
+
+      auto *CInt = dyn_cast_or_null<ConstantInt>(COp);
+      if (!CInt)
+        return nullptr;
+
+      APInt Val = CInt->getValue();
+      assert(Val.getBitWidth() == ArgTy->getScalarSizeInBits() &&
+             "Unexpected constant bitwidth");
+
+      if (IsSigned) {
+        // PACKSS: Truncate signed value with signed saturation.
+        // Source values less than dst minint are saturated to minint.
+        // Source values greater than dst maxint are saturated to maxint.
+        if (Val.isSignedIntN(DstScalarSizeInBits))
+          Val = Val.trunc(DstScalarSizeInBits);
+        else if (Val.isNegative())
+          Val = APInt::getSignedMinValue(DstScalarSizeInBits);
+        else
+          Val = APInt::getSignedMaxValue(DstScalarSizeInBits);
+      } else {
+        // PACKUS: Truncate signed value with unsigned saturation.
+        // Source values less than zero are saturated to zero.
+        // Source values greater than dst maxuint are saturated to maxuint.
+        if (Val.isIntN(DstScalarSizeInBits))
+          Val = Val.trunc(DstScalarSizeInBits);
+        else if (Val.isNegative())
+          Val = APInt::getNullValue(DstScalarSizeInBits);
+        else
+          Val = APInt::getAllOnesValue(DstScalarSizeInBits);
+      }
+
+      Vals.push_back(ConstantInt::get(ResTy->getScalarType(), Val));
+    }
+  }
+
+  return ConstantVector::get(Vals);
+}
+
+static Value *simplifyX86movmsk(const IntrinsicInst &II) {
+  Value *Arg = II.getArgOperand(0);
+  Type *ResTy = II.getType();
+  Type *ArgTy = Arg->getType();
+
+  // movmsk(undef) -> zero as we must ensure the upper bits are zero.
+  if (isa<UndefValue>(Arg))
+    return Constant::getNullValue(ResTy);
+
+  // We can't easily peek through x86_mmx types.
+  if (!ArgTy->isVectorTy())
+    return nullptr;
+
+  auto *C = dyn_cast<Constant>(Arg);
+  if (!C)
+    return nullptr;
+
+  // Extract signbits of the vector input and pack into integer result.
+  APInt Result(ResTy->getPrimitiveSizeInBits(), 0);
+  for (unsigned I = 0, E = ArgTy->getVectorNumElements(); I != E; ++I) {
+    auto *COp = C->getAggregateElement(I);
+    if (!COp)
+      return nullptr;
+    if (isa<UndefValue>(COp))
+      continue;
+
+    auto *CInt = dyn_cast<ConstantInt>(COp);
+    auto *CFp = dyn_cast<ConstantFP>(COp);
+    if (!CInt && !CFp)
+      return nullptr;
+
+    if ((CInt && CInt->isNegative()) || (CFp && CFp->isNegative()))
+      Result.setBit(I);
+  }
+
+  return Constant::getIntegerValue(ResTy, Result);
+}
+
+static Value *simplifyX86insertps(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
+  if (!CInt)
+    return nullptr;
+
+  VectorType *VecTy = cast<VectorType>(II.getType());
+  assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
+
+  // The immediate permute control byte looks like this:
+  //    [3:0] - zero mask for each 32-bit lane
+  //    [5:4] - select one 32-bit destination lane
+  //    [7:6] - select one 32-bit source lane
+
+  uint8_t Imm = CInt->getZExtValue();
+  uint8_t ZMask = Imm & 0xf;
+  uint8_t DestLane = (Imm >> 4) & 0x3;
+  uint8_t SourceLane = (Imm >> 6) & 0x3;
+
+  ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
+  // If all zero mask bits are set, this was just a weird way to
+  // generate a zero vector.
+  if (ZMask == 0xf)
+    return ZeroVector;
+
+  // Initialize by passing all of the first source bits through.
+  uint32_t ShuffleMask[4] = { 0, 1, 2, 3 };
+
+  // We may replace the second operand with the zero vector.
+  Value *V1 = II.getArgOperand(1);
+
+  if (ZMask) {
+    // If the zero mask is being used with a single input or the zero mask
+    // overrides the destination lane, this is a shuffle with the zero vector.
+    if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
+        (ZMask & (1 << DestLane))) {
+      V1 = ZeroVector;
+      // We may still move 32-bits of the first source vector from one lane
+      // to another.
+      ShuffleMask[DestLane] = SourceLane;
+      // The zero mask may override the previous insert operation.
+      for (unsigned i = 0; i < 4; ++i)
+        if ((ZMask >> i) & 0x1)
+          ShuffleMask[i] = i + 4;
+    } else {
+      // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
+      return nullptr;
+    }
+  } else {
+    // Replace the selected destination lane with the selected source lane.
+    ShuffleMask[DestLane] = SourceLane + 4;
+  }
+
+  return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
+}
+
+/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
+/// or conversion to a shuffle vector.
+static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
+                               ConstantInt *CILength, ConstantInt *CIIndex,
+                               InstCombiner::BuilderTy &Builder) {
+  auto LowConstantHighUndef = [&](uint64_t Val) {
+    Type *IntTy64 = Type::getInt64Ty(II.getContext());
+    Constant *Args[] = {ConstantInt::get(IntTy64, Val),
+                        UndefValue::get(IntTy64)};
+    return ConstantVector::get(Args);
+  };
+
+  // See if we're dealing with constant values.
+  Constant *C0 = dyn_cast<Constant>(Op0);
+  ConstantInt *CI0 =
+      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
+         : nullptr;
+
+  // Attempt to constant fold.
+  if (CILength && CIIndex) {
+    // From AMD documentation: "The bit index and field length are each six
+    // bits in length other bits of the field are ignored."
+    APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
+    APInt APLength = CILength->getValue().zextOrTrunc(6);
+
+    unsigned Index = APIndex.getZExtValue();
+
+    // From AMD documentation: "a value of zero in the field length is
+    // defined as length of 64".
+    unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
+
+    // From AMD documentation: "If the sum of the bit index + length field
+    // is greater than 64, the results are undefined".
+    unsigned End = Index + Length;
+
+    // Note that both field index and field length are 8-bit quantities.
+    // Since variables 'Index' and 'Length' are unsigned values
+    // obtained from zero-extending field index and field length
+    // respectively, their sum should never wrap around.
+    if (End > 64)
+      return UndefValue::get(II.getType());
+
+    // If we are inserting whole bytes, we can convert this to a shuffle.
+    // Lowering can recognize EXTRQI shuffle masks.
+    if ((Length % 8) == 0 && (Index % 8) == 0) {
+      // Convert bit indices to byte indices.
+      Length /= 8;
+      Index /= 8;
+
+      Type *IntTy8 = Type::getInt8Ty(II.getContext());
+      Type *IntTy32 = Type::getInt32Ty(II.getContext());
+      VectorType *ShufTy = VectorType::get(IntTy8, 16);
+
+      SmallVector<Constant *, 16> ShuffleMask;
+      for (int i = 0; i != (int)Length; ++i)
+        ShuffleMask.push_back(
+            Constant::getIntegerValue(IntTy32, APInt(32, i + Index)));
+      for (int i = Length; i != 8; ++i)
+        ShuffleMask.push_back(
+            Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
+      for (int i = 8; i != 16; ++i)
+        ShuffleMask.push_back(UndefValue::get(IntTy32));
+
+      Value *SV = Builder.CreateShuffleVector(
+          Builder.CreateBitCast(Op0, ShufTy),
+          ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask));
+      return Builder.CreateBitCast(SV, II.getType());
+    }
+
+    // Constant Fold - shift Index'th bit to lowest position and mask off
+    // Length bits.
+    if (CI0) {
+      APInt Elt = CI0->getValue();
+      Elt.lshrInPlace(Index);
+      Elt = Elt.zextOrTrunc(Length);
+      return LowConstantHighUndef(Elt.getZExtValue());
+    }
+
+    // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
+    if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
+      Value *Args[] = {Op0, CILength, CIIndex};
+      Module *M = II.getModule();
+      Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
+      return Builder.CreateCall(F, Args);
+    }
+  }
+
+  // Constant Fold - extraction from zero is always {zero, undef}.
+  if (CI0 && CI0->isZero())
+    return LowConstantHighUndef(0);
+
+  return nullptr;
+}
+
+/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
+/// folding or conversion to a shuffle vector.
+static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
+                                 APInt APLength, APInt APIndex,
+                                 InstCombiner::BuilderTy &Builder) {
+  // From AMD documentation: "The bit index and field length are each six bits
+  // in length other bits of the field are ignored."
+  APIndex = APIndex.zextOrTrunc(6);
+  APLength = APLength.zextOrTrunc(6);
+
+  // Attempt to constant fold.
+  unsigned Index = APIndex.getZExtValue();
+
+  // From AMD documentation: "a value of zero in the field length is
+  // defined as length of 64".
+  unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
+
+  // From AMD documentation: "If the sum of the bit index + length field
+  // is greater than 64, the results are undefined".
+  unsigned End = Index + Length;
+
+  // Note that both field index and field length are 8-bit quantities.
+  // Since variables 'Index' and 'Length' are unsigned values
+  // obtained from zero-extending field index and field length
+  // respectively, their sum should never wrap around.
+  if (End > 64)
+    return UndefValue::get(II.getType());
+
+  // If we are inserting whole bytes, we can convert this to a shuffle.
+  // Lowering can recognize INSERTQI shuffle masks.
+  if ((Length % 8) == 0 && (Index % 8) == 0) {
+    // Convert bit indices to byte indices.
+    Length /= 8;
+    Index /= 8;
+
+    Type *IntTy8 = Type::getInt8Ty(II.getContext());
+    Type *IntTy32 = Type::getInt32Ty(II.getContext());
+    VectorType *ShufTy = VectorType::get(IntTy8, 16);
+
+    SmallVector<Constant *, 16> ShuffleMask;
+    for (int i = 0; i != (int)Index; ++i)
+      ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i)));
+    for (int i = 0; i != (int)Length; ++i)
+      ShuffleMask.push_back(
+          Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
+    for (int i = Index + Length; i != 8; ++i)
+      ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i)));
+    for (int i = 8; i != 16; ++i)
+      ShuffleMask.push_back(UndefValue::get(IntTy32));
+
+    Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
+                                            Builder.CreateBitCast(Op1, ShufTy),
+                                            ConstantVector::get(ShuffleMask));
+    return Builder.CreateBitCast(SV, II.getType());
+  }
+
+  // See if we're dealing with constant values.
+  Constant *C0 = dyn_cast<Constant>(Op0);
+  Constant *C1 = dyn_cast<Constant>(Op1);
+  ConstantInt *CI00 =
+      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
+         : nullptr;
+  ConstantInt *CI10 =
+      C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
+         : nullptr;
+
+  // Constant Fold - insert bottom Length bits starting at the Index'th bit.
+  if (CI00 && CI10) {
+    APInt V00 = CI00->getValue();
+    APInt V10 = CI10->getValue();
+    APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
+    V00 = V00 & ~Mask;
+    V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
+    APInt Val = V00 | V10;
+    Type *IntTy64 = Type::getInt64Ty(II.getContext());
+    Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
+                        UndefValue::get(IntTy64)};
+    return ConstantVector::get(Args);
+  }
+
+  // If we were an INSERTQ call, we'll save demanded elements if we convert to
+  // INSERTQI.
+  if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
+    Type *IntTy8 = Type::getInt8Ty(II.getContext());
+    Constant *CILength = ConstantInt::get(IntTy8, Length, false);
+    Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
+
+    Value *Args[] = {Op0, Op1, CILength, CIIndex};
+    Module *M = II.getModule();
+    Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
+    return Builder.CreateCall(F, Args);
+  }
+
+  return nullptr;
+}
+
+/// Attempt to convert pshufb* to shufflevector if the mask is constant.
+static Value *simplifyX86pshufb(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
+  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!V)
+    return nullptr;
+
+  auto *VecTy = cast<VectorType>(II.getType());
+  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
+  unsigned NumElts = VecTy->getNumElements();
+  assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
+         "Unexpected number of elements in shuffle mask!");
+
+  // Construct a shuffle mask from constant integers or UNDEFs.
+  Constant *Indexes[64] = {nullptr};
+
+  // Each byte in the shuffle control mask forms an index to permute the
+  // corresponding byte in the destination operand.
+  for (unsigned I = 0; I < NumElts; ++I) {
+    Constant *COp = V->getAggregateElement(I);
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+      return nullptr;
+
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = UndefValue::get(MaskEltTy);
+      continue;
+    }
+
+    int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
+
+    // If the most significant bit (bit[7]) of each byte of the shuffle
+    // control mask is set, then zero is written in the result byte.
+    // The zero vector is in the right-hand side of the resulting
+    // shufflevector.
+
+    // The value of each index for the high 128-bit lane is the least
+    // significant 4 bits of the respective shuffle control byte.
+    Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
+    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
+  }
+
+  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts));
+  auto V1 = II.getArgOperand(0);
+  auto V2 = Constant::getNullValue(VecTy);
+  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
+}
+
+/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
+static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
+                                    InstCombiner::BuilderTy &Builder) {
+  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!V)
+    return nullptr;
+
+  auto *VecTy = cast<VectorType>(II.getType());
+  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
+  unsigned NumElts = VecTy->getVectorNumElements();
+  bool IsPD = VecTy->getScalarType()->isDoubleTy();
+  unsigned NumLaneElts = IsPD ? 2 : 4;
+  assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
+
+  // Construct a shuffle mask from constant integers or UNDEFs.
+  Constant *Indexes[16] = {nullptr};
+
+  // The intrinsics only read one or two bits, clear the rest.
+  for (unsigned I = 0; I < NumElts; ++I) {
+    Constant *COp = V->getAggregateElement(I);
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+      return nullptr;
+
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = UndefValue::get(MaskEltTy);
+      continue;
+    }
+
+    APInt Index = cast<ConstantInt>(COp)->getValue();
+    Index = Index.zextOrTrunc(32).getLoBits(2);
+
+    // The PD variants uses bit 1 to select per-lane element index, so
+    // shift down to convert to generic shuffle mask index.
+    if (IsPD)
+      Index.lshrInPlace(1);
+
+    // The _256 variants are a bit trickier since the mask bits always index
+    // into the corresponding 128 half. In order to convert to a generic
+    // shuffle, we have to make that explicit.
+    Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
+
+    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
+  }
+
+  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts));
+  auto V1 = II.getArgOperand(0);
+  auto V2 = UndefValue::get(V1->getType());
+  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
+}
+
+/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
+static Value *simplifyX86vpermv(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
+  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!V)
+    return nullptr;
+
+  auto *VecTy = cast<VectorType>(II.getType());
+  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
+  unsigned Size = VecTy->getNumElements();
+  assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
+         "Unexpected shuffle mask size");
+
+  // Construct a shuffle mask from constant integers or UNDEFs.
+  Constant *Indexes[64] = {nullptr};
+
+  for (unsigned I = 0; I < Size; ++I) {
+    Constant *COp = V->getAggregateElement(I);
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+      return nullptr;
+
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = UndefValue::get(MaskEltTy);
+      continue;
+    }
+
+    uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
+    Index &= Size - 1;
+    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
+  }
+
+  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, Size));
+  auto V1 = II.getArgOperand(0);
+  auto V2 = UndefValue::get(VecTy);
+  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
+}
+
+/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit
+/// source vectors, unless a zero bit is set. If a zero bit is set,
+/// then ignore that half of the mask and clear that half of the vector.
+static Value *simplifyX86vperm2(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
+  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
+  if (!CInt)
+    return nullptr;
+
+  VectorType *VecTy = cast<VectorType>(II.getType());
+  ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
+  // The immediate permute control byte looks like this:
+  //    [1:0] - select 128 bits from sources for low half of destination
+  //    [2]   - ignore
+  //    [3]   - zero low half of destination
+  //    [5:4] - select 128 bits from sources for high half of destination
+  //    [6]   - ignore
+  //    [7]   - zero high half of destination
+
+  uint8_t Imm = CInt->getZExtValue();
+
+  bool LowHalfZero = Imm & 0x08;
+  bool HighHalfZero = Imm & 0x80;
+
+  // If both zero mask bits are set, this was just a weird way to
+  // generate a zero vector.
+  if (LowHalfZero && HighHalfZero)
+    return ZeroVector;
+
+  // If 0 or 1 zero mask bits are set, this is a simple shuffle.
+  unsigned NumElts = VecTy->getNumElements();
+  unsigned HalfSize = NumElts / 2;
+  SmallVector<uint32_t, 8> ShuffleMask(NumElts);
+
+  // The high bit of the selection field chooses the 1st or 2nd operand.
+  bool LowInputSelect = Imm & 0x02;
+  bool HighInputSelect = Imm & 0x20;
+
+  // The low bit of the selection field chooses the low or high half
+  // of the selected operand.
+  bool LowHalfSelect = Imm & 0x01;
+  bool HighHalfSelect = Imm & 0x10;
+
+  // Determine which operand(s) are actually in use for this instruction.
+  Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
+  Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
+
+  // If needed, replace operands based on zero mask.
+  V0 = LowHalfZero ? ZeroVector : V0;
+  V1 = HighHalfZero ? ZeroVector : V1;
+
+  // Permute low half of result.
+  unsigned StartIndex = LowHalfSelect ? HalfSize : 0;
+  for (unsigned i = 0; i < HalfSize; ++i)
+    ShuffleMask[i] = StartIndex + i;
+
+  // Permute high half of result.
+  StartIndex = HighHalfSelect ? HalfSize : 0;
+  StartIndex += NumElts;
+  for (unsigned i = 0; i < HalfSize; ++i)
+    ShuffleMask[i + HalfSize] = StartIndex + i;
+
+  return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+}
+
+/// Decode XOP integer vector comparison intrinsics.
+static Value *simplifyX86vpcom(const IntrinsicInst &II,
+                               InstCombiner::BuilderTy &Builder,
+                               bool IsSigned) {
+  if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
+    uint64_t Imm = CInt->getZExtValue() & 0x7;
+    VectorType *VecTy = cast<VectorType>(II.getType());
+    CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+
+    switch (Imm) {
+    case 0x0:
+      Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+      break;
+    case 0x1:
+      Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
+      break;
+    case 0x2:
+      Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+      break;
+    case 0x3:
+      Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
+      break;
+    case 0x4:
+      Pred = ICmpInst::ICMP_EQ; break;
+    case 0x5:
+      Pred = ICmpInst::ICMP_NE; break;
+    case 0x6:
+      return ConstantInt::getSigned(VecTy, 0); // FALSE
+    case 0x7:
+      return ConstantInt::getSigned(VecTy, -1); // TRUE
+    }
+
+    if (Value *Cmp = Builder.CreateICmp(Pred, II.getArgOperand(0),
+                                        II.getArgOperand(1)))
+      return Builder.CreateSExtOrTrunc(Cmp, VecTy);
+  }
+  return nullptr;
+}
+
+// Emit a select instruction and appropriate bitcasts to help simplify
+// masked intrinsics.
+static Value *emitX86MaskSelect(Value *Mask, Value *Op0, Value *Op1,
+                                InstCombiner::BuilderTy &Builder) {
+  unsigned VWidth = Op0->getType()->getVectorNumElements();
+
+  // If the mask is all ones we don't need the select. But we need to check
+  // only the bit thats will be used in case VWidth is less than 8.
+  if (auto *C = dyn_cast<ConstantInt>(Mask))
+    if (C->getValue().zextOrTrunc(VWidth).isAllOnesValue())
+      return Op0;
+
+  auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
+                         cast<IntegerType>(Mask->getType())->getBitWidth());
+  Mask = Builder.CreateBitCast(Mask, MaskTy);
+
+  // If we have less than 8 elements, then the starting mask was an i8 and
+  // we need to extract down to the right number of elements.
+  if (VWidth < 8) {
+    uint32_t Indices[4];
+    for (unsigned i = 0; i != VWidth; ++i)
+      Indices[i] = i;
+    Mask = Builder.CreateShuffleVector(Mask, Mask,
+                                       makeArrayRef(Indices, VWidth),
+                                       "extract");
+  }
+
+  return Builder.CreateSelect(Mask, Op0, Op1);
+}
+
+static Value *simplifyMinnumMaxnum(const IntrinsicInst &II) {
+  Value *Arg0 = II.getArgOperand(0);
+  Value *Arg1 = II.getArgOperand(1);
+
+  // fmin(x, x) -> x
+  if (Arg0 == Arg1)
+    return Arg0;
+
+  const auto *C1 = dyn_cast<ConstantFP>(Arg1);
+
+  // fmin(x, nan) -> x
+  if (C1 && C1->isNaN())
+    return Arg0;
+
+  // This is the value because if undef were NaN, we would return the other
+  // value and cannot return a NaN unless both operands are.
+  //
+  // fmin(undef, x) -> x
+  if (isa<UndefValue>(Arg0))
+    return Arg1;
+
+  // fmin(x, undef) -> x
+  if (isa<UndefValue>(Arg1))
+    return Arg0;
+
+  Value *X = nullptr;
+  Value *Y = nullptr;
+  if (II.getIntrinsicID() == Intrinsic::minnum) {
+    // fmin(x, fmin(x, y)) -> fmin(x, y)
+    // fmin(y, fmin(x, y)) -> fmin(x, y)
+    if (match(Arg1, m_FMin(m_Value(X), m_Value(Y)))) {
+      if (Arg0 == X || Arg0 == Y)
+        return Arg1;
+    }
+
+    // fmin(fmin(x, y), x) -> fmin(x, y)
+    // fmin(fmin(x, y), y) -> fmin(x, y)
+    if (match(Arg0, m_FMin(m_Value(X), m_Value(Y)))) {
+      if (Arg1 == X || Arg1 == Y)
+        return Arg0;
+    }
+
+    // TODO: fmin(nnan x, inf) -> x
+    // TODO: fmin(nnan ninf x, flt_max) -> x
+    if (C1 && C1->isInfinity()) {
+      // fmin(x, -inf) -> -inf
+      if (C1->isNegative())
+        return Arg1;
+    }
+  } else {
+    assert(II.getIntrinsicID() == Intrinsic::maxnum);
+    // fmax(x, fmax(x, y)) -> fmax(x, y)
+    // fmax(y, fmax(x, y)) -> fmax(x, y)
+    if (match(Arg1, m_FMax(m_Value(X), m_Value(Y)))) {
+      if (Arg0 == X || Arg0 == Y)
+        return Arg1;
+    }
+
+    // fmax(fmax(x, y), x) -> fmax(x, y)
+    // fmax(fmax(x, y), y) -> fmax(x, y)
+    if (match(Arg0, m_FMax(m_Value(X), m_Value(Y)))) {
+      if (Arg1 == X || Arg1 == Y)
+        return Arg0;
+    }
+
+    // TODO: fmax(nnan x, -inf) -> x
+    // TODO: fmax(nnan ninf x, -flt_max) -> x
+    if (C1 && C1->isInfinity()) {
+      // fmax(x, inf) -> inf
+      if (!C1->isNegative())
+        return Arg1;
+    }
+  }
+  return nullptr;
+}
+
+static bool maskIsAllOneOrUndef(Value *Mask) {
+  auto *ConstMask = dyn_cast<Constant>(Mask);
+  if (!ConstMask)
+    return false;
+  if (ConstMask->isAllOnesValue() || isa<UndefValue>(ConstMask))
+    return true;
+  for (unsigned I = 0, E = ConstMask->getType()->getVectorNumElements(); I != E;
+       ++I) {
+    if (auto *MaskElt = ConstMask->getAggregateElement(I))
+      if (MaskElt->isAllOnesValue() || isa<UndefValue>(MaskElt))
+        continue;
+    return false;
+  }
+  return true;
+}
+
+static Value *simplifyMaskedLoad(const IntrinsicInst &II,
+                                 InstCombiner::BuilderTy &Builder) {
+  // If the mask is all ones or undefs, this is a plain vector load of the 1st
+  // argument.
+  if (maskIsAllOneOrUndef(II.getArgOperand(2))) {
+    Value *LoadPtr = II.getArgOperand(0);
+    unsigned Alignment = cast<ConstantInt>(II.getArgOperand(1))->getZExtValue();
+    return Builder.CreateAlignedLoad(LoadPtr, Alignment, "unmaskedload");
+  }
+
+  return nullptr;
+}
+
+static Instruction *simplifyMaskedStore(IntrinsicInst &II, InstCombiner &IC) {
+  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+  if (!ConstMask)
+    return nullptr;
+
+  // If the mask is all zeros, this instruction does nothing.
+  if (ConstMask->isNullValue())
+    return IC.eraseInstFromFunction(II);
+
+  // If the mask is all ones, this is a plain vector store of the 1st argument.
+  if (ConstMask->isAllOnesValue()) {
+    Value *StorePtr = II.getArgOperand(1);
+    unsigned Alignment = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
+    return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
+  }
+
+  return nullptr;
+}
+
+static Instruction *simplifyMaskedGather(IntrinsicInst &II, InstCombiner &IC) {
+  // If the mask is all zeros, return the "passthru" argument of the gather.
+  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
+  if (ConstMask && ConstMask->isNullValue())
+    return IC.replaceInstUsesWith(II, II.getArgOperand(3));
+
+  return nullptr;
+}
+
+static Instruction *simplifyMaskedScatter(IntrinsicInst &II, InstCombiner &IC) {
+  // If the mask is all zeros, a scatter does nothing.
+  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+  if (ConstMask && ConstMask->isNullValue())
+    return IC.eraseInstFromFunction(II);
+
+  return nullptr;
+}
+
+static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
+  assert((II.getIntrinsicID() == Intrinsic::cttz ||
+          II.getIntrinsicID() == Intrinsic::ctlz) &&
+         "Expected cttz or ctlz intrinsic");
+  Value *Op0 = II.getArgOperand(0);
+
+  KnownBits Known = IC.computeKnownBits(Op0, 0, &II);
+
+  // Create a mask for bits above (ctlz) or below (cttz) the first known one.
+  bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
+  unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros()
+                                : Known.countMaxLeadingZeros();
+  unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros()
+                                : Known.countMinLeadingZeros();
+
+  // If all bits above (ctlz) or below (cttz) the first known one are known
+  // zero, this value is constant.
+  // FIXME: This should be in InstSimplify because we're replacing an
+  // instruction with a constant.
+  if (PossibleZeros == DefiniteZeros) {
+    auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros);
+    return IC.replaceInstUsesWith(II, C);
+  }
+
+  // If the input to cttz/ctlz is known to be non-zero,
+  // then change the 'ZeroIsUndef' parameter to 'true'
+  // because we know the zero behavior can't affect the result.
+  if (!Known.One.isNullValue() ||
+      isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II,
+                     &IC.getDominatorTree())) {
+    if (!match(II.getArgOperand(1), m_One())) {
+      II.setOperand(1, IC.Builder.getTrue());
+      return &II;
+    }
+  }
+
+  // Add range metadata since known bits can't completely reflect what we know.
+  // TODO: Handle splat vectors.
+  auto *IT = dyn_cast<IntegerType>(Op0->getType());
+  if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
+    Metadata *LowAndHigh[] = {
+        ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)),
+        ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))};
+    II.setMetadata(LLVMContext::MD_range,
+                   MDNode::get(II.getContext(), LowAndHigh));
+    return &II;
+  }
+
+  return nullptr;
+}
+
+static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) {
+  assert(II.getIntrinsicID() == Intrinsic::ctpop &&
+         "Expected ctpop intrinsic");
+  Value *Op0 = II.getArgOperand(0);
+  // FIXME: Try to simplify vectors of integers.
+  auto *IT = dyn_cast<IntegerType>(Op0->getType());
+  if (!IT)
+    return nullptr;
+
+  unsigned BitWidth = IT->getBitWidth();
+  KnownBits Known(BitWidth);
+  IC.computeKnownBits(Op0, Known, 0, &II);
+
+  unsigned MinCount = Known.countMinPopulation();
+  unsigned MaxCount = Known.countMaxPopulation();
+
+  // Add range metadata since known bits can't completely reflect what we know.
+  if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
+    Metadata *LowAndHigh[] = {
+        ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)),
+        ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))};
+    II.setMetadata(LLVMContext::MD_range,
+                   MDNode::get(II.getContext(), LowAndHigh));
+    return &II;
+  }
+
+  return nullptr;
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
+  Value *Ptr = II.getOperand(0);
+  Value *Mask = II.getOperand(1);
+  Constant *ZeroVec = Constant::getNullValue(II.getType());
+
+  // Special case a zero mask since that's not a ConstantDataVector.
+  // This masked load instruction creates a zero vector.
+  if (isa<ConstantAggregateZero>(Mask))
+    return IC.replaceInstUsesWith(II, ZeroVec);
+
+  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
+  if (!ConstMask)
+    return nullptr;
+
+  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
+  // to allow target-independent optimizations.
+
+  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
+  // the LLVM intrinsic definition for the pointer argument.
+  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+  PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
+  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+  // Second, convert the x86 XMM integer vector mask to a vector of bools based
+  // on each element's most significant bit (the sign bit).
+  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
+
+  // The pass-through vector for an x86 masked load is a zero vector.
+  CallInst *NewMaskedLoad =
+      IC.Builder.CreateMaskedLoad(PtrCast, 1, BoolMask, ZeroVec);
+  return IC.replaceInstUsesWith(II, NewMaskedLoad);
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
+  Value *Ptr = II.getOperand(0);
+  Value *Mask = II.getOperand(1);
+  Value *Vec = II.getOperand(2);
+
+  // Special case a zero mask since that's not a ConstantDataVector:
+  // this masked store instruction does nothing.
+  if (isa<ConstantAggregateZero>(Mask)) {
+    IC.eraseInstFromFunction(II);
+    return true;
+  }
+
+  // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
+  // anything else at this level.
+  if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
+    return false;
+
+  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
+  if (!ConstMask)
+    return false;
+
+  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
+  // to allow target-independent optimizations.
+
+  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
+  // the LLVM intrinsic definition for the pointer argument.
+  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+  PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
+  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+  // Second, convert the x86 XMM integer vector mask to a vector of bools based
+  // on each element's most significant bit (the sign bit).
+  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
+
+  IC.Builder.CreateMaskedStore(Vec, PtrCast, 1, BoolMask);
+
+  // 'Replace uses' doesn't work for stores. Erase the original masked store.
+  IC.eraseInstFromFunction(II);
+  return true;
+}
+
+// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
+//
+// A single NaN input is folded to minnum, so we rely on that folding for
+// handling NaNs.
+static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
+                           const APFloat &Src2) {
+  APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
+
+  APFloat::cmpResult Cmp0 = Max3.compare(Src0);
+  assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
+  if (Cmp0 == APFloat::cmpEqual)
+    return maxnum(Src1, Src2);
+
+  APFloat::cmpResult Cmp1 = Max3.compare(Src1);
+  assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
+  if (Cmp1 == APFloat::cmpEqual)
+    return maxnum(Src0, Src2);
+
+  return maxnum(Src0, Src1);
+}
+
+// Returns true iff the 2 intrinsics have the same operands, limiting the
+// comparison to the first NumOperands.
+static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
+                             unsigned NumOperands) {
+  assert(I.getNumArgOperands() >= NumOperands && "Not enough operands");
+  assert(E.getNumArgOperands() >= NumOperands && "Not enough operands");
+  for (unsigned i = 0; i < NumOperands; i++)
+    if (I.getArgOperand(i) != E.getArgOperand(i))
+      return false;
+  return true;
+}
+
+// Remove trivially empty start/end intrinsic ranges, i.e. a start
+// immediately followed by an end (ignoring debuginfo or other
+// start/end intrinsics in between). As this handles only the most trivial
+// cases, tracking the nesting level is not needed:
+//
+//   call @llvm.foo.start(i1 0) ; &I
+//   call @llvm.foo.start(i1 0)
+//   call @llvm.foo.end(i1 0) ; This one will not be skipped: it will be removed
+//   call @llvm.foo.end(i1 0)
+static bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID,
+                                      unsigned EndID, InstCombiner &IC) {
+  assert(I.getIntrinsicID() == StartID &&
+         "Start intrinsic does not have expected ID");
+  BasicBlock::iterator BI(I), BE(I.getParent()->end());
+  for (++BI; BI != BE; ++BI) {
+    if (auto *E = dyn_cast<IntrinsicInst>(BI)) {
+      if (isa<DbgInfoIntrinsic>(E) || E->getIntrinsicID() == StartID)
+        continue;
+      if (E->getIntrinsicID() == EndID &&
+          haveSameOperands(I, *E, E->getNumArgOperands())) {
+        IC.eraseInstFromFunction(*E);
+        IC.eraseInstFromFunction(I);
+        return true;
+      }
+    }
+    break;
+  }
+
+  return false;
+}
+
+// Convert NVVM intrinsics to target-generic LLVM code where possible.
+static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
+  // Each NVVM intrinsic we can simplify can be replaced with one of:
+  //
+  //  * an LLVM intrinsic,
+  //  * an LLVM cast operation,
+  //  * an LLVM binary operation, or
+  //  * ad-hoc LLVM IR for the particular operation.
+
+  // Some transformations are only valid when the module's
+  // flush-denormals-to-zero (ftz) setting is true/false, whereas other
+  // transformations are valid regardless of the module's ftz setting.
+  enum FtzRequirementTy {
+    FTZ_Any,       // Any ftz setting is ok.
+    FTZ_MustBeOn,  // Transformation is valid only if ftz is on.
+    FTZ_MustBeOff, // Transformation is valid only if ftz is off.
+  };
+  // Classes of NVVM intrinsics that can't be replaced one-to-one with a
+  // target-generic intrinsic, cast op, or binary op but that we can nonetheless
+  // simplify.
+  enum SpecialCase {
+    SPC_Reciprocal,
+  };
+
+  // SimplifyAction is a poor-man's variant (plus an additional flag) that
+  // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
+  struct SimplifyAction {
+    // Invariant: At most one of these Optionals has a value.
+    Optional<Intrinsic::ID> IID;
+    Optional<Instruction::CastOps> CastOp;
+    Optional<Instruction::BinaryOps> BinaryOp;
+    Optional<SpecialCase> Special;
+
+    FtzRequirementTy FtzRequirement = FTZ_Any;
+
+    SimplifyAction() = default;
+
+    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
+        : IID(IID), FtzRequirement(FtzReq) {}
+
+    // Cast operations don't have anything to do with FTZ, so we skip that
+    // argument.
+    SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
+
+    SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
+        : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
+
+    SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
+        : Special(Special), FtzRequirement(FtzReq) {}
+  };
+
+  // Try to generate a SimplifyAction describing how to replace our
+  // IntrinsicInstr with target-generic LLVM IR.
+  const SimplifyAction Action = [II]() -> SimplifyAction {
+    switch (II->getIntrinsicID()) {
+
+    // NVVM intrinsics that map directly to LLVM intrinsics.
+    case Intrinsic::nvvm_ceil_d:
+      return {Intrinsic::ceil, FTZ_Any};
+    case Intrinsic::nvvm_ceil_f:
+      return {Intrinsic::ceil, FTZ_MustBeOff};
+    case Intrinsic::nvvm_ceil_ftz_f:
+      return {Intrinsic::ceil, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fabs_d:
+      return {Intrinsic::fabs, FTZ_Any};
+    case Intrinsic::nvvm_fabs_f:
+      return {Intrinsic::fabs, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fabs_ftz_f:
+      return {Intrinsic::fabs, FTZ_MustBeOn};
+    case Intrinsic::nvvm_floor_d:
+      return {Intrinsic::floor, FTZ_Any};
+    case Intrinsic::nvvm_floor_f:
+      return {Intrinsic::floor, FTZ_MustBeOff};
+    case Intrinsic::nvvm_floor_ftz_f:
+      return {Intrinsic::floor, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fma_rn_d:
+      return {Intrinsic::fma, FTZ_Any};
+    case Intrinsic::nvvm_fma_rn_f:
+      return {Intrinsic::fma, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fma_rn_ftz_f:
+      return {Intrinsic::fma, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmax_d:
+      return {Intrinsic::maxnum, FTZ_Any};
+    case Intrinsic::nvvm_fmax_f:
+      return {Intrinsic::maxnum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmax_ftz_f:
+      return {Intrinsic::maxnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmin_d:
+      return {Intrinsic::minnum, FTZ_Any};
+    case Intrinsic::nvvm_fmin_f:
+      return {Intrinsic::minnum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmin_ftz_f:
+      return {Intrinsic::minnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_round_d:
+      return {Intrinsic::round, FTZ_Any};
+    case Intrinsic::nvvm_round_f:
+      return {Intrinsic::round, FTZ_MustBeOff};
+    case Intrinsic::nvvm_round_ftz_f:
+      return {Intrinsic::round, FTZ_MustBeOn};
+    case Intrinsic::nvvm_sqrt_rn_d:
+      return {Intrinsic::sqrt, FTZ_Any};
+    case Intrinsic::nvvm_sqrt_f:
+      // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the
+      // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts
+      // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are
+      // the versions with explicit ftz-ness.
+      return {Intrinsic::sqrt, FTZ_Any};
+    case Intrinsic::nvvm_sqrt_rn_f:
+      return {Intrinsic::sqrt, FTZ_MustBeOff};
+    case Intrinsic::nvvm_sqrt_rn_ftz_f:
+      return {Intrinsic::sqrt, FTZ_MustBeOn};
+    case Intrinsic::nvvm_trunc_d:
+      return {Intrinsic::trunc, FTZ_Any};
+    case Intrinsic::nvvm_trunc_f:
+      return {Intrinsic::trunc, FTZ_MustBeOff};
+    case Intrinsic::nvvm_trunc_ftz_f:
+      return {Intrinsic::trunc, FTZ_MustBeOn};
+
+    // NVVM intrinsics that map to LLVM cast operations.
+    //
+    // Note that llvm's target-generic conversion operators correspond to the rz
+    // (round to zero) versions of the nvvm conversion intrinsics, even though
+    // most everything else here uses the rn (round to nearest even) nvvm ops.
+    case Intrinsic::nvvm_d2i_rz:
+    case Intrinsic::nvvm_f2i_rz:
+    case Intrinsic::nvvm_d2ll_rz:
+    case Intrinsic::nvvm_f2ll_rz:
+      return {Instruction::FPToSI};
+    case Intrinsic::nvvm_d2ui_rz:
+    case Intrinsic::nvvm_f2ui_rz:
+    case Intrinsic::nvvm_d2ull_rz:
+    case Intrinsic::nvvm_f2ull_rz:
+      return {Instruction::FPToUI};
+    case Intrinsic::nvvm_i2d_rz:
+    case Intrinsic::nvvm_i2f_rz:
+    case Intrinsic::nvvm_ll2d_rz:
+    case Intrinsic::nvvm_ll2f_rz:
+      return {Instruction::SIToFP};
+    case Intrinsic::nvvm_ui2d_rz:
+    case Intrinsic::nvvm_ui2f_rz:
+    case Intrinsic::nvvm_ull2d_rz:
+    case Intrinsic::nvvm_ull2f_rz:
+      return {Instruction::UIToFP};
+
+    // NVVM intrinsics that map to LLVM binary ops.
+    case Intrinsic::nvvm_add_rn_d:
+      return {Instruction::FAdd, FTZ_Any};
+    case Intrinsic::nvvm_add_rn_f:
+      return {Instruction::FAdd, FTZ_MustBeOff};
+    case Intrinsic::nvvm_add_rn_ftz_f:
+      return {Instruction::FAdd, FTZ_MustBeOn};
+    case Intrinsic::nvvm_mul_rn_d:
+      return {Instruction::FMul, FTZ_Any};
+    case Intrinsic::nvvm_mul_rn_f:
+      return {Instruction::FMul, FTZ_MustBeOff};
+    case Intrinsic::nvvm_mul_rn_ftz_f:
+      return {Instruction::FMul, FTZ_MustBeOn};
+    case Intrinsic::nvvm_div_rn_d:
+      return {Instruction::FDiv, FTZ_Any};
+    case Intrinsic::nvvm_div_rn_f:
+      return {Instruction::FDiv, FTZ_MustBeOff};
+    case Intrinsic::nvvm_div_rn_ftz_f:
+      return {Instruction::FDiv, FTZ_MustBeOn};
+
+    // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
+    // need special handling.
+    //
+    // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
+    // as well.
+    case Intrinsic::nvvm_rcp_rn_d:
+      return {SPC_Reciprocal, FTZ_Any};
+    case Intrinsic::nvvm_rcp_rn_f:
+      return {SPC_Reciprocal, FTZ_MustBeOff};
+    case Intrinsic::nvvm_rcp_rn_ftz_f:
+      return {SPC_Reciprocal, FTZ_MustBeOn};
+
+    // We do not currently simplify intrinsics that give an approximate answer.
+    // These include:
+    //
+    //   - nvvm_cos_approx_{f,ftz_f}
+    //   - nvvm_ex2_approx_{d,f,ftz_f}
+    //   - nvvm_lg2_approx_{d,f,ftz_f}
+    //   - nvvm_sin_approx_{f,ftz_f}
+    //   - nvvm_sqrt_approx_{f,ftz_f}
+    //   - nvvm_rsqrt_approx_{d,f,ftz_f}
+    //   - nvvm_div_approx_{ftz_d,ftz_f,f}
+    //   - nvvm_rcp_approx_ftz_d
+    //
+    // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
+    // means that fastmath is enabled in the intrinsic.  Unfortunately only
+    // binary operators (currently) have a fastmath bit in SelectionDAG, so this
+    // information gets lost and we can't select on it.
+    //
+    // TODO: div and rcp are lowered to a binary op, so these we could in theory
+    // lower them to "fast fdiv".
+
+    default:
+      return {};
+    }
+  }();
+
+  // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
+  // can bail out now.  (Notice that in the case that IID is not an NVVM
+  // intrinsic, we don't have to look up any module metadata, as
+  // FtzRequirementTy will be FTZ_Any.)
+  if (Action.FtzRequirement != FTZ_Any) {
+    bool FtzEnabled =
+        II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() ==
+        "true";
+
+    if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
+      return nullptr;
+  }
+
+  // Simplify to target-generic intrinsic.
+  if (Action.IID) {
+    SmallVector<Value *, 4> Args(II->arg_operands());
+    // All the target-generic intrinsics currently of interest to us have one
+    // type argument, equal to that of the nvvm intrinsic's argument.
+    Type *Tys[] = {II->getArgOperand(0)->getType()};
+    return CallInst::Create(
+        Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
+  }
+
+  // Simplify to target-generic binary op.
+  if (Action.BinaryOp)
+    return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
+                                  II->getArgOperand(1), II->getName());
+
+  // Simplify to target-generic cast op.
+  if (Action.CastOp)
+    return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
+                            II->getName());
+
+  // All that's left are the special cases.
+  if (!Action.Special)
+    return nullptr;
+
+  switch (*Action.Special) {
+  case SPC_Reciprocal:
+    // Simplify reciprocal.
+    return BinaryOperator::Create(
+        Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
+        II->getArgOperand(0), II->getName());
+  }
+  llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
+}
+
+Instruction *InstCombiner::visitVAStartInst(VAStartInst &I) {
+  removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this);
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) {
+  removeTriviallyEmptyRange(I, Intrinsic::vacopy, Intrinsic::vaend, *this);
+  return nullptr;
+}
+
+/// CallInst simplification. This mostly only handles folding of intrinsic
+/// instructions. For normal calls, it allows visitCallSite to do the heavy
+/// lifting.
+Instruction *InstCombiner::visitCallInst(CallInst &CI) {
+  auto Args = CI.arg_operands();
+  if (Value *V = SimplifyCall(&CI, CI.getCalledValue(), Args.begin(),
+                              Args.end(), SQ.getWithInstruction(&CI)))
+    return replaceInstUsesWith(CI, V);
+
+  if (isFreeCall(&CI, &TLI))
+    return visitFree(CI);
+
+  // If the caller function is nounwind, mark the call as nounwind, even if the
+  // callee isn't.
+  if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) {
+    CI.setDoesNotThrow();
+    return &CI;
+  }
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
+  if (!II) return visitCallSite(&CI);
+
+  // Intrinsics cannot occur in an invoke, so handle them here instead of in
+  // visitCallSite.
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) {
+    bool Changed = false;
+
+    // memmove/cpy/set of zero bytes is a noop.
+    if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
+      if (NumBytes->isNullValue())
+        return eraseInstFromFunction(CI);
+
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
+        if (CI->getZExtValue() == 1) {
+          // Replace the instruction with just byte operations.  We would
+          // transform other cases to loads/stores, but we don't know if
+          // alignment is sufficient.
+        }
+    }
+
+    // No other transformations apply to volatile transfers.
+    if (MI->isVolatile())
+      return nullptr;
+
+    // If we have a memmove and the source operation is a constant global,
+    // then the source and dest pointers can't alias, so we can change this
+    // into a call to memcpy.
+    if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) {
+      if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
+        if (GVSrc->isConstant()) {
+          Module *M = CI.getModule();
+          Intrinsic::ID MemCpyID = Intrinsic::memcpy;
+          Type *Tys[3] = { CI.getArgOperand(0)->getType(),
+                           CI.getArgOperand(1)->getType(),
+                           CI.getArgOperand(2)->getType() };
+          CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys));
+          Changed = true;
+        }
+    }
+
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+      // memmove(x,x,size) -> noop.
+      if (MTI->getSource() == MTI->getDest())
+        return eraseInstFromFunction(CI);
+    }
+
+    // If we can determine a pointer alignment that is bigger than currently
+    // set, update the alignment.
+    if (isa<MemTransferInst>(MI)) {
+      if (Instruction *I = SimplifyMemTransfer(MI))
+        return I;
+    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) {
+      if (Instruction *I = SimplifyMemSet(MSI))
+        return I;
+    }
+
+    if (Changed) return II;
+  }
+
+  if (auto *AMI = dyn_cast<ElementUnorderedAtomicMemCpyInst>(II)) {
+    if (Constant *C = dyn_cast<Constant>(AMI->getLength()))
+      if (C->isNullValue())
+        return eraseInstFromFunction(*AMI);
+
+    if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI))
+      return I;
+  }
+
+  if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
+    return I;
+
+  auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width,
+                                              unsigned DemandedWidth) {
+    APInt UndefElts(Width, 0);
+    APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
+    return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
+  };
+
+  switch (II->getIntrinsicID()) {
+  default: break;
+  case Intrinsic::objectsize:
+    if (ConstantInt *N =
+            lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false))
+      return replaceInstUsesWith(CI, N);
+    return nullptr;
+
+  case Intrinsic::bswap: {
+    Value *IIOperand = II->getArgOperand(0);
+    Value *X = nullptr;
+
+    // TODO should this be in InstSimplify?
+    // bswap(bswap(x)) -> x
+    if (match(IIOperand, m_BSwap(m_Value(X))))
+      return replaceInstUsesWith(CI, X);
+
+    // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
+    if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
+      unsigned C = X->getType()->getPrimitiveSizeInBits() -
+        IIOperand->getType()->getPrimitiveSizeInBits();
+      Value *CV = ConstantInt::get(X->getType(), C);
+      Value *V = Builder.CreateLShr(X, CV);
+      return new TruncInst(V, IIOperand->getType());
+    }
+    break;
+  }
+
+  case Intrinsic::bitreverse: {
+    Value *IIOperand = II->getArgOperand(0);
+    Value *X = nullptr;
+
+    // TODO should this be in InstSimplify?
+    // bitreverse(bitreverse(x)) -> x
+    if (match(IIOperand, m_BitReverse(m_Value(X))))
+      return replaceInstUsesWith(CI, X);
+    break;
+  }
+
+  case Intrinsic::masked_load:
+    if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II, Builder))
+      return replaceInstUsesWith(CI, SimplifiedMaskedOp);
+    break;
+  case Intrinsic::masked_store:
+    return simplifyMaskedStore(*II, *this);
+  case Intrinsic::masked_gather:
+    return simplifyMaskedGather(*II, *this);
+  case Intrinsic::masked_scatter:
+    return simplifyMaskedScatter(*II, *this);
+
+  case Intrinsic::powi:
+    if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+      // powi(x, 0) -> 1.0
+      if (Power->isZero())
+        return replaceInstUsesWith(CI, ConstantFP::get(CI.getType(), 1.0));
+      // powi(x, 1) -> x
+      if (Power->isOne())
+        return replaceInstUsesWith(CI, II->getArgOperand(0));
+      // powi(x, -1) -> 1/x
+      if (Power->isMinusOne())
+        return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0),
+                                          II->getArgOperand(0));
+    }
+    break;
+
+  case Intrinsic::cttz:
+  case Intrinsic::ctlz:
+    if (auto *I = foldCttzCtlz(*II, *this))
+      return I;
+    break;
+
+  case Intrinsic::ctpop:
+    if (auto *I = foldCtpop(*II, *this))
+      return I;
+    break;
+
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::umul_with_overflow:
+  case Intrinsic::smul_with_overflow:
+    if (isa<Constant>(II->getArgOperand(0)) &&
+        !isa<Constant>(II->getArgOperand(1))) {
+      // Canonicalize constants into the RHS.
+      Value *LHS = II->getArgOperand(0);
+      II->setArgOperand(0, II->getArgOperand(1));
+      II->setArgOperand(1, LHS);
+      return II;
+    }
+    LLVM_FALLTHROUGH;
+
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::ssub_with_overflow: {
+    OverflowCheckFlavor OCF =
+        IntrinsicIDToOverflowCheckFlavor(II->getIntrinsicID());
+    assert(OCF != OCF_INVALID && "unexpected!");
+
+    Value *OperationResult = nullptr;
+    Constant *OverflowResult = nullptr;
+    if (OptimizeOverflowCheck(OCF, II->getArgOperand(0), II->getArgOperand(1),
+                              *II, OperationResult, OverflowResult))
+      return CreateOverflowTuple(II, OperationResult, OverflowResult);
+
+    break;
+  }
+
+  case Intrinsic::minnum:
+  case Intrinsic::maxnum: {
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+    // Canonicalize constants to the RHS.
+    if (isa<ConstantFP>(Arg0) && !isa<ConstantFP>(Arg1)) {
+      II->setArgOperand(0, Arg1);
+      II->setArgOperand(1, Arg0);
+      return II;
+    }
+    if (Value *V = simplifyMinnumMaxnum(*II))
+      return replaceInstUsesWith(*II, V);
+    break;
+  }
+  case Intrinsic::fmuladd: {
+    // Canonicalize fast fmuladd to the separate fmul + fadd.
+    if (II->hasUnsafeAlgebra()) {
+      BuilderTy::FastMathFlagGuard Guard(Builder);
+      Builder.setFastMathFlags(II->getFastMathFlags());
+      Value *Mul = Builder.CreateFMul(II->getArgOperand(0),
+                                      II->getArgOperand(1));
+      Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2));
+      Add->takeName(II);
+      return replaceInstUsesWith(*II, Add);
+    }
+
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::fma: {
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+
+    // Canonicalize constants into the RHS.
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      II->setArgOperand(0, Src1);
+      II->setArgOperand(1, Src0);
+      std::swap(Src0, Src1);
+    }
+
+    Value *LHS = nullptr;
+    Value *RHS = nullptr;
+
+    // fma fneg(x), fneg(y), z -> fma x, y, z
+    if (match(Src0, m_FNeg(m_Value(LHS))) &&
+        match(Src1, m_FNeg(m_Value(RHS)))) {
+      II->setArgOperand(0, LHS);
+      II->setArgOperand(1, RHS);
+      return II;
+    }
+
+    // fma fabs(x), fabs(x), z -> fma x, x, z
+    if (match(Src0, m_Intrinsic<Intrinsic::fabs>(m_Value(LHS))) &&
+        match(Src1, m_Intrinsic<Intrinsic::fabs>(m_Value(RHS))) && LHS == RHS) {
+      II->setArgOperand(0, LHS);
+      II->setArgOperand(1, RHS);
+      return II;
+    }
+
+    // fma x, 1, z -> fadd x, z
+    if (match(Src1, m_FPOne())) {
+      Instruction *RI = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2));
+      RI->copyFastMathFlags(II);
+      return RI;
+    }
+
+    break;
+  }
+  case Intrinsic::fabs: {
+    Value *Cond;
+    Constant *LHS, *RHS;
+    if (match(II->getArgOperand(0),
+              m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) {
+      CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS});
+      CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS});
+      return SelectInst::Create(Cond, Call0, Call1);
+    }
+
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::ceil:
+  case Intrinsic::floor:
+  case Intrinsic::round:
+  case Intrinsic::nearbyint:
+  case Intrinsic::rint:
+  case Intrinsic::trunc: {
+    Value *ExtSrc;
+    if (match(II->getArgOperand(0), m_FPExt(m_Value(ExtSrc))) &&
+        II->getArgOperand(0)->hasOneUse()) {
+      // fabs (fpext x) -> fpext (fabs x)
+      Value *F = Intrinsic::getDeclaration(II->getModule(), II->getIntrinsicID(),
+                                           { ExtSrc->getType() });
+      CallInst *NewFabs = Builder.CreateCall(F, ExtSrc);
+      NewFabs->copyFastMathFlags(II);
+      NewFabs->takeName(II);
+      return new FPExtInst(NewFabs, II->getType());
+    }
+
+    break;
+  }
+  case Intrinsic::cos:
+  case Intrinsic::amdgcn_cos: {
+    Value *SrcSrc;
+    Value *Src = II->getArgOperand(0);
+    if (match(Src, m_FNeg(m_Value(SrcSrc))) ||
+        match(Src, m_Intrinsic<Intrinsic::fabs>(m_Value(SrcSrc)))) {
+      // cos(-x) -> cos(x)
+      // cos(fabs(x)) -> cos(x)
+      II->setArgOperand(0, SrcSrc);
+      return II;
+    }
+
+    break;
+  }
+  case Intrinsic::ppc_altivec_lvx:
+  case Intrinsic::ppc_altivec_lvxl:
+    // Turn PPC lvx -> load if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
+                                   &DT) >= 16) {
+      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
+                                         PointerType::getUnqual(II->getType()));
+      return new LoadInst(Ptr);
+    }
+    break;
+  case Intrinsic::ppc_vsx_lxvw4x:
+  case Intrinsic::ppc_vsx_lxvd2x: {
+    // Turn PPC VSX loads into normal loads.
+    Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
+                                       PointerType::getUnqual(II->getType()));
+    return new LoadInst(Ptr, Twine(""), false, 1);
+  }
+  case Intrinsic::ppc_altivec_stvx:
+  case Intrinsic::ppc_altivec_stvxl:
+    // Turn stvx -> store if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
+                                   &DT) >= 16) {
+      Type *OpPtrTy =
+        PointerType::getUnqual(II->getArgOperand(0)->getType());
+      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
+      return new StoreInst(II->getArgOperand(0), Ptr);
+    }
+    break;
+  case Intrinsic::ppc_vsx_stxvw4x:
+  case Intrinsic::ppc_vsx_stxvd2x: {
+    // Turn PPC VSX stores into normal stores.
+    Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
+    Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
+    return new StoreInst(II->getArgOperand(0), Ptr, false, 1);
+  }
+  case Intrinsic::ppc_qpx_qvlfs:
+    // Turn PPC QPX qvlfs -> load if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
+                                   &DT) >= 16) {
+      Type *VTy = VectorType::get(Builder.getFloatTy(),
+                                  II->getType()->getVectorNumElements());
+      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
+                                         PointerType::getUnqual(VTy));
+      Value *Load = Builder.CreateLoad(Ptr);
+      return new FPExtInst(Load, II->getType());
+    }
+    break;
+  case Intrinsic::ppc_qpx_qvlfd:
+    // Turn PPC QPX qvlfd -> load if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC,
+                                   &DT) >= 32) {
+      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
+                                         PointerType::getUnqual(II->getType()));
+      return new LoadInst(Ptr);
+    }
+    break;
+  case Intrinsic::ppc_qpx_qvstfs:
+    // Turn PPC QPX qvstfs -> store if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
+                                   &DT) >= 16) {
+      Type *VTy = VectorType::get(Builder.getFloatTy(),
+          II->getArgOperand(0)->getType()->getVectorNumElements());
+      Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy);
+      Type *OpPtrTy = PointerType::getUnqual(VTy);
+      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
+      return new StoreInst(TOp, Ptr);
+    }
+    break;
+  case Intrinsic::ppc_qpx_qvstfd:
+    // Turn PPC QPX qvstfd -> store if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC,
+                                   &DT) >= 32) {
+      Type *OpPtrTy =
+        PointerType::getUnqual(II->getArgOperand(0)->getType());
+      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
+      return new StoreInst(II->getArgOperand(0), Ptr);
+    }
+    break;
+
+  case Intrinsic::x86_vcvtph2ps_128:
+  case Intrinsic::x86_vcvtph2ps_256: {
+    auto Arg = II->getArgOperand(0);
+    auto ArgType = cast<VectorType>(Arg->getType());
+    auto RetType = cast<VectorType>(II->getType());
+    unsigned ArgWidth = ArgType->getNumElements();
+    unsigned RetWidth = RetType->getNumElements();
+    assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths");
+    assert(ArgType->isIntOrIntVectorTy() &&
+           ArgType->getScalarSizeInBits() == 16 &&
+           "CVTPH2PS input type should be 16-bit integer vector");
+    assert(RetType->getScalarType()->isFloatTy() &&
+           "CVTPH2PS output type should be 32-bit float vector");
+
+    // Constant folding: Convert to generic half to single conversion.
+    if (isa<ConstantAggregateZero>(Arg))
+      return replaceInstUsesWith(*II, ConstantAggregateZero::get(RetType));
+
+    if (isa<ConstantDataVector>(Arg)) {
+      auto VectorHalfAsShorts = Arg;
+      if (RetWidth < ArgWidth) {
+        SmallVector<uint32_t, 8> SubVecMask;
+        for (unsigned i = 0; i != RetWidth; ++i)
+          SubVecMask.push_back((int)i);
+        VectorHalfAsShorts = Builder.CreateShuffleVector(
+            Arg, UndefValue::get(ArgType), SubVecMask);
+      }
+
+      auto VectorHalfType =
+          VectorType::get(Type::getHalfTy(II->getContext()), RetWidth);
+      auto VectorHalfs =
+          Builder.CreateBitCast(VectorHalfAsShorts, VectorHalfType);
+      auto VectorFloats = Builder.CreateFPExt(VectorHalfs, RetType);
+      return replaceInstUsesWith(*II, VectorFloats);
+    }
+
+    // We only use the lowest lanes of the argument.
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) {
+      II->setArgOperand(0, V);
+      return II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse_cvtss2si:
+  case Intrinsic::x86_sse_cvtss2si64:
+  case Intrinsic::x86_sse_cvttss2si:
+  case Intrinsic::x86_sse_cvttss2si64:
+  case Intrinsic::x86_sse2_cvtsd2si:
+  case Intrinsic::x86_sse2_cvtsd2si64:
+  case Intrinsic::x86_sse2_cvttsd2si:
+  case Intrinsic::x86_sse2_cvttsd2si64:
+  case Intrinsic::x86_avx512_vcvtss2si32:
+  case Intrinsic::x86_avx512_vcvtss2si64:
+  case Intrinsic::x86_avx512_vcvtss2usi32:
+  case Intrinsic::x86_avx512_vcvtss2usi64:
+  case Intrinsic::x86_avx512_vcvtsd2si32:
+  case Intrinsic::x86_avx512_vcvtsd2si64:
+  case Intrinsic::x86_avx512_vcvtsd2usi32:
+  case Intrinsic::x86_avx512_vcvtsd2usi64:
+  case Intrinsic::x86_avx512_cvttss2si:
+  case Intrinsic::x86_avx512_cvttss2si64:
+  case Intrinsic::x86_avx512_cvttss2usi:
+  case Intrinsic::x86_avx512_cvttss2usi64:
+  case Intrinsic::x86_avx512_cvttsd2si:
+  case Intrinsic::x86_avx512_cvttsd2si64:
+  case Intrinsic::x86_avx512_cvttsd2usi:
+  case Intrinsic::x86_avx512_cvttsd2usi64: {
+    // These intrinsics only demand the 0th element of their input vectors. If
+    // we can simplify the input based on that, do so now.
+    Value *Arg = II->getArgOperand(0);
+    unsigned VWidth = Arg->getType()->getVectorNumElements();
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
+      II->setArgOperand(0, V);
+      return II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_mmx_pmovmskb:
+  case Intrinsic::x86_sse_movmsk_ps:
+  case Intrinsic::x86_sse2_movmsk_pd:
+  case Intrinsic::x86_sse2_pmovmskb_128:
+  case Intrinsic::x86_avx_movmsk_pd_256:
+  case Intrinsic::x86_avx_movmsk_ps_256:
+  case Intrinsic::x86_avx2_pmovmskb: {
+    if (Value *V = simplifyX86movmsk(*II))
+      return replaceInstUsesWith(*II, V);
+    break;
+  }
+
+  case Intrinsic::x86_sse_comieq_ss:
+  case Intrinsic::x86_sse_comige_ss:
+  case Intrinsic::x86_sse_comigt_ss:
+  case Intrinsic::x86_sse_comile_ss:
+  case Intrinsic::x86_sse_comilt_ss:
+  case Intrinsic::x86_sse_comineq_ss:
+  case Intrinsic::x86_sse_ucomieq_ss:
+  case Intrinsic::x86_sse_ucomige_ss:
+  case Intrinsic::x86_sse_ucomigt_ss:
+  case Intrinsic::x86_sse_ucomile_ss:
+  case Intrinsic::x86_sse_ucomilt_ss:
+  case Intrinsic::x86_sse_ucomineq_ss:
+  case Intrinsic::x86_sse2_comieq_sd:
+  case Intrinsic::x86_sse2_comige_sd:
+  case Intrinsic::x86_sse2_comigt_sd:
+  case Intrinsic::x86_sse2_comile_sd:
+  case Intrinsic::x86_sse2_comilt_sd:
+  case Intrinsic::x86_sse2_comineq_sd:
+  case Intrinsic::x86_sse2_ucomieq_sd:
+  case Intrinsic::x86_sse2_ucomige_sd:
+  case Intrinsic::x86_sse2_ucomigt_sd:
+  case Intrinsic::x86_sse2_ucomile_sd:
+  case Intrinsic::x86_sse2_ucomilt_sd:
+  case Intrinsic::x86_sse2_ucomineq_sd:
+  case Intrinsic::x86_avx512_vcomi_ss:
+  case Intrinsic::x86_avx512_vcomi_sd:
+  case Intrinsic::x86_avx512_mask_cmp_ss:
+  case Intrinsic::x86_avx512_mask_cmp_sd: {
+    // These intrinsics only demand the 0th element of their input vectors. If
+    // we can simplify the input based on that, do so now.
+    bool MadeChange = false;
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+    unsigned VWidth = Arg0->getType()->getVectorNumElements();
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
+      II->setArgOperand(0, V);
+      MadeChange = true;
+    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
+      II->setArgOperand(1, V);
+      MadeChange = true;
+    }
+    if (MadeChange)
+      return II;
+    break;
+  }
+  case Intrinsic::x86_avx512_mask_cmp_pd_128:
+  case Intrinsic::x86_avx512_mask_cmp_pd_256:
+  case Intrinsic::x86_avx512_mask_cmp_pd_512:
+  case Intrinsic::x86_avx512_mask_cmp_ps_128:
+  case Intrinsic::x86_avx512_mask_cmp_ps_256:
+  case Intrinsic::x86_avx512_mask_cmp_ps_512: {
+    // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+    bool Arg0IsZero = match(Arg0, m_Zero());
+    if (Arg0IsZero)
+      std::swap(Arg0, Arg1);
+    Value *A, *B;
+    // This fold requires only the NINF(not +/- inf) since inf minus
+    // inf is nan.
+    // NSZ(No Signed Zeros) is not needed because zeros of any sign are
+    // equal for both compares.
+    // NNAN is not needed because nans compare the same for both compares.
+    // The compare intrinsic uses the above assumptions and therefore
+    // doesn't require additional flags.
+    if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) &&
+         match(Arg1, m_Zero()) &&
+         cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
+      if (Arg0IsZero)
+        std::swap(A, B);
+      II->setArgOperand(0, A);
+      II->setArgOperand(1, B);
+      return II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_avx512_mask_add_ps_512:
+  case Intrinsic::x86_avx512_mask_div_ps_512:
+  case Intrinsic::x86_avx512_mask_mul_ps_512:
+  case Intrinsic::x86_avx512_mask_sub_ps_512:
+  case Intrinsic::x86_avx512_mask_add_pd_512:
+  case Intrinsic::x86_avx512_mask_div_pd_512:
+  case Intrinsic::x86_avx512_mask_mul_pd_512:
+  case Intrinsic::x86_avx512_mask_sub_pd_512:
+    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
+    // IR operations.
+    if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) {
+      if (R->getValue() == 4) {
+        Value *Arg0 = II->getArgOperand(0);
+        Value *Arg1 = II->getArgOperand(1);
+
+        Value *V;
+        switch (II->getIntrinsicID()) {
+        default: llvm_unreachable("Case stmts out of sync!");
+        case Intrinsic::x86_avx512_mask_add_ps_512:
+        case Intrinsic::x86_avx512_mask_add_pd_512:
+          V = Builder.CreateFAdd(Arg0, Arg1);
+          break;
+        case Intrinsic::x86_avx512_mask_sub_ps_512:
+        case Intrinsic::x86_avx512_mask_sub_pd_512:
+          V = Builder.CreateFSub(Arg0, Arg1);
+          break;
+        case Intrinsic::x86_avx512_mask_mul_ps_512:
+        case Intrinsic::x86_avx512_mask_mul_pd_512:
+          V = Builder.CreateFMul(Arg0, Arg1);
+          break;
+        case Intrinsic::x86_avx512_mask_div_ps_512:
+        case Intrinsic::x86_avx512_mask_div_pd_512:
+          V = Builder.CreateFDiv(Arg0, Arg1);
+          break;
+        }
+
+        // Create a select for the masking.
+        V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
+                              Builder);
+        return replaceInstUsesWith(*II, V);
+      }
+    }
+    break;
+
+  case Intrinsic::x86_avx512_mask_add_ss_round:
+  case Intrinsic::x86_avx512_mask_div_ss_round:
+  case Intrinsic::x86_avx512_mask_mul_ss_round:
+  case Intrinsic::x86_avx512_mask_sub_ss_round:
+  case Intrinsic::x86_avx512_mask_add_sd_round:
+  case Intrinsic::x86_avx512_mask_div_sd_round:
+  case Intrinsic::x86_avx512_mask_mul_sd_round:
+  case Intrinsic::x86_avx512_mask_sub_sd_round:
+    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
+    // IR operations.
+    if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) {
+      if (R->getValue() == 4) {
+        // Extract the element as scalars.
+        Value *Arg0 = II->getArgOperand(0);
+        Value *Arg1 = II->getArgOperand(1);
+        Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0);
+        Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0);
+
+        Value *V;
+        switch (II->getIntrinsicID()) {
+        default: llvm_unreachable("Case stmts out of sync!");
+        case Intrinsic::x86_avx512_mask_add_ss_round:
+        case Intrinsic::x86_avx512_mask_add_sd_round:
+          V = Builder.CreateFAdd(LHS, RHS);
+          break;
+        case Intrinsic::x86_avx512_mask_sub_ss_round:
+        case Intrinsic::x86_avx512_mask_sub_sd_round:
+          V = Builder.CreateFSub(LHS, RHS);
+          break;
+        case Intrinsic::x86_avx512_mask_mul_ss_round:
+        case Intrinsic::x86_avx512_mask_mul_sd_round:
+          V = Builder.CreateFMul(LHS, RHS);
+          break;
+        case Intrinsic::x86_avx512_mask_div_ss_round:
+        case Intrinsic::x86_avx512_mask_div_sd_round:
+          V = Builder.CreateFDiv(LHS, RHS);
+          break;
+        }
+
+        // Handle the masking aspect of the intrinsic.
+        Value *Mask = II->getArgOperand(3);
+        auto *C = dyn_cast<ConstantInt>(Mask);
+        // We don't need a select if we know the mask bit is a 1.
+        if (!C || !C->getValue()[0]) {
+          // Cast the mask to an i1 vector and then extract the lowest element.
+          auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
+                             cast<IntegerType>(Mask->getType())->getBitWidth());
+          Mask = Builder.CreateBitCast(Mask, MaskTy);
+          Mask = Builder.CreateExtractElement(Mask, (uint64_t)0);
+          // Extract the lowest element from the passthru operand.
+          Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2),
+                                                          (uint64_t)0);
+          V = Builder.CreateSelect(Mask, V, Passthru);
+        }
+
+        // Insert the result back into the original argument 0.
+        V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
+
+        return replaceInstUsesWith(*II, V);
+      }
+    }
+    LLVM_FALLTHROUGH;
+
+  // X86 scalar intrinsics simplified with SimplifyDemandedVectorElts.
+  case Intrinsic::x86_avx512_mask_max_ss_round:
+  case Intrinsic::x86_avx512_mask_min_ss_round:
+  case Intrinsic::x86_avx512_mask_max_sd_round:
+  case Intrinsic::x86_avx512_mask_min_sd_round:
+  case Intrinsic::x86_avx512_mask_vfmadd_ss:
+  case Intrinsic::x86_avx512_mask_vfmadd_sd:
+  case Intrinsic::x86_avx512_maskz_vfmadd_ss:
+  case Intrinsic::x86_avx512_maskz_vfmadd_sd:
+  case Intrinsic::x86_avx512_mask3_vfmadd_ss:
+  case Intrinsic::x86_avx512_mask3_vfmadd_sd:
+  case Intrinsic::x86_avx512_mask3_vfmsub_ss:
+  case Intrinsic::x86_avx512_mask3_vfmsub_sd:
+  case Intrinsic::x86_avx512_mask3_vfnmsub_ss:
+  case Intrinsic::x86_avx512_mask3_vfnmsub_sd:
+  case Intrinsic::x86_fma_vfmadd_ss:
+  case Intrinsic::x86_fma_vfmsub_ss:
+  case Intrinsic::x86_fma_vfnmadd_ss:
+  case Intrinsic::x86_fma_vfnmsub_ss:
+  case Intrinsic::x86_fma_vfmadd_sd:
+  case Intrinsic::x86_fma_vfmsub_sd:
+  case Intrinsic::x86_fma_vfnmadd_sd:
+  case Intrinsic::x86_fma_vfnmsub_sd:
+  case Intrinsic::x86_sse_cmp_ss:
+  case Intrinsic::x86_sse_min_ss:
+  case Intrinsic::x86_sse_max_ss:
+  case Intrinsic::x86_sse2_cmp_sd:
+  case Intrinsic::x86_sse2_min_sd:
+  case Intrinsic::x86_sse2_max_sd:
+  case Intrinsic::x86_sse41_round_ss:
+  case Intrinsic::x86_sse41_round_sd:
+  case Intrinsic::x86_xop_vfrcz_ss:
+  case Intrinsic::x86_xop_vfrcz_sd: {
+   unsigned VWidth = II->getType()->getVectorNumElements();
+   APInt UndefElts(VWidth, 0);
+   APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+   if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
+     if (V != II)
+       return replaceInstUsesWith(*II, V);
+     return II;
+   }
+   break;
+  }
+
+  // Constant fold ashr( <A x Bi>, Ci ).
+  // Constant fold lshr( <A x Bi>, Ci ).
+  // Constant fold shl( <A x Bi>, Ci ).
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx512_psrai_q_128:
+  case Intrinsic::x86_avx512_psrai_q_256:
+  case Intrinsic::x86_avx512_psrai_d_512:
+  case Intrinsic::x86_avx512_psrai_q_512:
+  case Intrinsic::x86_avx512_psrai_w_512:
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w:
+  case Intrinsic::x86_avx512_psrli_d_512:
+  case Intrinsic::x86_avx512_psrli_q_512:
+  case Intrinsic::x86_avx512_psrli_w_512:
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+  case Intrinsic::x86_avx2_pslli_w:
+  case Intrinsic::x86_avx512_pslli_d_512:
+  case Intrinsic::x86_avx512_pslli_q_512:
+  case Intrinsic::x86_avx512_pslli_w_512:
+    if (Value *V = simplifyX86immShift(*II, Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_avx2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx512_psra_q_128:
+  case Intrinsic::x86_avx512_psra_q_256:
+  case Intrinsic::x86_avx512_psra_d_512:
+  case Intrinsic::x86_avx512_psra_q_512:
+  case Intrinsic::x86_avx512_psra_w_512:
+  case Intrinsic::x86_sse2_psrl_d:
+  case Intrinsic::x86_sse2_psrl_q:
+  case Intrinsic::x86_sse2_psrl_w:
+  case Intrinsic::x86_avx2_psrl_d:
+  case Intrinsic::x86_avx2_psrl_q:
+  case Intrinsic::x86_avx2_psrl_w:
+  case Intrinsic::x86_avx512_psrl_d_512:
+  case Intrinsic::x86_avx512_psrl_q_512:
+  case Intrinsic::x86_avx512_psrl_w_512:
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+  case Intrinsic::x86_avx2_psll_w:
+  case Intrinsic::x86_avx512_psll_d_512:
+  case Intrinsic::x86_avx512_psll_q_512:
+  case Intrinsic::x86_avx512_psll_w_512: {
+    if (Value *V = simplifyX86immShift(*II, Builder))
+      return replaceInstUsesWith(*II, V);
+
+    // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
+    // operand to compute the shift amount.
+    Value *Arg1 = II->getArgOperand(1);
+    assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
+           "Unexpected packed shift size");
+    unsigned VWidth = Arg1->getType()->getVectorNumElements();
+
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
+      II->setArgOperand(1, V);
+      return II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_q_256:
+  case Intrinsic::x86_avx512_psllv_d_512:
+  case Intrinsic::x86_avx512_psllv_q_512:
+  case Intrinsic::x86_avx512_psllv_w_128:
+  case Intrinsic::x86_avx512_psllv_w_256:
+  case Intrinsic::x86_avx512_psllv_w_512:
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256:
+  case Intrinsic::x86_avx512_psrav_q_128:
+  case Intrinsic::x86_avx512_psrav_q_256:
+  case Intrinsic::x86_avx512_psrav_d_512:
+  case Intrinsic::x86_avx512_psrav_q_512:
+  case Intrinsic::x86_avx512_psrav_w_128:
+  case Intrinsic::x86_avx512_psrav_w_256:
+  case Intrinsic::x86_avx512_psrav_w_512:
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+  case Intrinsic::x86_avx512_psrlv_d_512:
+  case Intrinsic::x86_avx512_psrlv_q_512:
+  case Intrinsic::x86_avx512_psrlv_w_128:
+  case Intrinsic::x86_avx512_psrlv_w_256:
+  case Intrinsic::x86_avx512_psrlv_w_512:
+    if (Value *V = simplifyX86varShift(*II, Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_sse2_pmulu_dq:
+  case Intrinsic::x86_sse41_pmuldq:
+  case Intrinsic::x86_avx2_pmul_dq:
+  case Intrinsic::x86_avx2_pmulu_dq:
+  case Intrinsic::x86_avx512_pmul_dq_512:
+  case Intrinsic::x86_avx512_pmulu_dq_512: {
+    if (Value *V = simplifyX86muldq(*II, Builder))
+      return replaceInstUsesWith(*II, V);
+
+    unsigned VWidth = II->getType()->getVectorNumElements();
+    APInt UndefElts(VWidth, 0);
+    APInt DemandedElts = APInt::getAllOnesValue(VWidth);
+    if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) {
+      if (V != II)
+        return replaceInstUsesWith(*II, V);
+      return II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse2_packssdw_128:
+  case Intrinsic::x86_sse2_packsswb_128:
+  case Intrinsic::x86_avx2_packssdw:
+  case Intrinsic::x86_avx2_packsswb:
+  case Intrinsic::x86_avx512_packssdw_512:
+  case Intrinsic::x86_avx512_packsswb_512:
+    if (Value *V = simplifyX86pack(*II, true))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_sse2_packuswb_128:
+  case Intrinsic::x86_sse41_packusdw:
+  case Intrinsic::x86_avx2_packusdw:
+  case Intrinsic::x86_avx2_packuswb:
+  case Intrinsic::x86_avx512_packusdw_512:
+  case Intrinsic::x86_avx512_packuswb_512:
+    if (Value *V = simplifyX86pack(*II, false))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_pclmulqdq: {
+    if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
+      unsigned Imm = C->getZExtValue();
+
+      bool MadeChange = false;
+      Value *Arg0 = II->getArgOperand(0);
+      Value *Arg1 = II->getArgOperand(1);
+      unsigned VWidth = Arg0->getType()->getVectorNumElements();
+      APInt DemandedElts(VWidth, 0);
+
+      APInt UndefElts1(VWidth, 0);
+      DemandedElts = (Imm & 0x01) ? 2 : 1;
+      if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts,
+                                                UndefElts1)) {
+        II->setArgOperand(0, V);
+        MadeChange = true;
+      }
+
+      APInt UndefElts2(VWidth, 0);
+      DemandedElts = (Imm & 0x10) ? 2 : 1;
+      if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts,
+                                                UndefElts2)) {
+        II->setArgOperand(1, V);
+        MadeChange = true;
+      }
+
+      // If both input elements are undef, the result is undef.
+      if (UndefElts1[(Imm & 0x01) ? 1 : 0] ||
+          UndefElts2[(Imm & 0x10) ? 1 : 0])
+        return replaceInstUsesWith(*II,
+                                   ConstantAggregateZero::get(II->getType()));
+
+      if (MadeChange)
+        return II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse41_insertps:
+    if (Value *V = simplifyX86insertps(*II, Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_sse4a_extrq: {
+    Value *Op0 = II->getArgOperand(0);
+    Value *Op1 = II->getArgOperand(1);
+    unsigned VWidth0 = Op0->getType()->getVectorNumElements();
+    unsigned VWidth1 = Op1->getType()->getVectorNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
+           VWidth1 == 16 && "Unexpected operand sizes");
+
+    // See if we're dealing with constant values.
+    Constant *C1 = dyn_cast<Constant>(Op1);
+    ConstantInt *CILength =
+        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
+           : nullptr;
+    ConstantInt *CIIndex =
+        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
+           : nullptr;
+
+    // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
+    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder))
+      return replaceInstUsesWith(*II, V);
+
+    // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
+    // operands and the lowest 16-bits of the second.
+    bool MadeChange = false;
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
+      II->setArgOperand(0, V);
+      MadeChange = true;
+    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
+      II->setArgOperand(1, V);
+      MadeChange = true;
+    }
+    if (MadeChange)
+      return II;
+    break;
+  }
+
+  case Intrinsic::x86_sse4a_extrqi: {
+    // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
+    // bits of the lower 64-bits. The upper 64-bits are undefined.
+    Value *Op0 = II->getArgOperand(0);
+    unsigned VWidth = Op0->getType()->getVectorNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+           "Unexpected operand size");
+
+    // See if we're dealing with constant values.
+    ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1));
+    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2));
+
+    // Attempt to simplify to a constant or shuffle vector.
+    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder))
+      return replaceInstUsesWith(*II, V);
+
+    // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
+    // operand.
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
+      II->setArgOperand(0, V);
+      return II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse4a_insertq: {
+    Value *Op0 = II->getArgOperand(0);
+    Value *Op1 = II->getArgOperand(1);
+    unsigned VWidth = Op0->getType()->getVectorNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+           Op1->getType()->getVectorNumElements() == 2 &&
+           "Unexpected operand size");
+
+    // See if we're dealing with constant values.
+    Constant *C1 = dyn_cast<Constant>(Op1);
+    ConstantInt *CI11 =
+        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
+           : nullptr;
+
+    // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
+    if (CI11) {
+      const APInt &V11 = CI11->getValue();
+      APInt Len = V11.zextOrTrunc(6);
+      APInt Idx = V11.lshr(8).zextOrTrunc(6);
+      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder))
+        return replaceInstUsesWith(*II, V);
+    }
+
+    // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
+    // operand.
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
+      II->setArgOperand(0, V);
+      return II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse4a_insertqi: {
+    // INSERTQI: Extract lowest Length bits from lower half of second source and
+    // insert over first source starting at Index bit. The upper 64-bits are
+    // undefined.
+    Value *Op0 = II->getArgOperand(0);
+    Value *Op1 = II->getArgOperand(1);
+    unsigned VWidth0 = Op0->getType()->getVectorNumElements();
+    unsigned VWidth1 = Op1->getType()->getVectorNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
+           VWidth1 == 2 && "Unexpected operand sizes");
+
+    // See if we're dealing with constant values.
+    ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2));
+    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3));
+
+    // Attempt to simplify to a constant or shuffle vector.
+    if (CILength && CIIndex) {
+      APInt Len = CILength->getValue().zextOrTrunc(6);
+      APInt Idx = CIIndex->getValue().zextOrTrunc(6);
+      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder))
+        return replaceInstUsesWith(*II, V);
+    }
+
+    // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
+    // operands.
+    bool MadeChange = false;
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
+      II->setArgOperand(0, V);
+      MadeChange = true;
+    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
+      II->setArgOperand(1, V);
+      MadeChange = true;
+    }
+    if (MadeChange)
+      return II;
+    break;
+  }
+
+  case Intrinsic::x86_sse41_pblendvb:
+  case Intrinsic::x86_sse41_blendvps:
+  case Intrinsic::x86_sse41_blendvpd:
+  case Intrinsic::x86_avx_blendv_ps_256:
+  case Intrinsic::x86_avx_blendv_pd_256:
+  case Intrinsic::x86_avx2_pblendvb: {
+    // Convert blendv* to vector selects if the mask is constant.
+    // This optimization is convoluted because the intrinsic is defined as
+    // getting a vector of floats or doubles for the ps and pd versions.
+    // FIXME: That should be changed.
+
+    Value *Op0 = II->getArgOperand(0);
+    Value *Op1 = II->getArgOperand(1);
+    Value *Mask = II->getArgOperand(2);
+
+    // fold (blend A, A, Mask) -> A
+    if (Op0 == Op1)
+      return replaceInstUsesWith(CI, Op0);
+
+    // Zero Mask - select 1st argument.
+    if (isa<ConstantAggregateZero>(Mask))
+      return replaceInstUsesWith(CI, Op0);
+
+    // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
+    if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
+      Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
+      return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
+    }
+    break;
+  }
+
+  case Intrinsic::x86_ssse3_pshuf_b_128:
+  case Intrinsic::x86_avx2_pshuf_b:
+  case Intrinsic::x86_avx512_pshuf_b_512:
+    if (Value *V = simplifyX86pshufb(*II, Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_avx_vpermilvar_ps:
+  case Intrinsic::x86_avx_vpermilvar_ps_256:
+  case Intrinsic::x86_avx512_vpermilvar_ps_512:
+  case Intrinsic::x86_avx_vpermilvar_pd:
+  case Intrinsic::x86_avx_vpermilvar_pd_256:
+  case Intrinsic::x86_avx512_vpermilvar_pd_512:
+    if (Value *V = simplifyX86vpermilvar(*II, Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_avx2_permd:
+  case Intrinsic::x86_avx2_permps:
+    if (Value *V = simplifyX86vpermv(*II, Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_avx512_mask_permvar_df_256:
+  case Intrinsic::x86_avx512_mask_permvar_df_512:
+  case Intrinsic::x86_avx512_mask_permvar_di_256:
+  case Intrinsic::x86_avx512_mask_permvar_di_512:
+  case Intrinsic::x86_avx512_mask_permvar_hi_128:
+  case Intrinsic::x86_avx512_mask_permvar_hi_256:
+  case Intrinsic::x86_avx512_mask_permvar_hi_512:
+  case Intrinsic::x86_avx512_mask_permvar_qi_128:
+  case Intrinsic::x86_avx512_mask_permvar_qi_256:
+  case Intrinsic::x86_avx512_mask_permvar_qi_512:
+  case Intrinsic::x86_avx512_mask_permvar_sf_256:
+  case Intrinsic::x86_avx512_mask_permvar_sf_512:
+  case Intrinsic::x86_avx512_mask_permvar_si_256:
+  case Intrinsic::x86_avx512_mask_permvar_si_512:
+    if (Value *V = simplifyX86vpermv(*II, Builder)) {
+      // We simplified the permuting, now create a select for the masking.
+      V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
+                            Builder);
+      return replaceInstUsesWith(*II, V);
+    }
+    break;
+
+  case Intrinsic::x86_avx_vperm2f128_pd_256:
+  case Intrinsic::x86_avx_vperm2f128_ps_256:
+  case Intrinsic::x86_avx_vperm2f128_si_256:
+  case Intrinsic::x86_avx2_vperm2i128:
+    if (Value *V = simplifyX86vperm2(*II, Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_avx_maskload_ps:
+  case Intrinsic::x86_avx_maskload_pd:
+  case Intrinsic::x86_avx_maskload_ps_256:
+  case Intrinsic::x86_avx_maskload_pd_256:
+  case Intrinsic::x86_avx2_maskload_d:
+  case Intrinsic::x86_avx2_maskload_q:
+  case Intrinsic::x86_avx2_maskload_d_256:
+  case Intrinsic::x86_avx2_maskload_q_256:
+    if (Instruction *I = simplifyX86MaskedLoad(*II, *this))
+      return I;
+    break;
+
+  case Intrinsic::x86_sse2_maskmov_dqu:
+  case Intrinsic::x86_avx_maskstore_ps:
+  case Intrinsic::x86_avx_maskstore_pd:
+  case Intrinsic::x86_avx_maskstore_ps_256:
+  case Intrinsic::x86_avx_maskstore_pd_256:
+  case Intrinsic::x86_avx2_maskstore_d:
+  case Intrinsic::x86_avx2_maskstore_q:
+  case Intrinsic::x86_avx2_maskstore_d_256:
+  case Intrinsic::x86_avx2_maskstore_q_256:
+    if (simplifyX86MaskedStore(*II, *this))
+      return nullptr;
+    break;
+
+  case Intrinsic::x86_xop_vpcomb:
+  case Intrinsic::x86_xop_vpcomd:
+  case Intrinsic::x86_xop_vpcomq:
+  case Intrinsic::x86_xop_vpcomw:
+    if (Value *V = simplifyX86vpcom(*II, Builder, true))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_xop_vpcomub:
+  case Intrinsic::x86_xop_vpcomud:
+  case Intrinsic::x86_xop_vpcomuq:
+  case Intrinsic::x86_xop_vpcomuw:
+    if (Value *V = simplifyX86vpcom(*II, Builder, false))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::ppc_altivec_vperm:
+    // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
+    // Note that ppc_altivec_vperm has a big-endian bias, so when creating
+    // a vectorshuffle for little endian, we must undo the transformation
+    // performed on vec_perm in altivec.h.  That is, we must complement
+    // the permutation mask with respect to 31 and reverse the order of
+    // V1 and V2.
+    if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
+      assert(Mask->getType()->getVectorNumElements() == 16 &&
+             "Bad type for intrinsic!");
+
+      // Check that all of the elements are integer constants or undefs.
+      bool AllEltsOk = true;
+      for (unsigned i = 0; i != 16; ++i) {
+        Constant *Elt = Mask->getAggregateElement(i);
+        if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
+          AllEltsOk = false;
+          break;
+        }
+      }
+
+      if (AllEltsOk) {
+        // Cast the input vectors to byte vectors.
+        Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0),
+                                           Mask->getType());
+        Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1),
+                                           Mask->getType());
+        Value *Result = UndefValue::get(Op0->getType());
+
+        // Only extract each element once.
+        Value *ExtractedElts[32];
+        memset(ExtractedElts, 0, sizeof(ExtractedElts));
+
+        for (unsigned i = 0; i != 16; ++i) {
+          if (isa<UndefValue>(Mask->getAggregateElement(i)))
+            continue;
+          unsigned Idx =
+            cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
+          Idx &= 31;  // Match the hardware behavior.
+          if (DL.isLittleEndian())
+            Idx = 31 - Idx;
+
+          if (!ExtractedElts[Idx]) {
+            Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
+            Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
+            ExtractedElts[Idx] =
+              Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse,
+                                           Builder.getInt32(Idx&15));
+          }
+
+          // Insert this value into the result vector.
+          Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx],
+                                               Builder.getInt32(i));
+        }
+        return CastInst::Create(Instruction::BitCast, Result, CI.getType());
+      }
+    }
+    break;
+
+  case Intrinsic::arm_neon_vld1:
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane:
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    unsigned MemAlign =
+        getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
+    unsigned AlignArg = II->getNumArgOperands() - 1;
+    ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg));
+    if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) {
+      II->setArgOperand(AlignArg,
+                        ConstantInt::get(Type::getInt32Ty(II->getContext()),
+                                         MemAlign, false));
+      return II;
+    }
+    break;
+  }
+
+  case Intrinsic::arm_neon_vmulls:
+  case Intrinsic::arm_neon_vmullu:
+  case Intrinsic::aarch64_neon_smull:
+  case Intrinsic::aarch64_neon_umull: {
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+
+    // Handle mul by zero first:
+    if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
+      return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
+    }
+
+    // Check for constant LHS & RHS - in this case we just simplify.
+    bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu ||
+                 II->getIntrinsicID() == Intrinsic::aarch64_neon_umull);
+    VectorType *NewVT = cast<VectorType>(II->getType());
+    if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
+      if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
+        CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext);
+        CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext);
+
+        return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1));
+      }
+
+      // Couldn't simplify - canonicalize constant to the RHS.
+      std::swap(Arg0, Arg1);
+    }
+
+    // Handle mul by one:
+    if (Constant *CV1 = dyn_cast<Constant>(Arg1))
+      if (ConstantInt *Splat =
+              dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
+        if (Splat->isOne())
+          return CastInst::CreateIntegerCast(Arg0, II->getType(),
+                                             /*isSigned=*/!Zext);
+
+    break;
+  }
+  case Intrinsic::amdgcn_rcp: {
+    Value *Src = II->getArgOperand(0);
+
+    // TODO: Move to ConstantFolding/InstSimplify?
+    if (isa<UndefValue>(Src))
+      return replaceInstUsesWith(CI, Src);
+
+    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
+      const APFloat &ArgVal = C->getValueAPF();
+      APFloat Val(ArgVal.getSemantics(), 1.0);
+      APFloat::opStatus Status = Val.divide(ArgVal,
+                                            APFloat::rmNearestTiesToEven);
+      // Only do this if it was exact and therefore not dependent on the
+      // rounding mode.
+      if (Status == APFloat::opOK)
+        return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_rsq: {
+    Value *Src = II->getArgOperand(0);
+
+    // TODO: Move to ConstantFolding/InstSimplify?
+    if (isa<UndefValue>(Src))
+      return replaceInstUsesWith(CI, Src);
+    break;
+  }
+  case Intrinsic::amdgcn_frexp_mant:
+  case Intrinsic::amdgcn_frexp_exp: {
+    Value *Src = II->getArgOperand(0);
+    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
+      int Exp;
+      APFloat Significand = frexp(C->getValueAPF(), Exp,
+                                  APFloat::rmNearestTiesToEven);
+
+      if (II->getIntrinsicID() == Intrinsic::amdgcn_frexp_mant) {
+        return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(),
+                                                       Significand));
+      }
+
+      // Match instruction special case behavior.
+      if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
+        Exp = 0;
+
+      return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp));
+    }
+
+    if (isa<UndefValue>(Src))
+      return replaceInstUsesWith(CI, UndefValue::get(II->getType()));
+
+    break;
+  }
+  case Intrinsic::amdgcn_class: {
+    enum  {
+      S_NAN = 1 << 0,        // Signaling NaN
+      Q_NAN = 1 << 1,        // Quiet NaN
+      N_INFINITY = 1 << 2,   // Negative infinity
+      N_NORMAL = 1 << 3,     // Negative normal
+      N_SUBNORMAL = 1 << 4,  // Negative subnormal
+      N_ZERO = 1 << 5,       // Negative zero
+      P_ZERO = 1 << 6,       // Positive zero
+      P_SUBNORMAL = 1 << 7,  // Positive subnormal
+      P_NORMAL = 1 << 8,     // Positive normal
+      P_INFINITY = 1 << 9    // Positive infinity
+    };
+
+    const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
+      N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY;
+
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+    const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
+    if (!CMask) {
+      if (isa<UndefValue>(Src0))
+        return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+      if (isa<UndefValue>(Src1))
+        return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
+      break;
+    }
+
+    uint32_t Mask = CMask->getZExtValue();
+
+    // If all tests are made, it doesn't matter what the value is.
+    if ((Mask & FullMask) == FullMask)
+      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true));
+
+    if ((Mask & FullMask) == 0)
+      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
+
+    if (Mask == (S_NAN | Q_NAN)) {
+      // Equivalent of isnan. Replace with standard fcmp.
+      Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0);
+      FCmp->takeName(II);
+      return replaceInstUsesWith(*II, FCmp);
+    }
+
+    const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
+    if (!CVal) {
+      if (isa<UndefValue>(Src0))
+        return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+      // Clamp mask to used bits
+      if ((Mask & FullMask) != Mask) {
+        CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(),
+          { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) }
+        );
+
+        NewCall->takeName(II);
+        return replaceInstUsesWith(*II, NewCall);
+      }
+
+      break;
+    }
+
+    const APFloat &Val = CVal->getValueAPF();
+
+    bool Result =
+      ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
+      ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
+      ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
+      ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
+      ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
+      ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
+      ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
+      ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
+      ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
+      ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
+
+    return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
+  }
+  case Intrinsic::amdgcn_cvt_pkrtz: {
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+        const fltSemantics &HalfSem
+          = II->getType()->getScalarType()->getFltSemantics();
+        bool LosesInfo;
+        APFloat Val0 = C0->getValueAPF();
+        APFloat Val1 = C1->getValueAPF();
+        Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+        Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+
+        Constant *Folded = ConstantVector::get({
+            ConstantFP::get(II->getContext(), Val0),
+            ConstantFP::get(II->getContext(), Val1) });
+        return replaceInstUsesWith(*II, Folded);
+      }
+    }
+
+    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
+      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+    break;
+  }
+  case Intrinsic::amdgcn_ubfe:
+  case Intrinsic::amdgcn_sbfe: {
+    // Decompose simple cases into standard shifts.
+    Value *Src = II->getArgOperand(0);
+    if (isa<UndefValue>(Src))
+      return replaceInstUsesWith(*II, Src);
+
+    unsigned Width;
+    Type *Ty = II->getType();
+    unsigned IntSize = Ty->getIntegerBitWidth();
+
+    ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2));
+    if (CWidth) {
+      Width = CWidth->getZExtValue();
+      if ((Width & (IntSize - 1)) == 0)
+        return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty));
+
+      if (Width >= IntSize) {
+        // Hardware ignores high bits, so remove those.
+        II->setArgOperand(2, ConstantInt::get(CWidth->getType(),
+                                              Width & (IntSize - 1)));
+        return II;
+      }
+    }
+
+    unsigned Offset;
+    ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1));
+    if (COffset) {
+      Offset = COffset->getZExtValue();
+      if (Offset >= IntSize) {
+        II->setArgOperand(1, ConstantInt::get(COffset->getType(),
+                                              Offset & (IntSize - 1)));
+        return II;
+      }
+    }
+
+    bool Signed = II->getIntrinsicID() == Intrinsic::amdgcn_sbfe;
+
+    // TODO: Also emit sub if only width is constant.
+    if (!CWidth && COffset && Offset == 0) {
+      Constant *KSize = ConstantInt::get(COffset->getType(), IntSize);
+      Value *ShiftVal = Builder.CreateSub(KSize, II->getArgOperand(2));
+      ShiftVal = Builder.CreateZExt(ShiftVal, II->getType());
+
+      Value *Shl = Builder.CreateShl(Src, ShiftVal);
+      Value *RightShift = Signed ? Builder.CreateAShr(Shl, ShiftVal)
+                                 : Builder.CreateLShr(Shl, ShiftVal);
+      RightShift->takeName(II);
+      return replaceInstUsesWith(*II, RightShift);
+    }
+
+    if (!CWidth || !COffset)
+      break;
+
+    // TODO: This allows folding to undef when the hardware has specific
+    // behavior?
+    if (Offset + Width < IntSize) {
+      Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width);
+      Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width)
+                                 : Builder.CreateLShr(Shl, IntSize - Width);
+      RightShift->takeName(II);
+      return replaceInstUsesWith(*II, RightShift);
+    }
+
+    Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset)
+                               : Builder.CreateLShr(Src, Offset);
+
+    RightShift->takeName(II);
+    return replaceInstUsesWith(*II, RightShift);
+  }
+  case Intrinsic::amdgcn_exp:
+  case Intrinsic::amdgcn_exp_compr: {
+    ConstantInt *En = dyn_cast<ConstantInt>(II->getArgOperand(1));
+    if (!En) // Illegal.
+      break;
+
+    unsigned EnBits = En->getZExtValue();
+    if (EnBits == 0xf)
+      break; // All inputs enabled.
+
+    bool IsCompr = II->getIntrinsicID() == Intrinsic::amdgcn_exp_compr;
+    bool Changed = false;
+    for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
+      if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
+          (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
+        Value *Src = II->getArgOperand(I + 2);
+        if (!isa<UndefValue>(Src)) {
+          II->setArgOperand(I + 2, UndefValue::get(Src->getType()));
+          Changed = true;
+        }
+      }
+    }
+
+    if (Changed)
+      return II;
+
+    break;
+
+  }
+  case Intrinsic::amdgcn_fmed3: {
+    // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
+    // for the shader.
+
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+    Value *Src2 = II->getArgOperand(2);
+
+    bool Swap = false;
+    // Canonicalize constants to RHS operands.
+    //
+    // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      Swap = true;
+    }
+
+    if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
+      std::swap(Src1, Src2);
+      Swap = true;
+    }
+
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      Swap = true;
+    }
+
+    if (Swap) {
+      II->setArgOperand(0, Src0);
+      II->setArgOperand(1, Src1);
+      II->setArgOperand(2, Src2);
+      return II;
+    }
+
+    if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) {
+      CallInst *NewCall = Builder.CreateMinNum(Src0, Src1);
+      NewCall->copyFastMathFlags(II);
+      NewCall->takeName(II);
+      return replaceInstUsesWith(*II, NewCall);
+    }
+
+    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+        if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
+          APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
+                                       C2->getValueAPF());
+          return replaceInstUsesWith(*II,
+            ConstantFP::get(Builder.getContext(), Result));
+        }
+      }
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_icmp:
+  case Intrinsic::amdgcn_fcmp: {
+    const ConstantInt *CC = dyn_cast<ConstantInt>(II->getArgOperand(2));
+    if (!CC)
+      break;
+
+    // Guard against invalid arguments.
+    int64_t CCVal = CC->getZExtValue();
+    bool IsInteger = II->getIntrinsicID() == Intrinsic::amdgcn_icmp;
+    if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
+                       CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
+        (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
+                        CCVal > CmpInst::LAST_FCMP_PREDICATE)))
+      break;
+
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+
+    if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
+      if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
+        Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
+        if (CCmp->isNullValue()) {
+          return replaceInstUsesWith(
+              *II, ConstantExpr::getSExt(CCmp, II->getType()));
+        }
+
+        // The result of V_ICMP/V_FCMP assembly instructions (which this
+        // intrinsic exposes) is one bit per thread, masked with the EXEC
+        // register (which contains the bitmask of live threads). So a
+        // comparison that always returns true is the same as a read of the
+        // EXEC register.
+        Value *NewF = Intrinsic::getDeclaration(
+            II->getModule(), Intrinsic::read_register, II->getType());
+        Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")};
+        MDNode *MD = MDNode::get(II->getContext(), MDArgs);
+        Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)};
+        CallInst *NewCall = Builder.CreateCall(NewF, Args);
+        NewCall->addAttribute(AttributeList::FunctionIndex,
+                              Attribute::Convergent);
+        NewCall->takeName(II);
+        return replaceInstUsesWith(*II, NewCall);
+      }
+
+      // Canonicalize constants to RHS.
+      CmpInst::Predicate SwapPred
+        = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
+      II->setArgOperand(0, Src1);
+      II->setArgOperand(1, Src0);
+      II->setArgOperand(2, ConstantInt::get(CC->getType(),
+                                            static_cast<int>(SwapPred)));
+      return II;
+    }
+
+    if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
+      break;
+
+    // Canonicalize compare eq with true value to compare != 0
+    // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
+    //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
+    // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
+    //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
+    Value *ExtSrc;
+    if (CCVal == CmpInst::ICMP_EQ &&
+        ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) ||
+         (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) &&
+        ExtSrc->getType()->isIntegerTy(1)) {
+      II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType()));
+      II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
+      return II;
+    }
+
+    CmpInst::Predicate SrcPred;
+    Value *SrcLHS;
+    Value *SrcRHS;
+
+    // Fold compare eq/ne with 0 from a compare result as the predicate to the
+    // intrinsic. The typical use is a wave vote function in the library, which
+    // will be fed from a user code condition compared with 0. Fold in the
+    // redundant compare.
+
+    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
+    //   -> llvm.amdgcn.[if]cmp(a, b, pred)
+    //
+    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
+    //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
+    if (match(Src1, m_Zero()) &&
+        match(Src0,
+              m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) {
+      if (CCVal == CmpInst::ICMP_EQ)
+        SrcPred = CmpInst::getInversePredicate(SrcPred);
+
+      Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ?
+        Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp;
+
+      Value *NewF = Intrinsic::getDeclaration(II->getModule(), NewIID,
+                                              SrcLHS->getType());
+      Value *Args[] = { SrcLHS, SrcRHS,
+                        ConstantInt::get(CC->getType(), SrcPred) };
+      CallInst *NewCall = Builder.CreateCall(NewF, Args);
+      NewCall->takeName(II);
+      return replaceInstUsesWith(*II, NewCall);
+    }
+
+    break;
+  }
+  case Intrinsic::stackrestore: {
+    // If the save is right next to the restore, remove the restore.  This can
+    // happen when variable allocas are DCE'd.
+    if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
+      if (SS->getIntrinsicID() == Intrinsic::stacksave) {
+        if (&*++SS->getIterator() == II)
+          return eraseInstFromFunction(CI);
+      }
+    }
+
+    // Scan down this block to see if there is another stack restore in the
+    // same block without an intervening call/alloca.
+    BasicBlock::iterator BI(II);
+    TerminatorInst *TI = II->getParent()->getTerminator();
+    bool CannotRemove = false;
+    for (++BI; &*BI != TI; ++BI) {
+      if (isa<AllocaInst>(BI)) {
+        CannotRemove = true;
+        break;
+      }
+      if (CallInst *BCI = dyn_cast<CallInst>(BI)) {
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(BCI)) {
+          // If there is a stackrestore below this one, remove this one.
+          if (II->getIntrinsicID() == Intrinsic::stackrestore)
+            return eraseInstFromFunction(CI);
+
+          // Bail if we cross over an intrinsic with side effects, such as
+          // llvm.stacksave, llvm.read_register, or llvm.setjmp.
+          if (II->mayHaveSideEffects()) {
+            CannotRemove = true;
+            break;
+          }
+        } else {
+          // If we found a non-intrinsic call, we can't remove the stack
+          // restore.
+          CannotRemove = true;
+          break;
+        }
+      }
+    }
+
+    // If the stack restore is in a return, resume, or unwind block and if there
+    // are no allocas or calls between the restore and the return, nuke the
+    // restore.
+    if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI)))
+      return eraseInstFromFunction(CI);
+    break;
+  }
+  case Intrinsic::lifetime_start:
+    // Asan needs to poison memory to detect invalid access which is possible
+    // even for empty lifetime range.
+    if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress))
+      break;
+
+    if (removeTriviallyEmptyRange(*II, Intrinsic::lifetime_start,
+                                  Intrinsic::lifetime_end, *this))
+      return nullptr;
+    break;
+  case Intrinsic::assume: {
+    Value *IIOperand = II->getArgOperand(0);
+    // Remove an assume if it is immediately followed by an identical assume.
+    if (match(II->getNextNode(),
+              m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
+      return eraseInstFromFunction(CI);
+
+    // Canonicalize assume(a && b) -> assume(a); assume(b);
+    // Note: New assumption intrinsics created here are registered by
+    // the InstCombineIRInserter object.
+    Value *AssumeIntrinsic = II->getCalledValue(), *A, *B;
+    if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) {
+      Builder.CreateCall(AssumeIntrinsic, A, II->getName());
+      Builder.CreateCall(AssumeIntrinsic, B, II->getName());
+      return eraseInstFromFunction(*II);
+    }
+    // assume(!(a || b)) -> assume(!a); assume(!b);
+    if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) {
+      Builder.CreateCall(AssumeIntrinsic, Builder.CreateNot(A), II->getName());
+      Builder.CreateCall(AssumeIntrinsic, Builder.CreateNot(B), II->getName());
+      return eraseInstFromFunction(*II);
+    }
+
+    // assume( (load addr) != null ) -> add 'nonnull' metadata to load
+    // (if assume is valid at the load)
+    CmpInst::Predicate Pred;
+    Instruction *LHS;
+    if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) &&
+        Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load &&
+        LHS->getType()->isPointerTy() &&
+        isValidAssumeForContext(II, LHS, &DT)) {
+      MDNode *MD = MDNode::get(II->getContext(), None);
+      LHS->setMetadata(LLVMContext::MD_nonnull, MD);
+      return eraseInstFromFunction(*II);
+
+      // TODO: apply nonnull return attributes to calls and invokes
+      // TODO: apply range metadata for range check patterns?
+    }
+
+    // If there is a dominating assume with the same condition as this one,
+    // then this one is redundant, and should be removed.
+    KnownBits Known(1);
+    computeKnownBits(IIOperand, Known, 0, II);
+    if (Known.isAllOnes())
+      return eraseInstFromFunction(*II);
+
+    // Update the cache of affected values for this assumption (we might be
+    // here because we just simplified the condition).
+    AC.updateAffectedValues(II);
+    break;
+  }
+  case Intrinsic::experimental_gc_relocate: {
+    // Translate facts known about a pointer before relocating into
+    // facts about the relocate value, while being careful to
+    // preserve relocation semantics.
+    Value *DerivedPtr = cast<GCRelocateInst>(II)->getDerivedPtr();
+
+    // Remove the relocation if unused, note that this check is required
+    // to prevent the cases below from looping forever.
+    if (II->use_empty())
+      return eraseInstFromFunction(*II);
+
+    // Undef is undef, even after relocation.
+    // TODO: provide a hook for this in GCStrategy.  This is clearly legal for
+    // most practical collectors, but there was discussion in the review thread
+    // about whether it was legal for all possible collectors.
+    if (isa<UndefValue>(DerivedPtr))
+      // Use undef of gc_relocate's type to replace it.
+      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+    if (auto *PT = dyn_cast<PointerType>(II->getType())) {
+      // The relocation of null will be null for most any collector.
+      // TODO: provide a hook for this in GCStrategy.  There might be some
+      // weird collector this property does not hold for.
+      if (isa<ConstantPointerNull>(DerivedPtr))
+        // Use null-pointer of gc_relocate's type to replace it.
+        return replaceInstUsesWith(*II, ConstantPointerNull::get(PT));
+
+      // isKnownNonNull -> nonnull attribute
+      if (isKnownNonNullAt(DerivedPtr, II, &DT))
+        II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+    }
+
+    // TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
+    // Canonicalize on the type from the uses to the defs
+
+    // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...)
+    break;
+  }
+
+  case Intrinsic::experimental_guard: {
+    // Is this guard followed by another guard?
+    Instruction *NextInst = II->getNextNode();
+    Value *NextCond = nullptr;
+    if (match(NextInst,
+              m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) {
+      Value *CurrCond = II->getArgOperand(0);
+
+      // Remove a guard that it is immediately preceded by an identical guard.
+      if (CurrCond == NextCond)
+        return eraseInstFromFunction(*NextInst);
+
+      // Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
+      II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond));
+      return eraseInstFromFunction(*NextInst);
+    }
+    break;
+  }
+  }
+  return visitCallSite(II);
+}
+
+// Fence instruction simplification
+Instruction *InstCombiner::visitFenceInst(FenceInst &FI) {
+  // Remove identical consecutive fences.
+  if (auto *NFI = dyn_cast<FenceInst>(FI.getNextNode()))
+    if (FI.isIdenticalTo(NFI))
+      return eraseInstFromFunction(FI);
+  return nullptr;
+}
+
+// InvokeInst simplification
+//
+Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
+  return visitCallSite(&II);
+}
+
+/// If this cast does not affect the value passed through the varargs area, we
+/// can eliminate the use of the cast.
+static bool isSafeToEliminateVarargsCast(const CallSite CS,
+                                         const DataLayout &DL,
+                                         const CastInst *const CI,
+                                         const int ix) {
+  if (!CI->isLosslessCast())
+    return false;
+
+  // If this is a GC intrinsic, avoid munging types.  We need types for
+  // statepoint reconstruction in SelectionDAG.
+  // TODO: This is probably something which should be expanded to all
+  // intrinsics since the entire point of intrinsics is that
+  // they are understandable by the optimizer.
+  if (isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS))
+    return false;
+
+  // The size of ByVal or InAlloca arguments is derived from the type, so we
+  // can't change to a type with a different size.  If the size were
+  // passed explicitly we could avoid this check.
+  if (!CS.isByValOrInAllocaArgument(ix))
+    return true;
+
+  Type* SrcTy =
+            cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
+  Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
+  if (!SrcTy->isSized() || !DstTy->isSized())
+    return false;
+  if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy))
+    return false;
+  return true;
+}
+
+Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) {
+  if (!CI->getCalledFunction()) return nullptr;
+
+  auto InstCombineRAUW = [this](Instruction *From, Value *With) {
+    replaceInstUsesWith(*From, With);
+  };
+  LibCallSimplifier Simplifier(DL, &TLI, InstCombineRAUW);
+  if (Value *With = Simplifier.optimizeCall(CI)) {
+    ++NumSimplified;
+    return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With);
+  }
+
+  return nullptr;
+}
+
+static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) {
+  // Strip off at most one level of pointer casts, looking for an alloca.  This
+  // is good enough in practice and simpler than handling any number of casts.
+  Value *Underlying = TrampMem->stripPointerCasts();
+  if (Underlying != TrampMem &&
+      (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem))
+    return nullptr;
+  if (!isa<AllocaInst>(Underlying))
+    return nullptr;
+
+  IntrinsicInst *InitTrampoline = nullptr;
+  for (User *U : TrampMem->users()) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+    if (!II)
+      return nullptr;
+    if (II->getIntrinsicID() == Intrinsic::init_trampoline) {
+      if (InitTrampoline)
+        // More than one init_trampoline writes to this value.  Give up.
+        return nullptr;
+      InitTrampoline = II;
+      continue;
+    }
+    if (II->getIntrinsicID() == Intrinsic::adjust_trampoline)
+      // Allow any number of calls to adjust.trampoline.
+      continue;
+    return nullptr;
+  }
+
+  // No call to init.trampoline found.
+  if (!InitTrampoline)
+    return nullptr;
+
+  // Check that the alloca is being used in the expected way.
+  if (InitTrampoline->getOperand(0) != TrampMem)
+    return nullptr;
+
+  return InitTrampoline;
+}
+
+static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
+                                               Value *TrampMem) {
+  // Visit all the previous instructions in the basic block, and try to find a
+  // init.trampoline which has a direct path to the adjust.trampoline.
+  for (BasicBlock::iterator I = AdjustTramp->getIterator(),
+                            E = AdjustTramp->getParent()->begin();
+       I != E;) {
+    Instruction *Inst = &*--I;
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+      if (II->getIntrinsicID() == Intrinsic::init_trampoline &&
+          II->getOperand(0) == TrampMem)
+        return II;
+    if (Inst->mayWriteToMemory())
+      return nullptr;
+  }
+  return nullptr;
+}
+
+// Given a call to llvm.adjust.trampoline, find and return the corresponding
+// call to llvm.init.trampoline if the call to the trampoline can be optimized
+// to a direct call to a function.  Otherwise return NULL.
+//
+static IntrinsicInst *findInitTrampoline(Value *Callee) {
+  Callee = Callee->stripPointerCasts();
+  IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
+  if (!AdjustTramp ||
+      AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline)
+    return nullptr;
+
+  Value *TrampMem = AdjustTramp->getOperand(0);
+
+  if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem))
+    return IT;
+  if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem))
+    return IT;
+  return nullptr;
+}
+
+/// Improvements for call and invoke instructions.
+Instruction *InstCombiner::visitCallSite(CallSite CS) {
+  if (isAllocLikeFn(CS.getInstruction(), &TLI))
+    return visitAllocSite(*CS.getInstruction());
+
+  bool Changed = false;
+
+  // Mark any parameters that are known to be non-null with the nonnull
+  // attribute.  This is helpful for inlining calls to functions with null
+  // checks on their arguments.
+  SmallVector<unsigned, 4> ArgNos;
+  unsigned ArgNo = 0;
+
+  for (Value *V : CS.args()) {
+    if (V->getType()->isPointerTy() &&
+        !CS.paramHasAttr(ArgNo, Attribute::NonNull) &&
+        isKnownNonNullAt(V, CS.getInstruction(), &DT))
+      ArgNos.push_back(ArgNo);
+    ArgNo++;
+  }
+
+  assert(ArgNo == CS.arg_size() && "sanity check");
+
+  if (!ArgNos.empty()) {
+    AttributeList AS = CS.getAttributes();
+    LLVMContext &Ctx = CS.getInstruction()->getContext();
+    AS = AS.addParamAttribute(Ctx, ArgNos,
+                              Attribute::get(Ctx, Attribute::NonNull));
+    CS.setAttributes(AS);
+    Changed = true;
+  }
+
+  // If the callee is a pointer to a function, attempt to move any casts to the
+  // arguments of the call/invoke.
+  Value *Callee = CS.getCalledValue();
+  if (!isa<Function>(Callee) && transformConstExprCastCall(CS))
+    return nullptr;
+
+  if (Function *CalleeF = dyn_cast<Function>(Callee)) {
+    // Remove the convergent attr on calls when the callee is not convergent.
+    if (CS.isConvergent() && !CalleeF->isConvergent() &&
+        !CalleeF->isIntrinsic()) {
+      DEBUG(dbgs() << "Removing convergent attr from instr "
+                   << CS.getInstruction() << "\n");
+      CS.setNotConvergent();
+      return CS.getInstruction();
+    }
+
+    // If the call and callee calling conventions don't match, this call must
+    // be unreachable, as the call is undefined.
+    if (CalleeF->getCallingConv() != CS.getCallingConv() &&
+        // Only do this for calls to a function with a body.  A prototype may
+        // not actually end up matching the implementation's calling conv for a
+        // variety of reasons (e.g. it may be written in assembly).
+        !CalleeF->isDeclaration()) {
+      Instruction *OldCall = CS.getInstruction();
+      new StoreInst(ConstantInt::getTrue(Callee->getContext()),
+                UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
+                                  OldCall);
+      // If OldCall does not return void then replaceAllUsesWith undef.
+      // This allows ValueHandlers and custom metadata to adjust itself.
+      if (!OldCall->getType()->isVoidTy())
+        replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
+      if (isa<CallInst>(OldCall))
+        return eraseInstFromFunction(*OldCall);
+
+      // We cannot remove an invoke, because it would change the CFG, just
+      // change the callee to a null pointer.
+      cast<InvokeInst>(OldCall)->setCalledFunction(
+                                    Constant::getNullValue(CalleeF->getType()));
+      return nullptr;
+    }
+  }
+
+  if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+    // If CS does not return void then replaceAllUsesWith undef.
+    // This allows ValueHandlers and custom metadata to adjust itself.
+    if (!CS.getInstruction()->getType()->isVoidTy())
+      replaceInstUsesWith(*CS.getInstruction(),
+                          UndefValue::get(CS.getInstruction()->getType()));
+
+    if (isa<InvokeInst>(CS.getInstruction())) {
+      // Can't remove an invoke because we cannot change the CFG.
+      return nullptr;
+    }
+
+    // This instruction is not reachable, just remove it.  We insert a store to
+    // undef so that we know that this code is not reachable, despite the fact
+    // that we can't modify the CFG here.
+    new StoreInst(ConstantInt::getTrue(Callee->getContext()),
+                  UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
+                  CS.getInstruction());
+
+    return eraseInstFromFunction(*CS.getInstruction());
+  }
+
+  if (IntrinsicInst *II = findInitTrampoline(Callee))
+    return transformCallThroughTrampoline(CS, II);
+
+  PointerType *PTy = cast<PointerType>(Callee->getType());
+  FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+  if (FTy->isVarArg()) {
+    int ix = FTy->getNumParams();
+    // See if we can optimize any arguments passed through the varargs area of
+    // the call.
+    for (CallSite::arg_iterator I = CS.arg_begin() + FTy->getNumParams(),
+           E = CS.arg_end(); I != E; ++I, ++ix) {
+      CastInst *CI = dyn_cast<CastInst>(*I);
+      if (CI && isSafeToEliminateVarargsCast(CS, DL, CI, ix)) {
+        *I = CI->getOperand(0);
+        Changed = true;
+      }
+    }
+  }
+
+  if (isa<InlineAsm>(Callee) && !CS.doesNotThrow()) {
+    // Inline asm calls cannot throw - mark them 'nounwind'.
+    CS.setDoesNotThrow();
+    Changed = true;
+  }
+
+  // Try to optimize the call if possible, we require DataLayout for most of
+  // this.  None of these calls are seen as possibly dead so go ahead and
+  // delete the instruction now.
+  if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
+    Instruction *I = tryOptimizeCall(CI);
+    // If we changed something return the result, etc. Otherwise let
+    // the fallthrough check.
+    if (I) return eraseInstFromFunction(*I);
+  }
+
+  return Changed ? CS.getInstruction() : nullptr;
+}
+
+/// If the callee is a constexpr cast of a function, attempt to move the cast to
+/// the arguments of the call/invoke.
+bool InstCombiner::transformConstExprCastCall(CallSite CS) {
+  auto *Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
+  if (!Callee)
+    return false;
+
+  // The prototype of a thunk is a lie. Don't directly call such a function.
+  if (Callee->hasFnAttribute("thunk"))
+    return false;
+
+  Instruction *Caller = CS.getInstruction();
+  const AttributeList &CallerPAL = CS.getAttributes();
+
+  // Okay, this is a cast from a function to a different type.  Unless doing so
+  // would cause a type conversion of one of our arguments, change this call to
+  // be a direct call with arguments casted to the appropriate types.
+  //
+  FunctionType *FT = Callee->getFunctionType();
+  Type *OldRetTy = Caller->getType();
+  Type *NewRetTy = FT->getReturnType();
+
+  // Check to see if we are changing the return type...
+  if (OldRetTy != NewRetTy) {
+
+    if (NewRetTy->isStructTy())
+      return false; // TODO: Handle multiple return values.
+
+    if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) {
+      if (Callee->isDeclaration())
+        return false;   // Cannot transform this return value.
+
+      if (!Caller->use_empty() &&
+          // void -> non-void is handled specially
+          !NewRetTy->isVoidTy())
+        return false;   // Cannot transform this return value.
+    }
+
+    if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
+      AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+      if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy)))
+        return false;   // Attribute not compatible with transformed value.
+    }
+
+    // If the callsite is an invoke instruction, and the return value is used by
+    // a PHI node in a successor, we cannot change the return type of the call
+    // because there is no place to put the cast instruction (without breaking
+    // the critical edge).  Bail out in this case.
+    if (!Caller->use_empty())
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller))
+        for (User *U : II->users())
+          if (PHINode *PN = dyn_cast<PHINode>(U))
+            if (PN->getParent() == II->getNormalDest() ||
+                PN->getParent() == II->getUnwindDest())
+              return false;
+  }
+
+  unsigned NumActualArgs = CS.arg_size();
+  unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs);
+
+  // Prevent us turning:
+  // declare void @takes_i32_inalloca(i32* inalloca)
+  //  call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0)
+  //
+  // into:
+  //  call void @takes_i32_inalloca(i32* null)
+  //
+  //  Similarly, avoid folding away bitcasts of byval calls.
+  if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
+      Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal))
+    return false;
+
+  CallSite::arg_iterator AI = CS.arg_begin();
+  for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) {
+    Type *ParamTy = FT->getParamType(i);
+    Type *ActTy = (*AI)->getType();
+
+    if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
+      return false;   // Cannot transform this parameter value.
+
+    if (AttrBuilder(CallerPAL.getParamAttributes(i))
+            .overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
+      return false;   // Attribute not compatible with transformed value.
+
+    if (CS.isInAllocaArgument(i))
+      return false;   // Cannot transform to and from inalloca.
+
+    // If the parameter is passed as a byval argument, then we have to have a
+    // sized type and the sized type has to have the same size as the old type.
+    if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
+      PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
+      if (!ParamPTy || !ParamPTy->getElementType()->isSized())
+        return false;
+
+      Type *CurElTy = ActTy->getPointerElementType();
+      if (DL.getTypeAllocSize(CurElTy) !=
+          DL.getTypeAllocSize(ParamPTy->getElementType()))
+        return false;
+    }
+  }
+
+  if (Callee->isDeclaration()) {
+    // Do not delete arguments unless we have a function body.
+    if (FT->getNumParams() < NumActualArgs && !FT->isVarArg())
+      return false;
+
+    // If the callee is just a declaration, don't change the varargsness of the
+    // call.  We don't want to introduce a varargs call where one doesn't
+    // already exist.
+    PointerType *APTy = cast<PointerType>(CS.getCalledValue()->getType());
+    if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
+      return false;
+
+    // If both the callee and the cast type are varargs, we still have to make
+    // sure the number of fixed parameters are the same or we have the same
+    // ABI issues as if we introduce a varargs call.
+    if (FT->isVarArg() &&
+        cast<FunctionType>(APTy->getElementType())->isVarArg() &&
+        FT->getNumParams() !=
+        cast<FunctionType>(APTy->getElementType())->getNumParams())
+      return false;
+  }
+
+  if (FT->getNumParams() < NumActualArgs && FT->isVarArg() &&
+      !CallerPAL.isEmpty()) {
+    // In this case we have more arguments than the new function type, but we
+    // won't be dropping them.  Check that these extra arguments have attributes
+    // that are compatible with being a vararg call argument.
+    unsigned SRetIdx;
+    if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) &&
+        SRetIdx > FT->getNumParams())
+      return false;
+  }
+
+  // Okay, we decided that this is a safe thing to do: go ahead and start
+  // inserting cast instructions as necessary.
+  SmallVector<Value *, 8> Args;
+  SmallVector<AttributeSet, 8> ArgAttrs;
+  Args.reserve(NumActualArgs);
+  ArgAttrs.reserve(NumActualArgs);
+
+  // Get any return attributes.
+  AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+
+  // If the return value is not being used, the type may not be compatible
+  // with the existing attributes.  Wipe out any problematic attributes.
+  RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy));
+
+  AI = CS.arg_begin();
+  for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
+    Type *ParamTy = FT->getParamType(i);
+
+    Value *NewArg = *AI;
+    if ((*AI)->getType() != ParamTy)
+      NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy);
+    Args.push_back(NewArg);
+
+    // Add any parameter attributes.
+    ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
+  }
+
+  // If the function takes more arguments than the call was taking, add them
+  // now.
+  for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) {
+    Args.push_back(Constant::getNullValue(FT->getParamType(i)));
+    ArgAttrs.push_back(AttributeSet());
+  }
+
+  // If we are removing arguments to the function, emit an obnoxious warning.
+  if (FT->getNumParams() < NumActualArgs) {
+    // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722
+    if (FT->isVarArg()) {
+      // Add all of the arguments in their promoted form to the arg list.
+      for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
+        Type *PTy = getPromotedType((*AI)->getType());
+        Value *NewArg = *AI;
+        if (PTy != (*AI)->getType()) {
+          // Must promote to pass through va_arg area!
+          Instruction::CastOps opcode =
+            CastInst::getCastOpcode(*AI, false, PTy, false);
+          NewArg = Builder.CreateCast(opcode, *AI, PTy);
+        }
+        Args.push_back(NewArg);
+
+        // Add any parameter attributes.
+        ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
+      }
+    }
+  }
+
+  AttributeSet FnAttrs = CallerPAL.getFnAttributes();
+
+  if (NewRetTy->isVoidTy())
+    Caller->setName("");   // Void type should not have a name.
+
+  assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) &&
+         "missing argument attributes");
+  LLVMContext &Ctx = Callee->getContext();
+  AttributeList NewCallerPAL = AttributeList::get(
+      Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs);
+
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  CS.getOperandBundlesAsDefs(OpBundles);
+
+  CallSite NewCS;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+    NewCS = Builder.CreateInvoke(Callee, II->getNormalDest(),
+                                 II->getUnwindDest(), Args, OpBundles);
+  } else {
+    NewCS = Builder.CreateCall(Callee, Args, OpBundles);
+    cast<CallInst>(NewCS.getInstruction())
+        ->setTailCallKind(cast<CallInst>(Caller)->getTailCallKind());
+  }
+  NewCS->takeName(Caller);
+  NewCS.setCallingConv(CS.getCallingConv());
+  NewCS.setAttributes(NewCallerPAL);
+
+  // Preserve the weight metadata for the new call instruction. The metadata
+  // is used by SamplePGO to check callsite's hotness.
+  uint64_t W;
+  if (Caller->extractProfTotalWeight(W))
+    NewCS->setProfWeight(W);
+
+  // Insert a cast of the return type as necessary.
+  Instruction *NC = NewCS.getInstruction();
+  Value *NV = NC;
+  if (OldRetTy != NV->getType() && !Caller->use_empty()) {
+    if (!NV->getType()->isVoidTy()) {
+      NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy);
+      NC->setDebugLoc(Caller->getDebugLoc());
+
+      // If this is an invoke instruction, we should insert it after the first
+      // non-phi, instruction in the normal successor block.
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+        BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt();
+        InsertNewInstBefore(NC, *I);
+      } else {
+        // Otherwise, it's a call, just insert cast right after the call.
+        InsertNewInstBefore(NC, *Caller);
+      }
+      Worklist.AddUsersToWorkList(*Caller);
+    } else {
+      NV = UndefValue::get(Caller->getType());
+    }
+  }
+
+  if (!Caller->use_empty())
+    replaceInstUsesWith(*Caller, NV);
+  else if (Caller->hasValueHandle()) {
+    if (OldRetTy == NV->getType())
+      ValueHandleBase::ValueIsRAUWd(Caller, NV);
+    else
+      // We cannot call ValueIsRAUWd with a different type, and the
+      // actual tracked value will disappear.
+      ValueHandleBase::ValueIsDeleted(Caller);
+  }
+
+  eraseInstFromFunction(*Caller);
+  return true;
+}
+
+/// Turn a call to a function created by init_trampoline / adjust_trampoline
+/// intrinsic pair into a direct call to the underlying function.
+Instruction *
+InstCombiner::transformCallThroughTrampoline(CallSite CS,
+                                             IntrinsicInst *Tramp) {
+  Value *Callee = CS.getCalledValue();
+  PointerType *PTy = cast<PointerType>(Callee->getType());
+  FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+  AttributeList Attrs = CS.getAttributes();
+
+  // If the call already has the 'nest' attribute somewhere then give up -
+  // otherwise 'nest' would occur twice after splicing in the chain.
+  if (Attrs.hasAttrSomewhere(Attribute::Nest))
+    return nullptr;
+
+  assert(Tramp &&
+         "transformCallThroughTrampoline called with incorrect CallSite.");
+
+  Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts());
+  FunctionType *NestFTy = cast<FunctionType>(NestF->getValueType());
+
+  AttributeList NestAttrs = NestF->getAttributes();
+  if (!NestAttrs.isEmpty()) {
+    unsigned NestArgNo = 0;
+    Type *NestTy = nullptr;
+    AttributeSet NestAttr;
+
+    // Look for a parameter marked with the 'nest' attribute.
+    for (FunctionType::param_iterator I = NestFTy->param_begin(),
+                                      E = NestFTy->param_end();
+         I != E; ++NestArgNo, ++I) {
+      AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo);
+      if (AS.hasAttribute(Attribute::Nest)) {
+        // Record the parameter type and any other attributes.
+        NestTy = *I;
+        NestAttr = AS;
+        break;
+      }
+    }
+
+    if (NestTy) {
+      Instruction *Caller = CS.getInstruction();
+      std::vector<Value*> NewArgs;
+      std::vector<AttributeSet> NewArgAttrs;
+      NewArgs.reserve(CS.arg_size() + 1);
+      NewArgAttrs.reserve(CS.arg_size());
+
+      // Insert the nest argument into the call argument list, which may
+      // mean appending it.  Likewise for attributes.
+
+      {
+        unsigned ArgNo = 0;
+        CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+        do {
+          if (ArgNo == NestArgNo) {
+            // Add the chain argument and attributes.
+            Value *NestVal = Tramp->getArgOperand(2);
+            if (NestVal->getType() != NestTy)
+              NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest");
+            NewArgs.push_back(NestVal);
+            NewArgAttrs.push_back(NestAttr);
+          }
+
+          if (I == E)
+            break;
+
+          // Add the original argument and attributes.
+          NewArgs.push_back(*I);
+          NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
+
+          ++ArgNo;
+          ++I;
+        } while (true);
+      }
+
+      // The trampoline may have been bitcast to a bogus type (FTy).
+      // Handle this by synthesizing a new function type, equal to FTy
+      // with the chain parameter inserted.
+
+      std::vector<Type*> NewTypes;
+      NewTypes.reserve(FTy->getNumParams()+1);
+
+      // Insert the chain's type into the list of parameter types, which may
+      // mean appending it.
+      {
+        unsigned ArgNo = 0;
+        FunctionType::param_iterator I = FTy->param_begin(),
+          E = FTy->param_end();
+
+        do {
+          if (ArgNo == NestArgNo)
+            // Add the chain's type.
+            NewTypes.push_back(NestTy);
+
+          if (I == E)
+            break;
+
+          // Add the original type.
+          NewTypes.push_back(*I);
+
+          ++ArgNo;
+          ++I;
+        } while (true);
+      }
+
+      // Replace the trampoline call with a direct call.  Let the generic
+      // code sort out any function type mismatches.
+      FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes,
+                                                FTy->isVarArg());
+      Constant *NewCallee =
+        NestF->getType() == PointerType::getUnqual(NewFTy) ?
+        NestF : ConstantExpr::getBitCast(NestF,
+                                         PointerType::getUnqual(NewFTy));
+      AttributeList NewPAL =
+          AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(),
+                             Attrs.getRetAttributes(), NewArgAttrs);
+
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      CS.getOperandBundlesAsDefs(OpBundles);
+
+      Instruction *NewCaller;
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+        NewCaller = InvokeInst::Create(NewCallee,
+                                       II->getNormalDest(), II->getUnwindDest(),
+                                       NewArgs, OpBundles);
+        cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv());
+        cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
+      } else {
+        NewCaller = CallInst::Create(NewCallee, NewArgs, OpBundles);
+        cast<CallInst>(NewCaller)->setTailCallKind(
+            cast<CallInst>(Caller)->getTailCallKind());
+        cast<CallInst>(NewCaller)->setCallingConv(
+            cast<CallInst>(Caller)->getCallingConv());
+        cast<CallInst>(NewCaller)->setAttributes(NewPAL);
+      }
+
+      return NewCaller;
+    }
+  }
+
+  // Replace the trampoline call with a direct call.  Since there is no 'nest'
+  // parameter, there is no need to adjust the argument list.  Let the generic
+  // code sort out any function type mismatches.
+  Constant *NewCallee =
+    NestF->getType() == PTy ? NestF :
+                              ConstantExpr::getBitCast(NestF, PTy);
+  CS.setCalledFunction(NewCallee);
+  return CS.getInstruction();
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
new file mode 100644
index 000000000000..dfdfd3e9da84
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -0,0 +1,2238 @@
+//===- InstCombineCasts.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for cast operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+/// Analyze 'Val', seeing if it is a simple linear expression.
+/// If so, decompose it, returning some value X, such that Val is
+/// X*Scale+Offset.
+///
+static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
+                                        uint64_t &Offset) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    Offset = CI->getZExtValue();
+    Scale  = 0;
+    return ConstantInt::get(Val->getType(), 0);
+  }
+
+  if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
+    // Cannot look past anything that might overflow.
+    OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val);
+    if (OBI && !OBI->hasNoUnsignedWrap() && !OBI->hasNoSignedWrap()) {
+      Scale = 1;
+      Offset = 0;
+      return Val;
+    }
+
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      if (I->getOpcode() == Instruction::Shl) {
+        // This is a value scaled by '1 << the shift amt'.
+        Scale = UINT64_C(1) << RHS->getZExtValue();
+        Offset = 0;
+        return I->getOperand(0);
+      }
+
+      if (I->getOpcode() == Instruction::Mul) {
+        // This value is scaled by 'RHS'.
+        Scale = RHS->getZExtValue();
+        Offset = 0;
+        return I->getOperand(0);
+      }
+
+      if (I->getOpcode() == Instruction::Add) {
+        // We have X+C.  Check to see if we really have (X*C2)+C1,
+        // where C1 is divisible by C2.
+        unsigned SubScale;
+        Value *SubVal =
+          decomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
+        Offset += RHS->getZExtValue();
+        Scale = SubScale;
+        return SubVal;
+      }
+    }
+  }
+
+  // Otherwise, we can't look past this.
+  Scale = 1;
+  Offset = 0;
+  return Val;
+}
+
+/// If we find a cast of an allocation instruction, try to eliminate the cast by
+/// moving the type information into the alloc.
+Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
+                                                   AllocaInst &AI) {
+  PointerType *PTy = cast<PointerType>(CI.getType());
+
+  BuilderTy AllocaBuilder(Builder);
+  AllocaBuilder.SetInsertPoint(&AI);
+
+  // Get the type really allocated and the type casted to.
+  Type *AllocElTy = AI.getAllocatedType();
+  Type *CastElTy = PTy->getElementType();
+  if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr;
+
+  unsigned AllocElTyAlign = DL.getABITypeAlignment(AllocElTy);
+  unsigned CastElTyAlign = DL.getABITypeAlignment(CastElTy);
+  if (CastElTyAlign < AllocElTyAlign) return nullptr;
+
+  // If the allocation has multiple uses, only promote it if we are strictly
+  // increasing the alignment of the resultant allocation.  If we keep it the
+  // same, we open the door to infinite loops of various kinds.
+  if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr;
+
+  uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy);
+  uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy);
+  if (CastElTySize == 0 || AllocElTySize == 0) return nullptr;
+
+  // If the allocation has multiple uses, only promote it if we're not
+  // shrinking the amount of memory being allocated.
+  uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy);
+  uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy);
+  if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr;
+
+  // See if we can satisfy the modulus by pulling a scale out of the array
+  // size argument.
+  unsigned ArraySizeScale;
+  uint64_t ArrayOffset;
+  Value *NumElements = // See if the array size is a decomposable linear expr.
+    decomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
+
+  // If we can now satisfy the modulus, by using a non-1 scale, we really can
+  // do the xform.
+  if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
+      (AllocElTySize*ArrayOffset   ) % CastElTySize != 0) return nullptr;
+
+  unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize;
+  Value *Amt = nullptr;
+  if (Scale == 1) {
+    Amt = NumElements;
+  } else {
+    Amt = ConstantInt::get(AI.getArraySize()->getType(), Scale);
+    // Insert before the alloca, not before the cast.
+    Amt = AllocaBuilder.CreateMul(Amt, NumElements);
+  }
+
+  if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
+    Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
+                                  Offset, true);
+    Amt = AllocaBuilder.CreateAdd(Amt, Off);
+  }
+
+  AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt);
+  New->setAlignment(AI.getAlignment());
+  New->takeName(&AI);
+  New->setUsedWithInAlloca(AI.isUsedWithInAlloca());
+
+  // If the allocation has multiple real uses, insert a cast and change all
+  // things that used it to use the new cast.  This will also hack on CI, but it
+  // will die soon.
+  if (!AI.hasOneUse()) {
+    // New is the allocation instruction, pointer typed. AI is the original
+    // allocation instruction, also pointer typed. Thus, cast to use is BitCast.
+    Value *NewCast = AllocaBuilder.CreateBitCast(New, AI.getType(), "tmpcast");
+    replaceInstUsesWith(AI, NewCast);
+  }
+  return replaceInstUsesWith(CI, New);
+}
+
+/// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns
+/// true for, actually insert the code to evaluate the expression.
+Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
+                                             bool isSigned) {
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
+    // If we got a constantexpr back, try to simplify it with DL info.
+    if (Constant *FoldedC = ConstantFoldConstant(C, DL, &TLI))
+      C = FoldedC;
+    return C;
+  }
+
+  // Otherwise, it must be an instruction.
+  Instruction *I = cast<Instruction>(V);
+  Instruction *Res = nullptr;
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::AShr:
+  case Instruction::LShr:
+  case Instruction::Shl:
+  case Instruction::UDiv:
+  case Instruction::URem: {
+    Value *LHS = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned);
+    Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+    Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
+    break;
+  }
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    // If the source type of the cast is the type we're trying for then we can
+    // just return the source.  There's no need to insert it because it is not
+    // new.
+    if (I->getOperand(0)->getType() == Ty)
+      return I->getOperand(0);
+
+    // Otherwise, must be the same type of cast, so just reinsert a new one.
+    // This also handles the case of zext(trunc(x)) -> zext(x).
+    Res = CastInst::CreateIntegerCast(I->getOperand(0), Ty,
+                                      Opc == Instruction::SExt);
+    break;
+  case Instruction::Select: {
+    Value *True = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+    Value *False = EvaluateInDifferentType(I->getOperand(2), Ty, isSigned);
+    Res = SelectInst::Create(I->getOperand(0), True, False);
+    break;
+  }
+  case Instruction::PHI: {
+    PHINode *OPN = cast<PHINode>(I);
+    PHINode *NPN = PHINode::Create(Ty, OPN->getNumIncomingValues());
+    for (unsigned i = 0, e = OPN->getNumIncomingValues(); i != e; ++i) {
+      Value *V =
+          EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned);
+      NPN->addIncoming(V, OPN->getIncomingBlock(i));
+    }
+    Res = NPN;
+    break;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    llvm_unreachable("Unreachable!");
+  }
+
+  Res->takeName(I);
+  return InsertNewInstWith(Res, *I);
+}
+
+Instruction::CastOps InstCombiner::isEliminableCastPair(const CastInst *CI1,
+                                                        const CastInst *CI2) {
+  Type *SrcTy = CI1->getSrcTy();
+  Type *MidTy = CI1->getDestTy();
+  Type *DstTy = CI2->getDestTy();
+
+  Instruction::CastOps firstOp = Instruction::CastOps(CI1->getOpcode());
+  Instruction::CastOps secondOp = Instruction::CastOps(CI2->getOpcode());
+  Type *SrcIntPtrTy =
+      SrcTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(SrcTy) : nullptr;
+  Type *MidIntPtrTy =
+      MidTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(MidTy) : nullptr;
+  Type *DstIntPtrTy =
+      DstTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(DstTy) : nullptr;
+  unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy,
+                                                DstTy, SrcIntPtrTy, MidIntPtrTy,
+                                                DstIntPtrTy);
+
+  // We don't want to form an inttoptr or ptrtoint that converts to an integer
+  // type that differs from the pointer size.
+  if ((Res == Instruction::IntToPtr && SrcTy != DstIntPtrTy) ||
+      (Res == Instruction::PtrToInt && DstTy != SrcIntPtrTy))
+    Res = 0;
+
+  return Instruction::CastOps(Res);
+}
+
+/// @brief Implement the transforms common to all CastInst visitors.
+Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
+  Value *Src = CI.getOperand(0);
+
+  // Try to eliminate a cast of a cast.
+  if (auto *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
+    if (Instruction::CastOps NewOpc = isEliminableCastPair(CSrc, &CI)) {
+      // The first cast (CSrc) is eliminable so we need to fix up or replace
+      // the second cast (CI). CSrc will then have a good chance of being dead.
+      return CastInst::Create(NewOpc, CSrc->getOperand(0), CI.getType());
+    }
+  }
+
+  // If we are casting a select, then fold the cast into the select.
+  if (auto *SI = dyn_cast<SelectInst>(Src))
+    if (Instruction *NV = FoldOpIntoSelect(CI, SI))
+      return NV;
+
+  // If we are casting a PHI, then fold the cast into the PHI.
+  if (auto *PN = dyn_cast<PHINode>(Src)) {
+    // Don't do this if it would create a PHI node with an illegal type from a
+    // legal type.
+    if (!Src->getType()->isIntegerTy() || !CI.getType()->isIntegerTy() ||
+        shouldChangeType(CI.getType(), Src->getType()))
+      if (Instruction *NV = foldOpIntoPhi(CI, PN))
+        return NV;
+  }
+
+  return nullptr;
+}
+
+/// Return true if we can evaluate the specified expression tree as type Ty
+/// instead of its larger type, and arrive with the same value.
+/// This is used by code that tries to eliminate truncates.
+///
+/// Ty will always be a type smaller than V.  We should return true if trunc(V)
+/// can be computed by computing V in the smaller type.  If V is an instruction,
+/// then trunc(inst(x,y)) can be computed as inst(trunc(x),trunc(y)), which only
+/// makes sense if x and y can be efficiently truncated.
+///
+/// This function works on both vectors and scalars.
+///
+static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
+                                 Instruction *CxtI) {
+  // We can always evaluate constants in another type.
+  if (isa<Constant>(V))
+    return true;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  Type *OrigTy = V->getType();
+
+  // If this is an extension from the dest type, we can eliminate it, even if it
+  // has multiple uses.
+  if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+      I->getOperand(0)->getType() == Ty)
+    return true;
+
+  // We can't extend or shrink something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
+
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // These operators can all arbitrarily be extended or truncated.
+    return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+           canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
+
+  case Instruction::UDiv:
+  case Instruction::URem: {
+    // UDiv and URem can be truncated if all the truncated bits are zero.
+    uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+    uint32_t BitWidth = Ty->getScalarSizeInBits();
+    if (BitWidth < OrigBitWidth) {
+      APInt Mask = APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth);
+      if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) &&
+          IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) {
+        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+               canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
+      }
+    }
+    break;
+  }
+  case Instruction::Shl:
+    // If we are truncating the result of this SHL, and if it's a shift of a
+    // constant amount, we can always perform a SHL in a smaller type.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t BitWidth = Ty->getScalarSizeInBits();
+      if (CI->getLimitedValue(BitWidth) < BitWidth)
+        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
+    }
+    break;
+  case Instruction::LShr:
+    // If this is a truncate of a logical shr, we can truncate it to a smaller
+    // lshr iff we know that the bits we would otherwise be shifting in are
+    // already zeros.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+      uint32_t BitWidth = Ty->getScalarSizeInBits();
+      if (IC.MaskedValueIsZero(I->getOperand(0),
+            APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth), 0, CxtI) &&
+          CI->getLimitedValue(BitWidth) < BitWidth) {
+        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
+      }
+    }
+    break;
+  case Instruction::Trunc:
+    // trunc(trunc(x)) -> trunc(x)
+    return true;
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest
+    // trunc(ext(x)) -> trunc(x) if the source type is larger than the new dest
+    return true;
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    return canEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) &&
+           canEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI);
+  }
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (Value *IncValue : PN->incoming_values())
+      if (!canEvaluateTruncated(IncValue, Ty, IC, CxtI))
+        return false;
+    return true;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    break;
+  }
+
+  return false;
+}
+
+/// Given a vector that is bitcast to an integer, optionally logically
+/// right-shifted, and truncated, convert it to an extractelement.
+/// Example (big endian):
+///   trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32
+///   --->
+///   extractelement <4 x i32> %X, 1
+static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) {
+  Value *TruncOp = Trunc.getOperand(0);
+  Type *DestType = Trunc.getType();
+  if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType))
+    return nullptr;
+
+  Value *VecInput = nullptr;
+  ConstantInt *ShiftVal = nullptr;
+  if (!match(TruncOp, m_CombineOr(m_BitCast(m_Value(VecInput)),
+                                  m_LShr(m_BitCast(m_Value(VecInput)),
+                                         m_ConstantInt(ShiftVal)))) ||
+      !isa<VectorType>(VecInput->getType()))
+    return nullptr;
+
+  VectorType *VecType = cast<VectorType>(VecInput->getType());
+  unsigned VecWidth = VecType->getPrimitiveSizeInBits();
+  unsigned DestWidth = DestType->getPrimitiveSizeInBits();
+  unsigned ShiftAmount = ShiftVal ? ShiftVal->getZExtValue() : 0;
+
+  if ((VecWidth % DestWidth != 0) || (ShiftAmount % DestWidth != 0))
+    return nullptr;
+
+  // If the element type of the vector doesn't match the result type,
+  // bitcast it to a vector type that we can extract from.
+  unsigned NumVecElts = VecWidth / DestWidth;
+  if (VecType->getElementType() != DestType) {
+    VecType = VectorType::get(DestType, NumVecElts);
+    VecInput = IC.Builder.CreateBitCast(VecInput, VecType, "bc");
+  }
+
+  unsigned Elt = ShiftAmount / DestWidth;
+  if (IC.getDataLayout().isBigEndian())
+    Elt = NumVecElts - 1 - Elt;
+
+  return ExtractElementInst::Create(VecInput, IC.Builder.getInt32(Elt));
+}
+
+/// Try to narrow the width of bitwise logic instructions with constants.
+Instruction *InstCombiner::shrinkBitwiseLogic(TruncInst &Trunc) {
+  Type *SrcTy = Trunc.getSrcTy();
+  Type *DestTy = Trunc.getType();
+  if (isa<IntegerType>(SrcTy) && !shouldChangeType(SrcTy, DestTy))
+    return nullptr;
+
+  BinaryOperator *LogicOp;
+  Constant *C;
+  if (!match(Trunc.getOperand(0), m_OneUse(m_BinOp(LogicOp))) ||
+      !LogicOp->isBitwiseLogicOp() ||
+      !match(LogicOp->getOperand(1), m_Constant(C)))
+    return nullptr;
+
+  // trunc (logic X, C) --> logic (trunc X, C')
+  Constant *NarrowC = ConstantExpr::getTrunc(C, DestTy);
+  Value *NarrowOp0 = Builder.CreateTrunc(LogicOp->getOperand(0), DestTy);
+  return BinaryOperator::Create(LogicOp->getOpcode(), NarrowOp0, NarrowC);
+}
+
+/// Try to narrow the width of a splat shuffle. This could be generalized to any
+/// shuffle with a constant operand, but we limit the transform to avoid
+/// creating a shuffle type that targets may not be able to lower effectively.
+static Instruction *shrinkSplatShuffle(TruncInst &Trunc,
+                                       InstCombiner::BuilderTy &Builder) {
+  auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0));
+  if (Shuf && Shuf->hasOneUse() && isa<UndefValue>(Shuf->getOperand(1)) &&
+      Shuf->getMask()->getSplatValue() &&
+      Shuf->getType() == Shuf->getOperand(0)->getType()) {
+    // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Undef, SplatMask
+    Constant *NarrowUndef = UndefValue::get(Trunc.getType());
+    Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType());
+    return new ShuffleVectorInst(NarrowOp, NarrowUndef, Shuf->getMask());
+  }
+
+  return nullptr;
+}
+
+/// Try to narrow the width of an insert element. This could be generalized for
+/// any vector constant, but we limit the transform to insertion into undef to
+/// avoid potential backend problems from unsupported insertion widths. This
+/// could also be extended to handle the case of inserting a scalar constant
+/// into a vector variable.
+static Instruction *shrinkInsertElt(CastInst &Trunc,
+                                    InstCombiner::BuilderTy &Builder) {
+  Instruction::CastOps Opcode = Trunc.getOpcode();
+  assert((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
+         "Unexpected instruction for shrinking");
+
+  auto *InsElt = dyn_cast<InsertElementInst>(Trunc.getOperand(0));
+  if (!InsElt || !InsElt->hasOneUse())
+    return nullptr;
+
+  Type *DestTy = Trunc.getType();
+  Type *DestScalarTy = DestTy->getScalarType();
+  Value *VecOp = InsElt->getOperand(0);
+  Value *ScalarOp = InsElt->getOperand(1);
+  Value *Index = InsElt->getOperand(2);
+
+  if (isa<UndefValue>(VecOp)) {
+    // trunc   (inselt undef, X, Index) --> inselt undef,   (trunc X), Index
+    // fptrunc (inselt undef, X, Index) --> inselt undef, (fptrunc X), Index
+    UndefValue *NarrowUndef = UndefValue::get(DestTy);
+    Value *NarrowOp = Builder.CreateCast(Opcode, ScalarOp, DestScalarTy);
+    return InsertElementInst::Create(NarrowUndef, NarrowOp, Index);
+  }
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
+  if (Instruction *Result = commonCastTransforms(CI))
+    return Result;
+
+  // Test if the trunc is the user of a select which is part of a
+  // minimum or maximum operation. If so, don't do any more simplification.
+  // Even simplifying demanded bits can break the canonical form of a
+  // min/max.
+  Value *LHS, *RHS;
+  if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)))
+    if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN)
+      return nullptr;
+
+  // See if we can simplify any instructions used by the input whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(CI))
+    return &CI;
+
+  Value *Src = CI.getOperand(0);
+  Type *DestTy = CI.getType(), *SrcTy = Src->getType();
+
+  // Attempt to truncate the entire input expression tree to the destination
+  // type.   Only do this if the dest type is a simple type, don't convert the
+  // expression tree to something weird like i93 unless the source is also
+  // strange.
+  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
+      canEvaluateTruncated(Src, DestTy, *this, &CI)) {
+
+    // If this cast is a truncate, evaluting in a different type always
+    // eliminates the cast, so it is always a win.
+    DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+          " to avoid cast: " << CI << '\n');
+    Value *Res = EvaluateInDifferentType(Src, DestTy, false);
+    assert(Res->getType() == DestTy);
+    return replaceInstUsesWith(CI, Res);
+  }
+
+  // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
+  if (DestTy->getScalarSizeInBits() == 1) {
+    Constant *One = ConstantInt::get(SrcTy, 1);
+    Src = Builder.CreateAnd(Src, One);
+    Value *Zero = Constant::getNullValue(Src->getType());
+    return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
+  }
+
+  // FIXME: Maybe combine the next two transforms to handle the no cast case
+  // more efficiently. Support vector types. Cleanup code by using m_OneUse.
+
+  // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
+  Value *A = nullptr; ConstantInt *Cst = nullptr;
+  if (Src->hasOneUse() &&
+      match(Src, m_LShr(m_ZExt(m_Value(A)), m_ConstantInt(Cst)))) {
+    // We have three types to worry about here, the type of A, the source of
+    // the truncate (MidSize), and the destination of the truncate. We know that
+    // ASize < MidSize   and MidSize > ResultSize, but don't know the relation
+    // between ASize and ResultSize.
+    unsigned ASize = A->getType()->getPrimitiveSizeInBits();
+
+    // If the shift amount is larger than the size of A, then the result is
+    // known to be zero because all the input bits got shifted out.
+    if (Cst->getZExtValue() >= ASize)
+      return replaceInstUsesWith(CI, Constant::getNullValue(DestTy));
+
+    // Since we're doing an lshr and a zero extend, and know that the shift
+    // amount is smaller than ASize, it is always safe to do the shift in A's
+    // type, then zero extend or truncate to the result.
+    Value *Shift = Builder.CreateLShr(A, Cst->getZExtValue());
+    Shift->takeName(Src);
+    return CastInst::CreateIntegerCast(Shift, DestTy, false);
+  }
+
+  // FIXME: We should canonicalize to zext/trunc and remove this transform.
+  // Transform trunc(lshr (sext A), Cst) to ashr A, Cst to eliminate type
+  // conversion.
+  // It works because bits coming from sign extension have the same value as
+  // the sign bit of the original value; performing ashr instead of lshr
+  // generates bits of the same value as the sign bit.
+  if (Src->hasOneUse() &&
+      match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst)))) {
+    Value *SExt = cast<Instruction>(Src)->getOperand(0);
+    const unsigned SExtSize = SExt->getType()->getPrimitiveSizeInBits();
+    const unsigned ASize = A->getType()->getPrimitiveSizeInBits();
+    const unsigned CISize = CI.getType()->getPrimitiveSizeInBits();
+    const unsigned MaxAmt = SExtSize - std::max(CISize, ASize);
+    unsigned ShiftAmt = Cst->getZExtValue();
+
+    // This optimization can be only performed when zero bits generated by
+    // the original lshr aren't pulled into the value after truncation, so we
+    // can only shift by values no larger than the number of extension bits.
+    // FIXME: Instead of bailing when the shift is too large, use and to clear
+    // the extra bits.
+    if (ShiftAmt <= MaxAmt) {
+      if (CISize == ASize)
+        return BinaryOperator::CreateAShr(A, ConstantInt::get(CI.getType(),
+                                          std::min(ShiftAmt, ASize - 1)));
+      if (SExt->hasOneUse()) {
+        Value *Shift = Builder.CreateAShr(A, std::min(ShiftAmt, ASize - 1));
+        Shift->takeName(Src);
+        return CastInst::CreateIntegerCast(Shift, CI.getType(), true);
+      }
+    }
+  }
+
+  if (Instruction *I = shrinkBitwiseLogic(CI))
+    return I;
+
+  if (Instruction *I = shrinkSplatShuffle(CI, Builder))
+    return I;
+
+  if (Instruction *I = shrinkInsertElt(CI, Builder))
+    return I;
+
+  if (Src->hasOneUse() && isa<IntegerType>(SrcTy) &&
+      shouldChangeType(SrcTy, DestTy)) {
+    // Transform "trunc (shl X, cst)" -> "shl (trunc X), cst" so long as the
+    // dest type is native and cst < dest size.
+    if (match(Src, m_Shl(m_Value(A), m_ConstantInt(Cst))) &&
+        !match(A, m_Shr(m_Value(), m_Constant()))) {
+      // Skip shifts of shift by constants. It undoes a combine in
+      // FoldShiftByConstant and is the extend in reg pattern.
+      const unsigned DestSize = DestTy->getScalarSizeInBits();
+      if (Cst->getValue().ult(DestSize)) {
+        Value *NewTrunc = Builder.CreateTrunc(A, DestTy, A->getName() + ".tr");
+
+        return BinaryOperator::Create(
+          Instruction::Shl, NewTrunc,
+          ConstantInt::get(DestTy, Cst->getValue().trunc(DestSize)));
+      }
+    }
+  }
+
+  if (Instruction *I = foldVecTruncToExtElt(CI, *this))
+    return I;
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
+                                             bool DoTransform) {
+  // If we are just checking for a icmp eq of a single bit and zext'ing it
+  // to an integer, then shift the bit to the appropriate place and then
+  // cast to integer to avoid the comparison.
+  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(ICI->getOperand(1))) {
+    const APInt &Op1CV = Op1C->getValue();
+
+    // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
+    // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
+    if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV.isNullValue()) ||
+        (ICI->getPredicate() == ICmpInst::ICMP_SGT && Op1CV.isAllOnesValue())) {
+      if (!DoTransform) return ICI;
+
+      Value *In = ICI->getOperand(0);
+      Value *Sh = ConstantInt::get(In->getType(),
+                                   In->getType()->getScalarSizeInBits() - 1);
+      In = Builder.CreateLShr(In, Sh, In->getName() + ".lobit");
+      if (In->getType() != CI.getType())
+        In = Builder.CreateIntCast(In, CI.getType(), false /*ZExt*/);
+
+      if (ICI->getPredicate() == ICmpInst::ICMP_SGT) {
+        Constant *One = ConstantInt::get(In->getType(), 1);
+        In = Builder.CreateXor(In, One, In->getName() + ".not");
+      }
+
+      return replaceInstUsesWith(CI, In);
+    }
+
+    // zext (X == 0) to i32 --> X^1      iff X has only the low bit set.
+    // zext (X == 0) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
+    // zext (X == 1) to i32 --> X        iff X has only the low bit set.
+    // zext (X == 2) to i32 --> X>>1     iff X has only the 2nd bit set.
+    // zext (X != 0) to i32 --> X        iff X has only the low bit set.
+    // zext (X != 0) to i32 --> X>>1     iff X has only the 2nd bit set.
+    // zext (X != 1) to i32 --> X^1      iff X has only the low bit set.
+    // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
+    if ((Op1CV.isNullValue() || Op1CV.isPowerOf2()) &&
+        // This only works for EQ and NE
+        ICI->isEquality()) {
+      // If Op1C some other power of two, convert:
+      KnownBits Known = computeKnownBits(ICI->getOperand(0), 0, &CI);
+
+      APInt KnownZeroMask(~Known.Zero);
+      if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
+        if (!DoTransform) return ICI;
+
+        bool isNE = ICI->getPredicate() == ICmpInst::ICMP_NE;
+        if (!Op1CV.isNullValue() && (Op1CV != KnownZeroMask)) {
+          // (X&4) == 2 --> false
+          // (X&4) != 2 --> true
+          Constant *Res = ConstantInt::get(Type::getInt1Ty(CI.getContext()),
+                                           isNE);
+          Res = ConstantExpr::getZExt(Res, CI.getType());
+          return replaceInstUsesWith(CI, Res);
+        }
+
+        uint32_t ShAmt = KnownZeroMask.logBase2();
+        Value *In = ICI->getOperand(0);
+        if (ShAmt) {
+          // Perform a logical shr by shiftamt.
+          // Insert the shift to put the result in the low bit.
+          In = Builder.CreateLShr(In, ConstantInt::get(In->getType(), ShAmt),
+                                  In->getName() + ".lobit");
+        }
+
+        if (!Op1CV.isNullValue() == isNE) { // Toggle the low bit.
+          Constant *One = ConstantInt::get(In->getType(), 1);
+          In = Builder.CreateXor(In, One);
+        }
+
+        if (CI.getType() == In->getType())
+          return replaceInstUsesWith(CI, In);
+
+        Value *IntCast = Builder.CreateIntCast(In, CI.getType(), false);
+        return replaceInstUsesWith(CI, IntCast);
+      }
+    }
+  }
+
+  // icmp ne A, B is equal to xor A, B when A and B only really have one bit.
+  // It is also profitable to transform icmp eq into not(xor(A, B)) because that
+  // may lead to additional simplifications.
+  if (ICI->isEquality() && CI.getType() == ICI->getOperand(0)->getType()) {
+    if (IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) {
+      Value *LHS = ICI->getOperand(0);
+      Value *RHS = ICI->getOperand(1);
+
+      KnownBits KnownLHS = computeKnownBits(LHS, 0, &CI);
+      KnownBits KnownRHS = computeKnownBits(RHS, 0, &CI);
+
+      if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) {
+        APInt KnownBits = KnownLHS.Zero | KnownLHS.One;
+        APInt UnknownBit = ~KnownBits;
+        if (UnknownBit.countPopulation() == 1) {
+          if (!DoTransform) return ICI;
+
+          Value *Result = Builder.CreateXor(LHS, RHS);
+
+          // Mask off any bits that are set and won't be shifted away.
+          if (KnownLHS.One.uge(UnknownBit))
+            Result = Builder.CreateAnd(Result,
+                                        ConstantInt::get(ITy, UnknownBit));
+
+          // Shift the bit we're testing down to the lsb.
+          Result = Builder.CreateLShr(
+               Result, ConstantInt::get(ITy, UnknownBit.countTrailingZeros()));
+
+          if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+            Result = Builder.CreateXor(Result, ConstantInt::get(ITy, 1));
+          Result->takeName(ICI);
+          return replaceInstUsesWith(CI, Result);
+        }
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+/// Determine if the specified value can be computed in the specified wider type
+/// and produce the same low bits. If not, return false.
+///
+/// If this function returns true, it can also return a non-zero number of bits
+/// (in BitsToClear) which indicates that the value it computes is correct for
+/// the zero extend, but that the additional BitsToClear bits need to be zero'd
+/// out.  For example, to promote something like:
+///
+///   %B = trunc i64 %A to i32
+///   %C = lshr i32 %B, 8
+///   %E = zext i32 %C to i64
+///
+/// CanEvaluateZExtd for the 'lshr' will return true, and BitsToClear will be
+/// set to 8 to indicate that the promoted value needs to have bits 24-31
+/// cleared in addition to bits 32-63.  Since an 'and' will be generated to
+/// clear the top bits anyway, doing this has no extra cost.
+///
+/// This function works on both vectors and scalars.
+static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
+                             InstCombiner &IC, Instruction *CxtI) {
+  BitsToClear = 0;
+  if (isa<Constant>(V))
+    return true;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // If the input is a truncate from the destination type, we can trivially
+  // eliminate it.
+  if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
+    return true;
+
+  // We can't extend or shrink something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
+
+  unsigned Opc = I->getOpcode(), Tmp;
+  switch (Opc) {
+  case Instruction::ZExt:  // zext(zext(x)) -> zext(x).
+  case Instruction::SExt:  // zext(sext(x)) -> sext(x).
+  case Instruction::Trunc: // zext(trunc(x)) -> trunc(x) or zext(x)
+    return true;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) ||
+        !canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI))
+      return false;
+    // These can all be promoted if neither operand has 'bits to clear'.
+    if (BitsToClear == 0 && Tmp == 0)
+      return true;
+
+    // If the operation is an AND/OR/XOR and the bits to clear are zero in the
+    // other side, BitsToClear is ok.
+    if (Tmp == 0 && I->isBitwiseLogicOp()) {
+      // We use MaskedValueIsZero here for generality, but the case we care
+      // about the most is constant RHS.
+      unsigned VSize = V->getType()->getScalarSizeInBits();
+      if (IC.MaskedValueIsZero(I->getOperand(1),
+                               APInt::getHighBitsSet(VSize, BitsToClear),
+                               0, CxtI))
+        return true;
+    }
+
+    // Otherwise, we don't know how to analyze this BitsToClear case yet.
+    return false;
+
+  case Instruction::Shl:
+    // We can promote shl(x, cst) if we can promote x.  Since shl overwrites the
+    // upper bits we can reduce BitsToClear by the shift amount.
+    if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
+        return false;
+      uint64_t ShiftAmt = Amt->getZExtValue();
+      BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0;
+      return true;
+    }
+    return false;
+  case Instruction::LShr:
+    // We can promote lshr(x, cst) if we can promote x.  This requires the
+    // ultimate 'and' to clear out the high zero bits we're clearing out though.
+    if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
+        return false;
+      BitsToClear += Amt->getZExtValue();
+      if (BitsToClear > V->getType()->getScalarSizeInBits())
+        BitsToClear = V->getType()->getScalarSizeInBits();
+      return true;
+    }
+    // Cannot promote variable LSHR.
+    return false;
+  case Instruction::Select:
+    if (!canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) ||
+        !canEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) ||
+        // TODO: If important, we could handle the case when the BitsToClear are
+        // known zero in the disagreeing side.
+        Tmp != BitsToClear)
+      return false;
+    return true;
+
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    if (!canEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI))
+      return false;
+    for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!canEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) ||
+          // TODO: If important, we could handle the case when the BitsToClear
+          // are known zero in the disagreeing input.
+          Tmp != BitsToClear)
+        return false;
+    return true;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    return false;
+  }
+}
+
+Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
+  // If this zero extend is only used by a truncate, let the truncate be
+  // eliminated before we try to optimize this zext.
+  if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
+    return nullptr;
+
+  // If one of the common conversion will work, do it.
+  if (Instruction *Result = commonCastTransforms(CI))
+    return Result;
+
+  Value *Src = CI.getOperand(0);
+  Type *SrcTy = Src->getType(), *DestTy = CI.getType();
+
+  // Attempt to extend the entire input expression tree to the destination
+  // type.   Only do this if the dest type is a simple type, don't convert the
+  // expression tree to something weird like i93 unless the source is also
+  // strange.
+  unsigned BitsToClear;
+  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
+      canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) {
+    assert(BitsToClear <= SrcTy->getScalarSizeInBits() &&
+           "Can't clear more bits than in SrcTy");
+
+    // Okay, we can transform this!  Insert the new expression now.
+    DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+          " to avoid zero extend: " << CI << '\n');
+    Value *Res = EvaluateInDifferentType(Src, DestTy, false);
+    assert(Res->getType() == DestTy);
+
+    uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear;
+    uint32_t DestBitSize = DestTy->getScalarSizeInBits();
+
+    // If the high bits are already filled with zeros, just replace this
+    // cast with the result.
+    if (MaskedValueIsZero(Res,
+                          APInt::getHighBitsSet(DestBitSize,
+                                                DestBitSize-SrcBitsKept),
+                             0, &CI))
+      return replaceInstUsesWith(CI, Res);
+
+    // We need to emit an AND to clear the high bits.
+    Constant *C = ConstantInt::get(Res->getType(),
+                               APInt::getLowBitsSet(DestBitSize, SrcBitsKept));
+    return BinaryOperator::CreateAnd(Res, C);
+  }
+
+  // If this is a TRUNC followed by a ZEXT then we are dealing with integral
+  // types and if the sizes are just right we can convert this into a logical
+  // 'and' which will be much cheaper than the pair of casts.
+  if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) {   // A->B->C cast
+    // TODO: Subsume this into EvaluateInDifferentType.
+
+    // Get the sizes of the types involved.  We know that the intermediate type
+    // will be smaller than A or C, but don't know the relation between A and C.
+    Value *A = CSrc->getOperand(0);
+    unsigned SrcSize = A->getType()->getScalarSizeInBits();
+    unsigned MidSize = CSrc->getType()->getScalarSizeInBits();
+    unsigned DstSize = CI.getType()->getScalarSizeInBits();
+    // If we're actually extending zero bits, then if
+    // SrcSize <  DstSize: zext(a & mask)
+    // SrcSize == DstSize: a & mask
+    // SrcSize  > DstSize: trunc(a) & mask
+    if (SrcSize < DstSize) {
+      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+      Constant *AndConst = ConstantInt::get(A->getType(), AndValue);
+      Value *And = Builder.CreateAnd(A, AndConst, CSrc->getName() + ".mask");
+      return new ZExtInst(And, CI.getType());
+    }
+
+    if (SrcSize == DstSize) {
+      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+      return BinaryOperator::CreateAnd(A, ConstantInt::get(A->getType(),
+                                                           AndValue));
+    }
+    if (SrcSize > DstSize) {
+      Value *Trunc = Builder.CreateTrunc(A, CI.getType());
+      APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
+      return BinaryOperator::CreateAnd(Trunc,
+                                       ConstantInt::get(Trunc->getType(),
+                                                        AndValue));
+    }
+  }
+
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src))
+    return transformZExtICmp(ICI, CI);
+
+  BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src);
+  if (SrcI && SrcI->getOpcode() == Instruction::Or) {
+    // zext (or icmp, icmp) -> or (zext icmp), (zext icmp) if at least one
+    // of the (zext icmp) can be eliminated. If so, immediately perform the
+    // according elimination.
+    ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0));
+    ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1));
+    if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() &&
+        (transformZExtICmp(LHS, CI, false) ||
+         transformZExtICmp(RHS, CI, false))) {
+      // zext (or icmp, icmp) -> or (zext icmp), (zext icmp)
+      Value *LCast = Builder.CreateZExt(LHS, CI.getType(), LHS->getName());
+      Value *RCast = Builder.CreateZExt(RHS, CI.getType(), RHS->getName());
+      BinaryOperator *Or = BinaryOperator::Create(Instruction::Or, LCast, RCast);
+
+      // Perform the elimination.
+      if (auto *LZExt = dyn_cast<ZExtInst>(LCast))
+        transformZExtICmp(LHS, *LZExt);
+      if (auto *RZExt = dyn_cast<ZExtInst>(RCast))
+        transformZExtICmp(RHS, *RZExt);
+
+      return Or;
+    }
+  }
+
+  // zext(trunc(X) & C) -> (X & zext(C)).
+  Constant *C;
+  Value *X;
+  if (SrcI &&
+      match(SrcI, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Constant(C)))) &&
+      X->getType() == CI.getType())
+    return BinaryOperator::CreateAnd(X, ConstantExpr::getZExt(C, CI.getType()));
+
+  // zext((trunc(X) & C) ^ C) -> ((X & zext(C)) ^ zext(C)).
+  Value *And;
+  if (SrcI && match(SrcI, m_OneUse(m_Xor(m_Value(And), m_Constant(C)))) &&
+      match(And, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Specific(C)))) &&
+      X->getType() == CI.getType()) {
+    Constant *ZC = ConstantExpr::getZExt(C, CI.getType());
+    return BinaryOperator::CreateXor(Builder.CreateAnd(X, ZC), ZC);
+  }
+
+  return nullptr;
+}
+
+/// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp.
+Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
+  Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1);
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+
+  // Don't bother if Op1 isn't of vector or integer type.
+  if (!Op1->getType()->isIntOrIntVectorTy())
+    return nullptr;
+
+  if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
+    // (x <s  0) ? -1 : 0 -> ashr x, 31        -> all ones if negative
+    // (x >s -1) ? -1 : 0 -> not (ashr x, 31)  -> all ones if positive
+    if ((Pred == ICmpInst::ICMP_SLT && Op1C->isNullValue()) ||
+        (Pred == ICmpInst::ICMP_SGT && Op1C->isAllOnesValue())) {
+
+      Value *Sh = ConstantInt::get(Op0->getType(),
+                                   Op0->getType()->getScalarSizeInBits()-1);
+      Value *In = Builder.CreateAShr(Op0, Sh, Op0->getName() + ".lobit");
+      if (In->getType() != CI.getType())
+        In = Builder.CreateIntCast(In, CI.getType(), true /*SExt*/);
+
+      if (Pred == ICmpInst::ICMP_SGT)
+        In = Builder.CreateNot(In, In->getName() + ".not");
+      return replaceInstUsesWith(CI, In);
+    }
+  }
+
+  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+    // If we know that only one bit of the LHS of the icmp can be set and we
+    // have an equality comparison with zero or a power of 2, we can transform
+    // the icmp and sext into bitwise/integer operations.
+    if (ICI->hasOneUse() &&
+        ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){
+      KnownBits Known = computeKnownBits(Op0, 0, &CI);
+
+      APInt KnownZeroMask(~Known.Zero);
+      if (KnownZeroMask.isPowerOf2()) {
+        Value *In = ICI->getOperand(0);
+
+        // If the icmp tests for a known zero bit we can constant fold it.
+        if (!Op1C->isZero() && Op1C->getValue() != KnownZeroMask) {
+          Value *V = Pred == ICmpInst::ICMP_NE ?
+                       ConstantInt::getAllOnesValue(CI.getType()) :
+                       ConstantInt::getNullValue(CI.getType());
+          return replaceInstUsesWith(CI, V);
+        }
+
+        if (!Op1C->isZero() == (Pred == ICmpInst::ICMP_NE)) {
+          // sext ((x & 2^n) == 0)   -> (x >> n) - 1
+          // sext ((x & 2^n) != 2^n) -> (x >> n) - 1
+          unsigned ShiftAmt = KnownZeroMask.countTrailingZeros();
+          // Perform a right shift to place the desired bit in the LSB.
+          if (ShiftAmt)
+            In = Builder.CreateLShr(In,
+                                    ConstantInt::get(In->getType(), ShiftAmt));
+
+          // At this point "In" is either 1 or 0. Subtract 1 to turn
+          // {1, 0} -> {0, -1}.
+          In = Builder.CreateAdd(In,
+                                 ConstantInt::getAllOnesValue(In->getType()),
+                                 "sext");
+        } else {
+          // sext ((x & 2^n) != 0)   -> (x << bitwidth-n) a>> bitwidth-1
+          // sext ((x & 2^n) == 2^n) -> (x << bitwidth-n) a>> bitwidth-1
+          unsigned ShiftAmt = KnownZeroMask.countLeadingZeros();
+          // Perform a left shift to place the desired bit in the MSB.
+          if (ShiftAmt)
+            In = Builder.CreateShl(In,
+                                   ConstantInt::get(In->getType(), ShiftAmt));
+
+          // Distribute the bit over the whole bit width.
+          In = Builder.CreateAShr(In, ConstantInt::get(In->getType(),
+                                  KnownZeroMask.getBitWidth() - 1), "sext");
+        }
+
+        if (CI.getType() == In->getType())
+          return replaceInstUsesWith(CI, In);
+        return CastInst::CreateIntegerCast(In, CI.getType(), true/*SExt*/);
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+/// Return true if we can take the specified value and return it as type Ty
+/// without inserting any new casts and without changing the value of the common
+/// low bits.  This is used by code that tries to promote integer operations to
+/// a wider types will allow us to eliminate the extension.
+///
+/// This function works on both vectors and scalars.
+///
+static bool canEvaluateSExtd(Value *V, Type *Ty) {
+  assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() &&
+         "Can't sign extend type to a smaller type");
+  // If this is a constant, it can be trivially promoted.
+  if (isa<Constant>(V))
+    return true;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // If this is a truncate from the dest type, we can trivially eliminate it.
+  if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
+    return true;
+
+  // We can't extend or shrink something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
+
+  switch (I->getOpcode()) {
+  case Instruction::SExt:  // sext(sext(x)) -> sext(x)
+  case Instruction::ZExt:  // sext(zext(x)) -> zext(x)
+  case Instruction::Trunc: // sext(trunc(x)) -> trunc(x) or sext(x)
+    return true;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // These operators can all arbitrarily be extended if their inputs can.
+    return canEvaluateSExtd(I->getOperand(0), Ty) &&
+           canEvaluateSExtd(I->getOperand(1), Ty);
+
+  //case Instruction::Shl:   TODO
+  //case Instruction::LShr:  TODO
+
+  case Instruction::Select:
+    return canEvaluateSExtd(I->getOperand(1), Ty) &&
+           canEvaluateSExtd(I->getOperand(2), Ty);
+
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (Value *IncValue : PN->incoming_values())
+      if (!canEvaluateSExtd(IncValue, Ty)) return false;
+    return true;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    break;
+  }
+
+  return false;
+}
+
+Instruction *InstCombiner::visitSExt(SExtInst &CI) {
+  // If this sign extend is only used by a truncate, let the truncate be
+  // eliminated before we try to optimize this sext.
+  if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
+    return nullptr;
+
+  if (Instruction *I = commonCastTransforms(CI))
+    return I;
+
+  Value *Src = CI.getOperand(0);
+  Type *SrcTy = Src->getType(), *DestTy = CI.getType();
+
+  // If we know that the value being extended is positive, we can use a zext
+  // instead.
+  KnownBits Known = computeKnownBits(Src, 0, &CI);
+  if (Known.isNonNegative()) {
+    Value *ZExt = Builder.CreateZExt(Src, DestTy);
+    return replaceInstUsesWith(CI, ZExt);
+  }
+
+  // Attempt to extend the entire input expression tree to the destination
+  // type.   Only do this if the dest type is a simple type, don't convert the
+  // expression tree to something weird like i93 unless the source is also
+  // strange.
+  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
+      canEvaluateSExtd(Src, DestTy)) {
+    // Okay, we can transform this!  Insert the new expression now.
+    DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+          " to avoid sign extend: " << CI << '\n');
+    Value *Res = EvaluateInDifferentType(Src, DestTy, true);
+    assert(Res->getType() == DestTy);
+
+    uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
+    uint32_t DestBitSize = DestTy->getScalarSizeInBits();
+
+    // If the high bits are already filled with sign bit, just replace this
+    // cast with the result.
+    if (ComputeNumSignBits(Res, 0, &CI) > DestBitSize - SrcBitSize)
+      return replaceInstUsesWith(CI, Res);
+
+    // We need to emit a shl + ashr to do the sign extend.
+    Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
+    return BinaryOperator::CreateAShr(Builder.CreateShl(Res, ShAmt, "sext"),
+                                      ShAmt);
+  }
+
+  // If the input is a trunc from the destination type, then turn sext(trunc(x))
+  // into shifts.
+  Value *X;
+  if (match(Src, m_OneUse(m_Trunc(m_Value(X)))) && X->getType() == DestTy) {
+    // sext(trunc(X)) --> ashr(shl(X, C), C)
+    unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
+    unsigned DestBitSize = DestTy->getScalarSizeInBits();
+    Constant *ShAmt = ConstantInt::get(DestTy, DestBitSize - SrcBitSize);
+    return BinaryOperator::CreateAShr(Builder.CreateShl(X, ShAmt), ShAmt);
+  }
+
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src))
+    return transformSExtICmp(ICI, CI);
+
+  // If the input is a shl/ashr pair of a same constant, then this is a sign
+  // extension from a smaller value.  If we could trust arbitrary bitwidth
+  // integers, we could turn this into a truncate to the smaller bit and then
+  // use a sext for the whole extension.  Since we don't, look deeper and check
+  // for a truncate.  If the source and dest are the same type, eliminate the
+  // trunc and extend and just do shifts.  For example, turn:
+  //   %a = trunc i32 %i to i8
+  //   %b = shl i8 %a, 6
+  //   %c = ashr i8 %b, 6
+  //   %d = sext i8 %c to i32
+  // into:
+  //   %a = shl i32 %i, 30
+  //   %d = ashr i32 %a, 30
+  Value *A = nullptr;
+  // TODO: Eventually this could be subsumed by EvaluateInDifferentType.
+  ConstantInt *BA = nullptr, *CA = nullptr;
+  if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_ConstantInt(BA)),
+                        m_ConstantInt(CA))) &&
+      BA == CA && A->getType() == CI.getType()) {
+    unsigned MidSize = Src->getType()->getScalarSizeInBits();
+    unsigned SrcDstSize = CI.getType()->getScalarSizeInBits();
+    unsigned ShAmt = CA->getZExtValue()+SrcDstSize-MidSize;
+    Constant *ShAmtV = ConstantInt::get(CI.getType(), ShAmt);
+    A = Builder.CreateShl(A, ShAmtV, CI.getName());
+    return BinaryOperator::CreateAShr(A, ShAmtV);
+  }
+
+  return nullptr;
+}
+
+
+/// Return a Constant* for the specified floating-point constant if it fits
+/// in the specified FP type without changing its value.
+static Constant *fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
+  bool losesInfo;
+  APFloat F = CFP->getValueAPF();
+  (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
+  if (!losesInfo)
+    return ConstantFP::get(CFP->getContext(), F);
+  return nullptr;
+}
+
+/// Look through floating-point extensions until we get the source value.
+static Value *lookThroughFPExtensions(Value *V) {
+  while (auto *FPExt = dyn_cast<FPExtInst>(V))
+    V = FPExt->getOperand(0);
+
+  // If this value is a constant, return the constant in the smallest FP type
+  // that can accurately represent it.  This allows us to turn
+  // (float)((double)X+2.0) into x+2.0f.
+  if (auto *CFP = dyn_cast<ConstantFP>(V)) {
+    if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext()))
+      return V;  // No constant folding of this.
+    // See if the value can be truncated to half and then reextended.
+    if (Value *V = fitsInFPType(CFP, APFloat::IEEEhalf()))
+      return V;
+    // See if the value can be truncated to float and then reextended.
+    if (Value *V = fitsInFPType(CFP, APFloat::IEEEsingle()))
+      return V;
+    if (CFP->getType()->isDoubleTy())
+      return V;  // Won't shrink.
+    if (Value *V = fitsInFPType(CFP, APFloat::IEEEdouble()))
+      return V;
+    // Don't try to shrink to various long double types.
+  }
+
+  return V;
+}
+
+Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
+  if (Instruction *I = commonCastTransforms(CI))
+    return I;
+  // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to
+  // simplify this expression to avoid one or more of the trunc/extend
+  // operations if we can do so without changing the numerical results.
+  //
+  // The exact manner in which the widths of the operands interact to limit
+  // what we can and cannot do safely varies from operation to operation, and
+  // is explained below in the various case statements.
+  BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0));
+  if (OpI && OpI->hasOneUse()) {
+    Value *LHSOrig = lookThroughFPExtensions(OpI->getOperand(0));
+    Value *RHSOrig = lookThroughFPExtensions(OpI->getOperand(1));
+    unsigned OpWidth = OpI->getType()->getFPMantissaWidth();
+    unsigned LHSWidth = LHSOrig->getType()->getFPMantissaWidth();
+    unsigned RHSWidth = RHSOrig->getType()->getFPMantissaWidth();
+    unsigned SrcWidth = std::max(LHSWidth, RHSWidth);
+    unsigned DstWidth = CI.getType()->getFPMantissaWidth();
+    switch (OpI->getOpcode()) {
+      default: break;
+      case Instruction::FAdd:
+      case Instruction::FSub:
+        // For addition and subtraction, the infinitely precise result can
+        // essentially be arbitrarily wide; proving that double rounding
+        // will not occur because the result of OpI is exact (as we will for
+        // FMul, for example) is hopeless.  However, we *can* nonetheless
+        // frequently know that double rounding cannot occur (or that it is
+        // innocuous) by taking advantage of the specific structure of
+        // infinitely-precise results that admit double rounding.
+        //
+        // Specifically, if OpWidth >= 2*DstWdith+1 and DstWidth is sufficient
+        // to represent both sources, we can guarantee that the double
+        // rounding is innocuous (See p50 of Figueroa's 2000 PhD thesis,
+        // "A Rigorous Framework for Fully Supporting the IEEE Standard ..."
+        // for proof of this fact).
+        //
+        // Note: Figueroa does not consider the case where DstFormat !=
+        // SrcFormat.  It's possible (likely even!) that this analysis
+        // could be tightened for those cases, but they are rare (the main
+        // case of interest here is (float)((double)float + float)).
+        if (OpWidth >= 2*DstWidth+1 && DstWidth >= SrcWidth) {
+          if (LHSOrig->getType() != CI.getType())
+            LHSOrig = Builder.CreateFPExt(LHSOrig, CI.getType());
+          if (RHSOrig->getType() != CI.getType())
+            RHSOrig = Builder.CreateFPExt(RHSOrig, CI.getType());
+          Instruction *RI =
+            BinaryOperator::Create(OpI->getOpcode(), LHSOrig, RHSOrig);
+          RI->copyFastMathFlags(OpI);
+          return RI;
+        }
+        break;
+      case Instruction::FMul:
+        // For multiplication, the infinitely precise result has at most
+        // LHSWidth + RHSWidth significant bits; if OpWidth is sufficient
+        // that such a value can be exactly represented, then no double
+        // rounding can possibly occur; we can safely perform the operation
+        // in the destination format if it can represent both sources.
+        if (OpWidth >= LHSWidth + RHSWidth && DstWidth >= SrcWidth) {
+          if (LHSOrig->getType() != CI.getType())
+            LHSOrig = Builder.CreateFPExt(LHSOrig, CI.getType());
+          if (RHSOrig->getType() != CI.getType())
+            RHSOrig = Builder.CreateFPExt(RHSOrig, CI.getType());
+          Instruction *RI =
+            BinaryOperator::CreateFMul(LHSOrig, RHSOrig);
+          RI->copyFastMathFlags(OpI);
+          return RI;
+        }
+        break;
+      case Instruction::FDiv:
+        // For division, we use again use the bound from Figueroa's
+        // dissertation.  I am entirely certain that this bound can be
+        // tightened in the unbalanced operand case by an analysis based on
+        // the diophantine rational approximation bound, but the well-known
+        // condition used here is a good conservative first pass.
+        // TODO: Tighten bound via rigorous analysis of the unbalanced case.
+        if (OpWidth >= 2*DstWidth && DstWidth >= SrcWidth) {
+          if (LHSOrig->getType() != CI.getType())
+            LHSOrig = Builder.CreateFPExt(LHSOrig, CI.getType());
+          if (RHSOrig->getType() != CI.getType())
+            RHSOrig = Builder.CreateFPExt(RHSOrig, CI.getType());
+          Instruction *RI =
+            BinaryOperator::CreateFDiv(LHSOrig, RHSOrig);
+          RI->copyFastMathFlags(OpI);
+          return RI;
+        }
+        break;
+      case Instruction::FRem:
+        // Remainder is straightforward.  Remainder is always exact, so the
+        // type of OpI doesn't enter into things at all.  We simply evaluate
+        // in whichever source type is larger, then convert to the
+        // destination type.
+        if (SrcWidth == OpWidth)
+          break;
+        if (LHSWidth < SrcWidth)
+          LHSOrig = Builder.CreateFPExt(LHSOrig, RHSOrig->getType());
+        else if (RHSWidth <= SrcWidth)
+          RHSOrig = Builder.CreateFPExt(RHSOrig, LHSOrig->getType());
+        if (LHSOrig != OpI->getOperand(0) || RHSOrig != OpI->getOperand(1)) {
+          Value *ExactResult = Builder.CreateFRem(LHSOrig, RHSOrig);
+          if (Instruction *RI = dyn_cast<Instruction>(ExactResult))
+            RI->copyFastMathFlags(OpI);
+          return CastInst::CreateFPCast(ExactResult, CI.getType());
+        }
+    }
+
+    // (fptrunc (fneg x)) -> (fneg (fptrunc x))
+    if (BinaryOperator::isFNeg(OpI)) {
+      Value *InnerTrunc = Builder.CreateFPTrunc(OpI->getOperand(1),
+                                                CI.getType());
+      Instruction *RI = BinaryOperator::CreateFNeg(InnerTrunc);
+      RI->copyFastMathFlags(OpI);
+      return RI;
+    }
+  }
+
+  // (fptrunc (select cond, R1, Cst)) -->
+  // (select cond, (fptrunc R1), (fptrunc Cst))
+  //
+  //  - but only if this isn't part of a min/max operation, else we'll
+  // ruin min/max canonical form which is to have the select and
+  // compare's operands be of the same type with no casts to look through.
+  Value *LHS, *RHS;
+  SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0));
+  if (SI &&
+      (isa<ConstantFP>(SI->getOperand(1)) ||
+       isa<ConstantFP>(SI->getOperand(2))) &&
+      matchSelectPattern(SI, LHS, RHS).Flavor == SPF_UNKNOWN) {
+    Value *LHSTrunc = Builder.CreateFPTrunc(SI->getOperand(1), CI.getType());
+    Value *RHSTrunc = Builder.CreateFPTrunc(SI->getOperand(2), CI.getType());
+    return SelectInst::Create(SI->getOperand(0), LHSTrunc, RHSTrunc);
+  }
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0));
+  if (II) {
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::fabs:
+    case Intrinsic::ceil:
+    case Intrinsic::floor:
+    case Intrinsic::rint:
+    case Intrinsic::round:
+    case Intrinsic::nearbyint:
+    case Intrinsic::trunc: {
+      Value *Src = II->getArgOperand(0);
+      if (!Src->hasOneUse())
+        break;
+
+      // Except for fabs, this transformation requires the input of the unary FP
+      // operation to be itself an fpext from the type to which we're
+      // truncating.
+      if (II->getIntrinsicID() != Intrinsic::fabs) {
+        FPExtInst *FPExtSrc = dyn_cast<FPExtInst>(Src);
+        if (!FPExtSrc || FPExtSrc->getOperand(0)->getType() != CI.getType())
+          break;
+      }
+
+      // Do unary FP operation on smaller type.
+      // (fptrunc (fabs x)) -> (fabs (fptrunc x))
+      Value *InnerTrunc = Builder.CreateFPTrunc(Src, CI.getType());
+      Type *IntrinsicType[] = { CI.getType() };
+      Function *Overload = Intrinsic::getDeclaration(
+        CI.getModule(), II->getIntrinsicID(), IntrinsicType);
+
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      II->getOperandBundlesAsDefs(OpBundles);
+
+      Value *Args[] = { InnerTrunc };
+      CallInst *NewCI =  CallInst::Create(Overload, Args,
+                                          OpBundles, II->getName());
+      NewCI->copyFastMathFlags(II);
+      return NewCI;
+    }
+    }
+  }
+
+  if (Instruction *I = shrinkInsertElt(CI, Builder))
+    return I;
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitFPExt(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+// fpto{s/u}i({u/s}itofp(X)) --> X or zext(X) or sext(X) or trunc(X)
+// This is safe if the intermediate type has enough bits in its mantissa to
+// accurately represent all values of X.  For example, this won't work with
+// i64 -> float -> i64.
+Instruction *InstCombiner::FoldItoFPtoI(Instruction &FI) {
+  if (!isa<UIToFPInst>(FI.getOperand(0)) && !isa<SIToFPInst>(FI.getOperand(0)))
+    return nullptr;
+  Instruction *OpI = cast<Instruction>(FI.getOperand(0));
+
+  Value *SrcI = OpI->getOperand(0);
+  Type *FITy = FI.getType();
+  Type *OpITy = OpI->getType();
+  Type *SrcTy = SrcI->getType();
+  bool IsInputSigned = isa<SIToFPInst>(OpI);
+  bool IsOutputSigned = isa<FPToSIInst>(FI);
+
+  // We can safely assume the conversion won't overflow the output range,
+  // because (for example) (uint8_t)18293.f is undefined behavior.
+
+  // Since we can assume the conversion won't overflow, our decision as to
+  // whether the input will fit in the float should depend on the minimum
+  // of the input range and output range.
+
+  // This means this is also safe for a signed input and unsigned output, since
+  // a negative input would lead to undefined behavior.
+  int InputSize = (int)SrcTy->getScalarSizeInBits() - IsInputSigned;
+  int OutputSize = (int)FITy->getScalarSizeInBits() - IsOutputSigned;
+  int ActualSize = std::min(InputSize, OutputSize);
+
+  if (ActualSize <= OpITy->getFPMantissaWidth()) {
+    if (FITy->getScalarSizeInBits() > SrcTy->getScalarSizeInBits()) {
+      if (IsInputSigned && IsOutputSigned)
+        return new SExtInst(SrcI, FITy);
+      return new ZExtInst(SrcI, FITy);
+    }
+    if (FITy->getScalarSizeInBits() < SrcTy->getScalarSizeInBits())
+      return new TruncInst(SrcI, FITy);
+    if (SrcTy == FITy)
+      return replaceInstUsesWith(FI, SrcI);
+    return new BitCastInst(SrcI, FITy);
+  }
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
+  Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
+  if (!OpI)
+    return commonCastTransforms(FI);
+
+  if (Instruction *I = FoldItoFPtoI(FI))
+    return I;
+
+  return commonCastTransforms(FI);
+}
+
+Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
+  Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
+  if (!OpI)
+    return commonCastTransforms(FI);
+
+  if (Instruction *I = FoldItoFPtoI(FI))
+    return I;
+
+  return commonCastTransforms(FI);
+}
+
+Instruction *InstCombiner::visitUIToFP(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitSIToFP(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
+  // If the source integer type is not the intptr_t type for this target, do a
+  // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
+  // cast to be exposed to other transforms.
+  unsigned AS = CI.getAddressSpace();
+  if (CI.getOperand(0)->getType()->getScalarSizeInBits() !=
+      DL.getPointerSizeInBits(AS)) {
+    Type *Ty = DL.getIntPtrType(CI.getContext(), AS);
+    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
+      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
+
+    Value *P = Builder.CreateZExtOrTrunc(CI.getOperand(0), Ty);
+    return new IntToPtrInst(P, CI.getType());
+  }
+
+  if (Instruction *I = commonCastTransforms(CI))
+    return I;
+
+  return nullptr;
+}
+
+/// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint)
+Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
+  Value *Src = CI.getOperand(0);
+
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
+    // If casting the result of a getelementptr instruction with no offset, turn
+    // this into a cast of the original pointer!
+    if (GEP->hasAllZeroIndices() &&
+        // If CI is an addrspacecast and GEP changes the poiner type, merging
+        // GEP into CI would undo canonicalizing addrspacecast with different
+        // pointer types, causing infinite loops.
+        (!isa<AddrSpaceCastInst>(CI) ||
+         GEP->getType() == GEP->getPointerOperandType())) {
+      // Changing the cast operand is usually not a good idea but it is safe
+      // here because the pointer operand is being replaced with another
+      // pointer operand so the opcode doesn't need to change.
+      Worklist.Add(GEP);
+      CI.setOperand(0, GEP->getOperand(0));
+      return &CI;
+    }
+  }
+
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
+  // If the destination integer type is not the intptr_t type for this target,
+  // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
+  // to be exposed to other transforms.
+
+  Type *Ty = CI.getType();
+  unsigned AS = CI.getPointerAddressSpace();
+
+  if (Ty->getScalarSizeInBits() == DL.getPointerSizeInBits(AS))
+    return commonPointerCastTransforms(CI);
+
+  Type *PtrTy = DL.getIntPtrType(CI.getContext(), AS);
+  if (Ty->isVectorTy()) // Handle vectors of pointers.
+    PtrTy = VectorType::get(PtrTy, Ty->getVectorNumElements());
+
+  Value *P = Builder.CreatePtrToInt(CI.getOperand(0), PtrTy);
+  return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
+}
+
+/// This input value (which is known to have vector type) is being zero extended
+/// or truncated to the specified vector type.
+/// Try to replace it with a shuffle (and vector/vector bitcast) if possible.
+///
+/// The source and destination vector types may have different element types.
+static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy,
+                                         InstCombiner &IC) {
+  // We can only do this optimization if the output is a multiple of the input
+  // element size, or the input is a multiple of the output element size.
+  // Convert the input type to have the same element type as the output.
+  VectorType *SrcTy = cast<VectorType>(InVal->getType());
+
+  if (SrcTy->getElementType() != DestTy->getElementType()) {
+    // The input types don't need to be identical, but for now they must be the
+    // same size.  There is no specific reason we couldn't handle things like
+    // <4 x i16> -> <4 x i32> by bitcasting to <2 x i32> but haven't gotten
+    // there yet.
+    if (SrcTy->getElementType()->getPrimitiveSizeInBits() !=
+        DestTy->getElementType()->getPrimitiveSizeInBits())
+      return nullptr;
+
+    SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
+    InVal = IC.Builder.CreateBitCast(InVal, SrcTy);
+  }
+
+  // Now that the element types match, get the shuffle mask and RHS of the
+  // shuffle to use, which depends on whether we're increasing or decreasing the
+  // size of the input.
+  SmallVector<uint32_t, 16> ShuffleMask;
+  Value *V2;
+
+  if (SrcTy->getNumElements() > DestTy->getNumElements()) {
+    // If we're shrinking the number of elements, just shuffle in the low
+    // elements from the input and use undef as the second shuffle input.
+    V2 = UndefValue::get(SrcTy);
+    for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i)
+      ShuffleMask.push_back(i);
+
+  } else {
+    // If we're increasing the number of elements, shuffle in all of the
+    // elements from InVal and fill the rest of the result elements with zeros
+    // from a constant zero.
+    V2 = Constant::getNullValue(SrcTy);
+    unsigned SrcElts = SrcTy->getNumElements();
+    for (unsigned i = 0, e = SrcElts; i != e; ++i)
+      ShuffleMask.push_back(i);
+
+    // The excess elements reference the first element of the zero input.
+    for (unsigned i = 0, e = DestTy->getNumElements()-SrcElts; i != e; ++i)
+      ShuffleMask.push_back(SrcElts);
+  }
+
+  return new ShuffleVectorInst(InVal, V2,
+                               ConstantDataVector::get(V2->getContext(),
+                                                       ShuffleMask));
+}
+
+static bool isMultipleOfTypeSize(unsigned Value, Type *Ty) {
+  return Value % Ty->getPrimitiveSizeInBits() == 0;
+}
+
+static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {
+  return Value / Ty->getPrimitiveSizeInBits();
+}
+
+/// V is a value which is inserted into a vector of VecEltTy.
+/// Look through the value to see if we can decompose it into
+/// insertions into the vector.  See the example in the comment for
+/// OptimizeIntegerToVectorInsertions for the pattern this handles.
+/// The type of V is always a non-zero multiple of VecEltTy's size.
+/// Shift is the number of bits between the lsb of V and the lsb of
+/// the vector.
+///
+/// This returns false if the pattern can't be matched or true if it can,
+/// filling in Elements with the elements found here.
+static bool collectInsertionElements(Value *V, unsigned Shift,
+                                     SmallVectorImpl<Value *> &Elements,
+                                     Type *VecEltTy, bool isBigEndian) {
+  assert(isMultipleOfTypeSize(Shift, VecEltTy) &&
+         "Shift should be a multiple of the element type size");
+
+  // Undef values never contribute useful bits to the result.
+  if (isa<UndefValue>(V)) return true;
+
+  // If we got down to a value of the right type, we win, try inserting into the
+  // right element.
+  if (V->getType() == VecEltTy) {
+    // Inserting null doesn't actually insert any elements.
+    if (Constant *C = dyn_cast<Constant>(V))
+      if (C->isNullValue())
+        return true;
+
+    unsigned ElementIndex = getTypeSizeIndex(Shift, VecEltTy);
+    if (isBigEndian)
+      ElementIndex = Elements.size() - ElementIndex - 1;
+
+    // Fail if multiple elements are inserted into this slot.
+    if (Elements[ElementIndex])
+      return false;
+
+    Elements[ElementIndex] = V;
+    return true;
+  }
+
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    // Figure out the # elements this provides, and bitcast it or slice it up
+    // as required.
+    unsigned NumElts = getTypeSizeIndex(C->getType()->getPrimitiveSizeInBits(),
+                                        VecEltTy);
+    // If the constant is the size of a vector element, we just need to bitcast
+    // it to the right type so it gets properly inserted.
+    if (NumElts == 1)
+      return collectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
+                                      Shift, Elements, VecEltTy, isBigEndian);
+
+    // Okay, this is a constant that covers multiple elements.  Slice it up into
+    // pieces and insert each element-sized piece into the vector.
+    if (!isa<IntegerType>(C->getType()))
+      C = ConstantExpr::getBitCast(C, IntegerType::get(V->getContext(),
+                                       C->getType()->getPrimitiveSizeInBits()));
+    unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits();
+    Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
+
+    for (unsigned i = 0; i != NumElts; ++i) {
+      unsigned ShiftI = Shift+i*ElementSize;
+      Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
+                                                                  ShiftI));
+      Piece = ConstantExpr::getTrunc(Piece, ElementIntTy);
+      if (!collectInsertionElements(Piece, ShiftI, Elements, VecEltTy,
+                                    isBigEndian))
+        return false;
+    }
+    return true;
+  }
+
+  if (!V->hasOneUse()) return false;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+  switch (I->getOpcode()) {
+  default: return false; // Unhandled case.
+  case Instruction::BitCast:
+    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+                                    isBigEndian);
+  case Instruction::ZExt:
+    if (!isMultipleOfTypeSize(
+                          I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
+                              VecEltTy))
+      return false;
+    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+                                    isBigEndian);
+  case Instruction::Or:
+    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+                                    isBigEndian) &&
+           collectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy,
+                                    isBigEndian);
+  case Instruction::Shl: {
+    // Must be shifting by a constant that is a multiple of the element size.
+    ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
+    if (!CI) return false;
+    Shift += CI->getZExtValue();
+    if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false;
+    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+                                    isBigEndian);
+  }
+
+  }
+}
+
+
+/// If the input is an 'or' instruction, we may be doing shifts and ors to
+/// assemble the elements of the vector manually.
+/// Try to rip the code out and replace it with insertelements.  This is to
+/// optimize code like this:
+///
+///    %tmp37 = bitcast float %inc to i32
+///    %tmp38 = zext i32 %tmp37 to i64
+///    %tmp31 = bitcast float %inc5 to i32
+///    %tmp32 = zext i32 %tmp31 to i64
+///    %tmp33 = shl i64 %tmp32, 32
+///    %ins35 = or i64 %tmp33, %tmp38
+///    %tmp43 = bitcast i64 %ins35 to <2 x float>
+///
+/// Into two insertelements that do "buildvector{%inc, %inc5}".
+static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
+                                                InstCombiner &IC) {
+  VectorType *DestVecTy = cast<VectorType>(CI.getType());
+  Value *IntInput = CI.getOperand(0);
+
+  SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
+  if (!collectInsertionElements(IntInput, 0, Elements,
+                                DestVecTy->getElementType(),
+                                IC.getDataLayout().isBigEndian()))
+    return nullptr;
+
+  // If we succeeded, we know that all of the element are specified by Elements
+  // or are zero if Elements has a null entry.  Recast this as a set of
+  // insertions.
+  Value *Result = Constant::getNullValue(CI.getType());
+  for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
+    if (!Elements[i]) continue;  // Unset element.
+
+    Result = IC.Builder.CreateInsertElement(Result, Elements[i],
+                                            IC.Builder.getInt32(i));
+  }
+
+  return Result;
+}
+
+/// Canonicalize scalar bitcasts of extracted elements into a bitcast of the
+/// vector followed by extract element. The backend tends to handle bitcasts of
+/// vectors better than bitcasts of scalars because vector registers are
+/// usually not type-specific like scalar integer or scalar floating-point.
+static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
+                                              InstCombiner &IC) {
+  // TODO: Create and use a pattern matcher for ExtractElementInst.
+  auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
+  if (!ExtElt || !ExtElt->hasOneUse())
+    return nullptr;
+
+  // The bitcast must be to a vectorizable type, otherwise we can't make a new
+  // type to extract from.
+  Type *DestType = BitCast.getType();
+  if (!VectorType::isValidElementType(DestType))
+    return nullptr;
+
+  unsigned NumElts = ExtElt->getVectorOperandType()->getNumElements();
+  auto *NewVecType = VectorType::get(DestType, NumElts);
+  auto *NewBC = IC.Builder.CreateBitCast(ExtElt->getVectorOperand(),
+                                         NewVecType, "bc");
+  return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand());
+}
+
+/// Change the type of a bitwise logic operation if we can eliminate a bitcast.
+static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast,
+                                            InstCombiner::BuilderTy &Builder) {
+  Type *DestTy = BitCast.getType();
+  BinaryOperator *BO;
+  if (!DestTy->isIntOrIntVectorTy() ||
+      !match(BitCast.getOperand(0), m_OneUse(m_BinOp(BO))) ||
+      !BO->isBitwiseLogicOp())
+    return nullptr;
+  
+  // FIXME: This transform is restricted to vector types to avoid backend
+  // problems caused by creating potentially illegal operations. If a fix-up is
+  // added to handle that situation, we can remove this check.
+  if (!DestTy->isVectorTy() || !BO->getType()->isVectorTy())
+    return nullptr;
+  
+  Value *X;
+  if (match(BO->getOperand(0), m_OneUse(m_BitCast(m_Value(X)))) &&
+      X->getType() == DestTy && !isa<Constant>(X)) {
+    // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
+    Value *CastedOp1 = Builder.CreateBitCast(BO->getOperand(1), DestTy);
+    return BinaryOperator::Create(BO->getOpcode(), X, CastedOp1);
+  }
+
+  if (match(BO->getOperand(1), m_OneUse(m_BitCast(m_Value(X)))) &&
+      X->getType() == DestTy && !isa<Constant>(X)) {
+    // bitcast(logic(Y, bitcast(X))) --> logic'(bitcast(Y), X)
+    Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy);
+    return BinaryOperator::Create(BO->getOpcode(), CastedOp0, X);
+  }
+
+  // Canonicalize vector bitcasts to come before vector bitwise logic with a
+  // constant. This eases recognition of special constants for later ops.
+  // Example:
+  // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
+  Constant *C;
+  if (match(BO->getOperand(1), m_Constant(C))) {
+    // bitcast (logic X, C) --> logic (bitcast X, C')
+    Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy);
+    Value *CastedC = ConstantExpr::getBitCast(C, DestTy);
+    return BinaryOperator::Create(BO->getOpcode(), CastedOp0, CastedC);
+  }
+
+  return nullptr;
+}
+
+/// Change the type of a select if we can eliminate a bitcast.
+static Instruction *foldBitCastSelect(BitCastInst &BitCast,
+                                      InstCombiner::BuilderTy &Builder) {
+  Value *Cond, *TVal, *FVal;
+  if (!match(BitCast.getOperand(0),
+             m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
+    return nullptr;
+
+  // A vector select must maintain the same number of elements in its operands.
+  Type *CondTy = Cond->getType();
+  Type *DestTy = BitCast.getType();
+  if (CondTy->isVectorTy()) {
+    if (!DestTy->isVectorTy())
+      return nullptr;
+    if (DestTy->getVectorNumElements() != CondTy->getVectorNumElements())
+      return nullptr;
+  }
+
+  // FIXME: This transform is restricted from changing the select between
+  // scalars and vectors to avoid backend problems caused by creating
+  // potentially illegal operations. If a fix-up is added to handle that
+  // situation, we can remove this check.
+  if (DestTy->isVectorTy() != TVal->getType()->isVectorTy())
+    return nullptr;
+
+  auto *Sel = cast<Instruction>(BitCast.getOperand(0));
+  Value *X;
+  if (match(TVal, m_OneUse(m_BitCast(m_Value(X)))) && X->getType() == DestTy &&
+      !isa<Constant>(X)) {
+    // bitcast(select(Cond, bitcast(X), Y)) --> select'(Cond, X, bitcast(Y))
+    Value *CastedVal = Builder.CreateBitCast(FVal, DestTy);
+    return SelectInst::Create(Cond, X, CastedVal, "", nullptr, Sel);
+  }
+
+  if (match(FVal, m_OneUse(m_BitCast(m_Value(X)))) && X->getType() == DestTy &&
+      !isa<Constant>(X)) {
+    // bitcast(select(Cond, Y, bitcast(X))) --> select'(Cond, bitcast(Y), X)
+    Value *CastedVal = Builder.CreateBitCast(TVal, DestTy);
+    return SelectInst::Create(Cond, CastedVal, X, "", nullptr, Sel);
+  }
+
+  return nullptr;
+}
+
+/// Check if all users of CI are StoreInsts.
+static bool hasStoreUsersOnly(CastInst &CI) {
+  for (User *U : CI.users()) {
+    if (!isa<StoreInst>(U))
+      return false;
+  }
+  return true;
+}
+
+/// This function handles following case
+///
+///     A  ->  B    cast
+///     PHI
+///     B  ->  A    cast
+///
+/// All the related PHI nodes can be replaced by new PHI nodes with type A.
+/// The uses of \p CI can be changed to the new PHI node corresponding to \p PN.
+Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
+  // BitCast used by Store can be handled in InstCombineLoadStoreAlloca.cpp.
+  if (hasStoreUsersOnly(CI))
+    return nullptr;
+
+  Value *Src = CI.getOperand(0);
+  Type *SrcTy = Src->getType();         // Type B
+  Type *DestTy = CI.getType();          // Type A
+
+  SmallVector<PHINode *, 4> PhiWorklist;
+  SmallSetVector<PHINode *, 4> OldPhiNodes;
+
+  // Find all of the A->B casts and PHI nodes.
+  // We need to inpect all related PHI nodes, but PHIs can be cyclic, so
+  // OldPhiNodes is used to track all known PHI nodes, before adding a new
+  // PHI to PhiWorklist, it is checked against and added to OldPhiNodes first.
+  PhiWorklist.push_back(PN);
+  OldPhiNodes.insert(PN);
+  while (!PhiWorklist.empty()) {
+    auto *OldPN = PhiWorklist.pop_back_val();
+    for (Value *IncValue : OldPN->incoming_values()) {
+      if (isa<Constant>(IncValue))
+        continue;
+
+      if (auto *LI = dyn_cast<LoadInst>(IncValue)) {
+        // If there is a sequence of one or more load instructions, each loaded
+        // value is used as address of later load instruction, bitcast is
+        // necessary to change the value type, don't optimize it. For
+        // simplicity we give up if the load address comes from another load.
+        Value *Addr = LI->getOperand(0);
+        if (Addr == &CI || isa<LoadInst>(Addr))
+          return nullptr;
+        if (LI->hasOneUse() && LI->isSimple())
+          continue;
+        // If a LoadInst has more than one use, changing the type of loaded
+        // value may create another bitcast.
+        return nullptr;
+      }
+
+      if (auto *PNode = dyn_cast<PHINode>(IncValue)) {
+        if (OldPhiNodes.insert(PNode))
+          PhiWorklist.push_back(PNode);
+        continue;
+      }
+
+      auto *BCI = dyn_cast<BitCastInst>(IncValue);
+      // We can't handle other instructions.
+      if (!BCI)
+        return nullptr;
+
+      // Verify it's a A->B cast.
+      Type *TyA = BCI->getOperand(0)->getType();
+      Type *TyB = BCI->getType();
+      if (TyA != DestTy || TyB != SrcTy)
+        return nullptr;
+    }
+  }
+
+  // For each old PHI node, create a corresponding new PHI node with a type A.
+  SmallDenseMap<PHINode *, PHINode *> NewPNodes;
+  for (auto *OldPN : OldPhiNodes) {
+    Builder.SetInsertPoint(OldPN);
+    PHINode *NewPN = Builder.CreatePHI(DestTy, OldPN->getNumOperands());
+    NewPNodes[OldPN] = NewPN;
+  }
+
+  // Fill in the operands of new PHI nodes.
+  for (auto *OldPN : OldPhiNodes) {
+    PHINode *NewPN = NewPNodes[OldPN];
+    for (unsigned j = 0, e = OldPN->getNumOperands(); j != e; ++j) {
+      Value *V = OldPN->getOperand(j);
+      Value *NewV = nullptr;
+      if (auto *C = dyn_cast<Constant>(V)) {
+        NewV = ConstantExpr::getBitCast(C, DestTy);
+      } else if (auto *LI = dyn_cast<LoadInst>(V)) {
+        Builder.SetInsertPoint(LI->getNextNode());
+        NewV = Builder.CreateBitCast(LI, DestTy);
+        Worklist.Add(LI);
+      } else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
+        NewV = BCI->getOperand(0);
+      } else if (auto *PrevPN = dyn_cast<PHINode>(V)) {
+        NewV = NewPNodes[PrevPN];
+      }
+      assert(NewV);
+      NewPN->addIncoming(NewV, OldPN->getIncomingBlock(j));
+    }
+  }
+
+  // If there is a store with type B, change it to type A.
+  for (User *U : PN->users()) {
+    auto *SI = dyn_cast<StoreInst>(U);
+    if (SI && SI->isSimple() && SI->getOperand(0) == PN) {
+      Builder.SetInsertPoint(SI);
+      auto *NewBC =
+          cast<BitCastInst>(Builder.CreateBitCast(NewPNodes[PN], SrcTy));
+      SI->setOperand(0, NewBC);
+      Worklist.Add(SI);
+      assert(hasStoreUsersOnly(*NewBC));
+    }
+  }
+
+  return replaceInstUsesWith(CI, NewPNodes[PN]);
+}
+
+Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
+  // If the operands are integer typed then apply the integer transforms,
+  // otherwise just apply the common ones.
+  Value *Src = CI.getOperand(0);
+  Type *SrcTy = Src->getType();
+  Type *DestTy = CI.getType();
+
+  // Get rid of casts from one type to the same type. These are useless and can
+  // be replaced by the operand.
+  if (DestTy == Src->getType())
+    return replaceInstUsesWith(CI, Src);
+
+  if (PointerType *DstPTy = dyn_cast<PointerType>(DestTy)) {
+    PointerType *SrcPTy = cast<PointerType>(SrcTy);
+    Type *DstElTy = DstPTy->getElementType();
+    Type *SrcElTy = SrcPTy->getElementType();
+
+    // If we are casting a alloca to a pointer to a type of the same
+    // size, rewrite the allocation instruction to allocate the "right" type.
+    // There is no need to modify malloc calls because it is their bitcast that
+    // needs to be cleaned up.
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(Src))
+      if (Instruction *V = PromoteCastOfAllocation(CI, *AI))
+        return V;
+
+    // When the type pointed to is not sized the cast cannot be
+    // turned into a gep.
+    Type *PointeeType =
+        cast<PointerType>(Src->getType()->getScalarType())->getElementType();
+    if (!PointeeType->isSized())
+      return nullptr;
+
+    // If the source and destination are pointers, and this cast is equivalent
+    // to a getelementptr X, 0, 0, 0...  turn it into the appropriate gep.
+    // This can enhance SROA and other transforms that want type-safe pointers.
+    unsigned NumZeros = 0;
+    while (SrcElTy != DstElTy &&
+           isa<CompositeType>(SrcElTy) && !SrcElTy->isPointerTy() &&
+           SrcElTy->getNumContainedTypes() /* not "{}" */) {
+      SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(0U);
+      ++NumZeros;
+    }
+
+    // If we found a path from the src to dest, create the getelementptr now.
+    if (SrcElTy == DstElTy) {
+      SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder.getInt32(0));
+      return GetElementPtrInst::CreateInBounds(Src, Idxs);
+    }
+  }
+
+  if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
+    if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) {
+      Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType());
+      return InsertElementInst::Create(UndefValue::get(DestTy), Elem,
+                     Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
+      // FIXME: Canonicalize bitcast(insertelement) -> insertelement(bitcast)
+    }
+
+    if (isa<IntegerType>(SrcTy)) {
+      // If this is a cast from an integer to vector, check to see if the input
+      // is a trunc or zext of a bitcast from vector.  If so, we can replace all
+      // the casts with a shuffle and (potentially) a bitcast.
+      if (isa<TruncInst>(Src) || isa<ZExtInst>(Src)) {
+        CastInst *SrcCast = cast<CastInst>(Src);
+        if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0)))
+          if (isa<VectorType>(BCIn->getOperand(0)->getType()))
+            if (Instruction *I = optimizeVectorResize(BCIn->getOperand(0),
+                                               cast<VectorType>(DestTy), *this))
+              return I;
+      }
+
+      // If the input is an 'or' instruction, we may be doing shifts and ors to
+      // assemble the elements of the vector manually.  Try to rip the code out
+      // and replace it with insertelements.
+      if (Value *V = optimizeIntegerToVectorInsertions(CI, *this))
+        return replaceInstUsesWith(CI, V);
+    }
+  }
+
+  if (VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) {
+    if (SrcVTy->getNumElements() == 1) {
+      // If our destination is not a vector, then make this a straight
+      // scalar-scalar cast.
+      if (!DestTy->isVectorTy()) {
+        Value *Elem =
+          Builder.CreateExtractElement(Src,
+                     Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
+        return CastInst::Create(Instruction::BitCast, Elem, DestTy);
+      }
+
+      // Otherwise, see if our source is an insert. If so, then use the scalar
+      // component directly.
+      if (InsertElementInst *IEI =
+            dyn_cast<InsertElementInst>(CI.getOperand(0)))
+        return CastInst::Create(Instruction::BitCast, IEI->getOperand(1),
+                                DestTy);
+    }
+  }
+
+  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) {
+    // Okay, we have (bitcast (shuffle ..)).  Check to see if this is
+    // a bitcast to a vector with the same # elts.
+    if (SVI->hasOneUse() && DestTy->isVectorTy() &&
+        DestTy->getVectorNumElements() == SVI->getType()->getNumElements() &&
+        SVI->getType()->getNumElements() ==
+        SVI->getOperand(0)->getType()->getVectorNumElements()) {
+      BitCastInst *Tmp;
+      // If either of the operands is a cast from CI.getType(), then
+      // evaluating the shuffle in the casted destination's type will allow
+      // us to eliminate at least one cast.
+      if (((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(0))) &&
+           Tmp->getOperand(0)->getType() == DestTy) ||
+          ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) &&
+           Tmp->getOperand(0)->getType() == DestTy)) {
+        Value *LHS = Builder.CreateBitCast(SVI->getOperand(0), DestTy);
+        Value *RHS = Builder.CreateBitCast(SVI->getOperand(1), DestTy);
+        // Return a new shuffle vector.  Use the same element ID's, as we
+        // know the vector types match #elts.
+        return new ShuffleVectorInst(LHS, RHS, SVI->getOperand(2));
+      }
+    }
+  }
+
+  // Handle the A->B->A cast, and there is an intervening PHI node.
+  if (PHINode *PN = dyn_cast<PHINode>(Src))
+    if (Instruction *I = optimizeBitCastFromPhi(CI, PN))
+      return I;
+
+  if (Instruction *I = canonicalizeBitCastExtElt(CI, *this))
+    return I;
+
+  if (Instruction *I = foldBitCastBitwiseLogic(CI, Builder))
+    return I;
+
+  if (Instruction *I = foldBitCastSelect(CI, Builder))
+    return I;
+
+  if (SrcTy->isPointerTy())
+    return commonPointerCastTransforms(CI);
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
+  // If the destination pointer element type is not the same as the source's
+  // first do a bitcast to the destination type, and then the addrspacecast.
+  // This allows the cast to be exposed to other transforms.
+  Value *Src = CI.getOperand(0);
+  PointerType *SrcTy = cast<PointerType>(Src->getType()->getScalarType());
+  PointerType *DestTy = cast<PointerType>(CI.getType()->getScalarType());
+
+  Type *DestElemTy = DestTy->getElementType();
+  if (SrcTy->getElementType() != DestElemTy) {
+    Type *MidTy = PointerType::get(DestElemTy, SrcTy->getAddressSpace());
+    if (VectorType *VT = dyn_cast<VectorType>(CI.getType())) {
+      // Handle vectors of pointers.
+      MidTy = VectorType::get(MidTy, VT->getNumElements());
+    }
+
+    Value *NewBitCast = Builder.CreateBitCast(Src, MidTy);
+    return new AddrSpaceCastInst(NewBitCast, CI.getType());
+  }
+
+  return commonPointerCastTransforms(CI);
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
new file mode 100644
index 000000000000..60d1cde971dd
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -0,0 +1,5099 @@
+//===- InstCombineCompares.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitICmp and visitFCmp functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+// How many times is a select replaced by one of its operands?
+STATISTIC(NumSel, "Number of select opts");
+
+
+static ConstantInt *extractElement(Constant *V, Constant *Idx) {
+  return cast<ConstantInt>(ConstantExpr::getExtractElement(V, Idx));
+}
+
+static bool hasAddOverflow(ConstantInt *Result,
+                           ConstantInt *In1, ConstantInt *In2,
+                           bool IsSigned) {
+  if (!IsSigned)
+    return Result->getValue().ult(In1->getValue());
+
+  if (In2->isNegative())
+    return Result->getValue().sgt(In1->getValue());
+  return Result->getValue().slt(In1->getValue());
+}
+
+/// Compute Result = In1+In2, returning true if the result overflowed for this
+/// type.
+static bool addWithOverflow(Constant *&Result, Constant *In1,
+                            Constant *In2, bool IsSigned = false) {
+  Result = ConstantExpr::getAdd(In1, In2);
+
+  if (VectorType *VTy = dyn_cast<VectorType>(In1->getType())) {
+    for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
+      Constant *Idx = ConstantInt::get(Type::getInt32Ty(In1->getContext()), i);
+      if (hasAddOverflow(extractElement(Result, Idx),
+                         extractElement(In1, Idx),
+                         extractElement(In2, Idx),
+                         IsSigned))
+        return true;
+    }
+    return false;
+  }
+
+  return hasAddOverflow(cast<ConstantInt>(Result),
+                        cast<ConstantInt>(In1), cast<ConstantInt>(In2),
+                        IsSigned);
+}
+
+static bool hasSubOverflow(ConstantInt *Result,
+                           ConstantInt *In1, ConstantInt *In2,
+                           bool IsSigned) {
+  if (!IsSigned)
+    return Result->getValue().ugt(In1->getValue());
+
+  if (In2->isNegative())
+    return Result->getValue().slt(In1->getValue());
+
+  return Result->getValue().sgt(In1->getValue());
+}
+
+/// Compute Result = In1-In2, returning true if the result overflowed for this
+/// type.
+static bool subWithOverflow(Constant *&Result, Constant *In1,
+                            Constant *In2, bool IsSigned = false) {
+  Result = ConstantExpr::getSub(In1, In2);
+
+  if (VectorType *VTy = dyn_cast<VectorType>(In1->getType())) {
+    for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
+      Constant *Idx = ConstantInt::get(Type::getInt32Ty(In1->getContext()), i);
+      if (hasSubOverflow(extractElement(Result, Idx),
+                         extractElement(In1, Idx),
+                         extractElement(In2, Idx),
+                         IsSigned))
+        return true;
+    }
+    return false;
+  }
+
+  return hasSubOverflow(cast<ConstantInt>(Result),
+                        cast<ConstantInt>(In1), cast<ConstantInt>(In2),
+                        IsSigned);
+}
+
+/// Given an icmp instruction, return true if any use of this comparison is a
+/// branch on sign bit comparison.
+static bool hasBranchUse(ICmpInst &I) {
+  for (auto *U : I.users())
+    if (isa<BranchInst>(U))
+      return true;
+  return false;
+}
+
+/// Given an exploded icmp instruction, return true if the comparison only
+/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the
+/// result of the comparison is true when the input value is signed.
+static bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
+                           bool &TrueIfSigned) {
+  switch (Pred) {
+  case ICmpInst::ICMP_SLT:   // True if LHS s< 0
+    TrueIfSigned = true;
+    return RHS.isNullValue();
+  case ICmpInst::ICMP_SLE:   // True if LHS s<= RHS and RHS == -1
+    TrueIfSigned = true;
+    return RHS.isAllOnesValue();
+  case ICmpInst::ICMP_SGT:   // True if LHS s> -1
+    TrueIfSigned = false;
+    return RHS.isAllOnesValue();
+  case ICmpInst::ICMP_UGT:
+    // True if LHS u> RHS and RHS == high-bit-mask - 1
+    TrueIfSigned = true;
+    return RHS.isMaxSignedValue();
+  case ICmpInst::ICMP_UGE:
+    // True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc)
+    TrueIfSigned = true;
+    return RHS.isSignMask();
+  default:
+    return false;
+  }
+}
+
+/// Returns true if the exploded icmp can be expressed as a signed comparison
+/// to zero and updates the predicate accordingly.
+/// The signedness of the comparison is preserved.
+/// TODO: Refactor with decomposeBitTestICmp()?
+static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
+  if (!ICmpInst::isSigned(Pred))
+    return false;
+
+  if (C.isNullValue())
+    return ICmpInst::isRelational(Pred);
+
+  if (C.isOneValue()) {
+    if (Pred == ICmpInst::ICMP_SLT) {
+      Pred = ICmpInst::ICMP_SLE;
+      return true;
+    }
+  } else if (C.isAllOnesValue()) {
+    if (Pred == ICmpInst::ICMP_SGT) {
+      Pred = ICmpInst::ICMP_SGE;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// Given a signed integer type and a set of known zero and one bits, compute
+/// the maximum and minimum values that could have the specified known zero and
+/// known one bits, returning them in Min/Max.
+/// TODO: Move to method on KnownBits struct?
+static void computeSignedMinMaxValuesFromKnownBits(const KnownBits &Known,
+                                                   APInt &Min, APInt &Max) {
+  assert(Known.getBitWidth() == Min.getBitWidth() &&
+         Known.getBitWidth() == Max.getBitWidth() &&
+         "KnownZero, KnownOne and Min, Max must have equal bitwidth.");
+  APInt UnknownBits = ~(Known.Zero|Known.One);
+
+  // The minimum value is when all unknown bits are zeros, EXCEPT for the sign
+  // bit if it is unknown.
+  Min = Known.One;
+  Max = Known.One|UnknownBits;
+
+  if (UnknownBits.isNegative()) { // Sign bit is unknown
+    Min.setSignBit();
+    Max.clearSignBit();
+  }
+}
+
+/// Given an unsigned integer type and a set of known zero and one bits, compute
+/// the maximum and minimum values that could have the specified known zero and
+/// known one bits, returning them in Min/Max.
+/// TODO: Move to method on KnownBits struct?
+static void computeUnsignedMinMaxValuesFromKnownBits(const KnownBits &Known,
+                                                     APInt &Min, APInt &Max) {
+  assert(Known.getBitWidth() == Min.getBitWidth() &&
+         Known.getBitWidth() == Max.getBitWidth() &&
+         "Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth.");
+  APInt UnknownBits = ~(Known.Zero|Known.One);
+
+  // The minimum value is when the unknown bits are all zeros.
+  Min = Known.One;
+  // The maximum value is when the unknown bits are all ones.
+  Max = Known.One|UnknownBits;
+}
+
+/// This is called when we see this pattern:
+///   cmp pred (load (gep GV, ...)), cmpcst
+/// where GV is a global variable with a constant initializer. Try to simplify
+/// this into some simple computation that does not need the load. For example
+/// we can optimize "icmp eq (load (gep "foo", 0, i)), 0" into "icmp eq i, 3".
+///
+/// If AndCst is non-null, then the loaded value is masked with that constant
+/// before doing the comparison. This handles cases like "A[i]&4 == 0".
+Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
+                                                        GlobalVariable *GV,
+                                                        CmpInst &ICI,
+                                                        ConstantInt *AndCst) {
+  Constant *Init = GV->getInitializer();
+  if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
+    return nullptr;
+
+  uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
+  // Don't blow up on huge arrays.
+  if (ArrayElementCount > MaxArraySizeForCombine)
+    return nullptr;
+
+  // There are many forms of this optimization we can handle, for now, just do
+  // the simple index into a single-dimensional array.
+  //
+  // Require: GEP GV, 0, i {{, constant indices}}
+  if (GEP->getNumOperands() < 3 ||
+      !isa<ConstantInt>(GEP->getOperand(1)) ||
+      !cast<ConstantInt>(GEP->getOperand(1))->isZero() ||
+      isa<Constant>(GEP->getOperand(2)))
+    return nullptr;
+
+  // Check that indices after the variable are constants and in-range for the
+  // type they index.  Collect the indices.  This is typically for arrays of
+  // structs.
+  SmallVector<unsigned, 4> LaterIndices;
+
+  Type *EltTy = Init->getType()->getArrayElementType();
+  for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) {
+    ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!Idx) return nullptr;  // Variable index.
+
+    uint64_t IdxVal = Idx->getZExtValue();
+    if ((unsigned)IdxVal != IdxVal) return nullptr; // Too large array index.
+
+    if (StructType *STy = dyn_cast<StructType>(EltTy))
+      EltTy = STy->getElementType(IdxVal);
+    else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
+      if (IdxVal >= ATy->getNumElements()) return nullptr;
+      EltTy = ATy->getElementType();
+    } else {
+      return nullptr; // Unknown type.
+    }
+
+    LaterIndices.push_back(IdxVal);
+  }
+
+  enum { Overdefined = -3, Undefined = -2 };
+
+  // Variables for our state machines.
+
+  // FirstTrueElement/SecondTrueElement - Used to emit a comparison of the form
+  // "i == 47 | i == 87", where 47 is the first index the condition is true for,
+  // and 87 is the second (and last) index.  FirstTrueElement is -2 when
+  // undefined, otherwise set to the first true element.  SecondTrueElement is
+  // -2 when undefined, -3 when overdefined and >= 0 when that index is true.
+  int FirstTrueElement = Undefined, SecondTrueElement = Undefined;
+
+  // FirstFalseElement/SecondFalseElement - Used to emit a comparison of the
+  // form "i != 47 & i != 87".  Same state transitions as for true elements.
+  int FirstFalseElement = Undefined, SecondFalseElement = Undefined;
+
+  /// TrueRangeEnd/FalseRangeEnd - In conjunction with First*Element, these
+  /// define a state machine that triggers for ranges of values that the index
+  /// is true or false for.  This triggers on things like "abbbbc"[i] == 'b'.
+  /// This is -2 when undefined, -3 when overdefined, and otherwise the last
+  /// index in the range (inclusive).  We use -2 for undefined here because we
+  /// use relative comparisons and don't want 0-1 to match -1.
+  int TrueRangeEnd = Undefined, FalseRangeEnd = Undefined;
+
+  // MagicBitvector - This is a magic bitvector where we set a bit if the
+  // comparison is true for element 'i'.  If there are 64 elements or less in
+  // the array, this will fully represent all the comparison results.
+  uint64_t MagicBitvector = 0;
+
+  // Scan the array and see if one of our patterns matches.
+  Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
+  for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) {
+    Constant *Elt = Init->getAggregateElement(i);
+    if (!Elt) return nullptr;
+
+    // If this is indexing an array of structures, get the structure element.
+    if (!LaterIndices.empty())
+      Elt = ConstantExpr::getExtractValue(Elt, LaterIndices);
+
+    // If the element is masked, handle it.
+    if (AndCst) Elt = ConstantExpr::getAnd(Elt, AndCst);
+
+    // Find out if the comparison would be true or false for the i'th element.
+    Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt,
+                                                  CompareRHS, DL, &TLI);
+    // If the result is undef for this element, ignore it.
+    if (isa<UndefValue>(C)) {
+      // Extend range state machines to cover this element in case there is an
+      // undef in the middle of the range.
+      if (TrueRangeEnd == (int)i-1)
+        TrueRangeEnd = i;
+      if (FalseRangeEnd == (int)i-1)
+        FalseRangeEnd = i;
+      continue;
+    }
+
+    // If we can't compute the result for any of the elements, we have to give
+    // up evaluating the entire conditional.
+    if (!isa<ConstantInt>(C)) return nullptr;
+
+    // Otherwise, we know if the comparison is true or false for this element,
+    // update our state machines.
+    bool IsTrueForElt = !cast<ConstantInt>(C)->isZero();
+
+    // State machine for single/double/range index comparison.
+    if (IsTrueForElt) {
+      // Update the TrueElement state machine.
+      if (FirstTrueElement == Undefined)
+        FirstTrueElement = TrueRangeEnd = i;  // First true element.
+      else {
+        // Update double-compare state machine.
+        if (SecondTrueElement == Undefined)
+          SecondTrueElement = i;
+        else
+          SecondTrueElement = Overdefined;
+
+        // Update range state machine.
+        if (TrueRangeEnd == (int)i-1)
+          TrueRangeEnd = i;
+        else
+          TrueRangeEnd = Overdefined;
+      }
+    } else {
+      // Update the FalseElement state machine.
+      if (FirstFalseElement == Undefined)
+        FirstFalseElement = FalseRangeEnd = i; // First false element.
+      else {
+        // Update double-compare state machine.
+        if (SecondFalseElement == Undefined)
+          SecondFalseElement = i;
+        else
+          SecondFalseElement = Overdefined;
+
+        // Update range state machine.
+        if (FalseRangeEnd == (int)i-1)
+          FalseRangeEnd = i;
+        else
+          FalseRangeEnd = Overdefined;
+      }
+    }
+
+    // If this element is in range, update our magic bitvector.
+    if (i < 64 && IsTrueForElt)
+      MagicBitvector |= 1ULL << i;
+
+    // If all of our states become overdefined, bail out early.  Since the
+    // predicate is expensive, only check it every 8 elements.  This is only
+    // really useful for really huge arrays.
+    if ((i & 8) == 0 && i >= 64 && SecondTrueElement == Overdefined &&
+        SecondFalseElement == Overdefined && TrueRangeEnd == Overdefined &&
+        FalseRangeEnd == Overdefined)
+      return nullptr;
+  }
+
+  // Now that we've scanned the entire array, emit our new comparison(s).  We
+  // order the state machines in complexity of the generated code.
+  Value *Idx = GEP->getOperand(2);
+
+  // If the index is larger than the pointer size of the target, truncate the
+  // index down like the GEP would do implicitly.  We don't have to do this for
+  // an inbounds GEP because the index can't be out of range.
+  if (!GEP->isInBounds()) {
+    Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
+    unsigned PtrSize = IntPtrTy->getIntegerBitWidth();
+    if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize)
+      Idx = Builder.CreateTrunc(Idx, IntPtrTy);
+  }
+
+  // If the comparison is only true for one or two elements, emit direct
+  // comparisons.
+  if (SecondTrueElement != Overdefined) {
+    // None true -> false.
+    if (FirstTrueElement == Undefined)
+      return replaceInstUsesWith(ICI, Builder.getFalse());
+
+    Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement);
+
+    // True for one element -> 'i == 47'.
+    if (SecondTrueElement == Undefined)
+      return new ICmpInst(ICmpInst::ICMP_EQ, Idx, FirstTrueIdx);
+
+    // True for two elements -> 'i == 47 | i == 72'.
+    Value *C1 = Builder.CreateICmpEQ(Idx, FirstTrueIdx);
+    Value *SecondTrueIdx = ConstantInt::get(Idx->getType(), SecondTrueElement);
+    Value *C2 = Builder.CreateICmpEQ(Idx, SecondTrueIdx);
+    return BinaryOperator::CreateOr(C1, C2);
+  }
+
+  // If the comparison is only false for one or two elements, emit direct
+  // comparisons.
+  if (SecondFalseElement != Overdefined) {
+    // None false -> true.
+    if (FirstFalseElement == Undefined)
+      return replaceInstUsesWith(ICI, Builder.getTrue());
+
+    Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement);
+
+    // False for one element -> 'i != 47'.
+    if (SecondFalseElement == Undefined)
+      return new ICmpInst(ICmpInst::ICMP_NE, Idx, FirstFalseIdx);
+
+    // False for two elements -> 'i != 47 & i != 72'.
+    Value *C1 = Builder.CreateICmpNE(Idx, FirstFalseIdx);
+    Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement);
+    Value *C2 = Builder.CreateICmpNE(Idx, SecondFalseIdx);
+    return BinaryOperator::CreateAnd(C1, C2);
+  }
+
+  // If the comparison can be replaced with a range comparison for the elements
+  // where it is true, emit the range check.
+  if (TrueRangeEnd != Overdefined) {
+    assert(TrueRangeEnd != FirstTrueElement && "Should emit single compare");
+
+    // Generate (i-FirstTrue) <u (TrueRangeEnd-FirstTrue+1).
+    if (FirstTrueElement) {
+      Value *Offs = ConstantInt::get(Idx->getType(), -FirstTrueElement);
+      Idx = Builder.CreateAdd(Idx, Offs);
+    }
+
+    Value *End = ConstantInt::get(Idx->getType(),
+                                  TrueRangeEnd-FirstTrueElement+1);
+    return new ICmpInst(ICmpInst::ICMP_ULT, Idx, End);
+  }
+
+  // False range check.
+  if (FalseRangeEnd != Overdefined) {
+    assert(FalseRangeEnd != FirstFalseElement && "Should emit single compare");
+    // Generate (i-FirstFalse) >u (FalseRangeEnd-FirstFalse).
+    if (FirstFalseElement) {
+      Value *Offs = ConstantInt::get(Idx->getType(), -FirstFalseElement);
+      Idx = Builder.CreateAdd(Idx, Offs);
+    }
+
+    Value *End = ConstantInt::get(Idx->getType(),
+                                  FalseRangeEnd-FirstFalseElement);
+    return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End);
+  }
+
+  // If a magic bitvector captures the entire comparison state
+  // of this load, replace it with computation that does:
+  //   ((magic_cst >> i) & 1) != 0
+  {
+    Type *Ty = nullptr;
+
+    // Look for an appropriate type:
+    // - The type of Idx if the magic fits
+    // - The smallest fitting legal type if we have a DataLayout
+    // - Default to i32
+    if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth())
+      Ty = Idx->getType();
+    else
+      Ty = DL.getSmallestLegalIntType(Init->getContext(), ArrayElementCount);
+
+    if (Ty) {
+      Value *V = Builder.CreateIntCast(Idx, Ty, false);
+      V = Builder.CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
+      V = Builder.CreateAnd(ConstantInt::get(Ty, 1), V);
+      return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
+    }
+  }
+
+  return nullptr;
+}
+
+/// Return a value that can be used to compare the *offset* implied by a GEP to
+/// zero. For example, if we have &A[i], we want to return 'i' for
+/// "icmp ne i, 0". Note that, in general, indices can be complex, and scales
+/// are involved. The above expression would also be legal to codegen as
+/// "icmp ne (i*4), 0" (assuming A is a pointer to i32).
+/// This latter form is less amenable to optimization though, and we are allowed
+/// to generate the first by knowing that pointer arithmetic doesn't overflow.
+///
+/// If we can't emit an optimized form for this expression, this returns null.
+///
+static Value *evaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
+                                          const DataLayout &DL) {
+  gep_type_iterator GTI = gep_type_begin(GEP);
+
+  // Check to see if this gep only has a single variable index.  If so, and if
+  // any constant indices are a multiple of its scale, then we can compute this
+  // in terms of the scale of the variable index.  For example, if the GEP
+  // implies an offset of "12 + i*4", then we can codegen this as "3 + i",
+  // because the expression will cross zero at the same point.
+  unsigned i, e = GEP->getNumOperands();
+  int64_t Offset = 0;
+  for (i = 1; i != e; ++i, ++GTI) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
+      // Compute the aggregate offset of constant indices.
+      if (CI->isZero()) continue;
+
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (StructType *STy = GTI.getStructTypeOrNull()) {
+        Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
+      } else {
+        uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
+        Offset += Size*CI->getSExtValue();
+      }
+    } else {
+      // Found our variable index.
+      break;
+    }
+  }
+
+  // If there are no variable indices, we must have a constant offset, just
+  // evaluate it the general way.
+  if (i == e) return nullptr;
+
+  Value *VariableIdx = GEP->getOperand(i);
+  // Determine the scale factor of the variable element.  For example, this is
+  // 4 if the variable index is into an array of i32.
+  uint64_t VariableScale = DL.getTypeAllocSize(GTI.getIndexedType());
+
+  // Verify that there are no other variable indices.  If so, emit the hard way.
+  for (++i, ++GTI; i != e; ++i, ++GTI) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!CI) return nullptr;
+
+    // Compute the aggregate offset of constant indices.
+    if (CI->isZero()) continue;
+
+    // Handle a struct index, which adds its field offset to the pointer.
+    if (StructType *STy = GTI.getStructTypeOrNull()) {
+      Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
+    } else {
+      uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
+      Offset += Size*CI->getSExtValue();
+    }
+  }
+
+  // Okay, we know we have a single variable index, which must be a
+  // pointer/array/vector index.  If there is no offset, life is simple, return
+  // the index.
+  Type *IntPtrTy = DL.getIntPtrType(GEP->getOperand(0)->getType());
+  unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth();
+  if (Offset == 0) {
+    // Cast to intptrty in case a truncation occurs.  If an extension is needed,
+    // we don't need to bother extending: the extension won't affect where the
+    // computation crosses zero.
+    if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) {
+      VariableIdx = IC.Builder.CreateTrunc(VariableIdx, IntPtrTy);
+    }
+    return VariableIdx;
+  }
+
+  // Otherwise, there is an index.  The computation we will do will be modulo
+  // the pointer size, so get it.
+  uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
+
+  Offset &= PtrSizeMask;
+  VariableScale &= PtrSizeMask;
+
+  // To do this transformation, any constant index must be a multiple of the
+  // variable scale factor.  For example, we can evaluate "12 + 4*i" as "3 + i",
+  // but we can't evaluate "10 + 3*i" in terms of i.  Check that the offset is a
+  // multiple of the variable scale.
+  int64_t NewOffs = Offset / (int64_t)VariableScale;
+  if (Offset != NewOffs*(int64_t)VariableScale)
+    return nullptr;
+
+  // Okay, we can do this evaluation.  Start by converting the index to intptr.
+  if (VariableIdx->getType() != IntPtrTy)
+    VariableIdx = IC.Builder.CreateIntCast(VariableIdx, IntPtrTy,
+                                            true /*Signed*/);
+  Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs);
+  return IC.Builder.CreateAdd(VariableIdx, OffsetVal, "offset");
+}
+
+/// Returns true if we can rewrite Start as a GEP with pointer Base
+/// and some integer offset. The nodes that need to be re-written
+/// for this transformation will be added to Explored.
+static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
+                                  const DataLayout &DL,
+                                  SetVector<Value *> &Explored) {
+  SmallVector<Value *, 16> WorkList(1, Start);
+  Explored.insert(Base);
+
+  // The following traversal gives us an order which can be used
+  // when doing the final transformation. Since in the final
+  // transformation we create the PHI replacement instructions first,
+  // we don't have to get them in any particular order.
+  //
+  // However, for other instructions we will have to traverse the
+  // operands of an instruction first, which means that we have to
+  // do a post-order traversal.
+  while (!WorkList.empty()) {
+    SetVector<PHINode *> PHIs;
+
+    while (!WorkList.empty()) {
+      if (Explored.size() >= 100)
+        return false;
+
+      Value *V = WorkList.back();
+
+      if (Explored.count(V) != 0) {
+        WorkList.pop_back();
+        continue;
+      }
+
+      if (!isa<IntToPtrInst>(V) && !isa<PtrToIntInst>(V) &&
+          !isa<GetElementPtrInst>(V) && !isa<PHINode>(V))
+        // We've found some value that we can't explore which is different from
+        // the base. Therefore we can't do this transformation.
+        return false;
+
+      if (isa<IntToPtrInst>(V) || isa<PtrToIntInst>(V)) {
+        auto *CI = dyn_cast<CastInst>(V);
+        if (!CI->isNoopCast(DL))
+          return false;
+
+        if (Explored.count(CI->getOperand(0)) == 0)
+          WorkList.push_back(CI->getOperand(0));
+      }
+
+      if (auto *GEP = dyn_cast<GEPOperator>(V)) {
+        // We're limiting the GEP to having one index. This will preserve
+        // the original pointer type. We could handle more cases in the
+        // future.
+        if (GEP->getNumIndices() != 1 || !GEP->isInBounds() ||
+            GEP->getType() != Start->getType())
+          return false;
+
+        if (Explored.count(GEP->getOperand(0)) == 0)
+          WorkList.push_back(GEP->getOperand(0));
+      }
+
+      if (WorkList.back() == V) {
+        WorkList.pop_back();
+        // We've finished visiting this node, mark it as such.
+        Explored.insert(V);
+      }
+
+      if (auto *PN = dyn_cast<PHINode>(V)) {
+        // We cannot transform PHIs on unsplittable basic blocks.
+        if (isa<CatchSwitchInst>(PN->getParent()->getTerminator()))
+          return false;
+        Explored.insert(PN);
+        PHIs.insert(PN);
+      }
+    }
+
+    // Explore the PHI nodes further.
+    for (auto *PN : PHIs)
+      for (Value *Op : PN->incoming_values())
+        if (Explored.count(Op) == 0)
+          WorkList.push_back(Op);
+  }
+
+  // Make sure that we can do this. Since we can't insert GEPs in a basic
+  // block before a PHI node, we can't easily do this transformation if
+  // we have PHI node users of transformed instructions.
+  for (Value *Val : Explored) {
+    for (Value *Use : Val->uses()) {
+
+      auto *PHI = dyn_cast<PHINode>(Use);
+      auto *Inst = dyn_cast<Instruction>(Val);
+
+      if (Inst == Base || Inst == PHI || !Inst || !PHI ||
+          Explored.count(PHI) == 0)
+        continue;
+
+      if (PHI->getParent() == Inst->getParent())
+        return false;
+    }
+  }
+  return true;
+}
+
+// Sets the appropriate insert point on Builder where we can add
+// a replacement Instruction for V (if that is possible).
+static void setInsertionPoint(IRBuilder<> &Builder, Value *V,
+                              bool Before = true) {
+  if (auto *PHI = dyn_cast<PHINode>(V)) {
+    Builder.SetInsertPoint(&*PHI->getParent()->getFirstInsertionPt());
+    return;
+  }
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    if (!Before)
+      I = &*std::next(I->getIterator());
+    Builder.SetInsertPoint(I);
+    return;
+  }
+  if (auto *A = dyn_cast<Argument>(V)) {
+    // Set the insertion point in the entry block.
+    BasicBlock &Entry = A->getParent()->getEntryBlock();
+    Builder.SetInsertPoint(&*Entry.getFirstInsertionPt());
+    return;
+  }
+  // Otherwise, this is a constant and we don't need to set a new
+  // insertion point.
+  assert(isa<Constant>(V) && "Setting insertion point for unknown value!");
+}
+
+/// Returns a re-written value of Start as an indexed GEP using Base as a
+/// pointer.
+static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
+                                 const DataLayout &DL,
+                                 SetVector<Value *> &Explored) {
+  // Perform all the substitutions. This is a bit tricky because we can
+  // have cycles in our use-def chains.
+  // 1. Create the PHI nodes without any incoming values.
+  // 2. Create all the other values.
+  // 3. Add the edges for the PHI nodes.
+  // 4. Emit GEPs to get the original pointers.
+  // 5. Remove the original instructions.
+  Type *IndexType = IntegerType::get(
+      Base->getContext(), DL.getPointerTypeSizeInBits(Start->getType()));
+
+  DenseMap<Value *, Value *> NewInsts;
+  NewInsts[Base] = ConstantInt::getNullValue(IndexType);
+
+  // Create the new PHI nodes, without adding any incoming values.
+  for (Value *Val : Explored) {
+    if (Val == Base)
+      continue;
+    // Create empty phi nodes. This avoids cyclic dependencies when creating
+    // the remaining instructions.
+    if (auto *PHI = dyn_cast<PHINode>(Val))
+      NewInsts[PHI] = PHINode::Create(IndexType, PHI->getNumIncomingValues(),
+                                      PHI->getName() + ".idx", PHI);
+  }
+  IRBuilder<> Builder(Base->getContext());
+
+  // Create all the other instructions.
+  for (Value *Val : Explored) {
+
+    if (NewInsts.find(Val) != NewInsts.end())
+      continue;
+
+    if (auto *CI = dyn_cast<CastInst>(Val)) {
+      NewInsts[CI] = NewInsts[CI->getOperand(0)];
+      continue;
+    }
+    if (auto *GEP = dyn_cast<GEPOperator>(Val)) {
+      Value *Index = NewInsts[GEP->getOperand(1)] ? NewInsts[GEP->getOperand(1)]
+                                                  : GEP->getOperand(1);
+      setInsertionPoint(Builder, GEP);
+      // Indices might need to be sign extended. GEPs will magically do
+      // this, but we need to do it ourselves here.
+      if (Index->getType()->getScalarSizeInBits() !=
+          NewInsts[GEP->getOperand(0)]->getType()->getScalarSizeInBits()) {
+        Index = Builder.CreateSExtOrTrunc(
+            Index, NewInsts[GEP->getOperand(0)]->getType(),
+            GEP->getOperand(0)->getName() + ".sext");
+      }
+
+      auto *Op = NewInsts[GEP->getOperand(0)];
+      if (isa<ConstantInt>(Op) && dyn_cast<ConstantInt>(Op)->isZero())
+        NewInsts[GEP] = Index;
+      else
+        NewInsts[GEP] = Builder.CreateNSWAdd(
+            Op, Index, GEP->getOperand(0)->getName() + ".add");
+      continue;
+    }
+    if (isa<PHINode>(Val))
+      continue;
+
+    llvm_unreachable("Unexpected instruction type");
+  }
+
+  // Add the incoming values to the PHI nodes.
+  for (Value *Val : Explored) {
+    if (Val == Base)
+      continue;
+    // All the instructions have been created, we can now add edges to the
+    // phi nodes.
+    if (auto *PHI = dyn_cast<PHINode>(Val)) {
+      PHINode *NewPhi = static_cast<PHINode *>(NewInsts[PHI]);
+      for (unsigned I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
+        Value *NewIncoming = PHI->getIncomingValue(I);
+
+        if (NewInsts.find(NewIncoming) != NewInsts.end())
+          NewIncoming = NewInsts[NewIncoming];
+
+        NewPhi->addIncoming(NewIncoming, PHI->getIncomingBlock(I));
+      }
+    }
+  }
+
+  for (Value *Val : Explored) {
+    if (Val == Base)
+      continue;
+
+    // Depending on the type, for external users we have to emit
+    // a GEP or a GEP + ptrtoint.
+    setInsertionPoint(Builder, Val, false);
+
+    // If required, create an inttoptr instruction for Base.
+    Value *NewBase = Base;
+    if (!Base->getType()->isPointerTy())
+      NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(),
+                                               Start->getName() + "to.ptr");
+
+    Value *GEP = Builder.CreateInBoundsGEP(
+        Start->getType()->getPointerElementType(), NewBase,
+        makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr");
+
+    if (!Val->getType()->isPointerTy()) {
+      Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(),
+                                              Val->getName() + ".conv");
+      GEP = Cast;
+    }
+    Val->replaceAllUsesWith(GEP);
+  }
+
+  return NewInsts[Start];
+}
+
+/// Looks through GEPs, IntToPtrInsts and PtrToIntInsts in order to express
+/// the input Value as a constant indexed GEP. Returns a pair containing
+/// the GEPs Pointer and Index.
+static std::pair<Value *, Value *>
+getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
+  Type *IndexType = IntegerType::get(V->getContext(),
+                                     DL.getPointerTypeSizeInBits(V->getType()));
+
+  Constant *Index = ConstantInt::getNullValue(IndexType);
+  while (true) {
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      // We accept only inbouds GEPs here to exclude the possibility of
+      // overflow.
+      if (!GEP->isInBounds())
+        break;
+      if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 &&
+          GEP->getType() == V->getType()) {
+        V = GEP->getOperand(0);
+        Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1));
+        Index = ConstantExpr::getAdd(
+            Index, ConstantExpr::getSExtOrBitCast(GEPIndex, IndexType));
+        continue;
+      }
+      break;
+    }
+    if (auto *CI = dyn_cast<IntToPtrInst>(V)) {
+      if (!CI->isNoopCast(DL))
+        break;
+      V = CI->getOperand(0);
+      continue;
+    }
+    if (auto *CI = dyn_cast<PtrToIntInst>(V)) {
+      if (!CI->isNoopCast(DL))
+        break;
+      V = CI->getOperand(0);
+      continue;
+    }
+    break;
+  }
+  return {V, Index};
+}
+
+/// Converts (CMP GEPLHS, RHS) if this change would make RHS a constant.
+/// We can look through PHIs, GEPs and casts in order to determine a common base
+/// between GEPLHS and RHS.
+static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
+                                              ICmpInst::Predicate Cond,
+                                              const DataLayout &DL) {
+  if (!GEPLHS->hasAllConstantIndices())
+    return nullptr;
+
+  // Make sure the pointers have the same type.
+  if (GEPLHS->getType() != RHS->getType())
+    return nullptr;
+
+  Value *PtrBase, *Index;
+  std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL);
+
+  // The set of nodes that will take part in this transformation.
+  SetVector<Value *> Nodes;
+
+  if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes))
+    return nullptr;
+
+  // We know we can re-write this as
+  //  ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)
+  // Since we've only looked through inbouds GEPs we know that we
+  // can't have overflow on either side. We can therefore re-write
+  // this as:
+  //   OFFSET1 cmp OFFSET2
+  Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes);
+
+  // RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written
+  // GEP having PtrBase as the pointer base, and has returned in NewRHS the
+  // offset. Since Index is the offset of LHS to the base pointer, we will now
+  // compare the offsets instead of comparing the pointers.
+  return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Index, NewRHS);
+}
+
+/// Fold comparisons between a GEP instruction and something else. At this point
+/// we know that the GEP is on the LHS of the comparison.
+Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
+                                       ICmpInst::Predicate Cond,
+                                       Instruction &I) {
+  // Don't transform signed compares of GEPs into index compares. Even if the
+  // GEP is inbounds, the final add of the base pointer can have signed overflow
+  // and would change the result of the icmp.
+  // e.g. "&foo[0] <s &foo[1]" can't be folded to "true" because "foo" could be
+  // the maximum signed value for the pointer type.
+  if (ICmpInst::isSigned(Cond))
+    return nullptr;
+
+  // Look through bitcasts and addrspacecasts. We do not however want to remove
+  // 0 GEPs.
+  if (!isa<GetElementPtrInst>(RHS))
+    RHS = RHS->stripPointerCasts();
+
+  Value *PtrBase = GEPLHS->getOperand(0);
+  if (PtrBase == RHS && GEPLHS->isInBounds()) {
+    // ((gep Ptr, OFFSET) cmp Ptr)   ---> (OFFSET cmp 0).
+    // This transformation (ignoring the base and scales) is valid because we
+    // know pointers can't overflow since the gep is inbounds.  See if we can
+    // output an optimized form.
+    Value *Offset = evaluateGEPOffsetExpression(GEPLHS, *this, DL);
+
+    // If not, synthesize the offset the hard way.
+    if (!Offset)
+      Offset = EmitGEPOffset(GEPLHS);
+    return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
+                        Constant::getNullValue(Offset->getType()));
+  } else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) {
+    // If the base pointers are different, but the indices are the same, just
+    // compare the base pointer.
+    if (PtrBase != GEPRHS->getOperand(0)) {
+      bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands();
+      IndicesTheSame &= GEPLHS->getOperand(0)->getType() ==
+                        GEPRHS->getOperand(0)->getType();
+      if (IndicesTheSame)
+        for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
+          if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
+            IndicesTheSame = false;
+            break;
+          }
+
+      // If all indices are the same, just compare the base pointers.
+      if (IndicesTheSame)
+        return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0));
+
+      // If we're comparing GEPs with two base pointers that only differ in type
+      // and both GEPs have only constant indices or just one use, then fold
+      // the compare with the adjusted indices.
+      if (GEPLHS->isInBounds() && GEPRHS->isInBounds() &&
+          (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) &&
+          (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) &&
+          PtrBase->stripPointerCasts() ==
+              GEPRHS->getOperand(0)->stripPointerCasts()) {
+        Value *LOffset = EmitGEPOffset(GEPLHS);
+        Value *ROffset = EmitGEPOffset(GEPRHS);
+
+        // If we looked through an addrspacecast between different sized address
+        // spaces, the LHS and RHS pointers are different sized
+        // integers. Truncate to the smaller one.
+        Type *LHSIndexTy = LOffset->getType();
+        Type *RHSIndexTy = ROffset->getType();
+        if (LHSIndexTy != RHSIndexTy) {
+          if (LHSIndexTy->getPrimitiveSizeInBits() <
+              RHSIndexTy->getPrimitiveSizeInBits()) {
+            ROffset = Builder.CreateTrunc(ROffset, LHSIndexTy);
+          } else
+            LOffset = Builder.CreateTrunc(LOffset, RHSIndexTy);
+        }
+
+        Value *Cmp = Builder.CreateICmp(ICmpInst::getSignedPredicate(Cond),
+                                        LOffset, ROffset);
+        return replaceInstUsesWith(I, Cmp);
+      }
+
+      // Otherwise, the base pointers are different and the indices are
+      // different. Try convert this to an indexed compare by looking through
+      // PHIs/casts.
+      return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
+    }
+
+    // If one of the GEPs has all zero indices, recurse.
+    if (GEPLHS->hasAllZeroIndices())
+      return foldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
+                         ICmpInst::getSwappedPredicate(Cond), I);
+
+    // If the other GEP has all zero indices, recurse.
+    if (GEPRHS->hasAllZeroIndices())
+      return foldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
+
+    bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
+    if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) {
+      // If the GEPs only differ by one index, compare it.
+      unsigned NumDifferences = 0;  // Keep track of # differences.
+      unsigned DiffOperand = 0;     // The operand that differs.
+      for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
+        if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
+          if (GEPLHS->getOperand(i)->getType()->getPrimitiveSizeInBits() !=
+                   GEPRHS->getOperand(i)->getType()->getPrimitiveSizeInBits()) {
+            // Irreconcilable differences.
+            NumDifferences = 2;
+            break;
+          } else {
+            if (NumDifferences++) break;
+            DiffOperand = i;
+          }
+        }
+
+      if (NumDifferences == 0)   // SAME GEP?
+        return replaceInstUsesWith(I, // No comparison is needed here.
+                             Builder.getInt1(ICmpInst::isTrueWhenEqual(Cond)));
+
+      else if (NumDifferences == 1 && GEPsInBounds) {
+        Value *LHSV = GEPLHS->getOperand(DiffOperand);
+        Value *RHSV = GEPRHS->getOperand(DiffOperand);
+        // Make sure we do a signed comparison here.
+        return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV);
+      }
+    }
+
+    // Only lower this if the icmp is the only user of the GEP or if we expect
+    // the result to fold to a constant!
+    if (GEPsInBounds && (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) &&
+        (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) {
+      // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)  --->  (OFFSET1 cmp OFFSET2)
+      Value *L = EmitGEPOffset(GEPLHS);
+      Value *R = EmitGEPOffset(GEPRHS);
+      return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
+    }
+  }
+
+  // Try convert this to an indexed compare by looking through PHIs/casts as a
+  // last resort.
+  return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
+}
+
+Instruction *InstCombiner::foldAllocaCmp(ICmpInst &ICI,
+                                         const AllocaInst *Alloca,
+                                         const Value *Other) {
+  assert(ICI.isEquality() && "Cannot fold non-equality comparison.");
+
+  // It would be tempting to fold away comparisons between allocas and any
+  // pointer not based on that alloca (e.g. an argument). However, even
+  // though such pointers cannot alias, they can still compare equal.
+  //
+  // But LLVM doesn't specify where allocas get their memory, so if the alloca
+  // doesn't escape we can argue that it's impossible to guess its value, and we
+  // can therefore act as if any such guesses are wrong.
+  //
+  // The code below checks that the alloca doesn't escape, and that it's only
+  // used in a comparison once (the current instruction). The
+  // single-comparison-use condition ensures that we're trivially folding all
+  // comparisons against the alloca consistently, and avoids the risk of
+  // erroneously folding a comparison of the pointer with itself.
+
+  unsigned MaxIter = 32; // Break cycles and bound to constant-time.
+
+  SmallVector<const Use *, 32> Worklist;
+  for (const Use &U : Alloca->uses()) {
+    if (Worklist.size() >= MaxIter)
+      return nullptr;
+    Worklist.push_back(&U);
+  }
+
+  unsigned NumCmps = 0;
+  while (!Worklist.empty()) {
+    assert(Worklist.size() <= MaxIter);
+    const Use *U = Worklist.pop_back_val();
+    const Value *V = U->getUser();
+    --MaxIter;
+
+    if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V) || isa<PHINode>(V) ||
+        isa<SelectInst>(V)) {
+      // Track the uses.
+    } else if (isa<LoadInst>(V)) {
+      // Loading from the pointer doesn't escape it.
+      continue;
+    } else if (const auto *SI = dyn_cast<StoreInst>(V)) {
+      // Storing *to* the pointer is fine, but storing the pointer escapes it.
+      if (SI->getValueOperand() == U->get())
+        return nullptr;
+      continue;
+    } else if (isa<ICmpInst>(V)) {
+      if (NumCmps++)
+        return nullptr; // Found more than one cmp.
+      continue;
+    } else if (const auto *Intrin = dyn_cast<IntrinsicInst>(V)) {
+      switch (Intrin->getIntrinsicID()) {
+        // These intrinsics don't escape or compare the pointer. Memset is safe
+        // because we don't allow ptrtoint. Memcpy and memmove are safe because
+        // we don't allow stores, so src cannot point to V.
+        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
+        case Intrinsic::dbg_declare: case Intrinsic::dbg_value:
+        case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset:
+          continue;
+        default:
+          return nullptr;
+      }
+    } else {
+      return nullptr;
+    }
+    for (const Use &U : V->uses()) {
+      if (Worklist.size() >= MaxIter)
+        return nullptr;
+      Worklist.push_back(&U);
+    }
+  }
+
+  Type *CmpTy = CmpInst::makeCmpResultType(Other->getType());
+  return replaceInstUsesWith(
+      ICI,
+      ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate())));
+}
+
+/// Fold "icmp pred (X+CI), X".
+Instruction *InstCombiner::foldICmpAddOpConst(Instruction &ICI,
+                                              Value *X, ConstantInt *CI,
+                                              ICmpInst::Predicate Pred) {
+  // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
+  // so the values can never be equal.  Similarly for all other "or equals"
+  // operators.
+
+  // (X+1) <u X        --> X >u (MAXUINT-1)        --> X == 255
+  // (X+2) <u X        --> X >u (MAXUINT-2)        --> X > 253
+  // (X+MAXUINT) <u X  --> X >u (MAXUINT-MAXUINT)  --> X != 0
+  if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) {
+    Value *R =
+      ConstantExpr::getSub(ConstantInt::getAllOnesValue(CI->getType()), CI);
+    return new ICmpInst(ICmpInst::ICMP_UGT, X, R);
+  }
+
+  // (X+1) >u X        --> X <u (0-1)        --> X != 255
+  // (X+2) >u X        --> X <u (0-2)        --> X <u 254
+  // (X+MAXUINT) >u X  --> X <u (0-MAXUINT)  --> X <u 1  --> X == 0
+  if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE)
+    return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantExpr::getNeg(CI));
+
+  unsigned BitWidth = CI->getType()->getPrimitiveSizeInBits();
+  ConstantInt *SMax = ConstantInt::get(X->getContext(),
+                                       APInt::getSignedMaxValue(BitWidth));
+
+  // (X+ 1) <s X       --> X >s (MAXSINT-1)          --> X == 127
+  // (X+ 2) <s X       --> X >s (MAXSINT-2)          --> X >s 125
+  // (X+MAXSINT) <s X  --> X >s (MAXSINT-MAXSINT)    --> X >s 0
+  // (X+MINSINT) <s X  --> X >s (MAXSINT-MINSINT)    --> X >s -1
+  // (X+ -2) <s X      --> X >s (MAXSINT- -2)        --> X >s 126
+  // (X+ -1) <s X      --> X >s (MAXSINT- -1)        --> X != 127
+  if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
+    return new ICmpInst(ICmpInst::ICMP_SGT, X, ConstantExpr::getSub(SMax, CI));
+
+  // (X+ 1) >s X       --> X <s (MAXSINT-(1-1))       --> X != 127
+  // (X+ 2) >s X       --> X <s (MAXSINT-(2-1))       --> X <s 126
+  // (X+MAXSINT) >s X  --> X <s (MAXSINT-(MAXSINT-1)) --> X <s 1
+  // (X+MINSINT) >s X  --> X <s (MAXSINT-(MINSINT-1)) --> X <s -2
+  // (X+ -2) >s X      --> X <s (MAXSINT-(-2-1))      --> X <s -126
+  // (X+ -1) >s X      --> X <s (MAXSINT-(-1-1))      --> X == -128
+
+  assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE);
+  Constant *C = Builder.getInt(CI->getValue() - 1);
+  return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantExpr::getSub(SMax, C));
+}
+
+/// Handle "(icmp eq/ne (ashr/lshr AP2, A), AP1)" ->
+/// (icmp eq/ne A, Log2(AP2/AP1)) ->
+/// (icmp eq/ne A, Log2(AP2) - Log2(AP1)).
+Instruction *InstCombiner::foldICmpShrConstConst(ICmpInst &I, Value *A,
+                                                 const APInt &AP1,
+                                                 const APInt &AP2) {
+  assert(I.isEquality() && "Cannot fold icmp gt/lt");
+
+  auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
+    if (I.getPredicate() == I.ICMP_NE)
+      Pred = CmpInst::getInversePredicate(Pred);
+    return new ICmpInst(Pred, LHS, RHS);
+  };
+
+  // Don't bother doing any work for cases which InstSimplify handles.
+  if (AP2.isNullValue())
+    return nullptr;
+
+  bool IsAShr = isa<AShrOperator>(I.getOperand(0));
+  if (IsAShr) {
+    if (AP2.isAllOnesValue())
+      return nullptr;
+    if (AP2.isNegative() != AP1.isNegative())
+      return nullptr;
+    if (AP2.sgt(AP1))
+      return nullptr;
+  }
+
+  if (!AP1)
+    // 'A' must be large enough to shift out the highest set bit.
+    return getICmp(I.ICMP_UGT, A,
+                   ConstantInt::get(A->getType(), AP2.logBase2()));
+
+  if (AP1 == AP2)
+    return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
+
+  int Shift;
+  if (IsAShr && AP1.isNegative())
+    Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes();
+  else
+    Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros();
+
+  if (Shift > 0) {
+    if (IsAShr && AP1 == AP2.ashr(Shift)) {
+      // There are multiple solutions if we are comparing against -1 and the LHS
+      // of the ashr is not a power of two.
+      if (AP1.isAllOnesValue() && !AP2.isPowerOf2())
+        return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift));
+      return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+    } else if (AP1 == AP2.lshr(Shift)) {
+      return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+    }
+  }
+
+  // Shifting const2 will never be equal to const1.
+  // FIXME: This should always be handled by InstSimplify?
+  auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE);
+  return replaceInstUsesWith(I, TorF);
+}
+
+/// Handle "(icmp eq/ne (shl AP2, A), AP1)" ->
+/// (icmp eq/ne A, TrailingZeros(AP1) - TrailingZeros(AP2)).
+Instruction *InstCombiner::foldICmpShlConstConst(ICmpInst &I, Value *A,
+                                                 const APInt &AP1,
+                                                 const APInt &AP2) {
+  assert(I.isEquality() && "Cannot fold icmp gt/lt");
+
+  auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
+    if (I.getPredicate() == I.ICMP_NE)
+      Pred = CmpInst::getInversePredicate(Pred);
+    return new ICmpInst(Pred, LHS, RHS);
+  };
+
+  // Don't bother doing any work for cases which InstSimplify handles.
+  if (AP2.isNullValue())
+    return nullptr;
+
+  unsigned AP2TrailingZeros = AP2.countTrailingZeros();
+
+  if (!AP1 && AP2TrailingZeros != 0)
+    return getICmp(
+        I.ICMP_UGE, A,
+        ConstantInt::get(A->getType(), AP2.getBitWidth() - AP2TrailingZeros));
+
+  if (AP1 == AP2)
+    return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
+
+  // Get the distance between the lowest bits that are set.
+  int Shift = AP1.countTrailingZeros() - AP2TrailingZeros;
+
+  if (Shift > 0 && AP2.shl(Shift) == AP1)
+    return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+
+  // Shifting const2 will never be equal to const1.
+  // FIXME: This should always be handled by InstSimplify?
+  auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE);
+  return replaceInstUsesWith(I, TorF);
+}
+
+/// The caller has matched a pattern of the form:
+///   I = icmp ugt (add (add A, B), CI2), CI1
+/// If this is of the form:
+///   sum = a + b
+///   if (sum+128 >u 255)
+/// Then replace it with llvm.sadd.with.overflow.i8.
+///
+static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
+                                          ConstantInt *CI2, ConstantInt *CI1,
+                                          InstCombiner &IC) {
+  // The transformation we're trying to do here is to transform this into an
+  // llvm.sadd.with.overflow.  To do this, we have to replace the original add
+  // with a narrower add, and discard the add-with-constant that is part of the
+  // range check (if we can't eliminate it, this isn't profitable).
+
+  // In order to eliminate the add-with-constant, the compare can be its only
+  // use.
+  Instruction *AddWithCst = cast<Instruction>(I.getOperand(0));
+  if (!AddWithCst->hasOneUse())
+    return nullptr;
+
+  // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow.
+  if (!CI2->getValue().isPowerOf2())
+    return nullptr;
+  unsigned NewWidth = CI2->getValue().countTrailingZeros();
+  if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31)
+    return nullptr;
+
+  // The width of the new add formed is 1 more than the bias.
+  ++NewWidth;
+
+  // Check to see that CI1 is an all-ones value with NewWidth bits.
+  if (CI1->getBitWidth() == NewWidth ||
+      CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth))
+    return nullptr;
+
+  // This is only really a signed overflow check if the inputs have been
+  // sign-extended; check for that condition. For example, if CI2 is 2^31 and
+  // the operands of the add are 64 bits wide, we need at least 33 sign bits.
+  unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1;
+  if (IC.ComputeNumSignBits(A, 0, &I) < NeededSignBits ||
+      IC.ComputeNumSignBits(B, 0, &I) < NeededSignBits)
+    return nullptr;
+
+  // In order to replace the original add with a narrower
+  // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant
+  // and truncates that discard the high bits of the add.  Verify that this is
+  // the case.
+  Instruction *OrigAdd = cast<Instruction>(AddWithCst->getOperand(0));
+  for (User *U : OrigAdd->users()) {
+    if (U == AddWithCst)
+      continue;
+
+    // Only accept truncates for now.  We would really like a nice recursive
+    // predicate like SimplifyDemandedBits, but which goes downwards the use-def
+    // chain to see which bits of a value are actually demanded.  If the
+    // original add had another add which was then immediately truncated, we
+    // could still do the transformation.
+    TruncInst *TI = dyn_cast<TruncInst>(U);
+    if (!TI || TI->getType()->getPrimitiveSizeInBits() > NewWidth)
+      return nullptr;
+  }
+
+  // If the pattern matches, truncate the inputs to the narrower type and
+  // use the sadd_with_overflow intrinsic to efficiently compute both the
+  // result and the overflow bit.
+  Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth);
+  Value *F = Intrinsic::getDeclaration(I.getModule(),
+                                       Intrinsic::sadd_with_overflow, NewType);
+
+  InstCombiner::BuilderTy &Builder = IC.Builder;
+
+  // Put the new code above the original add, in case there are any uses of the
+  // add between the add and the compare.
+  Builder.SetInsertPoint(OrigAdd);
+
+  Value *TruncA = Builder.CreateTrunc(A, NewType, A->getName() + ".trunc");
+  Value *TruncB = Builder.CreateTrunc(B, NewType, B->getName() + ".trunc");
+  CallInst *Call = Builder.CreateCall(F, {TruncA, TruncB}, "sadd");
+  Value *Add = Builder.CreateExtractValue(Call, 0, "sadd.result");
+  Value *ZExt = Builder.CreateZExt(Add, OrigAdd->getType());
+
+  // The inner add was the result of the narrow add, zero extended to the
+  // wider type.  Replace it with the result computed by the intrinsic.
+  IC.replaceInstUsesWith(*OrigAdd, ZExt);
+
+  // The original icmp gets replaced with the overflow value.
+  return ExtractValueInst::Create(Call, 1, "sadd.overflow");
+}
+
+// Fold icmp Pred X, C.
+Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
+  CmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *X = Cmp.getOperand(0);
+
+  const APInt *C;
+  if (!match(Cmp.getOperand(1), m_APInt(C)))
+    return nullptr;
+
+  Value *A = nullptr, *B = nullptr;
+
+  // Match the following pattern, which is a common idiom when writing
+  // overflow-safe integer arithmetic functions. The source performs an addition
+  // in wider type and explicitly checks for overflow using comparisons against
+  // INT_MIN and INT_MAX. Simplify by using the sadd_with_overflow intrinsic.
+  //
+  // TODO: This could probably be generalized to handle other overflow-safe
+  // operations if we worked out the formulas to compute the appropriate magic
+  // constants.
+  //
+  // sum = a + b
+  // if (sum+128 >u 255)  ...  -> llvm.sadd.with.overflow.i8
+  {
+    ConstantInt *CI2; // I = icmp ugt (add (add A, B), CI2), CI
+    if (Pred == ICmpInst::ICMP_UGT &&
+        match(X, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2))))
+      if (Instruction *Res = processUGT_ADDCST_ADD(
+              Cmp, A, B, CI2, cast<ConstantInt>(Cmp.getOperand(1)), *this))
+        return Res;
+  }
+
+  // (icmp sgt smin(PosA, B) 0) -> (icmp sgt B 0)
+  if (C->isNullValue() && Pred == ICmpInst::ICMP_SGT) {
+    SelectPatternResult SPR = matchSelectPattern(X, A, B);
+    if (SPR.Flavor == SPF_SMIN) {
+      if (isKnownPositive(A, DL, 0, &AC, &Cmp, &DT))
+        return new ICmpInst(Pred, B, Cmp.getOperand(1));
+      if (isKnownPositive(B, DL, 0, &AC, &Cmp, &DT))
+        return new ICmpInst(Pred, A, Cmp.getOperand(1));
+    }
+  }
+
+  // FIXME: Use m_APInt to allow folds for splat constants.
+  ConstantInt *CI = dyn_cast<ConstantInt>(Cmp.getOperand(1));
+  if (!CI)
+    return nullptr;
+
+  // Canonicalize icmp instructions based on dominating conditions.
+  BasicBlock *Parent = Cmp.getParent();
+  BasicBlock *Dom = Parent->getSinglePredecessor();
+  auto *BI = Dom ? dyn_cast<BranchInst>(Dom->getTerminator()) : nullptr;
+  ICmpInst::Predicate Pred2;
+  BasicBlock *TrueBB, *FalseBB;
+  ConstantInt *CI2;
+  if (BI && match(BI, m_Br(m_ICmp(Pred2, m_Specific(X), m_ConstantInt(CI2)),
+                           TrueBB, FalseBB)) &&
+      TrueBB != FalseBB) {
+    ConstantRange CR =
+        ConstantRange::makeAllowedICmpRegion(Pred, CI->getValue());
+    ConstantRange DominatingCR =
+        (Parent == TrueBB)
+            ? ConstantRange::makeExactICmpRegion(Pred2, CI2->getValue())
+            : ConstantRange::makeExactICmpRegion(
+                  CmpInst::getInversePredicate(Pred2), CI2->getValue());
+    ConstantRange Intersection = DominatingCR.intersectWith(CR);
+    ConstantRange Difference = DominatingCR.difference(CR);
+    if (Intersection.isEmptySet())
+      return replaceInstUsesWith(Cmp, Builder.getFalse());
+    if (Difference.isEmptySet())
+      return replaceInstUsesWith(Cmp, Builder.getTrue());
+
+    // If this is a normal comparison, it demands all bits. If it is a sign
+    // bit comparison, it only demands the sign bit.
+    bool UnusedBit;
+    bool IsSignBit = isSignBitCheck(Pred, CI->getValue(), UnusedBit);
+
+    // Canonicalizing a sign bit comparison that gets used in a branch,
+    // pessimizes codegen by generating branch on zero instruction instead
+    // of a test and branch. So we avoid canonicalizing in such situations
+    // because test and branch instruction has better branch displacement
+    // than compare and branch instruction.
+    if (Cmp.isEquality() || (IsSignBit && hasBranchUse(Cmp)))
+      return nullptr;
+
+    if (auto *AI = Intersection.getSingleElement())
+      return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder.getInt(*AI));
+    if (auto *AD = Difference.getSingleElement())
+      return new ICmpInst(ICmpInst::ICMP_NE, X, Builder.getInt(*AD));
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (trunc X, Y), C.
+Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp,
+                                                 Instruction *Trunc,
+                                                 const APInt *C) {
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *X = Trunc->getOperand(0);
+  if (C->isOneValue() && C->getBitWidth() > 1) {
+    // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1
+    Value *V = nullptr;
+    if (Pred == ICmpInst::ICMP_SLT && match(X, m_Signum(m_Value(V))))
+      return new ICmpInst(ICmpInst::ICMP_SLT, V,
+                          ConstantInt::get(V->getType(), 1));
+  }
+
+  if (Cmp.isEquality() && Trunc->hasOneUse()) {
+    // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all
+    // of the high bits truncated out of x are known.
+    unsigned DstBits = Trunc->getType()->getScalarSizeInBits(),
+             SrcBits = X->getType()->getScalarSizeInBits();
+    KnownBits Known = computeKnownBits(X, 0, &Cmp);
+
+    // If all the high bits are known, we can do this xform.
+    if ((Known.Zero | Known.One).countLeadingOnes() >= SrcBits - DstBits) {
+      // Pull in the high bits from known-ones set.
+      APInt NewRHS = C->zext(SrcBits);
+      NewRHS |= Known.One & APInt::getHighBitsSet(SrcBits, SrcBits - DstBits);
+      return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), NewRHS));
+    }
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (xor X, Y), C.
+Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp,
+                                               BinaryOperator *Xor,
+                                               const APInt *C) {
+  Value *X = Xor->getOperand(0);
+  Value *Y = Xor->getOperand(1);
+  const APInt *XorC;
+  if (!match(Y, m_APInt(XorC)))
+    return nullptr;
+
+  // If this is a comparison that tests the signbit (X < 0) or (x > -1),
+  // fold the xor.
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  if ((Pred == ICmpInst::ICMP_SLT && C->isNullValue()) ||
+      (Pred == ICmpInst::ICMP_SGT && C->isAllOnesValue())) {
+
+    // If the sign bit of the XorCst is not set, there is no change to
+    // the operation, just stop using the Xor.
+    if (!XorC->isNegative()) {
+      Cmp.setOperand(0, X);
+      Worklist.Add(Xor);
+      return &Cmp;
+    }
+
+    // Was the old condition true if the operand is positive?
+    bool isTrueIfPositive = Pred == ICmpInst::ICMP_SGT;
+
+    // If so, the new one isn't.
+    isTrueIfPositive ^= true;
+
+    Constant *CmpConstant = cast<Constant>(Cmp.getOperand(1));
+    if (isTrueIfPositive)
+      return new ICmpInst(ICmpInst::ICMP_SGT, X, SubOne(CmpConstant));
+    else
+      return new ICmpInst(ICmpInst::ICMP_SLT, X, AddOne(CmpConstant));
+  }
+
+  if (Xor->hasOneUse()) {
+    // (icmp u/s (xor X SignMask), C) -> (icmp s/u X, (xor C SignMask))
+    if (!Cmp.isEquality() && XorC->isSignMask()) {
+      Pred = Cmp.isSigned() ? Cmp.getUnsignedPredicate()
+                            : Cmp.getSignedPredicate();
+      return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), *C ^ *XorC));
+    }
+
+    // (icmp u/s (xor X ~SignMask), C) -> (icmp s/u X, (xor C ~SignMask))
+    if (!Cmp.isEquality() && XorC->isMaxSignedValue()) {
+      Pred = Cmp.isSigned() ? Cmp.getUnsignedPredicate()
+                            : Cmp.getSignedPredicate();
+      Pred = Cmp.getSwappedPredicate(Pred);
+      return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), *C ^ *XorC));
+    }
+  }
+
+  // (icmp ugt (xor X, C), ~C) -> (icmp ult X, C)
+  //   iff -C is a power of 2
+  if (Pred == ICmpInst::ICMP_UGT && *XorC == ~(*C) && (*C + 1).isPowerOf2())
+    return new ICmpInst(ICmpInst::ICMP_ULT, X, Y);
+
+  // (icmp ult (xor X, C), -C) -> (icmp uge X, C)
+  //   iff -C is a power of 2
+  if (Pred == ICmpInst::ICMP_ULT && *XorC == -(*C) && C->isPowerOf2())
+    return new ICmpInst(ICmpInst::ICMP_UGE, X, Y);
+
+  return nullptr;
+}
+
+/// Fold icmp (and (sh X, Y), C2), C1.
+Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
+                                            const APInt *C1, const APInt *C2) {
+  BinaryOperator *Shift = dyn_cast<BinaryOperator>(And->getOperand(0));
+  if (!Shift || !Shift->isShift())
+    return nullptr;
+
+  // If this is: (X >> C3) & C2 != C1 (where any shift and any compare could
+  // exist), turn it into (X & (C2 << C3)) != (C1 << C3). This happens a LOT in
+  // code produced by the clang front-end, for bitfield access.
+  // This seemingly simple opportunity to fold away a shift turns out to be
+  // rather complicated. See PR17827 for details.
+  unsigned ShiftOpcode = Shift->getOpcode();
+  bool IsShl = ShiftOpcode == Instruction::Shl;
+  const APInt *C3;
+  if (match(Shift->getOperand(1), m_APInt(C3))) {
+    bool CanFold = false;
+    if (ShiftOpcode == Instruction::AShr) {
+      // There may be some constraints that make this possible, but nothing
+      // simple has been discovered yet.
+      CanFold = false;
+    } else if (ShiftOpcode == Instruction::Shl) {
+      // For a left shift, we can fold if the comparison is not signed. We can
+      // also fold a signed comparison if the mask value and comparison value
+      // are not negative. These constraints may not be obvious, but we can
+      // prove that they are correct using an SMT solver.
+      if (!Cmp.isSigned() || (!C2->isNegative() && !C1->isNegative()))
+        CanFold = true;
+    } else if (ShiftOpcode == Instruction::LShr) {
+      // For a logical right shift, we can fold if the comparison is not signed.
+      // We can also fold a signed comparison if the shifted mask value and the
+      // shifted comparison value are not negative. These constraints may not be
+      // obvious, but we can prove that they are correct using an SMT solver.
+      if (!Cmp.isSigned() ||
+          (!C2->shl(*C3).isNegative() && !C1->shl(*C3).isNegative()))
+        CanFold = true;
+    }
+
+    if (CanFold) {
+      APInt NewCst = IsShl ? C1->lshr(*C3) : C1->shl(*C3);
+      APInt SameAsC1 = IsShl ? NewCst.shl(*C3) : NewCst.lshr(*C3);
+      // Check to see if we are shifting out any of the bits being compared.
+      if (SameAsC1 != *C1) {
+        // If we shifted bits out, the fold is not going to work out. As a
+        // special case, check to see if this means that the result is always
+        // true or false now.
+        if (Cmp.getPredicate() == ICmpInst::ICMP_EQ)
+          return replaceInstUsesWith(Cmp, ConstantInt::getFalse(Cmp.getType()));
+        if (Cmp.getPredicate() == ICmpInst::ICMP_NE)
+          return replaceInstUsesWith(Cmp, ConstantInt::getTrue(Cmp.getType()));
+      } else {
+        Cmp.setOperand(1, ConstantInt::get(And->getType(), NewCst));
+        APInt NewAndCst = IsShl ? C2->lshr(*C3) : C2->shl(*C3);
+        And->setOperand(1, ConstantInt::get(And->getType(), NewAndCst));
+        And->setOperand(0, Shift->getOperand(0));
+        Worklist.Add(Shift); // Shift is dead.
+        return &Cmp;
+      }
+    }
+  }
+
+  // Turn ((X >> Y) & C2) == 0  into  (X & (C2 << Y)) == 0.  The latter is
+  // preferable because it allows the C2 << Y expression to be hoisted out of a
+  // loop if Y is invariant and X is not.
+  if (Shift->hasOneUse() && C1->isNullValue() && Cmp.isEquality() &&
+      !Shift->isArithmeticShift() && !isa<Constant>(Shift->getOperand(0))) {
+    // Compute C2 << Y.
+    Value *NewShift =
+        IsShl ? Builder.CreateLShr(And->getOperand(1), Shift->getOperand(1))
+              : Builder.CreateShl(And->getOperand(1), Shift->getOperand(1));
+
+    // Compute X & (C2 << Y).
+    Value *NewAnd = Builder.CreateAnd(Shift->getOperand(0), NewShift);
+    Cmp.setOperand(0, NewAnd);
+    return &Cmp;
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (and X, C2), C1.
+Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
+                                                 BinaryOperator *And,
+                                                 const APInt *C1) {
+  const APInt *C2;
+  if (!match(And->getOperand(1), m_APInt(C2)))
+    return nullptr;
+
+  if (!And->hasOneUse() || !And->getOperand(0)->hasOneUse())
+    return nullptr;
+
+  // If the LHS is an 'and' of a truncate and we can widen the and/compare to
+  // the input width without changing the value produced, eliminate the cast:
+  //
+  // icmp (and (trunc W), C2), C1 -> icmp (and W, C2'), C1'
+  //
+  // We can do this transformation if the constants do not have their sign bits
+  // set or if it is an equality comparison. Extending a relational comparison
+  // when we're checking the sign bit would not work.
+  Value *W;
+  if (match(And->getOperand(0), m_Trunc(m_Value(W))) &&
+      (Cmp.isEquality() || (!C1->isNegative() && !C2->isNegative()))) {
+    // TODO: Is this a good transform for vectors? Wider types may reduce
+    // throughput. Should this transform be limited (even for scalars) by using
+    // shouldChangeType()?
+    if (!Cmp.getType()->isVectorTy()) {
+      Type *WideType = W->getType();
+      unsigned WideScalarBits = WideType->getScalarSizeInBits();
+      Constant *ZextC1 = ConstantInt::get(WideType, C1->zext(WideScalarBits));
+      Constant *ZextC2 = ConstantInt::get(WideType, C2->zext(WideScalarBits));
+      Value *NewAnd = Builder.CreateAnd(W, ZextC2, And->getName());
+      return new ICmpInst(Cmp.getPredicate(), NewAnd, ZextC1);
+    }
+  }
+
+  if (Instruction *I = foldICmpAndShift(Cmp, And, C1, C2))
+    return I;
+
+  // (icmp pred (and (or (lshr A, B), A), 1), 0) -->
+  // (icmp pred (and A, (or (shl 1, B), 1), 0))
+  //
+  // iff pred isn't signed
+  if (!Cmp.isSigned() && C1->isNullValue() &&
+      match(And->getOperand(1), m_One())) {
+    Constant *One = cast<Constant>(And->getOperand(1));
+    Value *Or = And->getOperand(0);
+    Value *A, *B, *LShr;
+    if (match(Or, m_Or(m_Value(LShr), m_Value(A))) &&
+        match(LShr, m_LShr(m_Specific(A), m_Value(B)))) {
+      unsigned UsesRemoved = 0;
+      if (And->hasOneUse())
+        ++UsesRemoved;
+      if (Or->hasOneUse())
+        ++UsesRemoved;
+      if (LShr->hasOneUse())
+        ++UsesRemoved;
+
+      // Compute A & ((1 << B) | 1)
+      Value *NewOr = nullptr;
+      if (auto *C = dyn_cast<Constant>(B)) {
+        if (UsesRemoved >= 1)
+          NewOr = ConstantExpr::getOr(ConstantExpr::getNUWShl(One, C), One);
+      } else {
+        if (UsesRemoved >= 3)
+          NewOr = Builder.CreateOr(Builder.CreateShl(One, B, LShr->getName(),
+                                                     /*HasNUW=*/true),
+                                   One, Or->getName());
+      }
+      if (NewOr) {
+        Value *NewAnd = Builder.CreateAnd(A, NewOr, And->getName());
+        Cmp.setOperand(0, NewAnd);
+        return &Cmp;
+      }
+    }
+  }
+
+  // (X & C2) > C1 --> (X & C2) != 0, if any bit set in (X & C2) will produce a
+  // result greater than C1.
+  unsigned NumTZ = C2->countTrailingZeros();
+  if (Cmp.getPredicate() == ICmpInst::ICMP_UGT && NumTZ < C2->getBitWidth() &&
+      APInt::getOneBitSet(C2->getBitWidth(), NumTZ).ugt(*C1)) {
+    Constant *Zero = Constant::getNullValue(And->getType());
+    return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (and X, Y), C.
+Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp,
+                                               BinaryOperator *And,
+                                               const APInt *C) {
+  if (Instruction *I = foldICmpAndConstConst(Cmp, And, C))
+    return I;
+
+  // TODO: These all require that Y is constant too, so refactor with the above.
+
+  // Try to optimize things like "A[i] & 42 == 0" to index computations.
+  Value *X = And->getOperand(0);
+  Value *Y = And->getOperand(1);
+  if (auto *LI = dyn_cast<LoadInst>(X))
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0)))
+      if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+        if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+            !LI->isVolatile() && isa<ConstantInt>(Y)) {
+          ConstantInt *C2 = cast<ConstantInt>(Y);
+          if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, Cmp, C2))
+            return Res;
+        }
+
+  if (!Cmp.isEquality())
+    return nullptr;
+
+  // X & -C == -C -> X >  u ~C
+  // X & -C != -C -> X <= u ~C
+  //   iff C is a power of 2
+  if (Cmp.getOperand(1) == Y && (-(*C)).isPowerOf2()) {
+    auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT
+                                                          : CmpInst::ICMP_ULE;
+    return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1))));
+  }
+
+  // (X & C2) == 0 -> (trunc X) >= 0
+  // (X & C2) != 0 -> (trunc X) <  0
+  //   iff C2 is a power of 2 and it masks the sign bit of a legal integer type.
+  const APInt *C2;
+  if (And->hasOneUse() && C->isNullValue() && match(Y, m_APInt(C2))) {
+    int32_t ExactLogBase2 = C2->exactLogBase2();
+    if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
+      Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1);
+      if (And->getType()->isVectorTy())
+        NTy = VectorType::get(NTy, And->getType()->getVectorNumElements());
+      Value *Trunc = Builder.CreateTrunc(X, NTy);
+      auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE
+                                                            : CmpInst::ICMP_SLT;
+      return new ICmpInst(NewPred, Trunc, Constant::getNullValue(NTy));
+    }
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (or X, Y), C.
+Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
+                                              const APInt *C) {
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  if (C->isOneValue()) {
+    // icmp slt signum(V) 1 --> icmp slt V, 1
+    Value *V = nullptr;
+    if (Pred == ICmpInst::ICMP_SLT && match(Or, m_Signum(m_Value(V))))
+      return new ICmpInst(ICmpInst::ICMP_SLT, V,
+                          ConstantInt::get(V->getType(), 1));
+  }
+
+  // X | C == C --> X <=u C
+  // X | C != C --> X  >u C
+  //   iff C+1 is a power of 2 (C is a bitmask of the low bits)
+  if (Cmp.isEquality() && Cmp.getOperand(1) == Or->getOperand(1) &&
+      (*C + 1).isPowerOf2()) {
+    Pred = (Pred == CmpInst::ICMP_EQ) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
+    return new ICmpInst(Pred, Or->getOperand(0), Or->getOperand(1));
+  }
+
+  if (!Cmp.isEquality() || !C->isNullValue() || !Or->hasOneUse())
+    return nullptr;
+
+  Value *P, *Q;
+  if (match(Or, m_Or(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Value(Q))))) {
+    // Simplify icmp eq (or (ptrtoint P), (ptrtoint Q)), 0
+    // -> and (icmp eq P, null), (icmp eq Q, null).
+    Value *CmpP =
+        Builder.CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType()));
+    Value *CmpQ =
+        Builder.CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType()));
+    auto LogicOpc = Pred == ICmpInst::Predicate::ICMP_EQ ? Instruction::And
+                                                         : Instruction::Or;
+    return BinaryOperator::Create(LogicOpc, CmpP, CmpQ);
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (mul X, Y), C.
+Instruction *InstCombiner::foldICmpMulConstant(ICmpInst &Cmp,
+                                               BinaryOperator *Mul,
+                                               const APInt *C) {
+  const APInt *MulC;
+  if (!match(Mul->getOperand(1), m_APInt(MulC)))
+    return nullptr;
+
+  // If this is a test of the sign bit and the multiply is sign-preserving with
+  // a constant operand, use the multiply LHS operand instead.
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  if (isSignTest(Pred, *C) && Mul->hasNoSignedWrap()) {
+    if (MulC->isNegative())
+      Pred = ICmpInst::getSwappedPredicate(Pred);
+    return new ICmpInst(Pred, Mul->getOperand(0),
+                        Constant::getNullValue(Mul->getType()));
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (shl 1, Y), C.
+static Instruction *foldICmpShlOne(ICmpInst &Cmp, Instruction *Shl,
+                                   const APInt *C) {
+  Value *Y;
+  if (!match(Shl, m_Shl(m_One(), m_Value(Y))))
+    return nullptr;
+
+  Type *ShiftType = Shl->getType();
+  uint32_t TypeBits = C->getBitWidth();
+  bool CIsPowerOf2 = C->isPowerOf2();
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  if (Cmp.isUnsigned()) {
+    // (1 << Y) pred C -> Y pred Log2(C)
+    if (!CIsPowerOf2) {
+      // (1 << Y) <  30 -> Y <= 4
+      // (1 << Y) <= 30 -> Y <= 4
+      // (1 << Y) >= 30 -> Y >  4
+      // (1 << Y) >  30 -> Y >  4
+      if (Pred == ICmpInst::ICMP_ULT)
+        Pred = ICmpInst::ICMP_ULE;
+      else if (Pred == ICmpInst::ICMP_UGE)
+        Pred = ICmpInst::ICMP_UGT;
+    }
+
+    // (1 << Y) >= 2147483648 -> Y >= 31 -> Y == 31
+    // (1 << Y) <  2147483648 -> Y <  31 -> Y != 31
+    unsigned CLog2 = C->logBase2();
+    if (CLog2 == TypeBits - 1) {
+      if (Pred == ICmpInst::ICMP_UGE)
+        Pred = ICmpInst::ICMP_EQ;
+      else if (Pred == ICmpInst::ICMP_ULT)
+        Pred = ICmpInst::ICMP_NE;
+    }
+    return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, CLog2));
+  } else if (Cmp.isSigned()) {
+    Constant *BitWidthMinusOne = ConstantInt::get(ShiftType, TypeBits - 1);
+    if (C->isAllOnesValue()) {
+      // (1 << Y) <= -1 -> Y == 31
+      if (Pred == ICmpInst::ICMP_SLE)
+        return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);
+
+      // (1 << Y) >  -1 -> Y != 31
+      if (Pred == ICmpInst::ICMP_SGT)
+        return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne);
+    } else if (!(*C)) {
+      // (1 << Y) <  0 -> Y == 31
+      // (1 << Y) <= 0 -> Y == 31
+      if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
+        return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);
+
+      // (1 << Y) >= 0 -> Y != 31
+      // (1 << Y) >  0 -> Y != 31
+      if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE)
+        return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne);
+    }
+  } else if (Cmp.isEquality() && CIsPowerOf2) {
+    return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, C->logBase2()));
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (shl X, Y), C.
+Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
+                                               BinaryOperator *Shl,
+                                               const APInt *C) {
+  const APInt *ShiftVal;
+  if (Cmp.isEquality() && match(Shl->getOperand(0), m_APInt(ShiftVal)))
+    return foldICmpShlConstConst(Cmp, Shl->getOperand(1), *C, *ShiftVal);
+
+  const APInt *ShiftAmt;
+  if (!match(Shl->getOperand(1), m_APInt(ShiftAmt)))
+    return foldICmpShlOne(Cmp, Shl, C);
+
+  // Check that the shift amount is in range. If not, don't perform undefined
+  // shifts. When the shift is visited, it will be simplified.
+  unsigned TypeBits = C->getBitWidth();
+  if (ShiftAmt->uge(TypeBits))
+    return nullptr;
+
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *X = Shl->getOperand(0);
+  Type *ShType = Shl->getType();
+
+  // NSW guarantees that we are only shifting out sign bits from the high bits,
+  // so we can ASHR the compare constant without needing a mask and eliminate
+  // the shift.
+  if (Shl->hasNoSignedWrap()) {
+    if (Pred == ICmpInst::ICMP_SGT) {
+      // icmp Pred (shl nsw X, ShiftAmt), C --> icmp Pred X, (C >>s ShiftAmt)
+      APInt ShiftedC = C->ashr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) {
+      // This is the same code as the SGT case, but assert the pre-condition
+      // that is needed for this to work with equality predicates.
+      assert(C->ashr(*ShiftAmt).shl(*ShiftAmt) == *C &&
+             "Compare known true or false was not folded");
+      APInt ShiftedC = C->ashr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if (Pred == ICmpInst::ICMP_SLT) {
+      // SLE is the same as above, but SLE is canonicalized to SLT, so convert:
+      // (X << S) <=s C is equiv to X <=s (C >> S) for all C
+      // (X << S) <s (C + 1) is equiv to X <s (C >> S) + 1 if C <s SMAX
+      // (X << S) <s C is equiv to X <s ((C - 1) >> S) + 1 if C >s SMIN
+      assert(!C->isMinSignedValue() && "Unexpected icmp slt");
+      APInt ShiftedC = (*C - 1).ashr(*ShiftAmt) + 1;
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    // If this is a signed comparison to 0 and the shift is sign preserving,
+    // use the shift LHS operand instead; isSignTest may change 'Pred', so only
+    // do that if we're sure to not continue on in this function.
+    if (isSignTest(Pred, *C))
+      return new ICmpInst(Pred, X, Constant::getNullValue(ShType));
+  }
+
+  // NUW guarantees that we are only shifting out zero bits from the high bits,
+  // so we can LSHR the compare constant without needing a mask and eliminate
+  // the shift.
+  if (Shl->hasNoUnsignedWrap()) {
+    if (Pred == ICmpInst::ICMP_UGT) {
+      // icmp Pred (shl nuw X, ShiftAmt), C --> icmp Pred X, (C >>u ShiftAmt)
+      APInt ShiftedC = C->lshr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) {
+      // This is the same code as the UGT case, but assert the pre-condition
+      // that is needed for this to work with equality predicates.
+      assert(C->lshr(*ShiftAmt).shl(*ShiftAmt) == *C &&
+             "Compare known true or false was not folded");
+      APInt ShiftedC = C->lshr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if (Pred == ICmpInst::ICMP_ULT) {
+      // ULE is the same as above, but ULE is canonicalized to ULT, so convert:
+      // (X << S) <=u C is equiv to X <=u (C >> S) for all C
+      // (X << S) <u (C + 1) is equiv to X <u (C >> S) + 1 if C <u ~0u
+      // (X << S) <u C is equiv to X <u ((C - 1) >> S) + 1 if C >u 0
+      assert(C->ugt(0) && "ult 0 should have been eliminated");
+      APInt ShiftedC = (*C - 1).lshr(*ShiftAmt) + 1;
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+  }
+
+  if (Cmp.isEquality() && Shl->hasOneUse()) {
+    // Strength-reduce the shift into an 'and'.
+    Constant *Mask = ConstantInt::get(
+        ShType,
+        APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue()));
+    Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask");
+    Constant *LShrC = ConstantInt::get(ShType, C->lshr(*ShiftAmt));
+    return new ICmpInst(Pred, And, LShrC);
+  }
+
+  // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
+  bool TrueIfSigned = false;
+  if (Shl->hasOneUse() && isSignBitCheck(Pred, *C, TrueIfSigned)) {
+    // (X << 31) <s 0  --> (X & 1) != 0
+    Constant *Mask = ConstantInt::get(
+        ShType,
+        APInt::getOneBitSet(TypeBits, TypeBits - ShiftAmt->getZExtValue() - 1));
+    Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask");
+    return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
+                        And, Constant::getNullValue(ShType));
+  }
+
+  // Transform (icmp pred iM (shl iM %v, N), C)
+  // -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (C>>N))
+  // Transform the shl to a trunc if (trunc (C>>N)) has no loss and M-N.
+  // This enables us to get rid of the shift in favor of a trunc that may be
+  // free on the target. It has the additional benefit of comparing to a
+  // smaller constant that may be more target-friendly.
+  unsigned Amt = ShiftAmt->getLimitedValue(TypeBits - 1);
+  if (Shl->hasOneUse() && Amt != 0 && C->countTrailingZeros() >= Amt &&
+      DL.isLegalInteger(TypeBits - Amt)) {
+    Type *TruncTy = IntegerType::get(Cmp.getContext(), TypeBits - Amt);
+    if (ShType->isVectorTy())
+      TruncTy = VectorType::get(TruncTy, ShType->getVectorNumElements());
+    Constant *NewC =
+        ConstantInt::get(TruncTy, C->ashr(*ShiftAmt).trunc(TypeBits - Amt));
+    return new ICmpInst(Pred, Builder.CreateTrunc(X, TruncTy), NewC);
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp ({al}shr X, Y), C.
+Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
+                                               BinaryOperator *Shr,
+                                               const APInt *C) {
+  // An exact shr only shifts out zero bits, so:
+  // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0
+  Value *X = Shr->getOperand(0);
+  CmpInst::Predicate Pred = Cmp.getPredicate();
+  if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() &&
+      C->isNullValue())
+    return new ICmpInst(Pred, X, Cmp.getOperand(1));
+
+  const APInt *ShiftVal;
+  if (Cmp.isEquality() && match(Shr->getOperand(0), m_APInt(ShiftVal)))
+    return foldICmpShrConstConst(Cmp, Shr->getOperand(1), *C, *ShiftVal);
+
+  const APInt *ShiftAmt;
+  if (!match(Shr->getOperand(1), m_APInt(ShiftAmt)))
+    return nullptr;
+
+  // Check that the shift amount is in range. If not, don't perform undefined
+  // shifts. When the shift is visited it will be simplified.
+  unsigned TypeBits = C->getBitWidth();
+  unsigned ShAmtVal = ShiftAmt->getLimitedValue(TypeBits);
+  if (ShAmtVal >= TypeBits || ShAmtVal == 0)
+    return nullptr;
+
+  bool IsAShr = Shr->getOpcode() == Instruction::AShr;
+  if (!Cmp.isEquality()) {
+    // If we have an unsigned comparison and an ashr, we can't simplify this.
+    // Similarly for signed comparisons with lshr.
+    if (Cmp.isSigned() != IsAShr)
+      return nullptr;
+
+    // Otherwise, all lshr and most exact ashr's are equivalent to a udiv/sdiv
+    // by a power of 2.  Since we already have logic to simplify these,
+    // transform to div and then simplify the resultant comparison.
+    if (IsAShr && (!Shr->isExact() || ShAmtVal == TypeBits - 1))
+      return nullptr;
+
+    // Revisit the shift (to delete it).
+    Worklist.Add(Shr);
+
+    Constant *DivCst = ConstantInt::get(
+        Shr->getType(), APInt::getOneBitSet(TypeBits, ShAmtVal));
+
+    Value *Tmp = IsAShr ? Builder.CreateSDiv(X, DivCst, "", Shr->isExact())
+                        : Builder.CreateUDiv(X, DivCst, "", Shr->isExact());
+
+    Cmp.setOperand(0, Tmp);
+
+    // If the builder folded the binop, just return it.
+    BinaryOperator *TheDiv = dyn_cast<BinaryOperator>(Tmp);
+    if (!TheDiv)
+      return &Cmp;
+
+    // Otherwise, fold this div/compare.
+    assert(TheDiv->getOpcode() == Instruction::SDiv ||
+           TheDiv->getOpcode() == Instruction::UDiv);
+
+    Instruction *Res = foldICmpDivConstant(Cmp, TheDiv, C);
+    assert(Res && "This div/cst should have folded!");
+    return Res;
+  }
+
+  // Handle equality comparisons of shift-by-constant.
+
+  // If the comparison constant changes with the shift, the comparison cannot
+  // succeed (bits of the comparison constant cannot match the shifted value).
+  // This should be known by InstSimplify and already be folded to true/false.
+  assert(((IsAShr && C->shl(ShAmtVal).ashr(ShAmtVal) == *C) ||
+          (!IsAShr && C->shl(ShAmtVal).lshr(ShAmtVal) == *C)) &&
+         "Expected icmp+shr simplify did not occur.");
+
+  // Check if the bits shifted out are known to be zero. If so, we can compare
+  // against the unshifted value:
+  //  (X & 4) >> 1 == 2  --> (X & 4) == 4.
+  Constant *ShiftedCmpRHS = ConstantInt::get(Shr->getType(), *C << ShAmtVal);
+  if (Shr->hasOneUse()) {
+    if (Shr->isExact())
+      return new ICmpInst(Pred, X, ShiftedCmpRHS);
+
+    // Otherwise strength reduce the shift into an 'and'.
+    APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
+    Constant *Mask = ConstantInt::get(Shr->getType(), Val);
+    Value *And = Builder.CreateAnd(X, Mask, Shr->getName() + ".mask");
+    return new ICmpInst(Pred, And, ShiftedCmpRHS);
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (udiv X, Y), C.
+Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp,
+                                                BinaryOperator *UDiv,
+                                                const APInt *C) {
+  const APInt *C2;
+  if (!match(UDiv->getOperand(0), m_APInt(C2)))
+    return nullptr;
+
+  assert(*C2 != 0 && "udiv 0, X should have been simplified already.");
+
+  // (icmp ugt (udiv C2, Y), C) -> (icmp ule Y, C2/(C+1))
+  Value *Y = UDiv->getOperand(1);
+  if (Cmp.getPredicate() == ICmpInst::ICMP_UGT) {
+    assert(!C->isMaxValue() &&
+           "icmp ugt X, UINT_MAX should have been simplified already.");
+    return new ICmpInst(ICmpInst::ICMP_ULE, Y,
+                        ConstantInt::get(Y->getType(), C2->udiv(*C + 1)));
+  }
+
+  // (icmp ult (udiv C2, Y), C) -> (icmp ugt Y, C2/C)
+  if (Cmp.getPredicate() == ICmpInst::ICMP_ULT) {
+    assert(*C != 0 && "icmp ult X, 0 should have been simplified already.");
+    return new ICmpInst(ICmpInst::ICMP_UGT, Y,
+                        ConstantInt::get(Y->getType(), C2->udiv(*C)));
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp ({su}div X, Y), C.
+Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
+                                               BinaryOperator *Div,
+                                               const APInt *C) {
+  // Fold: icmp pred ([us]div X, C2), C -> range test
+  // Fold this div into the comparison, producing a range check.
+  // Determine, based on the divide type, what the range is being
+  // checked.  If there is an overflow on the low or high side, remember
+  // it, otherwise compute the range [low, hi) bounding the new value.
+  // See: InsertRangeTest above for the kinds of replacements possible.
+  const APInt *C2;
+  if (!match(Div->getOperand(1), m_APInt(C2)))
+    return nullptr;
+
+  // FIXME: If the operand types don't match the type of the divide
+  // then don't attempt this transform. The code below doesn't have the
+  // logic to deal with a signed divide and an unsigned compare (and
+  // vice versa). This is because (x /s C2) <s C  produces different
+  // results than (x /s C2) <u C or (x /u C2) <s C or even
+  // (x /u C2) <u C.  Simply casting the operands and result won't
+  // work. :(  The if statement below tests that condition and bails
+  // if it finds it.
+  bool DivIsSigned = Div->getOpcode() == Instruction::SDiv;
+  if (!Cmp.isEquality() && DivIsSigned != Cmp.isSigned())
+    return nullptr;
+
+  // The ProdOV computation fails on divide by 0 and divide by -1. Cases with
+  // INT_MIN will also fail if the divisor is 1. Although folds of all these
+  // division-by-constant cases should be present, we can not assert that they
+  // have happened before we reach this icmp instruction.
+  if (C2->isNullValue() || C2->isOneValue() ||
+      (DivIsSigned && C2->isAllOnesValue()))
+    return nullptr;
+
+  // TODO: We could do all of the computations below using APInt.
+  Constant *CmpRHS = cast<Constant>(Cmp.getOperand(1));
+  Constant *DivRHS = cast<Constant>(Div->getOperand(1));
+
+  // Compute Prod = CmpRHS * DivRHS. We are essentially solving an equation of
+  // form X / C2 = C. We solve for X by multiplying C2 (DivRHS) and C (CmpRHS).
+  // By solving for X, we can turn this into a range check instead of computing
+  // a divide.
+  Constant *Prod = ConstantExpr::getMul(CmpRHS, DivRHS);
+
+  // Determine if the product overflows by seeing if the product is not equal to
+  // the divide. Make sure we do the same kind of divide as in the LHS
+  // instruction that we're folding.
+  bool ProdOV = (DivIsSigned ? ConstantExpr::getSDiv(Prod, DivRHS)
+                             : ConstantExpr::getUDiv(Prod, DivRHS)) != CmpRHS;
+
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+
+  // If the division is known to be exact, then there is no remainder from the
+  // divide, so the covered range size is unit, otherwise it is the divisor.
+  Constant *RangeSize =
+      Div->isExact() ? ConstantInt::get(Div->getType(), 1) : DivRHS;
+
+  // Figure out the interval that is being checked.  For example, a comparison
+  // like "X /u 5 == 0" is really checking that X is in the interval [0, 5).
+  // Compute this interval based on the constants involved and the signedness of
+  // the compare/divide.  This computes a half-open interval, keeping track of
+  // whether either value in the interval overflows.  After analysis each
+  // overflow variable is set to 0 if it's corresponding bound variable is valid
+  // -1 if overflowed off the bottom end, or +1 if overflowed off the top end.
+  int LoOverflow = 0, HiOverflow = 0;
+  Constant *LoBound = nullptr, *HiBound = nullptr;
+
+  if (!DivIsSigned) {  // udiv
+    // e.g. X/5 op 3  --> [15, 20)
+    LoBound = Prod;
+    HiOverflow = LoOverflow = ProdOV;
+    if (!HiOverflow) {
+      // If this is not an exact divide, then many values in the range collapse
+      // to the same result value.
+      HiOverflow = addWithOverflow(HiBound, LoBound, RangeSize, false);
+    }
+  } else if (C2->isStrictlyPositive()) { // Divisor is > 0.
+    if (C->isNullValue()) {       // (X / pos) op 0
+      // Can't overflow.  e.g.  X/2 op 0 --> [-1, 2)
+      LoBound = ConstantExpr::getNeg(SubOne(RangeSize));
+      HiBound = RangeSize;
+    } else if (C->isStrictlyPositive()) {   // (X / pos) op pos
+      LoBound = Prod;     // e.g.   X/5 op 3 --> [15, 20)
+      HiOverflow = LoOverflow = ProdOV;
+      if (!HiOverflow)
+        HiOverflow = addWithOverflow(HiBound, Prod, RangeSize, true);
+    } else {                       // (X / pos) op neg
+      // e.g. X/5 op -3  --> [-15-4, -15+1) --> [-19, -14)
+      HiBound = AddOne(Prod);
+      LoOverflow = HiOverflow = ProdOV ? -1 : 0;
+      if (!LoOverflow) {
+        Constant *DivNeg = ConstantExpr::getNeg(RangeSize);
+        LoOverflow = addWithOverflow(LoBound, HiBound, DivNeg, true) ? -1 : 0;
+      }
+    }
+  } else if (C2->isNegative()) { // Divisor is < 0.
+    if (Div->isExact())
+      RangeSize = ConstantExpr::getNeg(RangeSize);
+    if (C->isNullValue()) { // (X / neg) op 0
+      // e.g. X/-5 op 0  --> [-4, 5)
+      LoBound = AddOne(RangeSize);
+      HiBound = ConstantExpr::getNeg(RangeSize);
+      if (HiBound == DivRHS) {     // -INTMIN = INTMIN
+        HiOverflow = 1;            // [INTMIN+1, overflow)
+        HiBound = nullptr;         // e.g. X/INTMIN = 0 --> X > INTMIN
+      }
+    } else if (C->isStrictlyPositive()) {   // (X / neg) op pos
+      // e.g. X/-5 op 3  --> [-19, -14)
+      HiBound = AddOne(Prod);
+      HiOverflow = LoOverflow = ProdOV ? -1 : 0;
+      if (!LoOverflow)
+        LoOverflow = addWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0;
+    } else {                       // (X / neg) op neg
+      LoBound = Prod;       // e.g. X/-5 op -3  --> [15, 20)
+      LoOverflow = HiOverflow = ProdOV;
+      if (!HiOverflow)
+        HiOverflow = subWithOverflow(HiBound, Prod, RangeSize, true);
+    }
+
+    // Dividing by a negative swaps the condition.  LT <-> GT
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  Value *X = Div->getOperand(0);
+  switch (Pred) {
+    default: llvm_unreachable("Unhandled icmp opcode!");
+    case ICmpInst::ICMP_EQ:
+      if (LoOverflow && HiOverflow)
+        return replaceInstUsesWith(Cmp, Builder.getFalse());
+      if (HiOverflow)
+        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
+                            ICmpInst::ICMP_UGE, X, LoBound);
+      if (LoOverflow)
+        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
+                            ICmpInst::ICMP_ULT, X, HiBound);
+      return replaceInstUsesWith(
+          Cmp, insertRangeTest(X, LoBound->getUniqueInteger(),
+                               HiBound->getUniqueInteger(), DivIsSigned, true));
+    case ICmpInst::ICMP_NE:
+      if (LoOverflow && HiOverflow)
+        return replaceInstUsesWith(Cmp, Builder.getTrue());
+      if (HiOverflow)
+        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
+                            ICmpInst::ICMP_ULT, X, LoBound);
+      if (LoOverflow)
+        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
+                            ICmpInst::ICMP_UGE, X, HiBound);
+      return replaceInstUsesWith(Cmp,
+                                 insertRangeTest(X, LoBound->getUniqueInteger(),
+                                                 HiBound->getUniqueInteger(),
+                                                 DivIsSigned, false));
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_SLT:
+      if (LoOverflow == +1)   // Low bound is greater than input range.
+        return replaceInstUsesWith(Cmp, Builder.getTrue());
+      if (LoOverflow == -1)   // Low bound is less than input range.
+        return replaceInstUsesWith(Cmp, Builder.getFalse());
+      return new ICmpInst(Pred, X, LoBound);
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_SGT:
+      if (HiOverflow == +1)       // High bound greater than input range.
+        return replaceInstUsesWith(Cmp, Builder.getFalse());
+      if (HiOverflow == -1)       // High bound less than input range.
+        return replaceInstUsesWith(Cmp, Builder.getTrue());
+      if (Pred == ICmpInst::ICMP_UGT)
+        return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound);
+      return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound);
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (sub X, Y), C.
+Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp,
+                                               BinaryOperator *Sub,
+                                               const APInt *C) {
+  Value *X = Sub->getOperand(0), *Y = Sub->getOperand(1);
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+
+  // The following transforms are only worth it if the only user of the subtract
+  // is the icmp.
+  if (!Sub->hasOneUse())
+    return nullptr;
+
+  if (Sub->hasNoSignedWrap()) {
+    // (icmp sgt (sub nsw X, Y), -1) -> (icmp sge X, Y)
+    if (Pred == ICmpInst::ICMP_SGT && C->isAllOnesValue())
+      return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);
+
+    // (icmp sgt (sub nsw X, Y), 0) -> (icmp sgt X, Y)
+    if (Pred == ICmpInst::ICMP_SGT && C->isNullValue())
+      return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);
+
+    // (icmp slt (sub nsw X, Y), 0) -> (icmp slt X, Y)
+    if (Pred == ICmpInst::ICMP_SLT && C->isNullValue())
+      return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);
+
+    // (icmp slt (sub nsw X, Y), 1) -> (icmp sle X, Y)
+    if (Pred == ICmpInst::ICMP_SLT && C->isOneValue())
+      return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
+  }
+
+  const APInt *C2;
+  if (!match(X, m_APInt(C2)))
+    return nullptr;
+
+  // C2 - Y <u C -> (Y | (C - 1)) == C2
+  //   iff (C2 & (C - 1)) == C - 1 and C is a power of 2
+  if (Pred == ICmpInst::ICMP_ULT && C->isPowerOf2() &&
+      (*C2 & (*C - 1)) == (*C - 1))
+    return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateOr(Y, *C - 1), X);
+
+  // C2 - Y >u C -> (Y | C) != C2
+  //   iff C2 & C == C and C + 1 is a power of 2
+  if (Pred == ICmpInst::ICMP_UGT && (*C + 1).isPowerOf2() && (*C2 & *C) == *C)
+    return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateOr(Y, *C), X);
+
+  return nullptr;
+}
+
+/// Fold icmp (add X, Y), C.
+Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
+                                               BinaryOperator *Add,
+                                               const APInt *C) {
+  Value *Y = Add->getOperand(1);
+  const APInt *C2;
+  if (Cmp.isEquality() || !match(Y, m_APInt(C2)))
+    return nullptr;
+
+  // Fold icmp pred (add X, C2), C.
+  Value *X = Add->getOperand(0);
+  Type *Ty = Add->getType();
+  CmpInst::Predicate Pred = Cmp.getPredicate();
+
+  // If the add does not wrap, we can always adjust the compare by subtracting
+  // the constants. Equality comparisons are handled elsewhere. SGE/SLE are
+  // canonicalized to SGT/SLT.
+  if (Add->hasNoSignedWrap() &&
+      (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLT)) {
+    bool Overflow;
+    APInt NewC = C->ssub_ov(*C2, Overflow);
+    // If there is overflow, the result must be true or false.
+    // TODO: Can we assert there is no overflow because InstSimplify always
+    // handles those cases?
+    if (!Overflow)
+      // icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2)
+      return new ICmpInst(Pred, X, ConstantInt::get(Ty, NewC));
+  }
+
+  auto CR = ConstantRange::makeExactICmpRegion(Pred, *C).subtract(*C2);
+  const APInt &Upper = CR.getUpper();
+  const APInt &Lower = CR.getLower();
+  if (Cmp.isSigned()) {
+    if (Lower.isSignMask())
+      return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantInt::get(Ty, Upper));
+    if (Upper.isSignMask())
+      return new ICmpInst(ICmpInst::ICMP_SGE, X, ConstantInt::get(Ty, Lower));
+  } else {
+    if (Lower.isMinValue())
+      return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantInt::get(Ty, Upper));
+    if (Upper.isMinValue())
+      return new ICmpInst(ICmpInst::ICMP_UGE, X, ConstantInt::get(Ty, Lower));
+  }
+
+  if (!Add->hasOneUse())
+    return nullptr;
+
+  // X+C <u C2 -> (X & -C2) == C
+  //   iff C & (C2-1) == 0
+  //       C2 is a power of 2
+  if (Pred == ICmpInst::ICMP_ULT && C->isPowerOf2() && (*C2 & (*C - 1)) == 0)
+    return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateAnd(X, -(*C)),
+                        ConstantExpr::getNeg(cast<Constant>(Y)));
+
+  // X+C >u C2 -> (X & ~C2) != C
+  //   iff C & C2 == 0
+  //       C2+1 is a power of 2
+  if (Pred == ICmpInst::ICMP_UGT && (*C + 1).isPowerOf2() && (*C2 & *C) == 0)
+    return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateAnd(X, ~(*C)),
+                        ConstantExpr::getNeg(cast<Constant>(Y)));
+
+  return nullptr;
+}
+
+bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
+                                           Value *&RHS, ConstantInt *&Less,
+                                           ConstantInt *&Equal,
+                                           ConstantInt *&Greater) {
+  // TODO: Generalize this to work with other comparison idioms or ensure
+  // they get canonicalized into this form.
+
+  // select i1 (a == b), i32 Equal, i32 (select i1 (a < b), i32 Less, i32
+  // Greater), where Equal, Less and Greater are placeholders for any three
+  // constants.
+  ICmpInst::Predicate PredA, PredB;
+  if (match(SI->getTrueValue(), m_ConstantInt(Equal)) &&
+      match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) &&
+      PredA == ICmpInst::ICMP_EQ &&
+      match(SI->getFalseValue(),
+            m_Select(m_ICmp(PredB, m_Specific(LHS), m_Specific(RHS)),
+                     m_ConstantInt(Less), m_ConstantInt(Greater))) &&
+      PredB == ICmpInst::ICMP_SLT) {
+    return true;
+  }
+  return false;
+}
+
+Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp,
+                                                  Instruction *Select,
+                                                  ConstantInt *C) {
+
+  assert(C && "Cmp RHS should be a constant int!");
+  // If we're testing a constant value against the result of a three way
+  // comparison, the result can be expressed directly in terms of the
+  // original values being compared.  Note: We could possibly be more
+  // aggressive here and remove the hasOneUse test. The original select is
+  // really likely to simplify or sink when we remove a test of the result.
+  Value *OrigLHS, *OrigRHS;
+  ConstantInt *C1LessThan, *C2Equal, *C3GreaterThan;
+  if (Cmp.hasOneUse() &&
+      matchThreeWayIntCompare(cast<SelectInst>(Select), OrigLHS, OrigRHS,
+                                 C1LessThan, C2Equal, C3GreaterThan)) {
+    assert(C1LessThan && C2Equal && C3GreaterThan);
+
+    bool TrueWhenLessThan =
+        ConstantExpr::getCompare(Cmp.getPredicate(), C1LessThan, C)
+            ->isAllOnesValue();
+    bool TrueWhenEqual =
+        ConstantExpr::getCompare(Cmp.getPredicate(), C2Equal, C)
+            ->isAllOnesValue();
+    bool TrueWhenGreaterThan =
+        ConstantExpr::getCompare(Cmp.getPredicate(), C3GreaterThan, C)
+            ->isAllOnesValue();
+
+    // This generates the new instruction that will replace the original Cmp
+    // Instruction. Instead of enumerating the various combinations when
+    // TrueWhenLessThan, TrueWhenEqual and TrueWhenGreaterThan are true versus
+    // false, we rely on chaining of ORs and future passes of InstCombine to
+    // simplify the OR further (i.e. a s< b || a == b becomes a s<= b).
+
+    // When none of the three constants satisfy the predicate for the RHS (C),
+    // the entire original Cmp can be simplified to a false.
+    Value *Cond = Builder.getFalse();
+    if (TrueWhenLessThan)
+      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SLT, OrigLHS, OrigRHS));
+    if (TrueWhenEqual)
+      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_EQ, OrigLHS, OrigRHS));
+    if (TrueWhenGreaterThan)
+      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SGT, OrigLHS, OrigRHS));
+
+    return replaceInstUsesWith(Cmp, Cond);
+  }
+  return nullptr;
+}
+
+/// Try to fold integer comparisons with a constant operand: icmp Pred X, C
+/// where X is some kind of instruction.
+Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
+  const APInt *C;
+  if (!match(Cmp.getOperand(1), m_APInt(C)))
+    return nullptr;
+
+  BinaryOperator *BO;
+  if (match(Cmp.getOperand(0), m_BinOp(BO))) {
+    switch (BO->getOpcode()) {
+    case Instruction::Xor:
+      if (Instruction *I = foldICmpXorConstant(Cmp, BO, C))
+        return I;
+      break;
+    case Instruction::And:
+      if (Instruction *I = foldICmpAndConstant(Cmp, BO, C))
+        return I;
+      break;
+    case Instruction::Or:
+      if (Instruction *I = foldICmpOrConstant(Cmp, BO, C))
+        return I;
+      break;
+    case Instruction::Mul:
+      if (Instruction *I = foldICmpMulConstant(Cmp, BO, C))
+        return I;
+      break;
+    case Instruction::Shl:
+      if (Instruction *I = foldICmpShlConstant(Cmp, BO, C))
+        return I;
+      break;
+    case Instruction::LShr:
+    case Instruction::AShr:
+      if (Instruction *I = foldICmpShrConstant(Cmp, BO, C))
+        return I;
+      break;
+    case Instruction::UDiv:
+      if (Instruction *I = foldICmpUDivConstant(Cmp, BO, C))
+        return I;
+      LLVM_FALLTHROUGH;
+    case Instruction::SDiv:
+      if (Instruction *I = foldICmpDivConstant(Cmp, BO, C))
+        return I;
+      break;
+    case Instruction::Sub:
+      if (Instruction *I = foldICmpSubConstant(Cmp, BO, C))
+        return I;
+      break;
+    case Instruction::Add:
+      if (Instruction *I = foldICmpAddConstant(Cmp, BO, C))
+        return I;
+      break;
+    default:
+      break;
+    }
+    // TODO: These folds could be refactored to be part of the above calls.
+    if (Instruction *I = foldICmpBinOpEqualityWithConstant(Cmp, BO, C))
+      return I;
+  }
+
+  // Match against CmpInst LHS being instructions other than binary operators.
+  Instruction *LHSI;
+  if (match(Cmp.getOperand(0), m_Instruction(LHSI))) {
+    switch (LHSI->getOpcode()) {
+    case Instruction::Select:
+      {
+      // For now, we only support constant integers while folding the
+      // ICMP(SELECT)) pattern. We can extend this to support vector of integers
+      // similar to the cases handled by binary ops above.
+      if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(Cmp.getOperand(1)))
+        if (Instruction *I = foldICmpSelectConstant(Cmp, LHSI, ConstRHS))
+          return I;
+      break;
+      }
+    case Instruction::Trunc:
+      if (Instruction *I = foldICmpTruncConstant(Cmp, LHSI, C))
+        return I;
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, C))
+    return I;
+
+  return nullptr;
+}
+
+/// Fold an icmp equality instruction with binary operator LHS and constant RHS:
+/// icmp eq/ne BO, C.
+Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
+                                                             BinaryOperator *BO,
+                                                             const APInt *C) {
+  // TODO: Some of these folds could work with arbitrary constants, but this
+  // function is limited to scalar and vector splat constants.
+  if (!Cmp.isEquality())
+    return nullptr;
+
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  bool isICMP_NE = Pred == ICmpInst::ICMP_NE;
+  Constant *RHS = cast<Constant>(Cmp.getOperand(1));
+  Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1);
+
+  switch (BO->getOpcode()) {
+  case Instruction::SRem:
+    // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one.
+    if (C->isNullValue() && BO->hasOneUse()) {
+      const APInt *BOC;
+      if (match(BOp1, m_APInt(BOC)) && BOC->sgt(1) && BOC->isPowerOf2()) {
+        Value *NewRem = Builder.CreateURem(BOp0, BOp1, BO->getName());
+        return new ICmpInst(Pred, NewRem,
+                            Constant::getNullValue(BO->getType()));
+      }
+    }
+    break;
+  case Instruction::Add: {
+    // Replace ((add A, B) != C) with (A != C-B) if B & C are constants.
+    const APInt *BOC;
+    if (match(BOp1, m_APInt(BOC))) {
+      if (BO->hasOneUse()) {
+        Constant *SubC = ConstantExpr::getSub(RHS, cast<Constant>(BOp1));
+        return new ICmpInst(Pred, BOp0, SubC);
+      }
+    } else if (C->isNullValue()) {
+      // Replace ((add A, B) != 0) with (A != -B) if A or B is
+      // efficiently invertible, or if the add has just this one use.
+      if (Value *NegVal = dyn_castNegVal(BOp1))
+        return new ICmpInst(Pred, BOp0, NegVal);
+      if (Value *NegVal = dyn_castNegVal(BOp0))
+        return new ICmpInst(Pred, NegVal, BOp1);
+      if (BO->hasOneUse()) {
+        Value *Neg = Builder.CreateNeg(BOp1);
+        Neg->takeName(BO);
+        return new ICmpInst(Pred, BOp0, Neg);
+      }
+    }
+    break;
+  }
+  case Instruction::Xor:
+    if (BO->hasOneUse()) {
+      if (Constant *BOC = dyn_cast<Constant>(BOp1)) {
+        // For the xor case, we can xor two constants together, eliminating
+        // the explicit xor.
+        return new ICmpInst(Pred, BOp0, ConstantExpr::getXor(RHS, BOC));
+      } else if (C->isNullValue()) {
+        // Replace ((xor A, B) != 0) with (A != B)
+        return new ICmpInst(Pred, BOp0, BOp1);
+      }
+    }
+    break;
+  case Instruction::Sub:
+    if (BO->hasOneUse()) {
+      const APInt *BOC;
+      if (match(BOp0, m_APInt(BOC))) {
+        // Replace ((sub BOC, B) != C) with (B != BOC-C).
+        Constant *SubC = ConstantExpr::getSub(cast<Constant>(BOp0), RHS);
+        return new ICmpInst(Pred, BOp1, SubC);
+      } else if (C->isNullValue()) {
+        // Replace ((sub A, B) != 0) with (A != B).
+        return new ICmpInst(Pred, BOp0, BOp1);
+      }
+    }
+    break;
+  case Instruction::Or: {
+    const APInt *BOC;
+    if (match(BOp1, m_APInt(BOC)) && BO->hasOneUse() && RHS->isAllOnesValue()) {
+      // Comparing if all bits outside of a constant mask are set?
+      // Replace (X | C) == -1 with (X & ~C) == ~C.
+      // This removes the -1 constant.
+      Constant *NotBOC = ConstantExpr::getNot(cast<Constant>(BOp1));
+      Value *And = Builder.CreateAnd(BOp0, NotBOC);
+      return new ICmpInst(Pred, And, NotBOC);
+    }
+    break;
+  }
+  case Instruction::And: {
+    const APInt *BOC;
+    if (match(BOp1, m_APInt(BOC))) {
+      // If we have ((X & C) == C), turn it into ((X & C) != 0).
+      if (C == BOC && C->isPowerOf2())
+        return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
+                            BO, Constant::getNullValue(RHS->getType()));
+
+      // Don't perform the following transforms if the AND has multiple uses
+      if (!BO->hasOneUse())
+        break;
+
+      // Replace (and X, (1 << size(X)-1) != 0) with x s< 0
+      if (BOC->isSignMask()) {
+        Constant *Zero = Constant::getNullValue(BOp0->getType());
+        auto NewPred = isICMP_NE ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
+        return new ICmpInst(NewPred, BOp0, Zero);
+      }
+
+      // ((X & ~7) == 0) --> X < 8
+      if (C->isNullValue() && (~(*BOC) + 1).isPowerOf2()) {
+        Constant *NegBOC = ConstantExpr::getNeg(cast<Constant>(BOp1));
+        auto NewPred = isICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
+        return new ICmpInst(NewPred, BOp0, NegBOC);
+      }
+    }
+    break;
+  }
+  case Instruction::Mul:
+    if (C->isNullValue() && BO->hasNoSignedWrap()) {
+      const APInt *BOC;
+      if (match(BOp1, m_APInt(BOC)) && !BOC->isNullValue()) {
+        // The trivial case (mul X, 0) is handled by InstSimplify.
+        // General case : (mul X, C) != 0 iff X != 0
+        //                (mul X, C) == 0 iff X == 0
+        return new ICmpInst(Pred, BOp0, Constant::getNullValue(RHS->getType()));
+      }
+    }
+    break;
+  case Instruction::UDiv:
+    if (C->isNullValue()) {
+      // (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A)
+      auto NewPred = isICMP_NE ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
+      return new ICmpInst(NewPred, BOp1, BOp0);
+    }
+    break;
+  default:
+    break;
+  }
+  return nullptr;
+}
+
+/// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C.
+Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
+                                                         const APInt *C) {
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(Cmp.getOperand(0));
+  if (!II || !Cmp.isEquality())
+    return nullptr;
+
+  // Handle icmp {eq|ne} <intrinsic>, Constant.
+  Type *Ty = II->getType();
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::bswap:
+    Worklist.Add(II);
+    Cmp.setOperand(0, II->getArgOperand(0));
+    Cmp.setOperand(1, ConstantInt::get(Ty, C->byteSwap()));
+    return &Cmp;
+
+  case Intrinsic::ctlz:
+  case Intrinsic::cttz:
+    // ctz(A) == bitwidth(A)  ->  A == 0 and likewise for !=
+    if (*C == C->getBitWidth()) {
+      Worklist.Add(II);
+      Cmp.setOperand(0, II->getArgOperand(0));
+      Cmp.setOperand(1, ConstantInt::getNullValue(Ty));
+      return &Cmp;
+    }
+    break;
+
+  case Intrinsic::ctpop: {
+    // popcount(A) == 0  ->  A == 0 and likewise for !=
+    // popcount(A) == bitwidth(A)  ->  A == -1 and likewise for !=
+    bool IsZero = C->isNullValue();
+    if (IsZero || *C == C->getBitWidth()) {
+      Worklist.Add(II);
+      Cmp.setOperand(0, II->getArgOperand(0));
+      auto *NewOp =
+          IsZero ? Constant::getNullValue(Ty) : Constant::getAllOnesValue(Ty);
+      Cmp.setOperand(1, NewOp);
+      return &Cmp;
+    }
+    break;
+  }
+  default:
+    break;
+  }
+
+  return nullptr;
+}
+
+/// Handle icmp with constant (but not simple integer constant) RHS.
+Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Constant *RHSC = dyn_cast<Constant>(Op1);
+  Instruction *LHSI = dyn_cast<Instruction>(Op0);
+  if (!RHSC || !LHSI)
+    return nullptr;
+
+  switch (LHSI->getOpcode()) {
+  case Instruction::GetElementPtr:
+    // icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null
+    if (RHSC->isNullValue() &&
+        cast<GetElementPtrInst>(LHSI)->hasAllZeroIndices())
+      return new ICmpInst(
+          I.getPredicate(), LHSI->getOperand(0),
+          Constant::getNullValue(LHSI->getOperand(0)->getType()));
+    break;
+  case Instruction::PHI:
+    // Only fold icmp into the PHI if the phi and icmp are in the same
+    // block.  If in the same block, we're encouraging jump threading.  If
+    // not, we are just pessimizing the code by making an i1 phi.
+    if (LHSI->getParent() == I.getParent())
+      if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
+        return NV;
+    break;
+  case Instruction::Select: {
+    // If either operand of the select is a constant, we can fold the
+    // comparison into the select arms, which will cause one to be
+    // constant folded and the select turned into a bitwise or.
+    Value *Op1 = nullptr, *Op2 = nullptr;
+    ConstantInt *CI = nullptr;
+    if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
+      Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+      CI = dyn_cast<ConstantInt>(Op1);
+    }
+    if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
+      Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+      CI = dyn_cast<ConstantInt>(Op2);
+    }
+
+    // We only want to perform this transformation if it will not lead to
+    // additional code. This is true if either both sides of the select
+    // fold to a constant (in which case the icmp is replaced with a select
+    // which will usually simplify) or this is the only user of the
+    // select (in which case we are trading a select+icmp for a simpler
+    // select+icmp) or all uses of the select can be replaced based on
+    // dominance information ("Global cases").
+    bool Transform = false;
+    if (Op1 && Op2)
+      Transform = true;
+    else if (Op1 || Op2) {
+      // Local case
+      if (LHSI->hasOneUse())
+        Transform = true;
+      // Global cases
+      else if (CI && !CI->isZero())
+        // When Op1 is constant try replacing select with second operand.
+        // Otherwise Op2 is constant and try replacing select with first
+        // operand.
+        Transform =
+            replacedSelectWithOperand(cast<SelectInst>(LHSI), &I, Op1 ? 2 : 1);
+    }
+    if (Transform) {
+      if (!Op1)
+        Op1 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(1), RHSC,
+                                 I.getName());
+      if (!Op2)
+        Op2 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(2), RHSC,
+                                 I.getName());
+      return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
+    }
+    break;
+  }
+  case Instruction::IntToPtr:
+    // icmp pred inttoptr(X), null -> icmp pred X, 0
+    if (RHSC->isNullValue() &&
+        DL.getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType())
+      return new ICmpInst(
+          I.getPredicate(), LHSI->getOperand(0),
+          Constant::getNullValue(LHSI->getOperand(0)->getType()));
+    break;
+
+  case Instruction::Load:
+    // Try to optimize things like "A[i] > 4" to index computations.
+    if (GetElementPtrInst *GEP =
+            dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) {
+      if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+        if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+            !cast<LoadInst>(LHSI)->isVolatile())
+          if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
+            return Res;
+    }
+    break;
+  }
+
+  return nullptr;
+}
+
+/// Try to fold icmp (binop), X or icmp X, (binop).
+/// TODO: A large part of this logic is duplicated in InstSimplify's
+/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
+/// duplication.
+Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Special logic for binary operators.
+  BinaryOperator *BO0 = dyn_cast<BinaryOperator>(Op0);
+  BinaryOperator *BO1 = dyn_cast<BinaryOperator>(Op1);
+  if (!BO0 && !BO1)
+    return nullptr;
+
+  const CmpInst::Predicate Pred = I.getPredicate();
+  bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
+  if (BO0 && isa<OverflowingBinaryOperator>(BO0))
+    NoOp0WrapProblem =
+        ICmpInst::isEquality(Pred) ||
+        (CmpInst::isUnsigned(Pred) && BO0->hasNoUnsignedWrap()) ||
+        (CmpInst::isSigned(Pred) && BO0->hasNoSignedWrap());
+  if (BO1 && isa<OverflowingBinaryOperator>(BO1))
+    NoOp1WrapProblem =
+        ICmpInst::isEquality(Pred) ||
+        (CmpInst::isUnsigned(Pred) && BO1->hasNoUnsignedWrap()) ||
+        (CmpInst::isSigned(Pred) && BO1->hasNoSignedWrap());
+
+  // Analyze the case when either Op0 or Op1 is an add instruction.
+  // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null).
+  Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
+  if (BO0 && BO0->getOpcode() == Instruction::Add) {
+    A = BO0->getOperand(0);
+    B = BO0->getOperand(1);
+  }
+  if (BO1 && BO1->getOpcode() == Instruction::Add) {
+    C = BO1->getOperand(0);
+    D = BO1->getOperand(1);
+  }
+
+  // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
+  if ((A == Op1 || B == Op1) && NoOp0WrapProblem)
+    return new ICmpInst(Pred, A == Op1 ? B : A,
+                        Constant::getNullValue(Op1->getType()));
+
+  // icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow.
+  if ((C == Op0 || D == Op0) && NoOp1WrapProblem)
+    return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()),
+                        C == Op0 ? D : C);
+
+  // icmp (X+Y), (X+Z) -> icmp Y, Z for equalities or if there is no overflow.
+  if (A && C && (A == C || A == D || B == C || B == D) && NoOp0WrapProblem &&
+      NoOp1WrapProblem &&
+      // Try not to increase register pressure.
+      BO0->hasOneUse() && BO1->hasOneUse()) {
+    // Determine Y and Z in the form icmp (X+Y), (X+Z).
+    Value *Y, *Z;
+    if (A == C) {
+      // C + B == C + D  ->  B == D
+      Y = B;
+      Z = D;
+    } else if (A == D) {
+      // D + B == C + D  ->  B == C
+      Y = B;
+      Z = C;
+    } else if (B == C) {
+      // A + C == C + D  ->  A == D
+      Y = A;
+      Z = D;
+    } else {
+      assert(B == D);
+      // A + D == C + D  ->  A == C
+      Y = A;
+      Z = C;
+    }
+    return new ICmpInst(Pred, Y, Z);
+  }
+
+  // icmp slt (X + -1), Y -> icmp sle X, Y
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT &&
+      match(B, m_AllOnes()))
+    return new ICmpInst(CmpInst::ICMP_SLE, A, Op1);
+
+  // icmp sge (X + -1), Y -> icmp sgt X, Y
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE &&
+      match(B, m_AllOnes()))
+    return new ICmpInst(CmpInst::ICMP_SGT, A, Op1);
+
+  // icmp sle (X + 1), Y -> icmp slt X, Y
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE && match(B, m_One()))
+    return new ICmpInst(CmpInst::ICMP_SLT, A, Op1);
+
+  // icmp sgt (X + 1), Y -> icmp sge X, Y
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT && match(B, m_One()))
+    return new ICmpInst(CmpInst::ICMP_SGE, A, Op1);
+
+  // icmp sgt X, (Y + -1) -> icmp sge X, Y
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT &&
+      match(D, m_AllOnes()))
+    return new ICmpInst(CmpInst::ICMP_SGE, Op0, C);
+
+  // icmp sle X, (Y + -1) -> icmp slt X, Y
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE &&
+      match(D, m_AllOnes()))
+    return new ICmpInst(CmpInst::ICMP_SLT, Op0, C);
+
+  // icmp sge X, (Y + 1) -> icmp sgt X, Y
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && match(D, m_One()))
+    return new ICmpInst(CmpInst::ICMP_SGT, Op0, C);
+
+  // icmp slt X, (Y + 1) -> icmp sle X, Y
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && match(D, m_One()))
+    return new ICmpInst(CmpInst::ICMP_SLE, Op0, C);
+
+  // TODO: The subtraction-related identities shown below also hold, but
+  // canonicalization from (X -nuw 1) to (X + -1) means that the combinations
+  // wouldn't happen even if they were implemented.
+  //
+  // icmp ult (X - 1), Y -> icmp ule X, Y
+  // icmp uge (X - 1), Y -> icmp ugt X, Y
+  // icmp ugt X, (Y - 1) -> icmp uge X, Y
+  // icmp ule X, (Y - 1) -> icmp ult X, Y
+
+  // icmp ule (X + 1), Y -> icmp ult X, Y
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_ULE && match(B, m_One()))
+    return new ICmpInst(CmpInst::ICMP_ULT, A, Op1);
+
+  // icmp ugt (X + 1), Y -> icmp uge X, Y
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_UGT && match(B, m_One()))
+    return new ICmpInst(CmpInst::ICMP_UGE, A, Op1);
+
+  // icmp uge X, (Y + 1) -> icmp ugt X, Y
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_UGE && match(D, m_One()))
+    return new ICmpInst(CmpInst::ICMP_UGT, Op0, C);
+
+  // icmp ult X, (Y + 1) -> icmp ule X, Y
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_ULT && match(D, m_One()))
+    return new ICmpInst(CmpInst::ICMP_ULE, Op0, C);
+
+  // if C1 has greater magnitude than C2:
+  //  icmp (X + C1), (Y + C2) -> icmp (X + C3), Y
+  //  s.t. C3 = C1 - C2
+  //
+  // if C2 has greater magnitude than C1:
+  //  icmp (X + C1), (Y + C2) -> icmp X, (Y + C3)
+  //  s.t. C3 = C2 - C1
+  if (A && C && NoOp0WrapProblem && NoOp1WrapProblem &&
+      (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned())
+    if (ConstantInt *C1 = dyn_cast<ConstantInt>(B))
+      if (ConstantInt *C2 = dyn_cast<ConstantInt>(D)) {
+        const APInt &AP1 = C1->getValue();
+        const APInt &AP2 = C2->getValue();
+        if (AP1.isNegative() == AP2.isNegative()) {
+          APInt AP1Abs = C1->getValue().abs();
+          APInt AP2Abs = C2->getValue().abs();
+          if (AP1Abs.uge(AP2Abs)) {
+            ConstantInt *C3 = Builder.getInt(AP1 - AP2);
+            Value *NewAdd = Builder.CreateNSWAdd(A, C3);
+            return new ICmpInst(Pred, NewAdd, C);
+          } else {
+            ConstantInt *C3 = Builder.getInt(AP2 - AP1);
+            Value *NewAdd = Builder.CreateNSWAdd(C, C3);
+            return new ICmpInst(Pred, A, NewAdd);
+          }
+        }
+      }
+
+  // Analyze the case when either Op0 or Op1 is a sub instruction.
+  // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null).
+  A = nullptr;
+  B = nullptr;
+  C = nullptr;
+  D = nullptr;
+  if (BO0 && BO0->getOpcode() == Instruction::Sub) {
+    A = BO0->getOperand(0);
+    B = BO0->getOperand(1);
+  }
+  if (BO1 && BO1->getOpcode() == Instruction::Sub) {
+    C = BO1->getOperand(0);
+    D = BO1->getOperand(1);
+  }
+
+  // icmp (X-Y), X -> icmp 0, Y for equalities or if there is no overflow.
+  if (A == Op1 && NoOp0WrapProblem)
+    return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B);
+
+  // icmp X, (X-Y) -> icmp Y, 0 for equalities or if there is no overflow.
+  if (C == Op0 && NoOp1WrapProblem)
+    return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType()));
+
+  // icmp (Y-X), (Z-X) -> icmp Y, Z for equalities or if there is no overflow.
+  if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem &&
+      // Try not to increase register pressure.
+      BO0->hasOneUse() && BO1->hasOneUse())
+    return new ICmpInst(Pred, A, C);
+
+  // icmp (X-Y), (X-Z) -> icmp Z, Y for equalities or if there is no overflow.
+  if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem &&
+      // Try not to increase register pressure.
+      BO0->hasOneUse() && BO1->hasOneUse())
+    return new ICmpInst(Pred, D, B);
+
+  // icmp (0-X) < cst --> x > -cst
+  if (NoOp0WrapProblem && ICmpInst::isSigned(Pred)) {
+    Value *X;
+    if (match(BO0, m_Neg(m_Value(X))))
+      if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1))
+        if (!RHSC->isMinValue(/*isSigned=*/true))
+          return new ICmpInst(I.getSwappedPredicate(), X,
+                              ConstantExpr::getNeg(RHSC));
+  }
+
+  BinaryOperator *SRem = nullptr;
+  // icmp (srem X, Y), Y
+  if (BO0 && BO0->getOpcode() == Instruction::SRem && Op1 == BO0->getOperand(1))
+    SRem = BO0;
+  // icmp Y, (srem X, Y)
+  else if (BO1 && BO1->getOpcode() == Instruction::SRem &&
+           Op0 == BO1->getOperand(1))
+    SRem = BO1;
+  if (SRem) {
+    // We don't check hasOneUse to avoid increasing register pressure because
+    // the value we use is the same value this instruction was already using.
+    switch (SRem == BO0 ? ICmpInst::getSwappedPredicate(Pred) : Pred) {
+    default:
+      break;
+    case ICmpInst::ICMP_EQ:
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    case ICmpInst::ICMP_NE:
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    case ICmpInst::ICMP_SGT:
+    case ICmpInst::ICMP_SGE:
+      return new ICmpInst(ICmpInst::ICMP_SGT, SRem->getOperand(1),
+                          Constant::getAllOnesValue(SRem->getType()));
+    case ICmpInst::ICMP_SLT:
+    case ICmpInst::ICMP_SLE:
+      return new ICmpInst(ICmpInst::ICMP_SLT, SRem->getOperand(1),
+                          Constant::getNullValue(SRem->getType()));
+    }
+  }
+
+  if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() && BO0->hasOneUse() &&
+      BO1->hasOneUse() && BO0->getOperand(1) == BO1->getOperand(1)) {
+    switch (BO0->getOpcode()) {
+    default:
+      break;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Xor: {
+      if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
+        return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
+      const APInt *C;
+      if (match(BO0->getOperand(1), m_APInt(C))) {
+        // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
+        if (C->isSignMask()) {
+          ICmpInst::Predicate NewPred =
+              I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
+          return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
+        }
+
+        // icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b
+        if (BO0->getOpcode() == Instruction::Xor && C->isMaxSignedValue()) {
+          ICmpInst::Predicate NewPred =
+              I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
+          NewPred = I.getSwappedPredicate(NewPred);
+          return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
+        }
+      }
+      break;
+    }
+    case Instruction::Mul: {
+      if (!I.isEquality())
+        break;
+
+      const APInt *C;
+      if (match(BO0->getOperand(1), m_APInt(C)) && !C->isNullValue() &&
+          !C->isOneValue()) {
+        // icmp eq/ne (X * C), (Y * C) --> icmp (X & Mask), (Y & Mask)
+        // Mask = -1 >> count-trailing-zeros(C).
+        if (unsigned TZs = C->countTrailingZeros()) {
+          Constant *Mask = ConstantInt::get(
+              BO0->getType(),
+              APInt::getLowBitsSet(C->getBitWidth(), C->getBitWidth() - TZs));
+          Value *And1 = Builder.CreateAnd(BO0->getOperand(0), Mask);
+          Value *And2 = Builder.CreateAnd(BO1->getOperand(0), Mask);
+          return new ICmpInst(Pred, And1, And2);
+        }
+        // If there are no trailing zeros in the multiplier, just eliminate
+        // the multiplies (no masking is needed):
+        // icmp eq/ne (X * C), (Y * C) --> icmp eq/ne X, Y
+        return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+      }
+      break;
+    }
+    case Instruction::UDiv:
+    case Instruction::LShr:
+      if (I.isSigned() || !BO0->isExact() || !BO1->isExact())
+        break;
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
+    case Instruction::SDiv:
+      if (!I.isEquality() || !BO0->isExact() || !BO1->isExact())
+        break;
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
+    case Instruction::AShr:
+      if (!BO0->isExact() || !BO1->isExact())
+        break;
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
+    case Instruction::Shl: {
+      bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
+      bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
+      if (!NUW && !NSW)
+        break;
+      if (!NSW && I.isSigned())
+        break;
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+    }
+    }
+  }
+
+  if (BO0) {
+    // Transform  A & (L - 1) `ult` L --> L != 0
+    auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes());
+    auto BitwiseAnd = m_c_And(m_Value(), LSubOne);
+
+    if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {
+      auto *Zero = Constant::getNullValue(BO0->getType());
+      return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
+    }
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp Pred min|max(X, Y), X.
+static Instruction *foldICmpWithMinMax(ICmpInst &Cmp) {
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *Op0 = Cmp.getOperand(0);
+  Value *X = Cmp.getOperand(1);
+
+  // Canonicalize minimum or maximum operand to LHS of the icmp.
+  if (match(X, m_c_SMin(m_Specific(Op0), m_Value())) ||
+      match(X, m_c_SMax(m_Specific(Op0), m_Value())) ||
+      match(X, m_c_UMin(m_Specific(Op0), m_Value())) ||
+      match(X, m_c_UMax(m_Specific(Op0), m_Value()))) {
+    std::swap(Op0, X);
+    Pred = Cmp.getSwappedPredicate();
+  }
+
+  Value *Y;
+  if (match(Op0, m_c_SMin(m_Specific(X), m_Value(Y)))) {
+    // smin(X, Y)  == X --> X s<= Y
+    // smin(X, Y) s>= X --> X s<= Y
+    if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SGE)
+      return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
+
+    // smin(X, Y) != X --> X s> Y
+    // smin(X, Y) s< X --> X s> Y
+    if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SLT)
+      return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);
+
+    // These cases should be handled in InstSimplify:
+    // smin(X, Y) s<= X --> true
+    // smin(X, Y) s> X --> false
+    return nullptr;
+  }
+
+  if (match(Op0, m_c_SMax(m_Specific(X), m_Value(Y)))) {
+    // smax(X, Y)  == X --> X s>= Y
+    // smax(X, Y) s<= X --> X s>= Y
+    if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SLE)
+      return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);
+
+    // smax(X, Y) != X --> X s< Y
+    // smax(X, Y) s> X --> X s< Y
+    if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SGT)
+      return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);
+
+    // These cases should be handled in InstSimplify:
+    // smax(X, Y) s>= X --> true
+    // smax(X, Y) s< X --> false
+    return nullptr;
+  }
+
+  if (match(Op0, m_c_UMin(m_Specific(X), m_Value(Y)))) {
+    // umin(X, Y)  == X --> X u<= Y
+    // umin(X, Y) u>= X --> X u<= Y
+    if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_UGE)
+      return new ICmpInst(ICmpInst::ICMP_ULE, X, Y);
+
+    // umin(X, Y) != X --> X u> Y
+    // umin(X, Y) u< X --> X u> Y
+    if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT)
+      return new ICmpInst(ICmpInst::ICMP_UGT, X, Y);
+
+    // These cases should be handled in InstSimplify:
+    // umin(X, Y) u<= X --> true
+    // umin(X, Y) u> X --> false
+    return nullptr;
+  }
+
+  if (match(Op0, m_c_UMax(m_Specific(X), m_Value(Y)))) {
+    // umax(X, Y)  == X --> X u>= Y
+    // umax(X, Y) u<= X --> X u>= Y
+    if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_ULE)
+      return new ICmpInst(ICmpInst::ICMP_UGE, X, Y);
+
+    // umax(X, Y) != X --> X u< Y
+    // umax(X, Y) u> X --> X u< Y
+    if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_UGT)
+      return new ICmpInst(ICmpInst::ICMP_ULT, X, Y);
+
+    // These cases should be handled in InstSimplify:
+    // umax(X, Y) u>= X --> true
+    // umax(X, Y) u< X --> false
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
+  if (!I.isEquality())
+    return nullptr;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  const CmpInst::Predicate Pred = I.getPredicate();
+  Value *A, *B, *C, *D;
+  if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+    if (A == Op1 || B == Op1) { // (A^B) == A  ->  B == 0
+      Value *OtherVal = A == Op1 ? B : A;
+      return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
+    }
+
+    if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
+      // A^c1 == C^c2 --> A == C^(c1^c2)
+      ConstantInt *C1, *C2;
+      if (match(B, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2)) &&
+          Op1->hasOneUse()) {
+        Constant *NC = Builder.getInt(C1->getValue() ^ C2->getValue());
+        Value *Xor = Builder.CreateXor(C, NC);
+        return new ICmpInst(Pred, A, Xor);
+      }
+
+      // A^B == A^D -> B == D
+      if (A == C)
+        return new ICmpInst(Pred, B, D);
+      if (A == D)
+        return new ICmpInst(Pred, B, C);
+      if (B == C)
+        return new ICmpInst(Pred, A, D);
+      if (B == D)
+        return new ICmpInst(Pred, A, C);
+    }
+  }
+
+  if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) {
+    // A == (A^B)  ->  B == 0
+    Value *OtherVal = A == Op0 ? B : A;
+    return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
+  }
+
+  // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
+  if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) &&
+      match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) {
+    Value *X = nullptr, *Y = nullptr, *Z = nullptr;
+
+    if (A == C) {
+      X = B;
+      Y = D;
+      Z = A;
+    } else if (A == D) {
+      X = B;
+      Y = C;
+      Z = A;
+    } else if (B == C) {
+      X = A;
+      Y = D;
+      Z = B;
+    } else if (B == D) {
+      X = A;
+      Y = C;
+      Z = B;
+    }
+
+    if (X) { // Build (X^Y) & Z
+      Op1 = Builder.CreateXor(X, Y);
+      Op1 = Builder.CreateAnd(Op1, Z);
+      I.setOperand(0, Op1);
+      I.setOperand(1, Constant::getNullValue(Op1->getType()));
+      return &I;
+    }
+  }
+
+  // Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B)
+  // and       (B & (1<<X)-1) == (zext A) --> A == (trunc B)
+  ConstantInt *Cst1;
+  if ((Op0->hasOneUse() && match(Op0, m_ZExt(m_Value(A))) &&
+       match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) ||
+      (Op1->hasOneUse() && match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) &&
+       match(Op1, m_ZExt(m_Value(A))))) {
+    APInt Pow2 = Cst1->getValue() + 1;
+    if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) &&
+        Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth())
+      return new ICmpInst(Pred, A, Builder.CreateTrunc(B, A->getType()));
+  }
+
+  // (A >> C) == (B >> C) --> (A^B) u< (1 << C)
+  // For lshr and ashr pairs.
+  if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) &&
+       match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) ||
+      (match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) &&
+       match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) {
+    unsigned TypeBits = Cst1->getBitWidth();
+    unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
+    if (ShAmt < TypeBits && ShAmt != 0) {
+      ICmpInst::Predicate NewPred =
+          Pred == ICmpInst::ICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
+      Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
+      APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
+      return new ICmpInst(NewPred, Xor, Builder.getInt(CmpVal));
+    }
+  }
+
+  // (A << C) == (B << C) --> ((A^B) & (~0U >> C)) == 0
+  if (match(Op0, m_OneUse(m_Shl(m_Value(A), m_ConstantInt(Cst1)))) &&
+      match(Op1, m_OneUse(m_Shl(m_Value(B), m_Specific(Cst1))))) {
+    unsigned TypeBits = Cst1->getBitWidth();
+    unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
+    if (ShAmt < TypeBits && ShAmt != 0) {
+      Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
+      APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt);
+      Value *And = Builder.CreateAnd(Xor, Builder.getInt(AndVal),
+                                      I.getName() + ".mask");
+      return new ICmpInst(Pred, And, Constant::getNullValue(Cst1->getType()));
+    }
+  }
+
+  // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
+  // "icmp (and X, mask), cst"
+  uint64_t ShAmt = 0;
+  if (Op0->hasOneUse() &&
+      match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A), m_ConstantInt(ShAmt))))) &&
+      match(Op1, m_ConstantInt(Cst1)) &&
+      // Only do this when A has multiple uses.  This is most important to do
+      // when it exposes other optimizations.
+      !A->hasOneUse()) {
+    unsigned ASize = cast<IntegerType>(A->getType())->getPrimitiveSizeInBits();
+
+    if (ShAmt < ASize) {
+      APInt MaskV =
+          APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits());
+      MaskV <<= ShAmt;
+
+      APInt CmpV = Cst1->getValue().zext(ASize);
+      CmpV <<= ShAmt;
+
+      Value *Mask = Builder.CreateAnd(A, Builder.getInt(MaskV));
+      return new ICmpInst(Pred, Mask, Builder.getInt(CmpV));
+    }
+  }
+
+  // If both operands are byte-swapped or bit-reversed, just compare the
+  // original values.
+  // TODO: Move this to a function similar to foldICmpIntrinsicWithConstant()
+  // and handle more intrinsics.
+  if ((match(Op0, m_BSwap(m_Value(A))) && match(Op1, m_BSwap(m_Value(B)))) ||
+      (match(Op0, m_BitReverse(m_Value(A))) &&
+       match(Op1, m_BitReverse(m_Value(B)))))
+    return new ICmpInst(Pred, A, B);
+
+  return nullptr;
+}
+
+/// Handle icmp (cast x to y), (cast/cst). We only handle extending casts so
+/// far.
+Instruction *InstCombiner::foldICmpWithCastAndCast(ICmpInst &ICmp) {
+  const CastInst *LHSCI = cast<CastInst>(ICmp.getOperand(0));
+  Value *LHSCIOp        = LHSCI->getOperand(0);
+  Type *SrcTy     = LHSCIOp->getType();
+  Type *DestTy    = LHSCI->getType();
+  Value *RHSCIOp;
+
+  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
+  // integer type is the same size as the pointer type.
+  if (LHSCI->getOpcode() == Instruction::PtrToInt &&
+      DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) {
+    Value *RHSOp = nullptr;
+    if (auto *RHSC = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) {
+      Value *RHSCIOp = RHSC->getOperand(0);
+      if (RHSCIOp->getType()->getPointerAddressSpace() ==
+          LHSCIOp->getType()->getPointerAddressSpace()) {
+        RHSOp = RHSC->getOperand(0);
+        // If the pointer types don't match, insert a bitcast.
+        if (LHSCIOp->getType() != RHSOp->getType())
+          RHSOp = Builder.CreateBitCast(RHSOp, LHSCIOp->getType());
+      }
+    } else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) {
+      RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
+    }
+
+    if (RHSOp)
+      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSOp);
+  }
+
+  // The code below only handles extension cast instructions, so far.
+  // Enforce this.
+  if (LHSCI->getOpcode() != Instruction::ZExt &&
+      LHSCI->getOpcode() != Instruction::SExt)
+    return nullptr;
+
+  bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt;
+  bool isSignedCmp = ICmp.isSigned();
+
+  if (auto *CI = dyn_cast<CastInst>(ICmp.getOperand(1))) {
+    // Not an extension from the same type?
+    RHSCIOp = CI->getOperand(0);
+    if (RHSCIOp->getType() != LHSCIOp->getType())
+      return nullptr;
+
+    // If the signedness of the two casts doesn't agree (i.e. one is a sext
+    // and the other is a zext), then we can't handle this.
+    if (CI->getOpcode() != LHSCI->getOpcode())
+      return nullptr;
+
+    // Deal with equality cases early.
+    if (ICmp.isEquality())
+      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp);
+
+    // A signed comparison of sign extended values simplifies into a
+    // signed comparison.
+    if (isSignedCmp && isSignedExt)
+      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp);
+
+    // The other three cases all fold into an unsigned comparison.
+    return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, RHSCIOp);
+  }
+
+  // If we aren't dealing with a constant on the RHS, exit early.
+  auto *C = dyn_cast<Constant>(ICmp.getOperand(1));
+  if (!C)
+    return nullptr;
+
+  // Compute the constant that would happen if we truncated to SrcTy then
+  // re-extended to DestTy.
+  Constant *Res1 = ConstantExpr::getTrunc(C, SrcTy);
+  Constant *Res2 = ConstantExpr::getCast(LHSCI->getOpcode(), Res1, DestTy);
+
+  // If the re-extended constant didn't change...
+  if (Res2 == C) {
+    // Deal with equality cases early.
+    if (ICmp.isEquality())
+      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1);
+
+    // A signed comparison of sign extended values simplifies into a
+    // signed comparison.
+    if (isSignedExt && isSignedCmp)
+      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1);
+
+    // The other three cases all fold into an unsigned comparison.
+    return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, Res1);
+  }
+
+  // The re-extended constant changed, partly changed (in the case of a vector),
+  // or could not be determined to be equal (in the case of a constant
+  // expression), so the constant cannot be represented in the shorter type.
+  // Consequently, we cannot emit a simple comparison.
+  // All the cases that fold to true or false will have already been handled
+  // by SimplifyICmpInst, so only deal with the tricky case.
+
+  if (isSignedCmp || !isSignedExt || !isa<ConstantInt>(C))
+    return nullptr;
+
+  // Evaluate the comparison for LT (we invert for GT below). LE and GE cases
+  // should have been folded away previously and not enter in here.
+
+  // We're performing an unsigned comp with a sign extended value.
+  // This is true if the input is >= 0. [aka >s -1]
+  Constant *NegOne = Constant::getAllOnesValue(SrcTy);
+  Value *Result = Builder.CreateICmpSGT(LHSCIOp, NegOne, ICmp.getName());
+
+  // Finally, return the value computed.
+  if (ICmp.getPredicate() == ICmpInst::ICMP_ULT)
+    return replaceInstUsesWith(ICmp, Result);
+
+  assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!");
+  return BinaryOperator::CreateNot(Result);
+}
+
+bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
+                                         Value *RHS, Instruction &OrigI,
+                                         Value *&Result, Constant *&Overflow) {
+  if (OrigI.isCommutative() && isa<Constant>(LHS) && !isa<Constant>(RHS))
+    std::swap(LHS, RHS);
+
+  auto SetResult = [&](Value *OpResult, Constant *OverflowVal, bool ReuseName) {
+    Result = OpResult;
+    Overflow = OverflowVal;
+    if (ReuseName)
+      Result->takeName(&OrigI);
+    return true;
+  };
+
+  // If the overflow check was an add followed by a compare, the insertion point
+  // may be pointing to the compare.  We want to insert the new instructions
+  // before the add in case there are uses of the add between the add and the
+  // compare.
+  Builder.SetInsertPoint(&OrigI);
+
+  switch (OCF) {
+  case OCF_INVALID:
+    llvm_unreachable("bad overflow check kind!");
+
+  case OCF_UNSIGNED_ADD: {
+    OverflowResult OR = computeOverflowForUnsignedAdd(LHS, RHS, &OrigI);
+    if (OR == OverflowResult::NeverOverflows)
+      return SetResult(Builder.CreateNUWAdd(LHS, RHS), Builder.getFalse(),
+                       true);
+
+    if (OR == OverflowResult::AlwaysOverflows)
+      return SetResult(Builder.CreateAdd(LHS, RHS), Builder.getTrue(), true);
+
+    // Fall through uadd into sadd
+    LLVM_FALLTHROUGH;
+  }
+  case OCF_SIGNED_ADD: {
+    // X + 0 -> {X, false}
+    if (match(RHS, m_Zero()))
+      return SetResult(LHS, Builder.getFalse(), false);
+
+    // We can strength reduce this signed add into a regular add if we can prove
+    // that it will never overflow.
+    if (OCF == OCF_SIGNED_ADD)
+      if (willNotOverflowSignedAdd(LHS, RHS, OrigI))
+        return SetResult(Builder.CreateNSWAdd(LHS, RHS), Builder.getFalse(),
+                         true);
+    break;
+  }
+
+  case OCF_UNSIGNED_SUB:
+  case OCF_SIGNED_SUB: {
+    // X - 0 -> {X, false}
+    if (match(RHS, m_Zero()))
+      return SetResult(LHS, Builder.getFalse(), false);
+
+    if (OCF == OCF_SIGNED_SUB) {
+      if (willNotOverflowSignedSub(LHS, RHS, OrigI))
+        return SetResult(Builder.CreateNSWSub(LHS, RHS), Builder.getFalse(),
+                         true);
+    } else {
+      if (willNotOverflowUnsignedSub(LHS, RHS, OrigI))
+        return SetResult(Builder.CreateNUWSub(LHS, RHS), Builder.getFalse(),
+                         true);
+    }
+    break;
+  }
+
+  case OCF_UNSIGNED_MUL: {
+    OverflowResult OR = computeOverflowForUnsignedMul(LHS, RHS, &OrigI);
+    if (OR == OverflowResult::NeverOverflows)
+      return SetResult(Builder.CreateNUWMul(LHS, RHS), Builder.getFalse(),
+                       true);
+    if (OR == OverflowResult::AlwaysOverflows)
+      return SetResult(Builder.CreateMul(LHS, RHS), Builder.getTrue(), true);
+    LLVM_FALLTHROUGH;
+  }
+  case OCF_SIGNED_MUL:
+    // X * undef -> undef
+    if (isa<UndefValue>(RHS))
+      return SetResult(RHS, UndefValue::get(Builder.getInt1Ty()), false);
+
+    // X * 0 -> {0, false}
+    if (match(RHS, m_Zero()))
+      return SetResult(RHS, Builder.getFalse(), false);
+
+    // X * 1 -> {X, false}
+    if (match(RHS, m_One()))
+      return SetResult(LHS, Builder.getFalse(), false);
+
+    if (OCF == OCF_SIGNED_MUL)
+      if (willNotOverflowSignedMul(LHS, RHS, OrigI))
+        return SetResult(Builder.CreateNSWMul(LHS, RHS), Builder.getFalse(),
+                         true);
+    break;
+  }
+
+  return false;
+}
+
+/// \brief Recognize and process idiom involving test for multiplication
+/// overflow.
+///
+/// The caller has matched a pattern of the form:
+///   I = cmp u (mul(zext A, zext B), V
+/// The function checks if this is a test for overflow and if so replaces
+/// multiplication with call to 'mul.with.overflow' intrinsic.
+///
+/// \param I Compare instruction.
+/// \param MulVal Result of 'mult' instruction.  It is one of the arguments of
+///               the compare instruction.  Must be of integer type.
+/// \param OtherVal The other argument of compare instruction.
+/// \returns Instruction which must replace the compare instruction, NULL if no
+///          replacement required.
+static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
+                                         Value *OtherVal, InstCombiner &IC) {
+  // Don't bother doing this transformation for pointers, don't do it for
+  // vectors.
+  if (!isa<IntegerType>(MulVal->getType()))
+    return nullptr;
+
+  assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal);
+  assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal);
+  auto *MulInstr = dyn_cast<Instruction>(MulVal);
+  if (!MulInstr)
+    return nullptr;
+  assert(MulInstr->getOpcode() == Instruction::Mul);
+
+  auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)),
+       *RHS = cast<ZExtOperator>(MulInstr->getOperand(1));
+  assert(LHS->getOpcode() == Instruction::ZExt);
+  assert(RHS->getOpcode() == Instruction::ZExt);
+  Value *A = LHS->getOperand(0), *B = RHS->getOperand(0);
+
+  // Calculate type and width of the result produced by mul.with.overflow.
+  Type *TyA = A->getType(), *TyB = B->getType();
+  unsigned WidthA = TyA->getPrimitiveSizeInBits(),
+           WidthB = TyB->getPrimitiveSizeInBits();
+  unsigned MulWidth;
+  Type *MulType;
+  if (WidthB > WidthA) {
+    MulWidth = WidthB;
+    MulType = TyB;
+  } else {
+    MulWidth = WidthA;
+    MulType = TyA;
+  }
+
+  // In order to replace the original mul with a narrower mul.with.overflow,
+  // all uses must ignore upper bits of the product.  The number of used low
+  // bits must be not greater than the width of mul.with.overflow.
+  if (MulVal->hasNUsesOrMore(2))
+    for (User *U : MulVal->users()) {
+      if (U == &I)
+        continue;
+      if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
+        // Check if truncation ignores bits above MulWidth.
+        unsigned TruncWidth = TI->getType()->getPrimitiveSizeInBits();
+        if (TruncWidth > MulWidth)
+          return nullptr;
+      } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
+        // Check if AND ignores bits above MulWidth.
+        if (BO->getOpcode() != Instruction::And)
+          return nullptr;
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+          const APInt &CVal = CI->getValue();
+          if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth)
+            return nullptr;
+        }
+      } else {
+        // Other uses prohibit this transformation.
+        return nullptr;
+      }
+    }
+
+  // Recognize patterns
+  switch (I.getPredicate()) {
+  case ICmpInst::ICMP_EQ:
+  case ICmpInst::ICMP_NE:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp eq/neq mulval, zext trunc mulval
+    if (ZExtInst *Zext = dyn_cast<ZExtInst>(OtherVal))
+      if (Zext->hasOneUse()) {
+        Value *ZextArg = Zext->getOperand(0);
+        if (TruncInst *Trunc = dyn_cast<TruncInst>(ZextArg))
+          if (Trunc->getType()->getPrimitiveSizeInBits() == MulWidth)
+            break; //Recognized
+      }
+
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp eq/neq mulval, and(mulval, mask), mask selects low MulWidth bits.
+    ConstantInt *CI;
+    Value *ValToMask;
+    if (match(OtherVal, m_And(m_Value(ValToMask), m_ConstantInt(CI)))) {
+      if (ValToMask != MulVal)
+        return nullptr;
+      const APInt &CVal = CI->getValue() + 1;
+      if (CVal.isPowerOf2()) {
+        unsigned MaskWidth = CVal.logBase2();
+        if (MaskWidth == MulWidth)
+          break; // Recognized
+      }
+    }
+    return nullptr;
+
+  case ICmpInst::ICMP_UGT:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp ugt mulval, max
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+      APInt MaxVal = APInt::getMaxValue(MulWidth);
+      MaxVal = MaxVal.zext(CI->getBitWidth());
+      if (MaxVal.eq(CI->getValue()))
+        break; // Recognized
+    }
+    return nullptr;
+
+  case ICmpInst::ICMP_UGE:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp uge mulval, max+1
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+      APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth);
+      if (MaxVal.eq(CI->getValue()))
+        break; // Recognized
+    }
+    return nullptr;
+
+  case ICmpInst::ICMP_ULE:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp ule mulval, max
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+      APInt MaxVal = APInt::getMaxValue(MulWidth);
+      MaxVal = MaxVal.zext(CI->getBitWidth());
+      if (MaxVal.eq(CI->getValue()))
+        break; // Recognized
+    }
+    return nullptr;
+
+  case ICmpInst::ICMP_ULT:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp ule mulval, max + 1
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+      APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth);
+      if (MaxVal.eq(CI->getValue()))
+        break; // Recognized
+    }
+    return nullptr;
+
+  default:
+    return nullptr;
+  }
+
+  InstCombiner::BuilderTy &Builder = IC.Builder;
+  Builder.SetInsertPoint(MulInstr);
+
+  // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B)
+  Value *MulA = A, *MulB = B;
+  if (WidthA < MulWidth)
+    MulA = Builder.CreateZExt(A, MulType);
+  if (WidthB < MulWidth)
+    MulB = Builder.CreateZExt(B, MulType);
+  Value *F = Intrinsic::getDeclaration(I.getModule(),
+                                       Intrinsic::umul_with_overflow, MulType);
+  CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul");
+  IC.Worklist.Add(MulInstr);
+
+  // If there are uses of mul result other than the comparison, we know that
+  // they are truncation or binary AND. Change them to use result of
+  // mul.with.overflow and adjust properly mask/size.
+  if (MulVal->hasNUsesOrMore(2)) {
+    Value *Mul = Builder.CreateExtractValue(Call, 0, "umul.value");
+    for (User *U : MulVal->users()) {
+      if (U == &I || U == OtherVal)
+        continue;
+      if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
+        if (TI->getType()->getPrimitiveSizeInBits() == MulWidth)
+          IC.replaceInstUsesWith(*TI, Mul);
+        else
+          TI->setOperand(0, Mul);
+      } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
+        assert(BO->getOpcode() == Instruction::And);
+        // Replace (mul & mask) --> zext (mul.with.overflow & short_mask)
+        Value *ShortMask =
+            Builder.CreateTrunc(BO->getOperand(1), Builder.getIntNTy(MulWidth));
+        Value *ShortAnd = Builder.CreateAnd(Mul, ShortMask);
+        Value *Zext = Builder.CreateZExt(ShortAnd, BO->getType());
+        if (auto *ZextI = dyn_cast<Instruction>(Zext))
+          IC.Worklist.Add(ZextI);
+        IC.replaceInstUsesWith(*BO, Zext);
+      } else {
+        llvm_unreachable("Unexpected Binary operation");
+      }
+      if (auto *UI = dyn_cast<Instruction>(U))
+        IC.Worklist.Add(UI);
+    }
+  }
+  if (isa<Instruction>(OtherVal))
+    IC.Worklist.Add(cast<Instruction>(OtherVal));
+
+  // The original icmp gets replaced with the overflow value, maybe inverted
+  // depending on predicate.
+  bool Inverse = false;
+  switch (I.getPredicate()) {
+  case ICmpInst::ICMP_NE:
+    break;
+  case ICmpInst::ICMP_EQ:
+    Inverse = true;
+    break;
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_UGE:
+    if (I.getOperand(0) == MulVal)
+      break;
+    Inverse = true;
+    break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_ULE:
+    if (I.getOperand(1) == MulVal)
+      break;
+    Inverse = true;
+    break;
+  default:
+    llvm_unreachable("Unexpected predicate");
+  }
+  if (Inverse) {
+    Value *Res = Builder.CreateExtractValue(Call, 1);
+    return BinaryOperator::CreateNot(Res);
+  }
+
+  return ExtractValueInst::Create(Call, 1);
+}
+
+/// When performing a comparison against a constant, it is possible that not all
+/// the bits in the LHS are demanded. This helper method computes the mask that
+/// IS demanded.
+static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth,
+                                    bool isSignCheck) {
+  if (isSignCheck)
+    return APInt::getSignMask(BitWidth);
+
+  ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(1));
+  if (!CI) return APInt::getAllOnesValue(BitWidth);
+  const APInt &RHS = CI->getValue();
+
+  switch (I.getPredicate()) {
+  // For a UGT comparison, we don't care about any bits that
+  // correspond to the trailing ones of the comparand.  The value of these
+  // bits doesn't impact the outcome of the comparison, because any value
+  // greater than the RHS must differ in a bit higher than these due to carry.
+  case ICmpInst::ICMP_UGT: {
+    unsigned trailingOnes = RHS.countTrailingOnes();
+    return APInt::getBitsSetFrom(BitWidth, trailingOnes);
+  }
+
+  // Similarly, for a ULT comparison, we don't care about the trailing zeros.
+  // Any value less than the RHS must differ in a higher bit because of carries.
+  case ICmpInst::ICMP_ULT: {
+    unsigned trailingZeros = RHS.countTrailingZeros();
+    return APInt::getBitsSetFrom(BitWidth, trailingZeros);
+  }
+
+  default:
+    return APInt::getAllOnesValue(BitWidth);
+  }
+}
+
+/// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst
+/// should be swapped.
+/// The decision is based on how many times these two operands are reused
+/// as subtract operands and their positions in those instructions.
+/// The rational is that several architectures use the same instruction for
+/// both subtract and cmp, thus it is better if the order of those operands
+/// match.
+/// \return true if Op0 and Op1 should be swapped.
+static bool swapMayExposeCSEOpportunities(const Value * Op0,
+                                          const Value * Op1) {
+  // Filter out pointer value as those cannot appears directly in subtract.
+  // FIXME: we may want to go through inttoptrs or bitcasts.
+  if (Op0->getType()->isPointerTy())
+    return false;
+  // Count every uses of both Op0 and Op1 in a subtract.
+  // Each time Op0 is the first operand, count -1: swapping is bad, the
+  // subtract has already the same layout as the compare.
+  // Each time Op0 is the second operand, count +1: swapping is good, the
+  // subtract has a different layout as the compare.
+  // At the end, if the benefit is greater than 0, Op0 should come second to
+  // expose more CSE opportunities.
+  int GlobalSwapBenefits = 0;
+  for (const User *U : Op0->users()) {
+    const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(U);
+    if (!BinOp || BinOp->getOpcode() != Instruction::Sub)
+      continue;
+    // If Op0 is the first argument, this is not beneficial to swap the
+    // arguments.
+    int LocalSwapBenefits = -1;
+    unsigned Op1Idx = 1;
+    if (BinOp->getOperand(Op1Idx) == Op0) {
+      Op1Idx = 0;
+      LocalSwapBenefits = 1;
+    }
+    if (BinOp->getOperand(Op1Idx) != Op1)
+      continue;
+    GlobalSwapBenefits += LocalSwapBenefits;
+  }
+  return GlobalSwapBenefits > 0;
+}
+
+/// \brief Check that one use is in the same block as the definition and all
+/// other uses are in blocks dominated by a given block.
+///
+/// \param DI Definition
+/// \param UI Use
+/// \param DB Block that must dominate all uses of \p DI outside
+///           the parent block
+/// \return true when \p UI is the only use of \p DI in the parent block
+/// and all other uses of \p DI are in blocks dominated by \p DB.
+///
+bool InstCombiner::dominatesAllUses(const Instruction *DI,
+                                    const Instruction *UI,
+                                    const BasicBlock *DB) const {
+  assert(DI && UI && "Instruction not defined\n");
+  // Ignore incomplete definitions.
+  if (!DI->getParent())
+    return false;
+  // DI and UI must be in the same block.
+  if (DI->getParent() != UI->getParent())
+    return false;
+  // Protect from self-referencing blocks.
+  if (DI->getParent() == DB)
+    return false;
+  for (const User *U : DI->users()) {
+    auto *Usr = cast<Instruction>(U);
+    if (Usr != UI && !DT.dominates(DB, Usr->getParent()))
+      return false;
+  }
+  return true;
+}
+
+/// Return true when the instruction sequence within a block is select-cmp-br.
+static bool isChainSelectCmpBranch(const SelectInst *SI) {
+  const BasicBlock *BB = SI->getParent();
+  if (!BB)
+    return false;
+  auto *BI = dyn_cast_or_null<BranchInst>(BB->getTerminator());
+  if (!BI || BI->getNumSuccessors() != 2)
+    return false;
+  auto *IC = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!IC || (IC->getOperand(0) != SI && IC->getOperand(1) != SI))
+    return false;
+  return true;
+}
+
+/// \brief True when a select result is replaced by one of its operands
+/// in select-icmp sequence. This will eventually result in the elimination
+/// of the select.
+///
+/// \param SI    Select instruction
+/// \param Icmp  Compare instruction
+/// \param SIOpd Operand that replaces the select
+///
+/// Notes:
+/// - The replacement is global and requires dominator information
+/// - The caller is responsible for the actual replacement
+///
+/// Example:
+///
+/// entry:
+///  %4 = select i1 %3, %C* %0, %C* null
+///  %5 = icmp eq %C* %4, null
+///  br i1 %5, label %9, label %7
+///  ...
+///  ; <label>:7                                       ; preds = %entry
+///  %8 = getelementptr inbounds %C* %4, i64 0, i32 0
+///  ...
+///
+/// can be transformed to
+///
+///  %5 = icmp eq %C* %0, null
+///  %6 = select i1 %3, i1 %5, i1 true
+///  br i1 %6, label %9, label %7
+///  ...
+///  ; <label>:7                                       ; preds = %entry
+///  %8 = getelementptr inbounds %C* %0, i64 0, i32 0  // replace by %0!
+///
+/// Similar when the first operand of the select is a constant or/and
+/// the compare is for not equal rather than equal.
+///
+/// NOTE: The function is only called when the select and compare constants
+/// are equal, the optimization can work only for EQ predicates. This is not a
+/// major restriction since a NE compare should be 'normalized' to an equal
+/// compare, which usually happens in the combiner and test case
+/// select-cmp-br.ll checks for it.
+bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
+                                             const ICmpInst *Icmp,
+                                             const unsigned SIOpd) {
+  assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!");
+  if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) {
+    BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1);
+    // The check for the single predecessor is not the best that can be
+    // done. But it protects efficiently against cases like when SI's
+    // home block has two successors, Succ and Succ1, and Succ1 predecessor
+    // of Succ. Then SI can't be replaced by SIOpd because the use that gets
+    // replaced can be reached on either path. So the uniqueness check
+    // guarantees that the path all uses of SI (outside SI's parent) are on
+    // is disjoint from all other paths out of SI. But that information
+    // is more expensive to compute, and the trade-off here is in favor
+    // of compile-time. It should also be noticed that we check for a single
+    // predecessor and not only uniqueness. This to handle the situation when
+    // Succ and Succ1 points to the same basic block.
+    if (Succ->getSinglePredecessor() && dominatesAllUses(SI, Icmp, Succ)) {
+      NumSel++;
+      SI->replaceUsesOutsideBlock(SI->getOperand(SIOpd), SI->getParent());
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Try to fold the comparison based on range information we can get by checking
+/// whether bits are known to be zero or one in the inputs.
+Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Type *Ty = Op0->getType();
+  ICmpInst::Predicate Pred = I.getPredicate();
+
+  // Get scalar or pointer size.
+  unsigned BitWidth = Ty->isIntOrIntVectorTy()
+                          ? Ty->getScalarSizeInBits()
+                          : DL.getTypeSizeInBits(Ty->getScalarType());
+
+  if (!BitWidth)
+    return nullptr;
+
+  // If this is a normal comparison, it demands all bits. If it is a sign bit
+  // comparison, it only demands the sign bit.
+  bool IsSignBit = false;
+  const APInt *CmpC;
+  if (match(Op1, m_APInt(CmpC))) {
+    bool UnusedBit;
+    IsSignBit = isSignBitCheck(Pred, *CmpC, UnusedBit);
+  }
+
+  KnownBits Op0Known(BitWidth);
+  KnownBits Op1Known(BitWidth);
+
+  if (SimplifyDemandedBits(&I, 0,
+                           getDemandedBitsLHSMask(I, BitWidth, IsSignBit),
+                           Op0Known, 0))
+    return &I;
+
+  if (SimplifyDemandedBits(&I, 1, APInt::getAllOnesValue(BitWidth),
+                           Op1Known, 0))
+    return &I;
+
+  // Given the known and unknown bits, compute a range that the LHS could be
+  // in.  Compute the Min, Max and RHS values based on the known bits. For the
+  // EQ and NE we use unsigned values.
+  APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0);
+  APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0);
+  if (I.isSigned()) {
+    computeSignedMinMaxValuesFromKnownBits(Op0Known, Op0Min, Op0Max);
+    computeSignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max);
+  } else {
+    computeUnsignedMinMaxValuesFromKnownBits(Op0Known, Op0Min, Op0Max);
+    computeUnsignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max);
+  }
+
+  // If Min and Max are known to be the same, then SimplifyDemandedBits
+  // figured out that the LHS is a constant. Constant fold this now, so that
+  // code below can assume that Min != Max.
+  if (!isa<Constant>(Op0) && Op0Min == Op0Max)
+    return new ICmpInst(Pred, ConstantInt::get(Op0->getType(), Op0Min), Op1);
+  if (!isa<Constant>(Op1) && Op1Min == Op1Max)
+    return new ICmpInst(Pred, Op0, ConstantInt::get(Op1->getType(), Op1Min));
+
+  // Based on the range information we know about the LHS, see if we can
+  // simplify this comparison.  For example, (x&4) < 8 is always true.
+  switch (Pred) {
+  default:
+    llvm_unreachable("Unknown icmp opcode!");
+  case ICmpInst::ICMP_EQ:
+  case ICmpInst::ICMP_NE: {
+    if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max)) {
+      return Pred == CmpInst::ICMP_EQ
+                 ? replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()))
+                 : replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    }
+
+    // If all bits are known zero except for one, then we know at most one bit
+    // is set. If the comparison is against zero, then this is a check to see if
+    // *that* bit is set.
+    APInt Op0KnownZeroInverted = ~Op0Known.Zero;
+    if (Op1Known.isZero()) {
+      // If the LHS is an AND with the same constant, look through it.
+      Value *LHS = nullptr;
+      const APInt *LHSC;
+      if (!match(Op0, m_And(m_Value(LHS), m_APInt(LHSC))) ||
+          *LHSC != Op0KnownZeroInverted)
+        LHS = Op0;
+
+      Value *X;
+      if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
+        APInt ValToCheck = Op0KnownZeroInverted;
+        Type *XTy = X->getType();
+        if (ValToCheck.isPowerOf2()) {
+          // ((1 << X) & 8) == 0 -> X != 3
+          // ((1 << X) & 8) != 0 -> X == 3
+          auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
+          auto NewPred = ICmpInst::getInversePredicate(Pred);
+          return new ICmpInst(NewPred, X, CmpC);
+        } else if ((++ValToCheck).isPowerOf2()) {
+          // ((1 << X) & 7) == 0 -> X >= 3
+          // ((1 << X) & 7) != 0 -> X  < 3
+          auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
+          auto NewPred =
+              Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGE : CmpInst::ICMP_ULT;
+          return new ICmpInst(NewPred, X, CmpC);
+        }
+      }
+
+      // Check if the LHS is 8 >>u x and the result is a power of 2 like 1.
+      const APInt *CI;
+      if (Op0KnownZeroInverted.isOneValue() &&
+          match(LHS, m_LShr(m_Power2(CI), m_Value(X)))) {
+        // ((8 >>u X) & 1) == 0 -> X != 3
+        // ((8 >>u X) & 1) != 0 -> X == 3
+        unsigned CmpVal = CI->countTrailingZeros();
+        auto NewPred = ICmpInst::getInversePredicate(Pred);
+        return new ICmpInst(NewPred, X, ConstantInt::get(X->getType(), CmpVal));
+      }
+    }
+    break;
+  }
+  case ICmpInst::ICMP_ULT: {
+    if (Op0Max.ult(Op1Min)) // A <u B -> true if max(A) < min(B)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Min.uge(Op1Max)) // A <u B -> false if min(A) >= max(B)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B)
+      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+
+    const APInt *CmpC;
+    if (match(Op1, m_APInt(CmpC))) {
+      // A <u C -> A == C-1 if min(A)+1 == C
+      if (Op1Max == Op0Min + 1) {
+        Constant *CMinus1 = ConstantInt::get(Op0->getType(), *CmpC - 1);
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0, CMinus1);
+      }
+    }
+    break;
+  }
+  case ICmpInst::ICMP_UGT: {
+    if (Op0Min.ugt(Op1Max)) // A >u B -> true if min(A) > max(B)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+
+    if (Op0Max.ule(Op1Min)) // A >u B -> false if max(A) <= max(B)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+
+    if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B)
+      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+
+    const APInt *CmpC;
+    if (match(Op1, m_APInt(CmpC))) {
+      // A >u C -> A == C+1 if max(a)-1 == C
+      if (*CmpC == Op0Max - 1)
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                            ConstantInt::get(Op1->getType(), *CmpC + 1));
+    }
+    break;
+  }
+  case ICmpInst::ICMP_SLT:
+    if (Op0Max.slt(Op1Min)) // A <s B -> true if max(A) < min(C)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Min.sge(Op1Max)) // A <s B -> false if min(A) >= max(C)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B)
+      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+      if (Op1Max == Op0Min + 1) // A <s C -> A == C-1 if min(A)+1 == C
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                            Builder.getInt(CI->getValue() - 1));
+    }
+    break;
+  case ICmpInst::ICMP_SGT:
+    if (Op0Min.sgt(Op1Max)) // A >s B -> true if min(A) > max(B)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Max.sle(Op1Min)) // A >s B -> false if max(A) <= min(B)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+
+    if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B)
+      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+      if (Op1Min == Op0Max - 1) // A >s C -> A == C+1 if max(A)-1 == C
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                            Builder.getInt(CI->getValue() + 1));
+    }
+    break;
+  case ICmpInst::ICMP_SGE:
+    assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!");
+    if (Op0Min.sge(Op1Max)) // A >=s B -> true if min(A) >= max(B)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Max.slt(Op1Min)) // A >=s B -> false if max(A) < min(B)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    break;
+  case ICmpInst::ICMP_SLE:
+    assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!");
+    if (Op0Max.sle(Op1Min)) // A <=s B -> true if max(A) <= min(B)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Min.sgt(Op1Max)) // A <=s B -> false if min(A) > max(B)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    break;
+  case ICmpInst::ICMP_UGE:
+    assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!");
+    if (Op0Min.uge(Op1Max)) // A >=u B -> true if min(A) >= max(B)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Max.ult(Op1Min)) // A >=u B -> false if max(A) < min(B)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    break;
+  case ICmpInst::ICMP_ULE:
+    assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!");
+    if (Op0Max.ule(Op1Min)) // A <=u B -> true if max(A) <= min(B)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Min.ugt(Op1Max)) // A <=u B -> false if min(A) > max(B)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    break;
+  }
+
+  // Turn a signed comparison into an unsigned one if both operands are known to
+  // have the same sign.
+  if (I.isSigned() &&
+      ((Op0Known.Zero.isNegative() && Op1Known.Zero.isNegative()) ||
+       (Op0Known.One.isNegative() && Op1Known.One.isNegative())))
+    return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
+
+  return nullptr;
+}
+
+/// If we have an icmp le or icmp ge instruction with a constant operand, turn
+/// it into the appropriate icmp lt or icmp gt instruction. This transform
+/// allows them to be folded in visitICmpInst.
+static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
+  ICmpInst::Predicate Pred = I.getPredicate();
+  if (Pred != ICmpInst::ICMP_SLE && Pred != ICmpInst::ICMP_SGE &&
+      Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_UGE)
+    return nullptr;
+
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  auto *Op1C = dyn_cast<Constant>(Op1);
+  if (!Op1C)
+    return nullptr;
+
+  // Check if the constant operand can be safely incremented/decremented without
+  // overflowing/underflowing. For scalars, SimplifyICmpInst has already handled
+  // the edge cases for us, so we just assert on them. For vectors, we must
+  // handle the edge cases.
+  Type *Op1Type = Op1->getType();
+  bool IsSigned = I.isSigned();
+  bool IsLE = (Pred == ICmpInst::ICMP_SLE || Pred == ICmpInst::ICMP_ULE);
+  auto *CI = dyn_cast<ConstantInt>(Op1C);
+  if (CI) {
+    // A <= MAX -> TRUE ; A >= MIN -> TRUE
+    assert(IsLE ? !CI->isMaxValue(IsSigned) : !CI->isMinValue(IsSigned));
+  } else if (Op1Type->isVectorTy()) {
+    // TODO? If the edge cases for vectors were guaranteed to be handled as they
+    // are for scalar, we could remove the min/max checks. However, to do that,
+    // we would have to use insertelement/shufflevector to replace edge values.
+    unsigned NumElts = Op1Type->getVectorNumElements();
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Elt = Op1C->getAggregateElement(i);
+      if (!Elt)
+        return nullptr;
+
+      if (isa<UndefValue>(Elt))
+        continue;
+
+      // Bail out if we can't determine if this constant is min/max or if we
+      // know that this constant is min/max.
+      auto *CI = dyn_cast<ConstantInt>(Elt);
+      if (!CI || (IsLE ? CI->isMaxValue(IsSigned) : CI->isMinValue(IsSigned)))
+        return nullptr;
+    }
+  } else {
+    // ConstantExpr?
+    return nullptr;
+  }
+
+  // Increment or decrement the constant and set the new comparison predicate:
+  // ULE -> ULT ; UGE -> UGT ; SLE -> SLT ; SGE -> SGT
+  Constant *OneOrNegOne = ConstantInt::get(Op1Type, IsLE ? 1 : -1, true);
+  CmpInst::Predicate NewPred = IsLE ? ICmpInst::ICMP_ULT: ICmpInst::ICMP_UGT;
+  NewPred = IsSigned ? ICmpInst::getSignedPredicate(NewPred) : NewPred;
+  return new ICmpInst(NewPred, Op0, ConstantExpr::getAdd(Op1C, OneOrNegOne));
+}
+
+/// Integer compare with boolean values can always be turned into bitwise ops.
+static Instruction *canonicalizeICmpBool(ICmpInst &I,
+                                         InstCombiner::BuilderTy &Builder) {
+  Value *A = I.getOperand(0), *B = I.getOperand(1);
+  assert(A->getType()->isIntOrIntVectorTy(1) && "Bools only");
+
+  // A boolean compared to true/false can be simplified to Op0/true/false in
+  // 14 out of the 20 (10 predicates * 2 constants) possible combinations.
+  // Cases not handled by InstSimplify are always 'not' of Op0.
+  if (match(B, m_Zero())) {
+    switch (I.getPredicate()) {
+      case CmpInst::ICMP_EQ:  // A ==   0 -> !A
+      case CmpInst::ICMP_ULE: // A <=u  0 -> !A
+      case CmpInst::ICMP_SGE: // A >=s  0 -> !A
+        return BinaryOperator::CreateNot(A);
+      default:
+        llvm_unreachable("ICmp i1 X, C not simplified as expected.");
+    }
+  } else if (match(B, m_One())) {
+    switch (I.getPredicate()) {
+      case CmpInst::ICMP_NE:  // A !=  1 -> !A
+      case CmpInst::ICMP_ULT: // A <u  1 -> !A
+      case CmpInst::ICMP_SGT: // A >s -1 -> !A
+        return BinaryOperator::CreateNot(A);
+      default:
+        llvm_unreachable("ICmp i1 X, C not simplified as expected.");
+    }
+  }
+
+  switch (I.getPredicate()) {
+  default:
+    llvm_unreachable("Invalid icmp instruction!");
+  case ICmpInst::ICMP_EQ:
+    // icmp eq i1 A, B -> ~(A ^ B)
+    return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+  case ICmpInst::ICMP_NE:
+    // icmp ne i1 A, B -> A ^ B
+    return BinaryOperator::CreateXor(A, B);
+
+  case ICmpInst::ICMP_UGT:
+    // icmp ugt -> icmp ult
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_ULT:
+    // icmp ult i1 A, B -> ~A & B
+    return BinaryOperator::CreateAnd(Builder.CreateNot(A), B);
+
+  case ICmpInst::ICMP_SGT:
+    // icmp sgt -> icmp slt
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_SLT:
+    // icmp slt i1 A, B -> A & ~B
+    return BinaryOperator::CreateAnd(Builder.CreateNot(B), A);
+
+  case ICmpInst::ICMP_UGE:
+    // icmp uge -> icmp ule
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_ULE:
+    // icmp ule i1 A, B -> ~A | B
+    return BinaryOperator::CreateOr(Builder.CreateNot(A), B);
+
+  case ICmpInst::ICMP_SGE:
+    // icmp sge -> icmp sle
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_SLE:
+    // icmp sle i1 A, B -> A | ~B
+    return BinaryOperator::CreateOr(Builder.CreateNot(B), A);
+  }
+}
+
+Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
+  bool Changed = false;
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  unsigned Op0Cplxity = getComplexity(Op0);
+  unsigned Op1Cplxity = getComplexity(Op1);
+
+  /// Orders the operands of the compare so that they are listed from most
+  /// complex to least complex.  This puts constants before unary operators,
+  /// before binary operators.
+  if (Op0Cplxity < Op1Cplxity ||
+      (Op0Cplxity == Op1Cplxity && swapMayExposeCSEOpportunities(Op0, Op1))) {
+    I.swapOperands();
+    std::swap(Op0, Op1);
+    Changed = true;
+  }
+
+  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1,
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  // comparing -val or val with non-zero is the same as just comparing val
+  // ie, abs(val) != 0 -> val != 0
+  if (I.getPredicate() == ICmpInst::ICMP_NE && match(Op1, m_Zero())) {
+    Value *Cond, *SelectTrue, *SelectFalse;
+    if (match(Op0, m_Select(m_Value(Cond), m_Value(SelectTrue),
+                            m_Value(SelectFalse)))) {
+      if (Value *V = dyn_castNegVal(SelectTrue)) {
+        if (V == SelectFalse)
+          return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
+      }
+      else if (Value *V = dyn_castNegVal(SelectFalse)) {
+        if (V == SelectTrue)
+          return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
+      }
+    }
+  }
+
+  if (Op0->getType()->isIntOrIntVectorTy(1))
+    if (Instruction *Res = canonicalizeICmpBool(I, Builder))
+      return Res;
+
+  if (ICmpInst *NewICmp = canonicalizeCmpWithConstant(I))
+    return NewICmp;
+
+  if (Instruction *Res = foldICmpWithConstant(I))
+    return Res;
+
+  if (Instruction *Res = foldICmpUsingKnownBits(I))
+    return Res;
+
+  // Test if the ICmpInst instruction is used exclusively by a select as
+  // part of a minimum or maximum operation. If so, refrain from doing
+  // any other folding. This helps out other analyses which understand
+  // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
+  // and CodeGen. And in this case, at least one of the comparison
+  // operands has at least one user besides the compare (the select),
+  // which would often largely negate the benefit of folding anyway.
+  if (I.hasOneUse())
+    if (SelectInst *SI = dyn_cast<SelectInst>(*I.user_begin()))
+      if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) ||
+          (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1))
+        return nullptr;
+
+  // FIXME: We only do this after checking for min/max to prevent infinite
+  // looping caused by a reverse canonicalization of these patterns for min/max.
+  // FIXME: The organization of folds is a mess. These would naturally go into
+  // canonicalizeCmpWithConstant(), but we can't move all of the above folds
+  // down here after the min/max restriction.
+  ICmpInst::Predicate Pred = I.getPredicate();
+  const APInt *C;
+  if (match(Op1, m_APInt(C))) {
+    // For i32: x >u 2147483647 -> x <s 0  -> true if sign bit set
+    if (Pred == ICmpInst::ICMP_UGT && C->isMaxSignedValue()) {
+      Constant *Zero = Constant::getNullValue(Op0->getType());
+      return new ICmpInst(ICmpInst::ICMP_SLT, Op0, Zero);
+    }
+
+    // For i32: x <u 2147483648 -> x >s -1  -> true if sign bit clear
+    if (Pred == ICmpInst::ICMP_ULT && C->isMinSignedValue()) {
+      Constant *AllOnes = Constant::getAllOnesValue(Op0->getType());
+      return new ICmpInst(ICmpInst::ICMP_SGT, Op0, AllOnes);
+    }
+  }
+
+  if (Instruction *Res = foldICmpInstWithConstant(I))
+    return Res;
+
+  if (Instruction *Res = foldICmpInstWithConstantNotInt(I))
+    return Res;
+
+  // If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now.
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op0))
+    if (Instruction *NI = foldGEPICmp(GEP, Op1, I.getPredicate(), I))
+      return NI;
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1))
+    if (Instruction *NI = foldGEPICmp(GEP, Op0,
+                           ICmpInst::getSwappedPredicate(I.getPredicate()), I))
+      return NI;
+
+  // Try to optimize equality comparisons against alloca-based pointers.
+  if (Op0->getType()->isPointerTy() && I.isEquality()) {
+    assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?");
+    if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op0, DL)))
+      if (Instruction *New = foldAllocaCmp(I, Alloca, Op1))
+        return New;
+    if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op1, DL)))
+      if (Instruction *New = foldAllocaCmp(I, Alloca, Op0))
+        return New;
+  }
+
+  // Test to see if the operands of the icmp are casted versions of other
+  // values.  If the ptr->ptr cast can be stripped off both arguments, we do so
+  // now.
+  if (BitCastInst *CI = dyn_cast<BitCastInst>(Op0)) {
+    if (Op0->getType()->isPointerTy() &&
+        (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) {
+      // We keep moving the cast from the left operand over to the right
+      // operand, where it can often be eliminated completely.
+      Op0 = CI->getOperand(0);
+
+      // If operand #1 is a bitcast instruction, it must also be a ptr->ptr cast
+      // so eliminate it as well.
+      if (BitCastInst *CI2 = dyn_cast<BitCastInst>(Op1))
+        Op1 = CI2->getOperand(0);
+
+      // If Op1 is a constant, we can fold the cast into the constant.
+      if (Op0->getType() != Op1->getType()) {
+        if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
+          Op1 = ConstantExpr::getBitCast(Op1C, Op0->getType());
+        } else {
+          // Otherwise, cast the RHS right before the icmp
+          Op1 = Builder.CreateBitCast(Op1, Op0->getType());
+        }
+      }
+      return new ICmpInst(I.getPredicate(), Op0, Op1);
+    }
+  }
+
+  if (isa<CastInst>(Op0)) {
+    // Handle the special case of: icmp (cast bool to X), <cst>
+    // This comes up when you have code like
+    //   int X = A < B;
+    //   if (X) ...
+    // For generality, we handle any zero-extension of any operand comparison
+    // with a constant or another cast from the same type.
+    if (isa<Constant>(Op1) || isa<CastInst>(Op1))
+      if (Instruction *R = foldICmpWithCastAndCast(I))
+        return R;
+  }
+
+  if (Instruction *Res = foldICmpBinOp(I))
+    return Res;
+
+  if (Instruction *Res = foldICmpWithMinMax(I))
+    return Res;
+
+  {
+    Value *A, *B;
+    // Transform (A & ~B) == 0 --> (A & B) != 0
+    // and       (A & ~B) != 0 --> (A & B) == 0
+    // if A is a power of 2.
+    if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
+        match(Op1, m_Zero()) &&
+        isKnownToBeAPowerOfTwo(A, false, 0, &I) && I.isEquality())
+      return new ICmpInst(I.getInversePredicate(), Builder.CreateAnd(A, B),
+                          Op1);
+
+    // ~X < ~Y --> Y < X
+    // ~X < C -->  X > ~C
+    if (match(Op0, m_Not(m_Value(A)))) {
+      if (match(Op1, m_Not(m_Value(B))))
+        return new ICmpInst(I.getPredicate(), B, A);
+
+      const APInt *C;
+      if (match(Op1, m_APInt(C)))
+        return new ICmpInst(I.getSwappedPredicate(), A,
+                            ConstantInt::get(Op1->getType(), ~(*C)));
+    }
+
+    Instruction *AddI = nullptr;
+    if (match(&I, m_UAddWithOverflow(m_Value(A), m_Value(B),
+                                     m_Instruction(AddI))) &&
+        isa<IntegerType>(A->getType())) {
+      Value *Result;
+      Constant *Overflow;
+      if (OptimizeOverflowCheck(OCF_UNSIGNED_ADD, A, B, *AddI, Result,
+                                Overflow)) {
+        replaceInstUsesWith(*AddI, Result);
+        return replaceInstUsesWith(I, Overflow);
+      }
+    }
+
+    // (zext a) * (zext b)  --> llvm.umul.with.overflow.
+    if (match(Op0, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
+      if (Instruction *R = processUMulZExtIdiom(I, Op0, Op1, *this))
+        return R;
+    }
+    if (match(Op1, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
+      if (Instruction *R = processUMulZExtIdiom(I, Op1, Op0, *this))
+        return R;
+    }
+  }
+
+  if (Instruction *Res = foldICmpEquality(I))
+    return Res;
+
+  // The 'cmpxchg' instruction returns an aggregate containing the old value and
+  // an i1 which indicates whether or not we successfully did the swap.
+  //
+  // Replace comparisons between the old value and the expected value with the
+  // indicator that 'cmpxchg' returns.
+  //
+  // N.B.  This transform is only valid when the 'cmpxchg' is not permitted to
+  // spuriously fail.  In those cases, the old value may equal the expected
+  // value but it is possible for the swap to not occur.
+  if (I.getPredicate() == ICmpInst::ICMP_EQ)
+    if (auto *EVI = dyn_cast<ExtractValueInst>(Op0))
+      if (auto *ACXI = dyn_cast<AtomicCmpXchgInst>(EVI->getAggregateOperand()))
+        if (EVI->getIndices()[0] == 0 && ACXI->getCompareOperand() == Op1 &&
+            !ACXI->isWeak())
+          return ExtractValueInst::Create(ACXI, 1);
+
+  {
+    Value *X; ConstantInt *Cst;
+    // icmp X+Cst, X
+    if (match(Op0, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op1 == X)
+      return foldICmpAddOpConst(I, X, Cst, I.getPredicate());
+
+    // icmp X, X+Cst
+    if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X)
+      return foldICmpAddOpConst(I, X, Cst, I.getSwappedPredicate());
+  }
+  return Changed ? &I : nullptr;
+}
+
+/// Fold fcmp ([us]itofp x, cst) if possible.
+Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
+                                                Constant *RHSC) {
+  if (!isa<ConstantFP>(RHSC)) return nullptr;
+  const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
+
+  // Get the width of the mantissa.  We don't want to hack on conversions that
+  // might lose information from the integer, e.g. "i64 -> float"
+  int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
+  if (MantissaWidth == -1) return nullptr;  // Unknown.
+
+  IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
+
+  bool LHSUnsigned = isa<UIToFPInst>(LHSI);
+
+  if (I.isEquality()) {
+    FCmpInst::Predicate P = I.getPredicate();
+    bool IsExact = false;
+    APSInt RHSCvt(IntTy->getBitWidth(), LHSUnsigned);
+    RHS.convertToInteger(RHSCvt, APFloat::rmNearestTiesToEven, &IsExact);
+
+    // If the floating point constant isn't an integer value, we know if we will
+    // ever compare equal / not equal to it.
+    if (!IsExact) {
+      // TODO: Can never be -0.0 and other non-representable values
+      APFloat RHSRoundInt(RHS);
+      RHSRoundInt.roundToIntegral(APFloat::rmNearestTiesToEven);
+      if (RHS.compare(RHSRoundInt) != APFloat::cmpEqual) {
+        if (P == FCmpInst::FCMP_OEQ || P == FCmpInst::FCMP_UEQ)
+          return replaceInstUsesWith(I, Builder.getFalse());
+
+        assert(P == FCmpInst::FCMP_ONE || P == FCmpInst::FCMP_UNE);
+        return replaceInstUsesWith(I, Builder.getTrue());
+      }
+    }
+
+    // TODO: If the constant is exactly representable, is it always OK to do
+    // equality compares as integer?
+  }
+
+  // Check to see that the input is converted from an integer type that is small
+  // enough that preserves all bits.  TODO: check here for "known" sign bits.
+  // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
+  unsigned InputSize = IntTy->getScalarSizeInBits();
+
+  // Following test does NOT adjust InputSize downwards for signed inputs,
+  // because the most negative value still requires all the mantissa bits
+  // to distinguish it from one less than that value.
+  if ((int)InputSize > MantissaWidth) {
+    // Conversion would lose accuracy. Check if loss can impact comparison.
+    int Exp = ilogb(RHS);
+    if (Exp == APFloat::IEK_Inf) {
+      int MaxExponent = ilogb(APFloat::getLargest(RHS.getSemantics()));
+      if (MaxExponent < (int)InputSize - !LHSUnsigned)
+        // Conversion could create infinity.
+        return nullptr;
+    } else {
+      // Note that if RHS is zero or NaN, then Exp is negative
+      // and first condition is trivially false.
+      if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned)
+        // Conversion could affect comparison.
+        return nullptr;
+    }
+  }
+
+  // Otherwise, we can potentially simplify the comparison.  We know that it
+  // will always come through as an integer value and we know the constant is
+  // not a NAN (it would have been previously simplified).
+  assert(!RHS.isNaN() && "NaN comparison not already folded!");
+
+  ICmpInst::Predicate Pred;
+  switch (I.getPredicate()) {
+  default: llvm_unreachable("Unexpected predicate!");
+  case FCmpInst::FCMP_UEQ:
+  case FCmpInst::FCMP_OEQ:
+    Pred = ICmpInst::ICMP_EQ;
+    break;
+  case FCmpInst::FCMP_UGT:
+  case FCmpInst::FCMP_OGT:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_SGT;
+    break;
+  case FCmpInst::FCMP_UGE:
+  case FCmpInst::FCMP_OGE:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_SGE;
+    break;
+  case FCmpInst::FCMP_ULT:
+  case FCmpInst::FCMP_OLT:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_SLT;
+    break;
+  case FCmpInst::FCMP_ULE:
+  case FCmpInst::FCMP_OLE:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_SLE;
+    break;
+  case FCmpInst::FCMP_UNE:
+  case FCmpInst::FCMP_ONE:
+    Pred = ICmpInst::ICMP_NE;
+    break;
+  case FCmpInst::FCMP_ORD:
+    return replaceInstUsesWith(I, Builder.getTrue());
+  case FCmpInst::FCMP_UNO:
+    return replaceInstUsesWith(I, Builder.getFalse());
+  }
+
+  // Now we know that the APFloat is a normal number, zero or inf.
+
+  // See if the FP constant is too large for the integer.  For example,
+  // comparing an i8 to 300.0.
+  unsigned IntWidth = IntTy->getScalarSizeInBits();
+
+  if (!LHSUnsigned) {
+    // If the RHS value is > SignedMax, fold the comparison.  This handles +INF
+    // and large values.
+    APFloat SMax(RHS.getSemantics());
+    SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true,
+                          APFloat::rmNearestTiesToEven);
+    if (SMax.compare(RHS) == APFloat::cmpLessThan) {  // smax < 13123.0
+      if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_SLT ||
+          Pred == ICmpInst::ICMP_SLE)
+        return replaceInstUsesWith(I, Builder.getTrue());
+      return replaceInstUsesWith(I, Builder.getFalse());
+    }
+  } else {
+    // If the RHS value is > UnsignedMax, fold the comparison. This handles
+    // +INF and large values.
+    APFloat UMax(RHS.getSemantics());
+    UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false,
+                          APFloat::rmNearestTiesToEven);
+    if (UMax.compare(RHS) == APFloat::cmpLessThan) {  // umax < 13123.0
+      if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_ULT ||
+          Pred == ICmpInst::ICMP_ULE)
+        return replaceInstUsesWith(I, Builder.getTrue());
+      return replaceInstUsesWith(I, Builder.getFalse());
+    }
+  }
+
+  if (!LHSUnsigned) {
+    // See if the RHS value is < SignedMin.
+    APFloat SMin(RHS.getSemantics());
+    SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true,
+                          APFloat::rmNearestTiesToEven);
+    if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0
+      if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
+          Pred == ICmpInst::ICMP_SGE)
+        return replaceInstUsesWith(I, Builder.getTrue());
+      return replaceInstUsesWith(I, Builder.getFalse());
+    }
+  } else {
+    // See if the RHS value is < UnsignedMin.
+    APFloat SMin(RHS.getSemantics());
+    SMin.convertFromAPInt(APInt::getMinValue(IntWidth), true,
+                          APFloat::rmNearestTiesToEven);
+    if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // umin > 12312.0
+      if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT ||
+          Pred == ICmpInst::ICMP_UGE)
+        return replaceInstUsesWith(I, Builder.getTrue());
+      return replaceInstUsesWith(I, Builder.getFalse());
+    }
+  }
+
+  // Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or
+  // [0, UMAX], but it may still be fractional.  See if it is fractional by
+  // casting the FP value to the integer value and back, checking for equality.
+  // Don't do this for zero, because -0.0 is not fractional.
+  Constant *RHSInt = LHSUnsigned
+    ? ConstantExpr::getFPToUI(RHSC, IntTy)
+    : ConstantExpr::getFPToSI(RHSC, IntTy);
+  if (!RHS.isZero()) {
+    bool Equal = LHSUnsigned
+      ? ConstantExpr::getUIToFP(RHSInt, RHSC->getType()) == RHSC
+      : ConstantExpr::getSIToFP(RHSInt, RHSC->getType()) == RHSC;
+    if (!Equal) {
+      // If we had a comparison against a fractional value, we have to adjust
+      // the compare predicate and sometimes the value.  RHSC is rounded towards
+      // zero at this point.
+      switch (Pred) {
+      default: llvm_unreachable("Unexpected integer comparison!");
+      case ICmpInst::ICMP_NE:  // (float)int != 4.4   --> true
+        return replaceInstUsesWith(I, Builder.getTrue());
+      case ICmpInst::ICMP_EQ:  // (float)int == 4.4   --> false
+        return replaceInstUsesWith(I, Builder.getFalse());
+      case ICmpInst::ICMP_ULE:
+        // (float)int <= 4.4   --> int <= 4
+        // (float)int <= -4.4  --> false
+        if (RHS.isNegative())
+          return replaceInstUsesWith(I, Builder.getFalse());
+        break;
+      case ICmpInst::ICMP_SLE:
+        // (float)int <= 4.4   --> int <= 4
+        // (float)int <= -4.4  --> int < -4
+        if (RHS.isNegative())
+          Pred = ICmpInst::ICMP_SLT;
+        break;
+      case ICmpInst::ICMP_ULT:
+        // (float)int < -4.4   --> false
+        // (float)int < 4.4    --> int <= 4
+        if (RHS.isNegative())
+          return replaceInstUsesWith(I, Builder.getFalse());
+        Pred = ICmpInst::ICMP_ULE;
+        break;
+      case ICmpInst::ICMP_SLT:
+        // (float)int < -4.4   --> int < -4
+        // (float)int < 4.4    --> int <= 4
+        if (!RHS.isNegative())
+          Pred = ICmpInst::ICMP_SLE;
+        break;
+      case ICmpInst::ICMP_UGT:
+        // (float)int > 4.4    --> int > 4
+        // (float)int > -4.4   --> true
+        if (RHS.isNegative())
+          return replaceInstUsesWith(I, Builder.getTrue());
+        break;
+      case ICmpInst::ICMP_SGT:
+        // (float)int > 4.4    --> int > 4
+        // (float)int > -4.4   --> int >= -4
+        if (RHS.isNegative())
+          Pred = ICmpInst::ICMP_SGE;
+        break;
+      case ICmpInst::ICMP_UGE:
+        // (float)int >= -4.4   --> true
+        // (float)int >= 4.4    --> int > 4
+        if (RHS.isNegative())
+          return replaceInstUsesWith(I, Builder.getTrue());
+        Pred = ICmpInst::ICMP_UGT;
+        break;
+      case ICmpInst::ICMP_SGE:
+        // (float)int >= -4.4   --> int >= -4
+        // (float)int >= 4.4    --> int > 4
+        if (!RHS.isNegative())
+          Pred = ICmpInst::ICMP_SGT;
+        break;
+      }
+    }
+  }
+
+  // Lower this FP comparison into an appropriate integer version of the
+  // comparison.
+  return new ICmpInst(Pred, LHSI->getOperand(0), RHSInt);
+}
+
+Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
+  bool Changed = false;
+
+  /// Orders the operands of the compare so that they are listed from most
+  /// complex to least complex.  This puts constants before unary operators,
+  /// before binary operators.
+  if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1))) {
+    I.swapOperands();
+    Changed = true;
+  }
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V =
+          SimplifyFCmpInst(I.getPredicate(), Op0, Op1, I.getFastMathFlags(),
+                           SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  // Simplify 'fcmp pred X, X'
+  if (Op0 == Op1) {
+    switch (I.getPredicate()) {
+    default: llvm_unreachable("Unknown predicate!");
+    case FCmpInst::FCMP_UNO:    // True if unordered: isnan(X) | isnan(Y)
+    case FCmpInst::FCMP_ULT:    // True if unordered or less than
+    case FCmpInst::FCMP_UGT:    // True if unordered or greater than
+    case FCmpInst::FCMP_UNE:    // True if unordered or not equal
+      // Canonicalize these to be 'fcmp uno %X, 0.0'.
+      I.setPredicate(FCmpInst::FCMP_UNO);
+      I.setOperand(1, Constant::getNullValue(Op0->getType()));
+      return &I;
+
+    case FCmpInst::FCMP_ORD:    // True if ordered (no nans)
+    case FCmpInst::FCMP_OEQ:    // True if ordered and equal
+    case FCmpInst::FCMP_OGE:    // True if ordered and greater than or equal
+    case FCmpInst::FCMP_OLE:    // True if ordered and less than or equal
+      // Canonicalize these to be 'fcmp ord %X, 0.0'.
+      I.setPredicate(FCmpInst::FCMP_ORD);
+      I.setOperand(1, Constant::getNullValue(Op0->getType()));
+      return &I;
+    }
+  }
+
+  // Test if the FCmpInst instruction is used exclusively by a select as
+  // part of a minimum or maximum operation. If so, refrain from doing
+  // any other folding. This helps out other analyses which understand
+  // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
+  // and CodeGen. And in this case, at least one of the comparison
+  // operands has at least one user besides the compare (the select),
+  // which would often largely negate the benefit of folding anyway.
+  if (I.hasOneUse())
+    if (SelectInst *SI = dyn_cast<SelectInst>(*I.user_begin()))
+      if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) ||
+          (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1))
+        return nullptr;
+
+  // Handle fcmp with constant RHS
+  if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
+    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
+      switch (LHSI->getOpcode()) {
+      case Instruction::FPExt: {
+        // fcmp (fpext x), C -> fcmp x, (fptrunc C) if fptrunc is lossless
+        FPExtInst *LHSExt = cast<FPExtInst>(LHSI);
+        ConstantFP *RHSF = dyn_cast<ConstantFP>(RHSC);
+        if (!RHSF)
+          break;
+
+        const fltSemantics *Sem;
+        // FIXME: This shouldn't be here.
+        if (LHSExt->getSrcTy()->isHalfTy())
+          Sem = &APFloat::IEEEhalf();
+        else if (LHSExt->getSrcTy()->isFloatTy())
+          Sem = &APFloat::IEEEsingle();
+        else if (LHSExt->getSrcTy()->isDoubleTy())
+          Sem = &APFloat::IEEEdouble();
+        else if (LHSExt->getSrcTy()->isFP128Ty())
+          Sem = &APFloat::IEEEquad();
+        else if (LHSExt->getSrcTy()->isX86_FP80Ty())
+          Sem = &APFloat::x87DoubleExtended();
+        else if (LHSExt->getSrcTy()->isPPC_FP128Ty())
+          Sem = &APFloat::PPCDoubleDouble();
+        else
+          break;
+
+        bool Lossy;
+        APFloat F = RHSF->getValueAPF();
+        F.convert(*Sem, APFloat::rmNearestTiesToEven, &Lossy);
+
+        // Avoid lossy conversions and denormals. Zero is a special case
+        // that's OK to convert.
+        APFloat Fabs = F;
+        Fabs.clearSign();
+        if (!Lossy &&
+            ((Fabs.compare(APFloat::getSmallestNormalized(*Sem)) !=
+                 APFloat::cmpLessThan) || Fabs.isZero()))
+
+          return new FCmpInst(I.getPredicate(), LHSExt->getOperand(0),
+                              ConstantFP::get(RHSC->getContext(), F));
+        break;
+      }
+      case Instruction::PHI:
+        // Only fold fcmp into the PHI if the phi and fcmp are in the same
+        // block.  If in the same block, we're encouraging jump threading.  If
+        // not, we are just pessimizing the code by making an i1 phi.
+        if (LHSI->getParent() == I.getParent())
+          if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
+            return NV;
+        break;
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+        if (Instruction *NV = foldFCmpIntToFPConst(I, LHSI, RHSC))
+          return NV;
+        break;
+      case Instruction::FSub: {
+        // fcmp pred (fneg x), C -> fcmp swap(pred) x, -C
+        Value *Op;
+        if (match(LHSI, m_FNeg(m_Value(Op))))
+          return new FCmpInst(I.getSwappedPredicate(), Op,
+                              ConstantExpr::getFNeg(RHSC));
+        break;
+      }
+      case Instruction::Load:
+        if (GetElementPtrInst *GEP =
+            dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) {
+          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+            if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+                !cast<LoadInst>(LHSI)->isVolatile())
+              if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
+                return Res;
+        }
+        break;
+      case Instruction::Call: {
+        if (!RHSC->isNullValue())
+          break;
+
+        CallInst *CI = cast<CallInst>(LHSI);
+        Intrinsic::ID IID = getIntrinsicForCallSite(CI, &TLI);
+        if (IID != Intrinsic::fabs)
+          break;
+
+        // Various optimization for fabs compared with zero.
+        switch (I.getPredicate()) {
+        default:
+          break;
+        // fabs(x) < 0 --> false
+        case FCmpInst::FCMP_OLT:
+          llvm_unreachable("handled by SimplifyFCmpInst");
+        // fabs(x) > 0 --> x != 0
+        case FCmpInst::FCMP_OGT:
+          return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0), RHSC);
+        // fabs(x) <= 0 --> x == 0
+        case FCmpInst::FCMP_OLE:
+          return new FCmpInst(FCmpInst::FCMP_OEQ, CI->getArgOperand(0), RHSC);
+        // fabs(x) >= 0 --> !isnan(x)
+        case FCmpInst::FCMP_OGE:
+          return new FCmpInst(FCmpInst::FCMP_ORD, CI->getArgOperand(0), RHSC);
+        // fabs(x) == 0 --> x == 0
+        // fabs(x) != 0 --> x != 0
+        case FCmpInst::FCMP_OEQ:
+        case FCmpInst::FCMP_UEQ:
+        case FCmpInst::FCMP_ONE:
+        case FCmpInst::FCMP_UNE:
+          return new FCmpInst(I.getPredicate(), CI->getArgOperand(0), RHSC);
+        }
+      }
+      }
+  }
+
+  // fcmp pred (fneg x), (fneg y) -> fcmp swap(pred) x, y
+  Value *X, *Y;
+  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
+    return new FCmpInst(I.getSwappedPredicate(), X, Y);
+
+  // fcmp (fpext x), (fpext y) -> fcmp x, y
+  if (FPExtInst *LHSExt = dyn_cast<FPExtInst>(Op0))
+    if (FPExtInst *RHSExt = dyn_cast<FPExtInst>(Op1))
+      if (LHSExt->getSrcTy() == RHSExt->getSrcTy())
+        return new FCmpInst(I.getPredicate(), LHSExt->getOperand(0),
+                            RHSExt->getOperand(0));
+
+  return Changed ? &I : nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
new file mode 100644
index 000000000000..c38a4981bf1d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -0,0 +1,760 @@
+//===- InstCombineInternal.h - InstCombine pass internals -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides internal interfaces used to implement the InstCombine.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
+#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#define DEBUG_TYPE "instcombine"
+
+namespace llvm {
+class CallSite;
+class DataLayout;
+class DominatorTree;
+class TargetLibraryInfo;
+class DbgDeclareInst;
+class MemIntrinsic;
+class MemSetInst;
+
+/// Assign a complexity or rank value to LLVM Values. This is used to reduce
+/// the amount of pattern matching needed for compares and commutative
+/// instructions. For example, if we have:
+///   icmp ugt X, Constant
+/// or
+///   xor (add X, Constant), cast Z
+///
+/// We do not have to consider the commuted variants of these patterns because
+/// canonicalization based on complexity guarantees the above ordering.
+///
+/// This routine maps IR values to various complexity ranks:
+///   0 -> undef
+///   1 -> Constants
+///   2 -> Other non-instructions
+///   3 -> Arguments
+///   4 -> Cast and (f)neg/not instructions
+///   5 -> Other instructions
+static inline unsigned getComplexity(Value *V) {
+  if (isa<Instruction>(V)) {
+    if (isa<CastInst>(V) || BinaryOperator::isNeg(V) ||
+        BinaryOperator::isFNeg(V) || BinaryOperator::isNot(V))
+      return 4;
+    return 5;
+  }
+  if (isa<Argument>(V))
+    return 3;
+  return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
+}
+
+/// Predicate canonicalization reduces the number of patterns that need to be
+/// matched by other transforms. For example, we may swap the operands of a
+/// conditional branch or select to create a compare with a canonical (inverted)
+/// predicate which is then more likely to be matched with other values.
+static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) {
+  switch (Pred) {
+  case CmpInst::ICMP_NE:
+  case CmpInst::ICMP_ULE:
+  case CmpInst::ICMP_SLE:
+  case CmpInst::ICMP_UGE:
+  case CmpInst::ICMP_SGE:
+  // TODO: There are 16 FCMP predicates. Should others be (not) canonical?
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_OGE:
+    return false;
+  default:
+    return true;
+  }
+}
+
+/// Return the source operand of a potentially bitcasted value while optionally
+/// checking if it has one use. If there is no bitcast or the one use check is
+/// not met, return the input value itself.
+static inline Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) {
+  if (auto *BitCast = dyn_cast<BitCastInst>(V))
+    if (!OneUseOnly || BitCast->hasOneUse())
+      return BitCast->getOperand(0);
+
+  // V is not a bitcast or V has more than one use and OneUseOnly is true.
+  return V;
+}
+
+/// \brief Add one to a Constant
+static inline Constant *AddOne(Constant *C) {
+  return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
+}
+/// \brief Subtract one from a Constant
+static inline Constant *SubOne(Constant *C) {
+  return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
+}
+
+/// \brief Return true if the specified value is free to invert (apply ~ to).
+/// This happens in cases where the ~ can be eliminated.  If WillInvertAllUses
+/// is true, work under the assumption that the caller intends to remove all
+/// uses of V and only keep uses of ~V.
+///
+static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) {
+  // ~(~(X)) -> X.
+  if (BinaryOperator::isNot(V))
+    return true;
+
+  // Constants can be considered to be not'ed values.
+  if (isa<ConstantInt>(V))
+    return true;
+
+  // A vector of constant integers can be inverted easily.
+  if (V->getType()->isVectorTy() && isa<Constant>(V)) {
+    unsigned NumElts = V->getType()->getVectorNumElements();
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Elt = cast<Constant>(V)->getAggregateElement(i);
+      if (!Elt)
+        return false;
+
+      if (isa<UndefValue>(Elt))
+        continue;
+
+      if (!isa<ConstantInt>(Elt))
+        return false;
+    }
+    return true;
+  }
+
+  // Compares can be inverted if all of their uses are being modified to use the
+  // ~V.
+  if (isa<CmpInst>(V))
+    return WillInvertAllUses;
+
+  // If `V` is of the form `A + Constant` then `-1 - V` can be folded into `(-1
+  // - Constant) - A` if we are willing to invert all of the uses.
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
+    if (BO->getOpcode() == Instruction::Add ||
+        BO->getOpcode() == Instruction::Sub)
+      if (isa<Constant>(BO->getOperand(0)) || isa<Constant>(BO->getOperand(1)))
+        return WillInvertAllUses;
+
+  return false;
+}
+
+
+/// \brief Specific patterns of overflow check idioms that we match.
+enum OverflowCheckFlavor {
+  OCF_UNSIGNED_ADD,
+  OCF_SIGNED_ADD,
+  OCF_UNSIGNED_SUB,
+  OCF_SIGNED_SUB,
+  OCF_UNSIGNED_MUL,
+  OCF_SIGNED_MUL,
+
+  OCF_INVALID
+};
+
+/// \brief Returns the OverflowCheckFlavor corresponding to a overflow_with_op
+/// intrinsic.
+static inline OverflowCheckFlavor
+IntrinsicIDToOverflowCheckFlavor(unsigned ID) {
+  switch (ID) {
+  default:
+    return OCF_INVALID;
+  case Intrinsic::uadd_with_overflow:
+    return OCF_UNSIGNED_ADD;
+  case Intrinsic::sadd_with_overflow:
+    return OCF_SIGNED_ADD;
+  case Intrinsic::usub_with_overflow:
+    return OCF_UNSIGNED_SUB;
+  case Intrinsic::ssub_with_overflow:
+    return OCF_SIGNED_SUB;
+  case Intrinsic::umul_with_overflow:
+    return OCF_UNSIGNED_MUL;
+  case Intrinsic::smul_with_overflow:
+    return OCF_SIGNED_MUL;
+  }
+}
+
+/// \brief The core instruction combiner logic.
+///
+/// This class provides both the logic to recursively visit instructions and
+/// combine them.
+class LLVM_LIBRARY_VISIBILITY InstCombiner
+    : public InstVisitor<InstCombiner, Instruction *> {
+  // FIXME: These members shouldn't be public.
+public:
+  /// \brief A worklist of the instructions that need to be simplified.
+  InstCombineWorklist &Worklist;
+
+  /// \brief An IRBuilder that automatically inserts new instructions into the
+  /// worklist.
+  typedef IRBuilder<TargetFolder, IRBuilderCallbackInserter> BuilderTy;
+  BuilderTy &Builder;
+
+private:
+  // Mode in which we are running the combiner.
+  const bool MinimizeSize;
+  /// Enable combines that trigger rarely but are costly in compiletime.
+  const bool ExpensiveCombines;
+
+  AliasAnalysis *AA;
+
+  // Required analyses.
+  AssumptionCache &AC;
+  TargetLibraryInfo &TLI;
+  DominatorTree &DT;
+  const DataLayout &DL;
+  const SimplifyQuery SQ;
+  // Optional analyses. When non-null, these can both be used to do better
+  // combining and will be updated to reflect any changes.
+  LoopInfo *LI;
+
+  bool MadeIRChange;
+
+public:
+  InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder,
+               bool MinimizeSize, bool ExpensiveCombines, AliasAnalysis *AA,
+               AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
+               const DataLayout &DL, LoopInfo *LI)
+      : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
+        ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT),
+        DL(DL), SQ(DL, &TLI, &DT, &AC), LI(LI), MadeIRChange(false) {}
+
+  /// \brief Run the combiner over the entire worklist until it is empty.
+  ///
+  /// \returns true if the IR is changed.
+  bool run();
+
+  AssumptionCache &getAssumptionCache() const { return AC; }
+
+  const DataLayout &getDataLayout() const { return DL; }
+
+  DominatorTree &getDominatorTree() const { return DT; }
+
+  LoopInfo *getLoopInfo() const { return LI; }
+
+  TargetLibraryInfo &getTargetLibraryInfo() const { return TLI; }
+
+  // Visitation implementation - Implement instruction combining for different
+  // instruction types.  The semantics are as follows:
+  // Return Value:
+  //    null        - No change was made
+  //     I          - Change was made, I is still valid, I may be dead though
+  //   otherwise    - Change was made, replace I with returned instruction
+  //
+  Instruction *visitAdd(BinaryOperator &I);
+  Instruction *visitFAdd(BinaryOperator &I);
+  Value *OptimizePointerDifference(Value *LHS, Value *RHS, Type *Ty);
+  Instruction *visitSub(BinaryOperator &I);
+  Instruction *visitFSub(BinaryOperator &I);
+  Instruction *visitMul(BinaryOperator &I);
+  Value *foldFMulConst(Instruction *FMulOrDiv, Constant *C,
+                       Instruction *InsertBefore);
+  Instruction *visitFMul(BinaryOperator &I);
+  Instruction *visitURem(BinaryOperator &I);
+  Instruction *visitSRem(BinaryOperator &I);
+  Instruction *visitFRem(BinaryOperator &I);
+  bool SimplifyDivRemOfSelect(BinaryOperator &I);
+  Instruction *commonRemTransforms(BinaryOperator &I);
+  Instruction *commonIRemTransforms(BinaryOperator &I);
+  Instruction *commonDivTransforms(BinaryOperator &I);
+  Instruction *commonIDivTransforms(BinaryOperator &I);
+  Instruction *visitUDiv(BinaryOperator &I);
+  Instruction *visitSDiv(BinaryOperator &I);
+  Instruction *visitFDiv(BinaryOperator &I);
+  Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted);
+  Instruction *visitAnd(BinaryOperator &I);
+  Instruction *visitOr(BinaryOperator &I);
+  Instruction *visitXor(BinaryOperator &I);
+  Instruction *visitShl(BinaryOperator &I);
+  Instruction *visitAShr(BinaryOperator &I);
+  Instruction *visitLShr(BinaryOperator &I);
+  Instruction *commonShiftTransforms(BinaryOperator &I);
+  Instruction *visitFCmpInst(FCmpInst &I);
+  Instruction *visitICmpInst(ICmpInst &I);
+  Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1,
+                                   BinaryOperator &I);
+  Instruction *commonCastTransforms(CastInst &CI);
+  Instruction *commonPointerCastTransforms(CastInst &CI);
+  Instruction *visitTrunc(TruncInst &CI);
+  Instruction *visitZExt(ZExtInst &CI);
+  Instruction *visitSExt(SExtInst &CI);
+  Instruction *visitFPTrunc(FPTruncInst &CI);
+  Instruction *visitFPExt(CastInst &CI);
+  Instruction *visitFPToUI(FPToUIInst &FI);
+  Instruction *visitFPToSI(FPToSIInst &FI);
+  Instruction *visitUIToFP(CastInst &CI);
+  Instruction *visitSIToFP(CastInst &CI);
+  Instruction *visitPtrToInt(PtrToIntInst &CI);
+  Instruction *visitIntToPtr(IntToPtrInst &CI);
+  Instruction *visitBitCast(BitCastInst &CI);
+  Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI);
+  Instruction *FoldItoFPtoI(Instruction &FI);
+  Instruction *visitSelectInst(SelectInst &SI);
+  Instruction *visitCallInst(CallInst &CI);
+  Instruction *visitInvokeInst(InvokeInst &II);
+
+  Instruction *SliceUpIllegalIntegerPHI(PHINode &PN);
+  Instruction *visitPHINode(PHINode &PN);
+  Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
+  Instruction *visitAllocaInst(AllocaInst &AI);
+  Instruction *visitAllocSite(Instruction &FI);
+  Instruction *visitFree(CallInst &FI);
+  Instruction *visitLoadInst(LoadInst &LI);
+  Instruction *visitStoreInst(StoreInst &SI);
+  Instruction *visitBranchInst(BranchInst &BI);
+  Instruction *visitFenceInst(FenceInst &FI);
+  Instruction *visitSwitchInst(SwitchInst &SI);
+  Instruction *visitReturnInst(ReturnInst &RI);
+  Instruction *visitInsertValueInst(InsertValueInst &IV);
+  Instruction *visitInsertElementInst(InsertElementInst &IE);
+  Instruction *visitExtractElementInst(ExtractElementInst &EI);
+  Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
+  Instruction *visitExtractValueInst(ExtractValueInst &EV);
+  Instruction *visitLandingPadInst(LandingPadInst &LI);
+  Instruction *visitVAStartInst(VAStartInst &I);
+  Instruction *visitVACopyInst(VACopyInst &I);
+
+  /// Specify what to return for unhandled instructions.
+  Instruction *visitInstruction(Instruction &I) { return nullptr; }
+
+  /// True when DB dominates all uses of DI except UI.
+  /// UI must be in the same block as DI.
+  /// The routine checks that the DI parent and DB are different.
+  bool dominatesAllUses(const Instruction *DI, const Instruction *UI,
+                        const BasicBlock *DB) const;
+
+  /// Try to replace select with select operand SIOpd in SI-ICmp sequence.
+  bool replacedSelectWithOperand(SelectInst *SI, const ICmpInst *Icmp,
+                                 const unsigned SIOpd);
+
+  /// Try to replace instruction \p I with value \p V which are pointers
+  /// in different address space.
+  /// \return true if successful.
+  bool replacePointer(Instruction &I, Value *V);
+
+private:
+  bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
+  bool shouldChangeType(Type *From, Type *To) const;
+  Value *dyn_castNegVal(Value *V) const;
+  Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const;
+  Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
+                            SmallVectorImpl<Value *> &NewIndices);
+
+  /// Classify whether a cast is worth optimizing.
+  ///
+  /// This is a helper to decide whether the simplification of
+  /// logic(cast(A), cast(B)) to cast(logic(A, B)) should be performed.
+  ///
+  /// \param CI The cast we are interested in.
+  ///
+  /// \return true if this cast actually results in any code being generated and
+  /// if it cannot already be eliminated by some other transformation.
+  bool shouldOptimizeCast(CastInst *CI);
+
+  /// \brief Try to optimize a sequence of instructions checking if an operation
+  /// on LHS and RHS overflows.
+  ///
+  /// If this overflow check is done via one of the overflow check intrinsics,
+  /// then CtxI has to be the call instruction calling that intrinsic.  If this
+  /// overflow check is done by arithmetic followed by a compare, then CtxI has
+  /// to be the arithmetic instruction.
+  ///
+  /// If a simplification is possible, stores the simplified result of the
+  /// operation in OperationResult and result of the overflow check in
+  /// OverflowResult, and return true.  If no simplification is possible,
+  /// returns false.
+  bool OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS, Value *RHS,
+                             Instruction &CtxI, Value *&OperationResult,
+                             Constant *&OverflowResult);
+
+  Instruction *visitCallSite(CallSite CS);
+  Instruction *tryOptimizeCall(CallInst *CI);
+  bool transformConstExprCastCall(CallSite CS);
+  Instruction *transformCallThroughTrampoline(CallSite CS,
+                                              IntrinsicInst *Tramp);
+
+  /// Transform (zext icmp) to bitwise / integer operations in order to
+  /// eliminate it.
+  ///
+  /// \param ICI The icmp of the (zext icmp) pair we are interested in.
+  /// \parem CI The zext of the (zext icmp) pair we are interested in.
+  /// \param DoTransform Pass false to just test whether the given (zext icmp)
+  /// would be transformed. Pass true to actually perform the transformation.
+  ///
+  /// \return null if the transformation cannot be performed. If the
+  /// transformation can be performed the new instruction that replaces the
+  /// (zext icmp) pair will be returned (if \p DoTransform is false the
+  /// unmodified \p ICI will be returned in this case).
+  Instruction *transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
+                                 bool DoTransform = true);
+
+  Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
+  bool willNotOverflowSignedAdd(const Value *LHS, const Value *RHS,
+                                const Instruction &CxtI) const {
+    return computeOverflowForSignedAdd(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  };
+  bool willNotOverflowUnsignedAdd(const Value *LHS, const Value *RHS,
+                                  const Instruction &CxtI) const {
+    return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  };
+  bool willNotOverflowSignedSub(const Value *LHS, const Value *RHS,
+                                const Instruction &CxtI) const;
+  bool willNotOverflowUnsignedSub(const Value *LHS, const Value *RHS,
+                                  const Instruction &CxtI) const;
+  bool willNotOverflowSignedMul(const Value *LHS, const Value *RHS,
+                                const Instruction &CxtI) const;
+  bool willNotOverflowUnsignedMul(const Value *LHS, const Value *RHS,
+                                  const Instruction &CxtI) const {
+    return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  };
+  Value *EmitGEPOffset(User *GEP);
+  Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
+  Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
+  Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
+  Instruction *shrinkBitwiseLogic(TruncInst &Trunc);
+  Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN);
+
+  /// Determine if a pair of casts can be replaced by a single cast.
+  ///
+  /// \param CI1 The first of a pair of casts.
+  /// \param CI2 The second of a pair of casts.
+  ///
+  /// \return 0 if the cast pair cannot be eliminated, otherwise returns an
+  /// Instruction::CastOps value for a cast that can replace the pair, casting
+  /// CI1->getSrcTy() to CI2->getDstTy().
+  ///
+  /// \see CastInst::isEliminableCastPair
+  Instruction::CastOps isEliminableCastPair(const CastInst *CI1,
+                                            const CastInst *CI2);
+
+  Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI);
+  Value *foldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
+  Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI);
+  Value *foldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
+  Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS);
+
+  Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
+                                       bool JoinedByAnd, Instruction &CxtI);
+public:
+  /// \brief Inserts an instruction \p New before instruction \p Old
+  ///
+  /// Also adds the new instruction to the worklist and returns \p New so that
+  /// it is suitable for use as the return from the visitation patterns.
+  Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) {
+    assert(New && !New->getParent() &&
+           "New instruction already inserted into a basic block!");
+    BasicBlock *BB = Old.getParent();
+    BB->getInstList().insert(Old.getIterator(), New); // Insert inst
+    Worklist.Add(New);
+    return New;
+  }
+
+  /// \brief Same as InsertNewInstBefore, but also sets the debug loc.
+  Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
+    New->setDebugLoc(Old.getDebugLoc());
+    return InsertNewInstBefore(New, Old);
+  }
+
+  /// \brief A combiner-aware RAUW-like routine.
+  ///
+  /// This method is to be used when an instruction is found to be dead,
+  /// replaceable with another preexisting expression. Here we add all uses of
+  /// I to the worklist, replace all uses of I with the new value, then return
+  /// I, so that the inst combiner will know that I was modified.
+  Instruction *replaceInstUsesWith(Instruction &I, Value *V) {
+    // If there are no uses to replace, then we return nullptr to indicate that
+    // no changes were made to the program.
+    if (I.use_empty()) return nullptr;
+
+    Worklist.AddUsersToWorkList(I); // Add all modified instrs to worklist.
+
+    // If we are replacing the instruction with itself, this must be in a
+    // segment of unreachable code, so just clobber the instruction.
+    if (&I == V)
+      V = UndefValue::get(I.getType());
+
+    DEBUG(dbgs() << "IC: Replacing " << I << "\n"
+                 << "    with " << *V << '\n');
+
+    I.replaceAllUsesWith(V);
+    return &I;
+  }
+
+  /// Creates a result tuple for an overflow intrinsic \p II with a given
+  /// \p Result and a constant \p Overflow value.
+  Instruction *CreateOverflowTuple(IntrinsicInst *II, Value *Result,
+                                   Constant *Overflow) {
+    Constant *V[] = {UndefValue::get(Result->getType()), Overflow};
+    StructType *ST = cast<StructType>(II->getType());
+    Constant *Struct = ConstantStruct::get(ST, V);
+    return InsertValueInst::Create(Struct, Result, 0);
+  }
+
+  /// \brief Combiner aware instruction erasure.
+  ///
+  /// When dealing with an instruction that has side effects or produces a void
+  /// value, we can't rely on DCE to delete the instruction. Instead, visit
+  /// methods should return the value returned by this function.
+  Instruction *eraseInstFromFunction(Instruction &I) {
+    DEBUG(dbgs() << "IC: ERASE " << I << '\n');
+    assert(I.use_empty() && "Cannot erase instruction that is used!");
+    salvageDebugInfo(I);
+
+    // Make sure that we reprocess all operands now that we reduced their
+    // use counts.
+    if (I.getNumOperands() < 8) {
+      for (Use &Operand : I.operands())
+        if (auto *Inst = dyn_cast<Instruction>(Operand))
+          Worklist.Add(Inst);
+    }
+    Worklist.Remove(&I);
+    I.eraseFromParent();
+    MadeIRChange = true;
+    return nullptr; // Don't do anything with FI
+  }
+
+  void computeKnownBits(const Value *V, KnownBits &Known,
+                        unsigned Depth, const Instruction *CxtI) const {
+    llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+  }
+  KnownBits computeKnownBits(const Value *V, unsigned Depth,
+                             const Instruction *CxtI) const {
+    return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero = false,
+                              unsigned Depth = 0,
+                              const Instruction *CxtI = nullptr) {
+    return llvm::isKnownToBeAPowerOfTwo(V, DL, OrZero, Depth, &AC, CxtI, &DT);
+  }
+
+  bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0,
+                         const Instruction *CxtI = nullptr) const {
+    return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT);
+  }
+  unsigned ComputeNumSignBits(const Value *Op, unsigned Depth = 0,
+                              const Instruction *CxtI = nullptr) const {
+    return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
+  }
+  OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+  OverflowResult computeOverflowForUnsignedAdd(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+  OverflowResult computeOverflowForSignedAdd(const Value *LHS,
+                                             const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  /// Maximum size of array considered when transforming.
+  uint64_t MaxArraySizeForCombine;
+
+private:
+  /// \brief Performs a few simplifications for operators which are associative
+  /// or commutative.
+  bool SimplifyAssociativeOrCommutative(BinaryOperator &I);
+
+  /// \brief Tries to simplify binary operations which some other binary
+  /// operation distributes over.
+  ///
+  /// It does this by either by factorizing out common terms (eg "(A*B)+(A*C)"
+  /// -> "A*(B+C)") or expanding out if this results in simplifications (eg: "A
+  /// & (B | C) -> (A&B) | (A&C)" if this is a win).  Returns the simplified
+  /// value, or null if it didn't simplify.
+  Value *SimplifyUsingDistributiveLaws(BinaryOperator &I);
+
+  /// This tries to simplify binary operations by factorizing out common terms
+  /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
+  Value *tryFactorization(BinaryOperator &, Instruction::BinaryOps, Value *,
+                          Value *, Value *, Value *);
+
+  /// Match a select chain which produces one of three values based on whether
+  /// the LHS is less than, equal to, or greater than RHS respectively.
+  /// Return true if we matched a three way compare idiom. The LHS, RHS, Less,
+  /// Equal and Greater values are saved in the matching process and returned to
+  /// the caller.
+  bool matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, Value *&RHS,
+                               ConstantInt *&Less, ConstantInt *&Equal,
+                               ConstantInt *&Greater);
+
+  /// \brief Attempts to replace V with a simpler value based on the demanded
+  /// bits.
+  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownBits &Known,
+                                 unsigned Depth, Instruction *CxtI);
+  bool SimplifyDemandedBits(Instruction *I, unsigned Op,
+                            const APInt &DemandedMask, KnownBits &Known,
+                            unsigned Depth = 0);
+  /// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne
+  /// bits. It also tries to handle simplifications that can be done based on
+  /// DemandedMask, but without modifying the Instruction.
+  Value *SimplifyMultipleUseDemandedBits(Instruction *I,
+                                         const APInt &DemandedMask,
+                                         KnownBits &Known,
+                                         unsigned Depth, Instruction *CxtI);
+  /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
+  /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence.
+  Value *simplifyShrShlDemandedBits(
+      Instruction *Shr, const APInt &ShrOp1, Instruction *Shl,
+      const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known);
+
+  /// \brief Tries to simplify operands to an integer instruction based on its
+  /// demanded bits.
+  bool SimplifyDemandedInstructionBits(Instruction &Inst);
+
+  Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
+                                    APInt &UndefElts, unsigned Depth = 0);
+
+  Value *SimplifyVectorOp(BinaryOperator &Inst);
+
+
+  /// Given a binary operator, cast instruction, or select which has a PHI node
+  /// as operand #0, see if we can fold the instruction into the PHI (which is
+  /// only possible if all operands to the PHI are constants).
+  Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN);
+
+  /// Given an instruction with a select as one operand and a constant as the
+  /// other operand, try to fold the binary operator into the select arguments.
+  /// This also works for Cast instructions, which obviously do not have a
+  /// second operand.
+  Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
+
+  /// This is a convenience wrapper function for the above two functions.
+  Instruction *foldOpWithConstantIntoOperand(BinaryOperator &I);
+
+  /// \brief Try to rotate an operation below a PHI node, using PHI nodes for
+  /// its operands.
+  Instruction *FoldPHIArgOpIntoPHI(PHINode &PN);
+  Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN);
+  Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);
+  Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN);
+  Instruction *FoldPHIArgZextsIntoPHI(PHINode &PN);
+
+  /// Helper function for FoldPHIArgXIntoPHI() to get debug location for the
+  /// folded operation.
+  DebugLoc PHIArgMergedDebugLoc(PHINode &PN);
+
+  Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
+                           ICmpInst::Predicate Cond, Instruction &I);
+  Instruction *foldAllocaCmp(ICmpInst &ICI, const AllocaInst *Alloca,
+                             const Value *Other);
+  Instruction *foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
+                                            GlobalVariable *GV, CmpInst &ICI,
+                                            ConstantInt *AndCst = nullptr);
+  Instruction *foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
+                                    Constant *RHSC);
+  Instruction *foldICmpAddOpConst(Instruction &ICI, Value *X, ConstantInt *CI,
+                                  ICmpInst::Predicate Pred);
+  Instruction *foldICmpWithCastAndCast(ICmpInst &ICI);
+
+  Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp);
+  Instruction *foldICmpWithConstant(ICmpInst &Cmp);
+  Instruction *foldICmpInstWithConstant(ICmpInst &Cmp);
+  Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp);
+  Instruction *foldICmpBinOp(ICmpInst &Cmp);
+  Instruction *foldICmpEquality(ICmpInst &Cmp);
+
+  Instruction *foldICmpSelectConstant(ICmpInst &Cmp, Instruction *Select,
+                                      ConstantInt *C);
+  Instruction *foldICmpTruncConstant(ICmpInst &Cmp, Instruction *Trunc,
+                                     const APInt *C);
+  Instruction *foldICmpAndConstant(ICmpInst &Cmp, BinaryOperator *And,
+                                   const APInt *C);
+  Instruction *foldICmpXorConstant(ICmpInst &Cmp, BinaryOperator *Xor,
+                                   const APInt *C);
+  Instruction *foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
+                                  const APInt *C);
+  Instruction *foldICmpMulConstant(ICmpInst &Cmp, BinaryOperator *Mul,
+                                   const APInt *C);
+  Instruction *foldICmpShlConstant(ICmpInst &Cmp, BinaryOperator *Shl,
+                                   const APInt *C);
+  Instruction *foldICmpShrConstant(ICmpInst &Cmp, BinaryOperator *Shr,
+                                   const APInt *C);
+  Instruction *foldICmpUDivConstant(ICmpInst &Cmp, BinaryOperator *UDiv,
+                                    const APInt *C);
+  Instruction *foldICmpDivConstant(ICmpInst &Cmp, BinaryOperator *Div,
+                                   const APInt *C);
+  Instruction *foldICmpSubConstant(ICmpInst &Cmp, BinaryOperator *Sub,
+                                   const APInt *C);
+  Instruction *foldICmpAddConstant(ICmpInst &Cmp, BinaryOperator *Add,
+                                   const APInt *C);
+  Instruction *foldICmpAndConstConst(ICmpInst &Cmp, BinaryOperator *And,
+                                     const APInt *C1);
+  Instruction *foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
+                                const APInt *C1, const APInt *C2);
+  Instruction *foldICmpShrConstConst(ICmpInst &I, Value *ShAmt, const APInt &C1,
+                                     const APInt &C2);
+  Instruction *foldICmpShlConstConst(ICmpInst &I, Value *ShAmt, const APInt &C1,
+                                     const APInt &C2);
+
+  Instruction *foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
+                                                 BinaryOperator *BO,
+                                                 const APInt *C);
+  Instruction *foldICmpIntrinsicWithConstant(ICmpInst &ICI, const APInt *C);
+
+  // Helpers of visitSelectInst().
+  Instruction *foldSelectExtConst(SelectInst &Sel);
+  Instruction *foldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI);
+  Instruction *foldSelectIntoOp(SelectInst &SI, Value *, Value *);
+  Instruction *foldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1,
+                            Value *A, Value *B, Instruction &Outer,
+                            SelectPatternFlavor SPF2, Value *C);
+  Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
+
+  Instruction *OptAndOp(BinaryOperator *Op, ConstantInt *OpRHS,
+                        ConstantInt *AndRHS, BinaryOperator &TheAnd);
+
+  Value *insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
+                         bool isSigned, bool Inside);
+  Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
+  Instruction *MatchBSwap(BinaryOperator &I);
+  bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
+
+  Instruction *
+  SimplifyElementUnorderedAtomicMemCpy(ElementUnorderedAtomicMemCpyInst *AMI);
+  Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
+  Instruction *SimplifyMemSet(MemSetInst *MI);
+
+  Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
+
+  /// \brief Returns a value X such that Val = X * Scale, or null if none.
+  ///
+  /// If the multiplication is known not to overflow then NoSignedWrap is set.
+  Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
+};
+
+} // end namespace llvm.
+
+#undef DEBUG_TYPE
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
new file mode 100644
index 000000000000..c59e1ce69ac2
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -0,0 +1,1561 @@
+//===- InstCombineLoadStoreAlloca.cpp -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for load, store and alloca.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "instcombine"
+
+STATISTIC(NumDeadStore,    "Number of dead stores eliminated");
+STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
+
+/// pointsToConstantGlobal - Return true if V (possibly indirectly) points to
+/// some part of a constant global variable.  This intentionally only accepts
+/// constant expressions because we can't rewrite arbitrary instructions.
+static bool pointsToConstantGlobal(Value *V) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return GV->isConstant();
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    if (CE->getOpcode() == Instruction::BitCast ||
+        CE->getOpcode() == Instruction::AddrSpaceCast ||
+        CE->getOpcode() == Instruction::GetElementPtr)
+      return pointsToConstantGlobal(CE->getOperand(0));
+  }
+  return false;
+}
+
+/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
+/// pointer to an alloca.  Ignore any reads of the pointer, return false if we
+/// see any stores or other unknown uses.  If we see pointer arithmetic, keep
+/// track of whether it moves the pointer (with IsOffset) but otherwise traverse
+/// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
+/// the alloca, and if the source pointer is a pointer to a constant global, we
+/// can optimize this.
+static bool
+isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
+                               SmallVectorImpl<Instruction *> &ToDelete) {
+  // We track lifetime intrinsics as we encounter them.  If we decide to go
+  // ahead and replace the value with the global, this lets the caller quickly
+  // eliminate the markers.
+
+  SmallVector<std::pair<Value *, bool>, 35> ValuesToInspect;
+  ValuesToInspect.emplace_back(V, false);
+  while (!ValuesToInspect.empty()) {
+    auto ValuePair = ValuesToInspect.pop_back_val();
+    const bool IsOffset = ValuePair.second;
+    for (auto &U : ValuePair.first->uses()) {
+      auto *I = cast<Instruction>(U.getUser());
+
+      if (auto *LI = dyn_cast<LoadInst>(I)) {
+        // Ignore non-volatile loads, they are always ok.
+        if (!LI->isSimple()) return false;
+        continue;
+      }
+
+      if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) {
+        // If uses of the bitcast are ok, we are ok.
+        ValuesToInspect.emplace_back(I, IsOffset);
+        continue;
+      }
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+        // If the GEP has all zero indices, it doesn't offset the pointer. If it
+        // doesn't, it does.
+        ValuesToInspect.emplace_back(I, IsOffset || !GEP->hasAllZeroIndices());
+        continue;
+      }
+
+      if (auto CS = CallSite(I)) {
+        // If this is the function being called then we treat it like a load and
+        // ignore it.
+        if (CS.isCallee(&U))
+          continue;
+
+        unsigned DataOpNo = CS.getDataOperandNo(&U);
+        bool IsArgOperand = CS.isArgOperand(&U);
+
+        // Inalloca arguments are clobbered by the call.
+        if (IsArgOperand && CS.isInAllocaArgument(DataOpNo))
+          return false;
+
+        // If this is a readonly/readnone call site, then we know it is just a
+        // load (but one that potentially returns the value itself), so we can
+        // ignore it if we know that the value isn't captured.
+        if (CS.onlyReadsMemory() &&
+            (CS.getInstruction()->use_empty() || CS.doesNotCapture(DataOpNo)))
+          continue;
+
+        // If this is being passed as a byval argument, the caller is making a
+        // copy, so it is only a read of the alloca.
+        if (IsArgOperand && CS.isByValArgument(DataOpNo))
+          continue;
+      }
+
+      // Lifetime intrinsics can be handled by the caller.
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+            II->getIntrinsicID() == Intrinsic::lifetime_end) {
+          assert(II->use_empty() && "Lifetime markers have no result to use!");
+          ToDelete.push_back(II);
+          continue;
+        }
+      }
+
+      // If this is isn't our memcpy/memmove, reject it as something we can't
+      // handle.
+      MemTransferInst *MI = dyn_cast<MemTransferInst>(I);
+      if (!MI)
+        return false;
+
+      // If the transfer is using the alloca as a source of the transfer, then
+      // ignore it since it is a load (unless the transfer is volatile).
+      if (U.getOperandNo() == 1) {
+        if (MI->isVolatile()) return false;
+        continue;
+      }
+
+      // If we already have seen a copy, reject the second one.
+      if (TheCopy) return false;
+
+      // If the pointer has been offset from the start of the alloca, we can't
+      // safely handle this.
+      if (IsOffset) return false;
+
+      // If the memintrinsic isn't using the alloca as the dest, reject it.
+      if (U.getOperandNo() != 0) return false;
+
+      // If the source of the memcpy/move is not a constant global, reject it.
+      if (!pointsToConstantGlobal(MI->getSource()))
+        return false;
+
+      // Otherwise, the transform is safe.  Remember the copy instruction.
+      TheCopy = MI;
+    }
+  }
+  return true;
+}
+
+/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
+/// modified by a copy from a constant global.  If we can prove this, we can
+/// replace any uses of the alloca with uses of the global directly.
+static MemTransferInst *
+isOnlyCopiedFromConstantGlobal(AllocaInst *AI,
+                               SmallVectorImpl<Instruction *> &ToDelete) {
+  MemTransferInst *TheCopy = nullptr;
+  if (isOnlyCopiedFromConstantGlobal(AI, TheCopy, ToDelete))
+    return TheCopy;
+  return nullptr;
+}
+
+/// Returns true if V is dereferenceable for size of alloca.
+static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
+                                           const DataLayout &DL) {
+  if (AI->isArrayAllocation())
+    return false;
+  uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
+  if (!AllocaSize)
+    return false;
+  return isDereferenceableAndAlignedPointer(V, AI->getAlignment(),
+                                            APInt(64, AllocaSize), DL);
+}
+
+static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
+  // Check for array size of 1 (scalar allocation).
+  if (!AI.isArrayAllocation()) {
+    // i32 1 is the canonical array size for scalar allocations.
+    if (AI.getArraySize()->getType()->isIntegerTy(32))
+      return nullptr;
+
+    // Canonicalize it.
+    Value *V = IC.Builder.getInt32(1);
+    AI.setOperand(0, V);
+    return &AI;
+  }
+
+  // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
+  if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
+    Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
+    AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
+    New->setAlignment(AI.getAlignment());
+
+    // Scan to the end of the allocation instructions, to skip over a block of
+    // allocas if possible...also skip interleaved debug info
+    //
+    BasicBlock::iterator It(New);
+    while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
+      ++It;
+
+    // Now that I is pointing to the first non-allocation-inst in the block,
+    // insert our getelementptr instruction...
+    //
+    Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType());
+    Value *NullIdx = Constant::getNullValue(IdxTy);
+    Value *Idx[2] = {NullIdx, NullIdx};
+    Instruction *GEP =
+        GetElementPtrInst::CreateInBounds(New, Idx, New->getName() + ".sub");
+    IC.InsertNewInstBefore(GEP, *It);
+
+    // Now make everything use the getelementptr instead of the original
+    // allocation.
+    return IC.replaceInstUsesWith(AI, GEP);
+  }
+
+  if (isa<UndefValue>(AI.getArraySize()))
+    return IC.replaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
+
+  // Ensure that the alloca array size argument has type intptr_t, so that
+  // any casting is exposed early.
+  Type *IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType());
+  if (AI.getArraySize()->getType() != IntPtrTy) {
+    Value *V = IC.Builder.CreateIntCast(AI.getArraySize(), IntPtrTy, false);
+    AI.setOperand(0, V);
+    return &AI;
+  }
+
+  return nullptr;
+}
+
+namespace {
+// If I and V are pointers in different address space, it is not allowed to
+// use replaceAllUsesWith since I and V have different types. A
+// non-target-specific transformation should not use addrspacecast on V since
+// the two address space may be disjoint depending on target.
+//
+// This class chases down uses of the old pointer until reaching the load
+// instructions, then replaces the old pointer in the load instructions with
+// the new pointer. If during the chasing it sees bitcast or GEP, it will
+// create new bitcast or GEP with the new pointer and use them in the load
+// instruction.
+class PointerReplacer {
+public:
+  PointerReplacer(InstCombiner &IC) : IC(IC) {}
+  void replacePointer(Instruction &I, Value *V);
+
+private:
+  void findLoadAndReplace(Instruction &I);
+  void replace(Instruction *I);
+  Value *getReplacement(Value *I);
+
+  SmallVector<Instruction *, 4> Path;
+  MapVector<Value *, Value *> WorkMap;
+  InstCombiner &IC;
+};
+} // end anonymous namespace
+
+void PointerReplacer::findLoadAndReplace(Instruction &I) {
+  for (auto U : I.users()) {
+    auto *Inst = dyn_cast<Instruction>(&*U);
+    if (!Inst)
+      return;
+    DEBUG(dbgs() << "Found pointer user: " << *U << '\n');
+    if (isa<LoadInst>(Inst)) {
+      for (auto P : Path)
+        replace(P);
+      replace(Inst);
+    } else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) {
+      Path.push_back(Inst);
+      findLoadAndReplace(*Inst);
+      Path.pop_back();
+    } else {
+      return;
+    }
+  }
+}
+
+Value *PointerReplacer::getReplacement(Value *V) {
+  auto Loc = WorkMap.find(V);
+  if (Loc != WorkMap.end())
+    return Loc->second;
+  return nullptr;
+}
+
+void PointerReplacer::replace(Instruction *I) {
+  if (getReplacement(I))
+    return;
+
+  if (auto *LT = dyn_cast<LoadInst>(I)) {
+    auto *V = getReplacement(LT->getPointerOperand());
+    assert(V && "Operand not replaced");
+    auto *NewI = new LoadInst(V);
+    NewI->takeName(LT);
+    IC.InsertNewInstWith(NewI, *LT);
+    IC.replaceInstUsesWith(*LT, NewI);
+    WorkMap[LT] = NewI;
+  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    auto *V = getReplacement(GEP->getPointerOperand());
+    assert(V && "Operand not replaced");
+    SmallVector<Value *, 8> Indices;
+    Indices.append(GEP->idx_begin(), GEP->idx_end());
+    auto *NewI = GetElementPtrInst::Create(
+        V->getType()->getPointerElementType(), V, Indices);
+    IC.InsertNewInstWith(NewI, *GEP);
+    NewI->takeName(GEP);
+    WorkMap[GEP] = NewI;
+  } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
+    auto *V = getReplacement(BC->getOperand(0));
+    assert(V && "Operand not replaced");
+    auto *NewT = PointerType::get(BC->getType()->getPointerElementType(),
+                                  V->getType()->getPointerAddressSpace());
+    auto *NewI = new BitCastInst(V, NewT);
+    IC.InsertNewInstWith(NewI, *BC);
+    NewI->takeName(BC);
+    WorkMap[BC] = NewI;
+  } else {
+    llvm_unreachable("should never reach here");
+  }
+}
+
+void PointerReplacer::replacePointer(Instruction &I, Value *V) {
+#ifndef NDEBUG
+  auto *PT = cast<PointerType>(I.getType());
+  auto *NT = cast<PointerType>(V->getType());
+  assert(PT != NT && PT->getElementType() == NT->getElementType() &&
+         "Invalid usage");
+#endif
+  WorkMap[&I] = V;
+  findLoadAndReplace(I);
+}
+
+Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
+  if (auto *I = simplifyAllocaArraySize(*this, AI))
+    return I;
+
+  if (AI.getAllocatedType()->isSized()) {
+    // If the alignment is 0 (unspecified), assign it the preferred alignment.
+    if (AI.getAlignment() == 0)
+      AI.setAlignment(DL.getPrefTypeAlignment(AI.getAllocatedType()));
+
+    // Move all alloca's of zero byte objects to the entry block and merge them
+    // together.  Note that we only do this for alloca's, because malloc should
+    // allocate and return a unique pointer, even for a zero byte allocation.
+    if (DL.getTypeAllocSize(AI.getAllocatedType()) == 0) {
+      // For a zero sized alloca there is no point in doing an array allocation.
+      // This is helpful if the array size is a complicated expression not used
+      // elsewhere.
+      if (AI.isArrayAllocation()) {
+        AI.setOperand(0, ConstantInt::get(AI.getArraySize()->getType(), 1));
+        return &AI;
+      }
+
+      // Get the first instruction in the entry block.
+      BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
+      Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg();
+      if (FirstInst != &AI) {
+        // If the entry block doesn't start with a zero-size alloca then move
+        // this one to the start of the entry block.  There is no problem with
+        // dominance as the array size was forced to a constant earlier already.
+        AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
+        if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
+            DL.getTypeAllocSize(EntryAI->getAllocatedType()) != 0) {
+          AI.moveBefore(FirstInst);
+          return &AI;
+        }
+
+        // If the alignment of the entry block alloca is 0 (unspecified),
+        // assign it the preferred alignment.
+        if (EntryAI->getAlignment() == 0)
+          EntryAI->setAlignment(
+              DL.getPrefTypeAlignment(EntryAI->getAllocatedType()));
+        // Replace this zero-sized alloca with the one at the start of the entry
+        // block after ensuring that the address will be aligned enough for both
+        // types.
+        unsigned MaxAlign = std::max(EntryAI->getAlignment(),
+                                     AI.getAlignment());
+        EntryAI->setAlignment(MaxAlign);
+        if (AI.getType() != EntryAI->getType())
+          return new BitCastInst(EntryAI, AI.getType());
+        return replaceInstUsesWith(AI, EntryAI);
+      }
+    }
+  }
+
+  if (AI.getAlignment()) {
+    // Check to see if this allocation is only modified by a memcpy/memmove from
+    // a constant global whose alignment is equal to or exceeds that of the
+    // allocation.  If this is the case, we can change all users to use
+    // the constant global instead.  This is commonly produced by the CFE by
+    // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
+    // is only subsequently read.
+    SmallVector<Instruction *, 4> ToDelete;
+    if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
+      unsigned SourceAlign = getOrEnforceKnownAlignment(
+          Copy->getSource(), AI.getAlignment(), DL, &AI, &AC, &DT);
+      if (AI.getAlignment() <= SourceAlign &&
+          isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) {
+        DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
+        DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
+        for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
+          eraseInstFromFunction(*ToDelete[i]);
+        Constant *TheSrc = cast<Constant>(Copy->getSource());
+        auto *SrcTy = TheSrc->getType();
+        auto *DestTy = PointerType::get(AI.getType()->getPointerElementType(),
+                                        SrcTy->getPointerAddressSpace());
+        Constant *Cast =
+            ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, DestTy);
+        if (AI.getType()->getPointerAddressSpace() ==
+            SrcTy->getPointerAddressSpace()) {
+          Instruction *NewI = replaceInstUsesWith(AI, Cast);
+          eraseInstFromFunction(*Copy);
+          ++NumGlobalCopies;
+          return NewI;
+        } else {
+          PointerReplacer PtrReplacer(*this);
+          PtrReplacer.replacePointer(AI, Cast);
+          ++NumGlobalCopies;
+        }
+      }
+    }
+  }
+
+  // At last, use the generic allocation site handler to aggressively remove
+  // unused allocas.
+  return visitAllocSite(AI);
+}
+
+// Are we allowed to form a atomic load or store of this type?
+static bool isSupportedAtomicType(Type *Ty) {
+  return Ty->isIntegerTy() || Ty->isPointerTy() || Ty->isFloatingPointTy();
+}
+
+/// \brief Helper to combine a load to a new type.
+///
+/// This just does the work of combining a load to a new type. It handles
+/// metadata, etc., and returns the new instruction. The \c NewTy should be the
+/// loaded *value* type. This will convert it to a pointer, cast the operand to
+/// that pointer type, load it, etc.
+///
+/// Note that this will create all of the instructions with whatever insert
+/// point the \c InstCombiner currently is using.
+static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewTy,
+                                      const Twine &Suffix = "") {
+  assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
+         "can't fold an atomic load to requested type");
+  
+  Value *Ptr = LI.getPointerOperand();
+  unsigned AS = LI.getPointerAddressSpace();
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+  LI.getAllMetadata(MD);
+
+  LoadInst *NewLoad = IC.Builder.CreateAlignedLoad(
+      IC.Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS)),
+      LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix);
+  NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+  MDBuilder MDB(NewLoad->getContext());
+  for (const auto &MDPair : MD) {
+    unsigned ID = MDPair.first;
+    MDNode *N = MDPair.second;
+    // Note, essentially every kind of metadata should be preserved here! This
+    // routine is supposed to clone a load instruction changing *only its type*.
+    // The only metadata it makes sense to drop is metadata which is invalidated
+    // when the pointer type changes. This should essentially never be the case
+    // in LLVM, but we explicitly switch over only known metadata to be
+    // conservatively correct. If you are adding metadata to LLVM which pertains
+    // to loads, you almost certainly want to add it here.
+    switch (ID) {
+    case LLVMContext::MD_dbg:
+    case LLVMContext::MD_tbaa:
+    case LLVMContext::MD_prof:
+    case LLVMContext::MD_fpmath:
+    case LLVMContext::MD_tbaa_struct:
+    case LLVMContext::MD_invariant_load:
+    case LLVMContext::MD_alias_scope:
+    case LLVMContext::MD_noalias:
+    case LLVMContext::MD_nontemporal:
+    case LLVMContext::MD_mem_parallel_loop_access:
+      // All of these directly apply.
+      NewLoad->setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_nonnull:
+      copyNonnullMetadata(LI, N, *NewLoad);
+      break;
+    case LLVMContext::MD_align:
+    case LLVMContext::MD_dereferenceable:
+    case LLVMContext::MD_dereferenceable_or_null:
+      // These only directly apply if the new type is also a pointer.
+      if (NewTy->isPointerTy())
+        NewLoad->setMetadata(ID, N);
+      break;
+    case LLVMContext::MD_range:
+      copyRangeMetadata(IC.getDataLayout(), LI, N, *NewLoad);
+      break;
+    }
+  }
+  return NewLoad;
+}
+
+/// \brief Combine a store to a new type.
+///
+/// Returns the newly created store instruction.
+static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value *V) {
+  assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) &&
+         "can't fold an atomic store of requested type");
+  
+  Value *Ptr = SI.getPointerOperand();
+  unsigned AS = SI.getPointerAddressSpace();
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+  SI.getAllMetadata(MD);
+
+  StoreInst *NewStore = IC.Builder.CreateAlignedStore(
+      V, IC.Builder.CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
+      SI.getAlignment(), SI.isVolatile());
+  NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
+  for (const auto &MDPair : MD) {
+    unsigned ID = MDPair.first;
+    MDNode *N = MDPair.second;
+    // Note, essentially every kind of metadata should be preserved here! This
+    // routine is supposed to clone a store instruction changing *only its
+    // type*. The only metadata it makes sense to drop is metadata which is
+    // invalidated when the pointer type changes. This should essentially
+    // never be the case in LLVM, but we explicitly switch over only known
+    // metadata to be conservatively correct. If you are adding metadata to
+    // LLVM which pertains to stores, you almost certainly want to add it
+    // here.
+    switch (ID) {
+    case LLVMContext::MD_dbg:
+    case LLVMContext::MD_tbaa:
+    case LLVMContext::MD_prof:
+    case LLVMContext::MD_fpmath:
+    case LLVMContext::MD_tbaa_struct:
+    case LLVMContext::MD_alias_scope:
+    case LLVMContext::MD_noalias:
+    case LLVMContext::MD_nontemporal:
+    case LLVMContext::MD_mem_parallel_loop_access:
+      // All of these directly apply.
+      NewStore->setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_invariant_load:
+    case LLVMContext::MD_nonnull:
+    case LLVMContext::MD_range:
+    case LLVMContext::MD_align:
+    case LLVMContext::MD_dereferenceable:
+    case LLVMContext::MD_dereferenceable_or_null:
+      // These don't apply for stores.
+      break;
+    }
+  }
+
+  return NewStore;
+}
+
+/// \brief Combine loads to match the type of their uses' value after looking
+/// through intervening bitcasts.
+///
+/// The core idea here is that if the result of a load is used in an operation,
+/// we should load the type most conducive to that operation. For example, when
+/// loading an integer and converting that immediately to a pointer, we should
+/// instead directly load a pointer.
+///
+/// However, this routine must never change the width of a load or the number of
+/// loads as that would introduce a semantic change. This combine is expected to
+/// be a semantic no-op which just allows loads to more closely model the types
+/// of their consuming operations.
+///
+/// Currently, we also refuse to change the precise type used for an atomic load
+/// or a volatile load. This is debatable, and might be reasonable to change
+/// later. However, it is risky in case some backend or other part of LLVM is
+/// relying on the exact type loaded to select appropriate atomic operations.
+static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
+  // FIXME: We could probably with some care handle both volatile and ordered
+  // atomic loads here but it isn't clear that this is important.
+  if (!LI.isUnordered())
+    return nullptr;
+
+  if (LI.use_empty())
+    return nullptr;
+
+  // swifterror values can't be bitcasted.
+  if (LI.getPointerOperand()->isSwiftError())
+    return nullptr;
+
+  Type *Ty = LI.getType();
+  const DataLayout &DL = IC.getDataLayout();
+
+  // Try to canonicalize loads which are only ever stored to operate over
+  // integers instead of any other type. We only do this when the loaded type
+  // is sized and has a size exactly the same as its store size and the store
+  // size is a legal integer type.
+  if (!Ty->isIntegerTy() && Ty->isSized() &&
+      DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) &&
+      DL.getTypeStoreSizeInBits(Ty) == DL.getTypeSizeInBits(Ty) &&
+      !DL.isNonIntegralPointerType(Ty)) {
+    if (all_of(LI.users(), [&LI](User *U) {
+          auto *SI = dyn_cast<StoreInst>(U);
+          return SI && SI->getPointerOperand() != &LI &&
+                 !SI->getPointerOperand()->isSwiftError();
+        })) {
+      LoadInst *NewLoad = combineLoadToNewType(
+          IC, LI,
+          Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty)));
+      // Replace all the stores with stores of the newly loaded value.
+      for (auto UI = LI.user_begin(), UE = LI.user_end(); UI != UE;) {
+        auto *SI = cast<StoreInst>(*UI++);
+        IC.Builder.SetInsertPoint(SI);
+        combineStoreToNewValue(IC, *SI, NewLoad);
+        IC.eraseInstFromFunction(*SI);
+      }
+      assert(LI.use_empty() && "Failed to remove all users of the load!");
+      // Return the old load so the combiner can delete it safely.
+      return &LI;
+    }
+  }
+
+  // Fold away bit casts of the loaded value by loading the desired type.
+  // We can do this for BitCastInsts as well as casts from and to pointer types,
+  // as long as those are noops (i.e., the source or dest type have the same
+  // bitwidth as the target's pointers).
+  if (LI.hasOneUse())
+    if (auto* CI = dyn_cast<CastInst>(LI.user_back()))
+      if (CI->isNoopCast(DL))
+        if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) {
+          LoadInst *NewLoad = combineLoadToNewType(IC, LI, CI->getDestTy());
+          CI->replaceAllUsesWith(NewLoad);
+          IC.eraseInstFromFunction(*CI);
+          return &LI;
+        }
+
+  // FIXME: We should also canonicalize loads of vectors when their elements are
+  // cast to other types.
+  return nullptr;
+}
+
+static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
+  // FIXME: We could probably with some care handle both volatile and atomic
+  // stores here but it isn't clear that this is important.
+  if (!LI.isSimple())
+    return nullptr;
+
+  Type *T = LI.getType();
+  if (!T->isAggregateType())
+    return nullptr;
+
+  StringRef Name = LI.getName();
+  assert(LI.getAlignment() && "Alignment must be set at this point");
+
+  if (auto *ST = dyn_cast<StructType>(T)) {
+    // If the struct only have one element, we unpack.
+    auto NumElements = ST->getNumElements();
+    if (NumElements == 1) {
+      LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U),
+                                               ".unpack");
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      NewLoad->setAAMetadata(AAMD);
+      return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
+        UndefValue::get(T), NewLoad, 0, Name));
+    }
+
+    // We don't want to break loads with padding here as we'd loose
+    // the knowledge that padding exists for the rest of the pipeline.
+    const DataLayout &DL = IC.getDataLayout();
+    auto *SL = DL.getStructLayout(ST);
+    if (SL->hasPadding())
+      return nullptr;
+
+    auto Align = LI.getAlignment();
+    if (!Align)
+      Align = DL.getABITypeAlignment(ST);
+
+    auto *Addr = LI.getPointerOperand();
+    auto *IdxType = Type::getInt32Ty(T->getContext());
+    auto *Zero = ConstantInt::get(IdxType, 0);
+
+    Value *V = UndefValue::get(T);
+    for (unsigned i = 0; i < NumElements; i++) {
+      Value *Indices[2] = {
+        Zero,
+        ConstantInt::get(IdxType, i),
+      };
+      auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
+                                               Name + ".elt");
+      auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
+      auto *L = IC.Builder.CreateAlignedLoad(Ptr, EltAlign, Name + ".unpack");
+      // Propagate AA metadata. It'll still be valid on the narrowed load.
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      L->setAAMetadata(AAMD);
+      V = IC.Builder.CreateInsertValue(V, L, i);
+    }
+
+    V->setName(Name);
+    return IC.replaceInstUsesWith(LI, V);
+  }
+
+  if (auto *AT = dyn_cast<ArrayType>(T)) {
+    auto *ET = AT->getElementType();
+    auto NumElements = AT->getNumElements();
+    if (NumElements == 1) {
+      LoadInst *NewLoad = combineLoadToNewType(IC, LI, ET, ".unpack");
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      NewLoad->setAAMetadata(AAMD);
+      return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
+        UndefValue::get(T), NewLoad, 0, Name));
+    }
+
+    // Bail out if the array is too large. Ideally we would like to optimize
+    // arrays of arbitrary size but this has a terrible impact on compile time.
+    // The threshold here is chosen arbitrarily, maybe needs a little bit of
+    // tuning.
+    if (NumElements > IC.MaxArraySizeForCombine)
+      return nullptr;
+
+    const DataLayout &DL = IC.getDataLayout();
+    auto EltSize = DL.getTypeAllocSize(ET);
+    auto Align = LI.getAlignment();
+    if (!Align)
+      Align = DL.getABITypeAlignment(T);
+
+    auto *Addr = LI.getPointerOperand();
+    auto *IdxType = Type::getInt64Ty(T->getContext());
+    auto *Zero = ConstantInt::get(IdxType, 0);
+
+    Value *V = UndefValue::get(T);
+    uint64_t Offset = 0;
+    for (uint64_t i = 0; i < NumElements; i++) {
+      Value *Indices[2] = {
+        Zero,
+        ConstantInt::get(IdxType, i),
+      };
+      auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
+                                               Name + ".elt");
+      auto *L = IC.Builder.CreateAlignedLoad(Ptr, MinAlign(Align, Offset),
+                                             Name + ".unpack");
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      L->setAAMetadata(AAMD);
+      V = IC.Builder.CreateInsertValue(V, L, i);
+      Offset += EltSize;
+    }
+
+    V->setName(Name);
+    return IC.replaceInstUsesWith(LI, V);
+  }
+
+  return nullptr;
+}
+
+// If we can determine that all possible objects pointed to by the provided
+// pointer value are, not only dereferenceable, but also definitively less than
+// or equal to the provided maximum size, then return true. Otherwise, return
+// false (constant global values and allocas fall into this category).
+//
+// FIXME: This should probably live in ValueTracking (or similar).
+static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
+                                     const DataLayout &DL) {
+  SmallPtrSet<Value *, 4> Visited;
+  SmallVector<Value *, 4> Worklist(1, V);
+
+  do {
+    Value *P = Worklist.pop_back_val();
+    P = P->stripPointerCasts();
+
+    if (!Visited.insert(P).second)
+      continue;
+
+    if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
+      Worklist.push_back(SI->getTrueValue());
+      Worklist.push_back(SI->getFalseValue());
+      continue;
+    }
+
+    if (PHINode *PN = dyn_cast<PHINode>(P)) {
+      for (Value *IncValue : PN->incoming_values())
+        Worklist.push_back(IncValue);
+      continue;
+    }
+
+    if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) {
+      if (GA->isInterposable())
+        return false;
+      Worklist.push_back(GA->getAliasee());
+      continue;
+    }
+
+    // If we know how big this object is, and it is less than MaxSize, continue
+    // searching. Otherwise, return false.
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) {
+      if (!AI->getAllocatedType()->isSized())
+        return false;
+
+      ConstantInt *CS = dyn_cast<ConstantInt>(AI->getArraySize());
+      if (!CS)
+        return false;
+
+      uint64_t TypeSize = DL.getTypeAllocSize(AI->getAllocatedType());
+      // Make sure that, even if the multiplication below would wrap as an
+      // uint64_t, we still do the right thing.
+      if ((CS->getValue().zextOrSelf(128)*APInt(128, TypeSize)).ugt(MaxSize))
+        return false;
+      continue;
+    }
+
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
+      if (!GV->hasDefinitiveInitializer() || !GV->isConstant())
+        return false;
+
+      uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
+      if (InitSize > MaxSize)
+        return false;
+      continue;
+    }
+
+    return false;
+  } while (!Worklist.empty());
+
+  return true;
+}
+
+// If we're indexing into an object of a known size, and the outer index is
+// not a constant, but having any value but zero would lead to undefined
+// behavior, replace it with zero.
+//
+// For example, if we have:
+// @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4
+// ...
+// %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x
+// ... = load i32* %arrayidx, align 4
+// Then we know that we can replace %x in the GEP with i64 0.
+//
+// FIXME: We could fold any GEP index to zero that would cause UB if it were
+// not zero. Currently, we only handle the first such index. Also, we could
+// also search through non-zero constant indices if we kept track of the
+// offsets those indices implied.
+static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
+                                     Instruction *MemI, unsigned &Idx) {
+  if (GEPI->getNumOperands() < 2)
+    return false;
+
+  // Find the first non-zero index of a GEP. If all indices are zero, return
+  // one past the last index.
+  auto FirstNZIdx = [](const GetElementPtrInst *GEPI) {
+    unsigned I = 1;
+    for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) {
+      Value *V = GEPI->getOperand(I);
+      if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
+        if (CI->isZero())
+          continue;
+
+      break;
+    }
+
+    return I;
+  };
+
+  // Skip through initial 'zero' indices, and find the corresponding pointer
+  // type. See if the next index is not a constant.
+  Idx = FirstNZIdx(GEPI);
+  if (Idx == GEPI->getNumOperands())
+    return false;
+  if (isa<Constant>(GEPI->getOperand(Idx)))
+    return false;
+
+  SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx);
+  Type *AllocTy =
+    GetElementPtrInst::getIndexedType(GEPI->getSourceElementType(), Ops);
+  if (!AllocTy || !AllocTy->isSized())
+    return false;
+  const DataLayout &DL = IC.getDataLayout();
+  uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy);
+
+  // If there are more indices after the one we might replace with a zero, make
+  // sure they're all non-negative. If any of them are negative, the overall
+  // address being computed might be before the base address determined by the
+  // first non-zero index.
+  auto IsAllNonNegative = [&]() {
+    for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
+      KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
+      if (Known.isNonNegative())
+        continue;
+      return false;
+    }
+
+    return true;
+  };
+
+  // FIXME: If the GEP is not inbounds, and there are extra indices after the
+  // one we'll replace, those could cause the address computation to wrap
+  // (rendering the IsAllNonNegative() check below insufficient). We can do
+  // better, ignoring zero indices (and other indices we can prove small
+  // enough not to wrap).
+  if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds())
+    return false;
+
+  // Note that isObjectSizeLessThanOrEq will return true only if the pointer is
+  // also known to be dereferenceable.
+  return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) &&
+         IsAllNonNegative();
+}
+
+// If we're indexing into an object with a variable index for the memory
+// access, but the object has only one element, we can assume that the index
+// will always be zero. If we replace the GEP, return it.
+template <typename T>
+static Instruction *replaceGEPIdxWithZero(InstCombiner &IC, Value *Ptr,
+                                          T &MemI) {
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
+    unsigned Idx;
+    if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) {
+      Instruction *NewGEPI = GEPI->clone();
+      NewGEPI->setOperand(Idx,
+        ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0));
+      NewGEPI->insertBefore(GEPI);
+      MemI.setOperand(MemI.getPointerOperandIndex(), NewGEPI);
+      return NewGEPI;
+    }
+  }
+
+  return nullptr;
+}
+
+static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) {
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
+    const Value *GEPI0 = GEPI->getOperand(0);
+    if (isa<ConstantPointerNull>(GEPI0) && GEPI->getPointerAddressSpace() == 0)
+      return true;
+  }
+  if (isa<UndefValue>(Op) ||
+      (isa<ConstantPointerNull>(Op) && LI.getPointerAddressSpace() == 0))
+    return true;
+  return false;
+}
+
+Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
+  Value *Op = LI.getOperand(0);
+
+  // Try to canonicalize the loaded type.
+  if (Instruction *Res = combineLoadToOperationType(*this, LI))
+    return Res;
+
+  // Attempt to improve the alignment.
+  unsigned KnownAlign = getOrEnforceKnownAlignment(
+      Op, DL.getPrefTypeAlignment(LI.getType()), DL, &LI, &AC, &DT);
+  unsigned LoadAlign = LI.getAlignment();
+  unsigned EffectiveLoadAlign =
+      LoadAlign != 0 ? LoadAlign : DL.getABITypeAlignment(LI.getType());
+
+  if (KnownAlign > EffectiveLoadAlign)
+    LI.setAlignment(KnownAlign);
+  else if (LoadAlign == 0)
+    LI.setAlignment(EffectiveLoadAlign);
+
+  // Replace GEP indices if possible.
+  if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) {
+      Worklist.Add(NewGEPI);
+      return &LI;
+  }
+
+  if (Instruction *Res = unpackLoadToAggregate(*this, LI))
+    return Res;
+
+  // Do really simple store-to-load forwarding and load CSE, to catch cases
+  // where there are several consecutive memory accesses to the same location,
+  // separated by a few arithmetic operations.
+  BasicBlock::iterator BBI(LI);
+  bool IsLoadCSE = false;
+  if (Value *AvailableVal = FindAvailableLoadedValue(
+          &LI, LI.getParent(), BBI, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+    if (IsLoadCSE)
+      combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI);
+
+    return replaceInstUsesWith(
+        LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
+                                           LI.getName() + ".cast"));
+  }
+
+  // None of the following transforms are legal for volatile/ordered atomic
+  // loads.  Most of them do apply for unordered atomics.
+  if (!LI.isUnordered()) return nullptr;
+
+  // load(gep null, ...) -> unreachable
+  // load null/undef -> unreachable
+  // TODO: Consider a target hook for valid address spaces for this xforms.
+  if (canSimplifyNullLoadOrGEP(LI, Op)) {
+    // Insert a new store to null instruction before the load to indicate
+    // that this code is not reachable.  We do this instead of inserting
+    // an unreachable instruction directly because we cannot modify the
+    // CFG.
+    new StoreInst(UndefValue::get(LI.getType()),
+                  Constant::getNullValue(Op->getType()), &LI);
+    return replaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+  }
+
+  if (Op->hasOneUse()) {
+    // Change select and PHI nodes to select values instead of addresses: this
+    // helps alias analysis out a lot, allows many others simplifications, and
+    // exposes redundancy in the code.
+    //
+    // Note that we cannot do the transformation unless we know that the
+    // introduced loads cannot trap!  Something like this is valid as long as
+    // the condition is always false: load (select bool %C, int* null, int* %G),
+    // but it would not be valid if we transformed it to load from null
+    // unconditionally.
+    //
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
+      // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
+      unsigned Align = LI.getAlignment();
+      if (isSafeToLoadUnconditionally(SI->getOperand(1), Align, DL, SI) &&
+          isSafeToLoadUnconditionally(SI->getOperand(2), Align, DL, SI)) {
+        LoadInst *V1 = Builder.CreateLoad(SI->getOperand(1),
+                                          SI->getOperand(1)->getName()+".val");
+        LoadInst *V2 = Builder.CreateLoad(SI->getOperand(2),
+                                          SI->getOperand(2)->getName()+".val");
+        assert(LI.isUnordered() && "implied by above");
+        V1->setAlignment(Align);
+        V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+        V2->setAlignment(Align);
+        V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+        return SelectInst::Create(SI->getCondition(), V1, V2);
+      }
+
+      // load (select (cond, null, P)) -> load P
+      if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
+          LI.getPointerAddressSpace() == 0) {
+        LI.setOperand(0, SI->getOperand(2));
+        return &LI;
+      }
+
+      // load (select (cond, P, null)) -> load P
+      if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
+          LI.getPointerAddressSpace() == 0) {
+        LI.setOperand(0, SI->getOperand(1));
+        return &LI;
+      }
+    }
+  }
+  return nullptr;
+}
+
+/// \brief Look for extractelement/insertvalue sequence that acts like a bitcast.
+///
+/// \returns underlying value that was "cast", or nullptr otherwise.
+///
+/// For example, if we have:
+///
+///     %E0 = extractelement <2 x double> %U, i32 0
+///     %V0 = insertvalue [2 x double] undef, double %E0, 0
+///     %E1 = extractelement <2 x double> %U, i32 1
+///     %V1 = insertvalue [2 x double] %V0, double %E1, 1
+///
+/// and the layout of a <2 x double> is isomorphic to a [2 x double],
+/// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
+/// Note that %U may contain non-undef values where %V1 has undef.
+static Value *likeBitCastFromVector(InstCombiner &IC, Value *V) {
+  Value *U = nullptr;
+  while (auto *IV = dyn_cast<InsertValueInst>(V)) {
+    auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
+    if (!E)
+      return nullptr;
+    auto *W = E->getVectorOperand();
+    if (!U)
+      U = W;
+    else if (U != W)
+      return nullptr;
+    auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand());
+    if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
+      return nullptr;
+    V = IV->getAggregateOperand();
+  }
+  if (!isa<UndefValue>(V) ||!U)
+    return nullptr;
+
+  auto *UT = cast<VectorType>(U->getType());
+  auto *VT = V->getType();
+  // Check that types UT and VT are bitwise isomorphic.
+  const auto &DL = IC.getDataLayout();
+  if (DL.getTypeStoreSizeInBits(UT) != DL.getTypeStoreSizeInBits(VT)) {
+    return nullptr;
+  }
+  if (auto *AT = dyn_cast<ArrayType>(VT)) {
+    if (AT->getNumElements() != UT->getNumElements())
+      return nullptr;
+  } else {
+    auto *ST = cast<StructType>(VT);
+    if (ST->getNumElements() != UT->getNumElements())
+      return nullptr;
+    for (const auto *EltT : ST->elements()) {
+      if (EltT != UT->getElementType())
+        return nullptr;
+    }
+  }
+  return U;
+}
+
+/// \brief Combine stores to match the type of value being stored.
+///
+/// The core idea here is that the memory does not have any intrinsic type and
+/// where we can we should match the type of a store to the type of value being
+/// stored.
+///
+/// However, this routine must never change the width of a store or the number of
+/// stores as that would introduce a semantic change. This combine is expected to
+/// be a semantic no-op which just allows stores to more closely model the types
+/// of their incoming values.
+///
+/// Currently, we also refuse to change the precise type used for an atomic or
+/// volatile store. This is debatable, and might be reasonable to change later.
+/// However, it is risky in case some backend or other part of LLVM is relying
+/// on the exact type stored to select appropriate atomic operations.
+///
+/// \returns true if the store was successfully combined away. This indicates
+/// the caller must erase the store instruction. We have to let the caller erase
+/// the store instruction as otherwise there is no way to signal whether it was
+/// combined or not: IC.EraseInstFromFunction returns a null pointer.
+static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) {
+  // FIXME: We could probably with some care handle both volatile and ordered
+  // atomic stores here but it isn't clear that this is important.
+  if (!SI.isUnordered())
+    return false;
+
+  // swifterror values can't be bitcasted.
+  if (SI.getPointerOperand()->isSwiftError())
+    return false;
+
+  Value *V = SI.getValueOperand();
+
+  // Fold away bit casts of the stored value by storing the original type.
+  if (auto *BC = dyn_cast<BitCastInst>(V)) {
+    V = BC->getOperand(0);
+    if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) {
+      combineStoreToNewValue(IC, SI, V);
+      return true;
+    }
+  }
+
+  if (Value *U = likeBitCastFromVector(IC, V))
+    if (!SI.isAtomic() || isSupportedAtomicType(U->getType())) {
+      combineStoreToNewValue(IC, SI, U);
+      return true;
+    }
+
+  // FIXME: We should also canonicalize stores of vectors when their elements
+  // are cast to other types.
+  return false;
+}
+
+static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
+  // FIXME: We could probably with some care handle both volatile and atomic
+  // stores here but it isn't clear that this is important.
+  if (!SI.isSimple())
+    return false;
+
+  Value *V = SI.getValueOperand();
+  Type *T = V->getType();
+
+  if (!T->isAggregateType())
+    return false;
+
+  if (auto *ST = dyn_cast<StructType>(T)) {
+    // If the struct only have one element, we unpack.
+    unsigned Count = ST->getNumElements();
+    if (Count == 1) {
+      V = IC.Builder.CreateExtractValue(V, 0);
+      combineStoreToNewValue(IC, SI, V);
+      return true;
+    }
+
+    // We don't want to break loads with padding here as we'd loose
+    // the knowledge that padding exists for the rest of the pipeline.
+    const DataLayout &DL = IC.getDataLayout();
+    auto *SL = DL.getStructLayout(ST);
+    if (SL->hasPadding())
+      return false;
+
+    auto Align = SI.getAlignment();
+    if (!Align)
+      Align = DL.getABITypeAlignment(ST);
+
+    SmallString<16> EltName = V->getName();
+    EltName += ".elt";
+    auto *Addr = SI.getPointerOperand();
+    SmallString<16> AddrName = Addr->getName();
+    AddrName += ".repack";
+
+    auto *IdxType = Type::getInt32Ty(ST->getContext());
+    auto *Zero = ConstantInt::get(IdxType, 0);
+    for (unsigned i = 0; i < Count; i++) {
+      Value *Indices[2] = {
+        Zero,
+        ConstantInt::get(IdxType, i),
+      };
+      auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
+                                               AddrName);
+      auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
+      auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
+      llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
+      AAMDNodes AAMD;
+      SI.getAAMetadata(AAMD);
+      NS->setAAMetadata(AAMD);
+    }
+
+    return true;
+  }
+
+  if (auto *AT = dyn_cast<ArrayType>(T)) {
+    // If the array only have one element, we unpack.
+    auto NumElements = AT->getNumElements();
+    if (NumElements == 1) {
+      V = IC.Builder.CreateExtractValue(V, 0);
+      combineStoreToNewValue(IC, SI, V);
+      return true;
+    }
+
+    // Bail out if the array is too large. Ideally we would like to optimize
+    // arrays of arbitrary size but this has a terrible impact on compile time.
+    // The threshold here is chosen arbitrarily, maybe needs a little bit of
+    // tuning.
+    if (NumElements > IC.MaxArraySizeForCombine)
+      return false;
+
+    const DataLayout &DL = IC.getDataLayout();
+    auto EltSize = DL.getTypeAllocSize(AT->getElementType());
+    auto Align = SI.getAlignment();
+    if (!Align)
+      Align = DL.getABITypeAlignment(T);
+
+    SmallString<16> EltName = V->getName();
+    EltName += ".elt";
+    auto *Addr = SI.getPointerOperand();
+    SmallString<16> AddrName = Addr->getName();
+    AddrName += ".repack";
+
+    auto *IdxType = Type::getInt64Ty(T->getContext());
+    auto *Zero = ConstantInt::get(IdxType, 0);
+
+    uint64_t Offset = 0;
+    for (uint64_t i = 0; i < NumElements; i++) {
+      Value *Indices[2] = {
+        Zero,
+        ConstantInt::get(IdxType, i),
+      };
+      auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
+                                               AddrName);
+      auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
+      auto EltAlign = MinAlign(Align, Offset);
+      Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
+      AAMDNodes AAMD;
+      SI.getAAMetadata(AAMD);
+      NS->setAAMetadata(AAMD);
+      Offset += EltSize;
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+/// equivalentAddressValues - Test if A and B will obviously have the same
+/// value. This includes recognizing that %t0 and %t1 will have the same
+/// value in code like this:
+///   %t0 = getelementptr \@a, 0, 3
+///   store i32 0, i32* %t0
+///   %t1 = getelementptr \@a, 0, 3
+///   %t2 = load i32* %t1
+///
+static bool equivalentAddressValues(Value *A, Value *B) {
+  // Test if the values are trivially equivalent.
+  if (A == B) return true;
+
+  // Test if the values come form identical arithmetic instructions.
+  // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
+  // its only used to compare two uses within the same basic block, which
+  // means that they'll always either have the same value or one of them
+  // will have an undefined value.
+  if (isa<BinaryOperator>(A) ||
+      isa<CastInst>(A) ||
+      isa<PHINode>(A) ||
+      isa<GetElementPtrInst>(A))
+    if (Instruction *BI = dyn_cast<Instruction>(B))
+      if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
+        return true;
+
+  // Otherwise they may not be equivalent.
+  return false;
+}
+
+Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
+  Value *Val = SI.getOperand(0);
+  Value *Ptr = SI.getOperand(1);
+
+  // Try to canonicalize the stored type.
+  if (combineStoreToValueType(*this, SI))
+    return eraseInstFromFunction(SI);
+
+  // Attempt to improve the alignment.
+  unsigned KnownAlign = getOrEnforceKnownAlignment(
+      Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT);
+  unsigned StoreAlign = SI.getAlignment();
+  unsigned EffectiveStoreAlign =
+      StoreAlign != 0 ? StoreAlign : DL.getABITypeAlignment(Val->getType());
+
+  if (KnownAlign > EffectiveStoreAlign)
+    SI.setAlignment(KnownAlign);
+  else if (StoreAlign == 0)
+    SI.setAlignment(EffectiveStoreAlign);
+
+  // Try to canonicalize the stored type.
+  if (unpackStoreToAggregate(*this, SI))
+    return eraseInstFromFunction(SI);
+
+  // Replace GEP indices if possible.
+  if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) {
+      Worklist.Add(NewGEPI);
+      return &SI;
+  }
+
+  // Don't hack volatile/ordered stores.
+  // FIXME: Some bits are legal for ordered atomic stores; needs refactoring.
+  if (!SI.isUnordered()) return nullptr;
+
+  // If the RHS is an alloca with a single use, zapify the store, making the
+  // alloca dead.
+  if (Ptr->hasOneUse()) {
+    if (isa<AllocaInst>(Ptr))
+      return eraseInstFromFunction(SI);
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+      if (isa<AllocaInst>(GEP->getOperand(0))) {
+        if (GEP->getOperand(0)->hasOneUse())
+          return eraseInstFromFunction(SI);
+      }
+    }
+  }
+
+  // Do really simple DSE, to catch cases where there are several consecutive
+  // stores to the same location, separated by a few arithmetic operations. This
+  // situation often occurs with bitfield accesses.
+  BasicBlock::iterator BBI(SI);
+  for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
+       --ScanInsts) {
+    --BBI;
+    // Don't count debug info directives, lest they affect codegen,
+    // and we skip pointer-to-pointer bitcasts, which are NOPs.
+    if (isa<DbgInfoIntrinsic>(BBI) ||
+        (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
+      ScanInsts++;
+      continue;
+    }
+
+    if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
+      // Prev store isn't volatile, and stores to the same location?
+      if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1),
+                                                        SI.getOperand(1))) {
+        ++NumDeadStore;
+        ++BBI;
+        eraseInstFromFunction(*PrevSI);
+        continue;
+      }
+      break;
+    }
+
+    // If this is a load, we have to stop.  However, if the loaded value is from
+    // the pointer we're loading and is producing the pointer we're storing,
+    // then *this* store is dead (X = load P; store X -> P).
+    if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+      if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) {
+        assert(SI.isUnordered() && "can't eliminate ordering operation");
+        return eraseInstFromFunction(SI);
+      }
+
+      // Otherwise, this is a load from some other location.  Stores before it
+      // may not be dead.
+      break;
+    }
+
+    // Don't skip over loads, throws or things that can modify memory.
+    if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow())
+      break;
+  }
+
+  // store X, null    -> turns into 'unreachable' in SimplifyCFG
+  if (isa<ConstantPointerNull>(Ptr) && SI.getPointerAddressSpace() == 0) {
+    if (!isa<UndefValue>(Val)) {
+      SI.setOperand(0, UndefValue::get(Val->getType()));
+      if (Instruction *U = dyn_cast<Instruction>(Val))
+        Worklist.Add(U);  // Dropped a use.
+    }
+    return nullptr;  // Do not modify these!
+  }
+
+  // store undef, Ptr -> noop
+  if (isa<UndefValue>(Val))
+    return eraseInstFromFunction(SI);
+
+  // If this store is the last instruction in the basic block (possibly
+  // excepting debug info instructions), and if the block ends with an
+  // unconditional branch, try to move it to the successor block.
+  BBI = SI.getIterator();
+  do {
+    ++BBI;
+  } while (isa<DbgInfoIntrinsic>(BBI) ||
+           (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy()));
+  if (BranchInst *BI = dyn_cast<BranchInst>(BBI))
+    if (BI->isUnconditional())
+      if (SimplifyStoreAtEndOfBlock(SI))
+        return nullptr;  // xform done!
+
+  return nullptr;
+}
+
+/// SimplifyStoreAtEndOfBlock - Turn things like:
+///   if () { *P = v1; } else { *P = v2 }
+/// into a phi node with a store in the successor.
+///
+/// Simplify things like:
+///   *P = v1; if () { *P = v2; }
+/// into a phi node with a store in the successor.
+///
+bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
+  assert(SI.isUnordered() &&
+         "this code has not been auditted for volatile or ordered store case");
+
+  BasicBlock *StoreBB = SI.getParent();
+
+  // Check to see if the successor block has exactly two incoming edges.  If
+  // so, see if the other predecessor contains a store to the same location.
+  // if so, insert a PHI node (if needed) and move the stores down.
+  BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
+
+  // Determine whether Dest has exactly two predecessors and, if so, compute
+  // the other predecessor.
+  pred_iterator PI = pred_begin(DestBB);
+  BasicBlock *P = *PI;
+  BasicBlock *OtherBB = nullptr;
+
+  if (P != StoreBB)
+    OtherBB = P;
+
+  if (++PI == pred_end(DestBB))
+    return false;
+
+  P = *PI;
+  if (P != StoreBB) {
+    if (OtherBB)
+      return false;
+    OtherBB = P;
+  }
+  if (++PI != pred_end(DestBB))
+    return false;
+
+  // Bail out if all the relevant blocks aren't distinct (this can happen,
+  // for example, if SI is in an infinite loop)
+  if (StoreBB == DestBB || OtherBB == DestBB)
+    return false;
+
+  // Verify that the other block ends in a branch and is not otherwise empty.
+  BasicBlock::iterator BBI(OtherBB->getTerminator());
+  BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
+  if (!OtherBr || BBI == OtherBB->begin())
+    return false;
+
+  // If the other block ends in an unconditional branch, check for the 'if then
+  // else' case.  there is an instruction before the branch.
+  StoreInst *OtherStore = nullptr;
+  if (OtherBr->isUnconditional()) {
+    --BBI;
+    // Skip over debugging info.
+    while (isa<DbgInfoIntrinsic>(BBI) ||
+           (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
+      if (BBI==OtherBB->begin())
+        return false;
+      --BBI;
+    }
+    // If this isn't a store, isn't a store to the same location, or is not the
+    // right kind of store, bail out.
+    OtherStore = dyn_cast<StoreInst>(BBI);
+    if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) ||
+        !SI.isSameOperationAs(OtherStore))
+      return false;
+  } else {
+    // Otherwise, the other block ended with a conditional branch. If one of the
+    // destinations is StoreBB, then we have the if/then case.
+    if (OtherBr->getSuccessor(0) != StoreBB &&
+        OtherBr->getSuccessor(1) != StoreBB)
+      return false;
+
+    // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
+    // if/then triangle.  See if there is a store to the same ptr as SI that
+    // lives in OtherBB.
+    for (;; --BBI) {
+      // Check to see if we find the matching store.
+      if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
+        if (OtherStore->getOperand(1) != SI.getOperand(1) ||
+            !SI.isSameOperationAs(OtherStore))
+          return false;
+        break;
+      }
+      // If we find something that may be using or overwriting the stored
+      // value, or if we run out of instructions, we can't do the xform.
+      if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
+          BBI->mayWriteToMemory() || BBI == OtherBB->begin())
+        return false;
+    }
+
+    // In order to eliminate the store in OtherBr, we have to
+    // make sure nothing reads or overwrites the stored value in
+    // StoreBB.
+    for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
+      // FIXME: This should really be AA driven.
+      if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
+        return false;
+    }
+  }
+
+  // Insert a PHI node now if we need it.
+  Value *MergedVal = OtherStore->getOperand(0);
+  if (MergedVal != SI.getOperand(0)) {
+    PHINode *PN = PHINode::Create(MergedVal->getType(), 2, "storemerge");
+    PN->addIncoming(SI.getOperand(0), SI.getParent());
+    PN->addIncoming(OtherStore->getOperand(0), OtherBB);
+    MergedVal = InsertNewInstBefore(PN, DestBB->front());
+  }
+
+  // Advance to a place where it is safe to insert the new store and
+  // insert it.
+  BBI = DestBB->getFirstInsertionPt();
+  StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1),
+                                   SI.isVolatile(),
+                                   SI.getAlignment(),
+                                   SI.getOrdering(),
+                                   SI.getSyncScopeID());
+  InsertNewInstBefore(NewSI, *BBI);
+  // The debug locations of the original instructions might differ; merge them.
+  NewSI->setDebugLoc(DILocation::getMergedLocation(SI.getDebugLoc(),
+                                                   OtherStore->getDebugLoc()));
+
+  // If the two stores had AA tags, merge them.
+  AAMDNodes AATags;
+  SI.getAAMetadata(AATags);
+  if (AATags) {
+    OtherStore->getAAMetadata(AATags, /* Merge = */ true);
+    NewSI->setAAMetadata(AATags);
+  }
+
+  // Nuke the old stores.
+  eraseInstFromFunction(SI);
+  eraseInstFromFunction(*OtherStore);
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
new file mode 100644
index 000000000000..e3a50220f94e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -0,0 +1,1599 @@
+//===- InstCombineMulDivRem.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for mul, fmul, sdiv, udiv, fdiv,
+// srem, urem, frem.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+
+/// The specific integer value is used in a context where it is known to be
+/// non-zero.  If this allows us to simplify the computation, do so and return
+/// the new operand, otherwise return null.
+static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
+                                        Instruction &CxtI) {
+  // If V has multiple uses, then we would have to do more analysis to determine
+  // if this is safe.  For example, the use could be in dynamically unreached
+  // code.
+  if (!V->hasOneUse()) return nullptr;
+
+  bool MadeChange = false;
+
+  // ((1 << A) >>u B) --> (1 << (A-B))
+  // Because V cannot be zero, we know that B is less than A.
+  Value *A = nullptr, *B = nullptr, *One = nullptr;
+  if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(One), m_Value(A))), m_Value(B))) &&
+      match(One, m_One())) {
+    A = IC.Builder.CreateSub(A, B);
+    return IC.Builder.CreateShl(One, A);
+  }
+
+  // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
+  // inexact.  Similarly for <<.
+  BinaryOperator *I = dyn_cast<BinaryOperator>(V);
+  if (I && I->isLogicalShift() &&
+      IC.isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0, &CxtI)) {
+    // We know that this is an exact/nuw shift and that the input is a
+    // non-zero context as well.
+    if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) {
+      I->setOperand(0, V2);
+      MadeChange = true;
+    }
+
+    if (I->getOpcode() == Instruction::LShr && !I->isExact()) {
+      I->setIsExact();
+      MadeChange = true;
+    }
+
+    if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) {
+      I->setHasNoUnsignedWrap();
+      MadeChange = true;
+    }
+  }
+
+  // TODO: Lots more we could do here:
+  //    If V is a phi node, we can call this on each of its operands.
+  //    "select cond, X, 0" can simplify to "X".
+
+  return MadeChange ? V : nullptr;
+}
+
+
+/// True if the multiply can not be expressed in an int this size.
+static bool MultiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product,
+                              bool IsSigned) {
+  bool Overflow;
+  if (IsSigned)
+    Product = C1.smul_ov(C2, Overflow);
+  else
+    Product = C1.umul_ov(C2, Overflow);
+
+  return Overflow;
+}
+
+/// \brief True if C2 is a multiple of C1. Quotient contains C2/C1.
+static bool IsMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
+                       bool IsSigned) {
+  assert(C1.getBitWidth() == C2.getBitWidth() &&
+         "Inconsistent width of constants!");
+
+  // Bail if we will divide by zero.
+  if (C2.isMinValue())
+    return false;
+
+  // Bail if we would divide INT_MIN by -1.
+  if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue())
+    return false;
+
+  APInt Remainder(C1.getBitWidth(), /*Val=*/0ULL, IsSigned);
+  if (IsSigned)
+    APInt::sdivrem(C1, C2, Quotient, Remainder);
+  else
+    APInt::udivrem(C1, C2, Quotient, Remainder);
+
+  return Remainder.isMinValue();
+}
+
+/// \brief A helper routine of InstCombiner::visitMul().
+///
+/// If C is a vector of known powers of 2, then this function returns
+/// a new vector obtained from C replacing each element with its logBase2.
+/// Return a null pointer otherwise.
+static Constant *getLogBase2Vector(ConstantDataVector *CV) {
+  const APInt *IVal;
+  SmallVector<Constant *, 4> Elts;
+
+  for (unsigned I = 0, E = CV->getNumElements(); I != E; ++I) {
+    Constant *Elt = CV->getElementAsConstant(I);
+    if (!match(Elt, m_APInt(IVal)) || !IVal->isPowerOf2())
+      return nullptr;
+    Elts.push_back(ConstantInt::get(Elt->getType(), IVal->logBase2()));
+  }
+
+  return ConstantVector::get(Elts);
+}
+
+/// \brief Return true if we can prove that:
+///    (mul LHS, RHS)  === (mul nsw LHS, RHS)
+bool InstCombiner::willNotOverflowSignedMul(const Value *LHS,
+                                            const Value *RHS,
+                                            const Instruction &CxtI) const {
+  // Multiplying n * m significant bits yields a result of n + m significant
+  // bits. If the total number of significant bits does not exceed the
+  // result bit width (minus 1), there is no overflow.
+  // This means if we have enough leading sign bits in the operands
+  // we can guarantee that the result does not overflow.
+  // Ref: "Hacker's Delight" by Henry Warren
+  unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
+
+  // Note that underestimating the number of sign bits gives a more
+  // conservative answer.
+  unsigned SignBits =
+      ComputeNumSignBits(LHS, 0, &CxtI) + ComputeNumSignBits(RHS, 0, &CxtI);
+
+  // First handle the easy case: if we have enough sign bits there's
+  // definitely no overflow.
+  if (SignBits > BitWidth + 1)
+    return true;
+
+  // There are two ambiguous cases where there can be no overflow:
+  //   SignBits == BitWidth + 1    and
+  //   SignBits == BitWidth
+  // The second case is difficult to check, therefore we only handle the
+  // first case.
+  if (SignBits == BitWidth + 1) {
+    // It overflows only when both arguments are negative and the true
+    // product is exactly the minimum negative number.
+    // E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000
+    // For simplicity we just check if at least one side is not negative.
+    KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
+    KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
+    if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative())
+      return true;
+  }
+  return false;
+}
+
+Instruction *InstCombiner::visitMul(BinaryOperator &I) {
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyMulInst(Op0, Op1, SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
+
+  // X * -1 == 0 - X
+  if (match(Op1, m_AllOnes())) {
+    BinaryOperator *BO = BinaryOperator::CreateNeg(Op0, I.getName());
+    if (I.hasNoSignedWrap())
+      BO->setHasNoSignedWrap();
+    return BO;
+  }
+
+  // Also allow combining multiply instructions on vectors.
+  {
+    Value *NewOp;
+    Constant *C1, *C2;
+    const APInt *IVal;
+    if (match(&I, m_Mul(m_Shl(m_Value(NewOp), m_Constant(C2)),
+                        m_Constant(C1))) &&
+        match(C1, m_APInt(IVal))) {
+      // ((X << C2)*C1) == (X * (C1 << C2))
+      Constant *Shl = ConstantExpr::getShl(C1, C2);
+      BinaryOperator *Mul = cast<BinaryOperator>(I.getOperand(0));
+      BinaryOperator *BO = BinaryOperator::CreateMul(NewOp, Shl);
+      if (I.hasNoUnsignedWrap() && Mul->hasNoUnsignedWrap())
+        BO->setHasNoUnsignedWrap();
+      if (I.hasNoSignedWrap() && Mul->hasNoSignedWrap() &&
+          Shl->isNotMinSignedValue())
+        BO->setHasNoSignedWrap();
+      return BO;
+    }
+
+    if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) {
+      Constant *NewCst = nullptr;
+      if (match(C1, m_APInt(IVal)) && IVal->isPowerOf2())
+        // Replace X*(2^C) with X << C, where C is either a scalar or a splat.
+        NewCst = ConstantInt::get(NewOp->getType(), IVal->logBase2());
+      else if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(C1))
+        // Replace X*(2^C) with X << C, where C is a vector of known
+        // constant powers of 2.
+        NewCst = getLogBase2Vector(CV);
+
+      if (NewCst) {
+        unsigned Width = NewCst->getType()->getPrimitiveSizeInBits();
+        BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst);
+
+        if (I.hasNoUnsignedWrap())
+          Shl->setHasNoUnsignedWrap();
+        if (I.hasNoSignedWrap()) {
+          const APInt *V;
+          if (match(NewCst, m_APInt(V)) && *V != Width - 1)
+            Shl->setHasNoSignedWrap();
+        }
+
+        return Shl;
+      }
+    }
+  }
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+    // (Y - X) * (-(2**n)) -> (X - Y) * (2**n), for positive nonzero n
+    // (Y + const) * (-(2**n)) -> (-constY) * (2**n), for positive nonzero n
+    // The "* (2**n)" thus becomes a potential shifting opportunity.
+    {
+      const APInt &   Val = CI->getValue();
+      const APInt &PosVal = Val.abs();
+      if (Val.isNegative() && PosVal.isPowerOf2()) {
+        Value *X = nullptr, *Y = nullptr;
+        if (Op0->hasOneUse()) {
+          ConstantInt *C1;
+          Value *Sub = nullptr;
+          if (match(Op0, m_Sub(m_Value(Y), m_Value(X))))
+            Sub = Builder.CreateSub(X, Y, "suba");
+          else if (match(Op0, m_Add(m_Value(Y), m_ConstantInt(C1))))
+            Sub = Builder.CreateSub(Builder.CreateNeg(C1), Y, "subc");
+          if (Sub)
+            return
+              BinaryOperator::CreateMul(Sub,
+                                        ConstantInt::get(Y->getType(), PosVal));
+        }
+      }
+    }
+  }
+
+  // Simplify mul instructions with a constant RHS.
+  if (isa<Constant>(Op1)) {
+    if (Instruction *FoldedMul = foldOpWithConstantIntoOperand(I))
+      return FoldedMul;
+
+    // Canonicalize (X+C1)*CI -> X*CI+C1*CI.
+    {
+      Value *X;
+      Constant *C1;
+      if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) {
+        Value *Mul = Builder.CreateMul(C1, Op1);
+        // Only go forward with the transform if C1*CI simplifies to a tidier
+        // constant.
+        if (!match(Mul, m_Mul(m_Value(), m_Value())))
+          return BinaryOperator::CreateAdd(Builder.CreateMul(X, Op1), Mul);
+      }
+    }
+  }
+
+  if (Value *Op0v = dyn_castNegVal(Op0)) {   // -X * -Y = X*Y
+    if (Value *Op1v = dyn_castNegVal(Op1)) {
+      BinaryOperator *BO = BinaryOperator::CreateMul(Op0v, Op1v);
+      if (I.hasNoSignedWrap() &&
+          match(Op0, m_NSWSub(m_Value(), m_Value())) &&
+          match(Op1, m_NSWSub(m_Value(), m_Value())))
+        BO->setHasNoSignedWrap();
+      return BO;
+    }
+  }
+
+  // (X / Y) *  Y = X - (X % Y)
+  // (X / Y) * -Y = (X % Y) - X
+  {
+    Value *Y = Op1;
+    BinaryOperator *Div = dyn_cast<BinaryOperator>(Op0);
+    if (!Div || (Div->getOpcode() != Instruction::UDiv &&
+                 Div->getOpcode() != Instruction::SDiv)) {
+      Y = Op0;
+      Div = dyn_cast<BinaryOperator>(Op1);
+    }
+    Value *Neg = dyn_castNegVal(Y);
+    if (Div && Div->hasOneUse() &&
+        (Div->getOperand(1) == Y || Div->getOperand(1) == Neg) &&
+        (Div->getOpcode() == Instruction::UDiv ||
+         Div->getOpcode() == Instruction::SDiv)) {
+      Value *X = Div->getOperand(0), *DivOp1 = Div->getOperand(1);
+
+      // If the division is exact, X % Y is zero, so we end up with X or -X.
+      if (Div->isExact()) {
+        if (DivOp1 == Y)
+          return replaceInstUsesWith(I, X);
+        return BinaryOperator::CreateNeg(X);
+      }
+
+      auto RemOpc = Div->getOpcode() == Instruction::UDiv ? Instruction::URem
+                                                          : Instruction::SRem;
+      Value *Rem = Builder.CreateBinOp(RemOpc, X, DivOp1);
+      if (DivOp1 == Y)
+        return BinaryOperator::CreateSub(X, Rem);
+      return BinaryOperator::CreateSub(Rem, X);
+    }
+  }
+
+  /// i1 mul -> i1 and.
+  if (I.getType()->isIntOrIntVectorTy(1))
+    return BinaryOperator::CreateAnd(Op0, Op1);
+
+  // X*(1 << Y) --> X << Y
+  // (1 << Y)*X --> X << Y
+  {
+    Value *Y;
+    BinaryOperator *BO = nullptr;
+    bool ShlNSW = false;
+    if (match(Op0, m_Shl(m_One(), m_Value(Y)))) {
+      BO = BinaryOperator::CreateShl(Op1, Y);
+      ShlNSW = cast<ShlOperator>(Op0)->hasNoSignedWrap();
+    } else if (match(Op1, m_Shl(m_One(), m_Value(Y)))) {
+      BO = BinaryOperator::CreateShl(Op0, Y);
+      ShlNSW = cast<ShlOperator>(Op1)->hasNoSignedWrap();
+    }
+    if (BO) {
+      if (I.hasNoUnsignedWrap())
+        BO->setHasNoUnsignedWrap();
+      if (I.hasNoSignedWrap() && ShlNSW)
+        BO->setHasNoSignedWrap();
+      return BO;
+    }
+  }
+
+  // If one of the operands of the multiply is a cast from a boolean value, then
+  // we know the bool is either zero or one, so this is a 'masking' multiply.
+  //   X * Y (where Y is 0 or 1) -> X & (0-Y)
+  if (!I.getType()->isVectorTy()) {
+    // -2 is "-1 << 1" so it is all bits set except the low one.
+    APInt Negative2(I.getType()->getPrimitiveSizeInBits(), (uint64_t)-2, true);
+
+    Value *BoolCast = nullptr, *OtherOp = nullptr;
+    if (MaskedValueIsZero(Op0, Negative2, 0, &I)) {
+      BoolCast = Op0;
+      OtherOp = Op1;
+    } else if (MaskedValueIsZero(Op1, Negative2, 0, &I)) {
+      BoolCast = Op1;
+      OtherOp = Op0;
+    }
+
+    if (BoolCast) {
+      Value *V = Builder.CreateSub(Constant::getNullValue(I.getType()),
+                                    BoolCast);
+      return BinaryOperator::CreateAnd(V, OtherOp);
+    }
+  }
+
+  // Check for (mul (sext x), y), see if we can merge this into an
+  // integer mul followed by a sext.
+  if (SExtInst *Op0Conv = dyn_cast<SExtInst>(Op0)) {
+    // (mul (sext x), cst) --> (sext (mul x, cst'))
+    if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+      if (Op0Conv->hasOneUse()) {
+        Constant *CI =
+            ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType());
+        if (ConstantExpr::getSExt(CI, I.getType()) == Op1C &&
+            willNotOverflowSignedMul(Op0Conv->getOperand(0), CI, I)) {
+          // Insert the new, smaller mul.
+          Value *NewMul =
+              Builder.CreateNSWMul(Op0Conv->getOperand(0), CI, "mulconv");
+          return new SExtInst(NewMul, I.getType());
+        }
+      }
+    }
+
+    // (mul (sext x), (sext y)) --> (sext (mul int x, y))
+    if (SExtInst *Op1Conv = dyn_cast<SExtInst>(Op1)) {
+      // Only do this if x/y have the same type, if at last one of them has a
+      // single use (so we don't increase the number of sexts), and if the
+      // integer mul will not overflow.
+      if (Op0Conv->getOperand(0)->getType() ==
+              Op1Conv->getOperand(0)->getType() &&
+          (Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) &&
+          willNotOverflowSignedMul(Op0Conv->getOperand(0),
+                                   Op1Conv->getOperand(0), I)) {
+        // Insert the new integer mul.
+        Value *NewMul = Builder.CreateNSWMul(
+            Op0Conv->getOperand(0), Op1Conv->getOperand(0), "mulconv");
+        return new SExtInst(NewMul, I.getType());
+      }
+    }
+  }
+
+  // Check for (mul (zext x), y), see if we can merge this into an
+  // integer mul followed by a zext.
+  if (auto *Op0Conv = dyn_cast<ZExtInst>(Op0)) {
+    // (mul (zext x), cst) --> (zext (mul x, cst'))
+    if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+      if (Op0Conv->hasOneUse()) {
+        Constant *CI =
+            ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType());
+        if (ConstantExpr::getZExt(CI, I.getType()) == Op1C &&
+            willNotOverflowUnsignedMul(Op0Conv->getOperand(0), CI, I)) {
+          // Insert the new, smaller mul.
+          Value *NewMul =
+              Builder.CreateNUWMul(Op0Conv->getOperand(0), CI, "mulconv");
+          return new ZExtInst(NewMul, I.getType());
+        }
+      }
+    }
+
+    // (mul (zext x), (zext y)) --> (zext (mul int x, y))
+    if (auto *Op1Conv = dyn_cast<ZExtInst>(Op1)) {
+      // Only do this if x/y have the same type, if at last one of them has a
+      // single use (so we don't increase the number of zexts), and if the
+      // integer mul will not overflow.
+      if (Op0Conv->getOperand(0)->getType() ==
+              Op1Conv->getOperand(0)->getType() &&
+          (Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) &&
+          willNotOverflowUnsignedMul(Op0Conv->getOperand(0),
+                                     Op1Conv->getOperand(0), I)) {
+        // Insert the new integer mul.
+        Value *NewMul = Builder.CreateNUWMul(
+            Op0Conv->getOperand(0), Op1Conv->getOperand(0), "mulconv");
+        return new ZExtInst(NewMul, I.getType());
+      }
+    }
+  }
+
+  if (!I.hasNoSignedWrap() && willNotOverflowSignedMul(Op0, Op1, I)) {
+    Changed = true;
+    I.setHasNoSignedWrap(true);
+  }
+
+  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) {
+    Changed = true;
+    I.setHasNoUnsignedWrap(true);
+  }
+
+  return Changed ? &I : nullptr;
+}
+
+/// Detect pattern log2(Y * 0.5) with corresponding fast math flags.
+static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
+  if (!Op->hasOneUse())
+    return;
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op);
+  if (!II)
+    return;
+  if (II->getIntrinsicID() != Intrinsic::log2 || !II->hasUnsafeAlgebra())
+    return;
+  Log2 = II;
+
+  Value *OpLog2Of = II->getArgOperand(0);
+  if (!OpLog2Of->hasOneUse())
+    return;
+
+  Instruction *I = dyn_cast<Instruction>(OpLog2Of);
+  if (!I)
+    return;
+  if (I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra())
+    return;
+
+  if (match(I->getOperand(0), m_SpecificFP(0.5)))
+    Y = I->getOperand(1);
+  else if (match(I->getOperand(1), m_SpecificFP(0.5)))
+    Y = I->getOperand(0);
+}
+
+static bool isFiniteNonZeroFp(Constant *C) {
+  if (C->getType()->isVectorTy()) {
+    for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E;
+         ++I) {
+      ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(C->getAggregateElement(I));
+      if (!CFP || !CFP->getValueAPF().isFiniteNonZero())
+        return false;
+    }
+    return true;
+  }
+
+  return isa<ConstantFP>(C) &&
+         cast<ConstantFP>(C)->getValueAPF().isFiniteNonZero();
+}
+
+static bool isNormalFp(Constant *C) {
+  if (C->getType()->isVectorTy()) {
+    for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E;
+         ++I) {
+      ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(C->getAggregateElement(I));
+      if (!CFP || !CFP->getValueAPF().isNormal())
+        return false;
+    }
+    return true;
+  }
+
+  return isa<ConstantFP>(C) && cast<ConstantFP>(C)->getValueAPF().isNormal();
+}
+
+/// Helper function of InstCombiner::visitFMul(BinaryOperator(). It returns
+/// true iff the given value is FMul or FDiv with one and only one operand
+/// being a normal constant (i.e. not Zero/NaN/Infinity).
+static bool isFMulOrFDivWithConstant(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || (I->getOpcode() != Instruction::FMul &&
+             I->getOpcode() != Instruction::FDiv))
+    return false;
+
+  Constant *C0 = dyn_cast<Constant>(I->getOperand(0));
+  Constant *C1 = dyn_cast<Constant>(I->getOperand(1));
+
+  if (C0 && C1)
+    return false;
+
+  return (C0 && isFiniteNonZeroFp(C0)) || (C1 && isFiniteNonZeroFp(C1));
+}
+
+/// foldFMulConst() is a helper routine of InstCombiner::visitFMul().
+/// The input \p FMulOrDiv is a FMul/FDiv with one and only one operand
+/// being a constant (i.e. isFMulOrFDivWithConstant(FMulOrDiv) == true).
+/// This function is to simplify "FMulOrDiv * C" and returns the
+/// resulting expression. Note that this function could return NULL in
+/// case the constants cannot be folded into a normal floating-point.
+///
+Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, Constant *C,
+                                   Instruction *InsertBefore) {
+  assert(isFMulOrFDivWithConstant(FMulOrDiv) && "V is invalid");
+
+  Value *Opnd0 = FMulOrDiv->getOperand(0);
+  Value *Opnd1 = FMulOrDiv->getOperand(1);
+
+  Constant *C0 = dyn_cast<Constant>(Opnd0);
+  Constant *C1 = dyn_cast<Constant>(Opnd1);
+
+  BinaryOperator *R = nullptr;
+
+  // (X * C0) * C => X * (C0*C)
+  if (FMulOrDiv->getOpcode() == Instruction::FMul) {
+    Constant *F = ConstantExpr::getFMul(C1 ? C1 : C0, C);
+    if (isNormalFp(F))
+      R = BinaryOperator::CreateFMul(C1 ? Opnd0 : Opnd1, F);
+  } else {
+    if (C0) {
+      // (C0 / X) * C => (C0 * C) / X
+      if (FMulOrDiv->hasOneUse()) {
+        // It would otherwise introduce another div.
+        Constant *F = ConstantExpr::getFMul(C0, C);
+        if (isNormalFp(F))
+          R = BinaryOperator::CreateFDiv(F, Opnd1);
+      }
+    } else {
+      // (X / C1) * C => X * (C/C1) if C/C1 is not a denormal
+      Constant *F = ConstantExpr::getFDiv(C, C1);
+      if (isNormalFp(F)) {
+        R = BinaryOperator::CreateFMul(Opnd0, F);
+      } else {
+        // (X / C1) * C => X / (C1/C)
+        Constant *F = ConstantExpr::getFDiv(C1, C);
+        if (isNormalFp(F))
+          R = BinaryOperator::CreateFDiv(Opnd0, F);
+      }
+    }
+  }
+
+  if (R) {
+    R->setHasUnsafeAlgebra(true);
+    InsertNewInstWith(R, *InsertBefore);
+  }
+
+  return R;
+}
+
+Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (isa<Constant>(Op0))
+    std::swap(Op0, Op1);
+
+  if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  bool AllowReassociate = I.hasUnsafeAlgebra();
+
+  // Simplify mul instructions with a constant RHS.
+  if (isa<Constant>(Op1)) {
+    if (Instruction *FoldedMul = foldOpWithConstantIntoOperand(I))
+      return FoldedMul;
+
+    // (fmul X, -1.0) --> (fsub -0.0, X)
+    if (match(Op1, m_SpecificFP(-1.0))) {
+      Constant *NegZero = ConstantFP::getNegativeZero(Op1->getType());
+      Instruction *RI = BinaryOperator::CreateFSub(NegZero, Op0);
+      RI->copyFastMathFlags(&I);
+      return RI;
+    }
+
+    Constant *C = cast<Constant>(Op1);
+    if (AllowReassociate && isFiniteNonZeroFp(C)) {
+      // Let MDC denote an expression in one of these forms:
+      // X * C, C/X, X/C, where C is a constant.
+      //
+      // Try to simplify "MDC * Constant"
+      if (isFMulOrFDivWithConstant(Op0))
+        if (Value *V = foldFMulConst(cast<Instruction>(Op0), C, &I))
+          return replaceInstUsesWith(I, V);
+
+      // (MDC +/- C1) * C => (MDC * C) +/- (C1 * C)
+      Instruction *FAddSub = dyn_cast<Instruction>(Op0);
+      if (FAddSub &&
+          (FAddSub->getOpcode() == Instruction::FAdd ||
+           FAddSub->getOpcode() == Instruction::FSub)) {
+        Value *Opnd0 = FAddSub->getOperand(0);
+        Value *Opnd1 = FAddSub->getOperand(1);
+        Constant *C0 = dyn_cast<Constant>(Opnd0);
+        Constant *C1 = dyn_cast<Constant>(Opnd1);
+        bool Swap = false;
+        if (C0) {
+          std::swap(C0, C1);
+          std::swap(Opnd0, Opnd1);
+          Swap = true;
+        }
+
+        if (C1 && isFiniteNonZeroFp(C1) && isFMulOrFDivWithConstant(Opnd0)) {
+          Value *M1 = ConstantExpr::getFMul(C1, C);
+          Value *M0 = isNormalFp(cast<Constant>(M1)) ?
+                      foldFMulConst(cast<Instruction>(Opnd0), C, &I) :
+                      nullptr;
+          if (M0 && M1) {
+            if (Swap && FAddSub->getOpcode() == Instruction::FSub)
+              std::swap(M0, M1);
+
+            Instruction *RI = (FAddSub->getOpcode() == Instruction::FAdd)
+                                  ? BinaryOperator::CreateFAdd(M0, M1)
+                                  : BinaryOperator::CreateFSub(M0, M1);
+            RI->copyFastMathFlags(&I);
+            return RI;
+          }
+        }
+      }
+    }
+  }
+
+  if (Op0 == Op1) {
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op0)) {
+      // sqrt(X) * sqrt(X) -> X
+      if (AllowReassociate && II->getIntrinsicID() == Intrinsic::sqrt)
+        return replaceInstUsesWith(I, II->getOperand(0));
+
+      // fabs(X) * fabs(X) -> X * X
+      if (II->getIntrinsicID() == Intrinsic::fabs) {
+        Instruction *FMulVal = BinaryOperator::CreateFMul(II->getOperand(0),
+                                                          II->getOperand(0),
+                                                          I.getName());
+        FMulVal->copyFastMathFlags(&I);
+        return FMulVal;
+      }
+    }
+  }
+
+  // Under unsafe algebra do:
+  // X * log2(0.5*Y) = X*log2(Y) - X
+  if (AllowReassociate) {
+    Value *OpX = nullptr;
+    Value *OpY = nullptr;
+    IntrinsicInst *Log2;
+    detectLog2OfHalf(Op0, OpY, Log2);
+    if (OpY) {
+      OpX = Op1;
+    } else {
+      detectLog2OfHalf(Op1, OpY, Log2);
+      if (OpY) {
+        OpX = Op0;
+      }
+    }
+    // if pattern detected emit alternate sequence
+    if (OpX && OpY) {
+      BuilderTy::FastMathFlagGuard Guard(Builder);
+      Builder.setFastMathFlags(Log2->getFastMathFlags());
+      Log2->setArgOperand(0, OpY);
+      Value *FMulVal = Builder.CreateFMul(OpX, Log2);
+      Value *FSub = Builder.CreateFSub(FMulVal, OpX);
+      FSub->takeName(&I);
+      return replaceInstUsesWith(I, FSub);
+    }
+  }
+
+  // Handle symmetric situation in a 2-iteration loop
+  Value *Opnd0 = Op0;
+  Value *Opnd1 = Op1;
+  for (int i = 0; i < 2; i++) {
+    bool IgnoreZeroSign = I.hasNoSignedZeros();
+    if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) {
+      BuilderTy::FastMathFlagGuard Guard(Builder);
+      Builder.setFastMathFlags(I.getFastMathFlags());
+
+      Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign);
+      Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign);
+
+      // -X * -Y => X*Y
+      if (N1) {
+        Value *FMul = Builder.CreateFMul(N0, N1);
+        FMul->takeName(&I);
+        return replaceInstUsesWith(I, FMul);
+      }
+
+      if (Opnd0->hasOneUse()) {
+        // -X * Y => -(X*Y) (Promote negation as high as possible)
+        Value *T = Builder.CreateFMul(N0, Opnd1);
+        Value *Neg = Builder.CreateFNeg(T);
+        Neg->takeName(&I);
+        return replaceInstUsesWith(I, Neg);
+      }
+    }
+
+    // (X*Y) * X => (X*X) * Y where Y != X
+    //  The purpose is two-fold:
+    //   1) to form a power expression (of X).
+    //   2) potentially shorten the critical path: After transformation, the
+    //  latency of the instruction Y is amortized by the expression of X*X,
+    //  and therefore Y is in a "less critical" position compared to what it
+    //  was before the transformation.
+    //
+    if (AllowReassociate) {
+      Value *Opnd0_0, *Opnd0_1;
+      if (Opnd0->hasOneUse() &&
+          match(Opnd0, m_FMul(m_Value(Opnd0_0), m_Value(Opnd0_1)))) {
+        Value *Y = nullptr;
+        if (Opnd0_0 == Opnd1 && Opnd0_1 != Opnd1)
+          Y = Opnd0_1;
+        else if (Opnd0_1 == Opnd1 && Opnd0_0 != Opnd1)
+          Y = Opnd0_0;
+
+        if (Y) {
+          BuilderTy::FastMathFlagGuard Guard(Builder);
+          Builder.setFastMathFlags(I.getFastMathFlags());
+          Value *T = Builder.CreateFMul(Opnd1, Opnd1);
+          Value *R = Builder.CreateFMul(T, Y);
+          R->takeName(&I);
+          return replaceInstUsesWith(I, R);
+        }
+      }
+    }
+
+    if (!isa<Constant>(Op1))
+      std::swap(Opnd0, Opnd1);
+    else
+      break;
+  }
+
+  return Changed ? &I : nullptr;
+}
+
+/// Try to fold a divide or remainder of a select instruction.
+bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
+  SelectInst *SI = cast<SelectInst>(I.getOperand(1));
+
+  // div/rem X, (Cond ? 0 : Y) -> div/rem X, Y
+  int NonNullOperand = -1;
+  if (Constant *ST = dyn_cast<Constant>(SI->getOperand(1)))
+    if (ST->isNullValue())
+      NonNullOperand = 2;
+  // div/rem X, (Cond ? Y : 0) -> div/rem X, Y
+  if (Constant *ST = dyn_cast<Constant>(SI->getOperand(2)))
+    if (ST->isNullValue())
+      NonNullOperand = 1;
+
+  if (NonNullOperand == -1)
+    return false;
+
+  Value *SelectCond = SI->getOperand(0);
+
+  // Change the div/rem to use 'Y' instead of the select.
+  I.setOperand(1, SI->getOperand(NonNullOperand));
+
+  // Okay, we know we replace the operand of the div/rem with 'Y' with no
+  // problem.  However, the select, or the condition of the select may have
+  // multiple uses.  Based on our knowledge that the operand must be non-zero,
+  // propagate the known value for the select into other uses of it, and
+  // propagate a known value of the condition into its other users.
+
+  // If the select and condition only have a single use, don't bother with this,
+  // early exit.
+  if (SI->use_empty() && SelectCond->hasOneUse())
+    return true;
+
+  // Scan the current block backward, looking for other uses of SI.
+  BasicBlock::iterator BBI = I.getIterator(), BBFront = I.getParent()->begin();
+
+  while (BBI != BBFront) {
+    --BBI;
+    // If we found a call to a function, we can't assume it will return, so
+    // information from below it cannot be propagated above it.
+    if (isa<CallInst>(BBI) && !isa<IntrinsicInst>(BBI))
+      break;
+
+    // Replace uses of the select or its condition with the known values.
+    for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end();
+         I != E; ++I) {
+      if (*I == SI) {
+        *I = SI->getOperand(NonNullOperand);
+        Worklist.Add(&*BBI);
+      } else if (*I == SelectCond) {
+        *I = Builder.getInt1(NonNullOperand == 1);
+        Worklist.Add(&*BBI);
+      }
+    }
+
+    // If we past the instruction, quit looking for it.
+    if (&*BBI == SI)
+      SI = nullptr;
+    if (&*BBI == SelectCond)
+      SelectCond = nullptr;
+
+    // If we ran out of things to eliminate, break out of the loop.
+    if (!SelectCond && !SI)
+      break;
+
+  }
+  return true;
+}
+
+
+/// This function implements the transforms common to both integer division
+/// instructions (udiv and sdiv). It is called by the visitors to those integer
+/// division instructions.
+/// @brief Common integer divide transforms
+Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // The RHS is known non-zero.
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) {
+    I.setOperand(1, V);
+    return &I;
+  }
+
+  // Handle cases involving: [su]div X, (select Cond, Y, Z)
+  // This does not apply for fdiv.
+  if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
+    return &I;
+
+  if (Instruction *LHS = dyn_cast<Instruction>(Op0)) {
+    const APInt *C2;
+    if (match(Op1, m_APInt(C2))) {
+      Value *X;
+      const APInt *C1;
+      bool IsSigned = I.getOpcode() == Instruction::SDiv;
+
+      // (X / C1) / C2  -> X / (C1*C2)
+      if ((IsSigned && match(LHS, m_SDiv(m_Value(X), m_APInt(C1)))) ||
+          (!IsSigned && match(LHS, m_UDiv(m_Value(X), m_APInt(C1))))) {
+        APInt Product(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
+        if (!MultiplyOverflows(*C1, *C2, Product, IsSigned))
+          return BinaryOperator::Create(I.getOpcode(), X,
+                                        ConstantInt::get(I.getType(), Product));
+      }
+
+      if ((IsSigned && match(LHS, m_NSWMul(m_Value(X), m_APInt(C1)))) ||
+          (!IsSigned && match(LHS, m_NUWMul(m_Value(X), m_APInt(C1))))) {
+        APInt Quotient(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
+
+        // (X * C1) / C2 -> X / (C2 / C1) if C2 is a multiple of C1.
+        if (IsMultiple(*C2, *C1, Quotient, IsSigned)) {
+          BinaryOperator *BO = BinaryOperator::Create(
+              I.getOpcode(), X, ConstantInt::get(X->getType(), Quotient));
+          BO->setIsExact(I.isExact());
+          return BO;
+        }
+
+        // (X * C1) / C2 -> X * (C1 / C2) if C1 is a multiple of C2.
+        if (IsMultiple(*C1, *C2, Quotient, IsSigned)) {
+          BinaryOperator *BO = BinaryOperator::Create(
+              Instruction::Mul, X, ConstantInt::get(X->getType(), Quotient));
+          BO->setHasNoUnsignedWrap(
+              !IsSigned &&
+              cast<OverflowingBinaryOperator>(LHS)->hasNoUnsignedWrap());
+          BO->setHasNoSignedWrap(
+              cast<OverflowingBinaryOperator>(LHS)->hasNoSignedWrap());
+          return BO;
+        }
+      }
+
+      if ((IsSigned && match(LHS, m_NSWShl(m_Value(X), m_APInt(C1))) &&
+           *C1 != C1->getBitWidth() - 1) ||
+          (!IsSigned && match(LHS, m_NUWShl(m_Value(X), m_APInt(C1))))) {
+        APInt Quotient(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
+        APInt C1Shifted = APInt::getOneBitSet(
+            C1->getBitWidth(), static_cast<unsigned>(C1->getLimitedValue()));
+
+        // (X << C1) / C2 -> X / (C2 >> C1) if C2 is a multiple of C1.
+        if (IsMultiple(*C2, C1Shifted, Quotient, IsSigned)) {
+          BinaryOperator *BO = BinaryOperator::Create(
+              I.getOpcode(), X, ConstantInt::get(X->getType(), Quotient));
+          BO->setIsExact(I.isExact());
+          return BO;
+        }
+
+        // (X << C1) / C2 -> X * (C2 >> C1) if C1 is a multiple of C2.
+        if (IsMultiple(C1Shifted, *C2, Quotient, IsSigned)) {
+          BinaryOperator *BO = BinaryOperator::Create(
+              Instruction::Mul, X, ConstantInt::get(X->getType(), Quotient));
+          BO->setHasNoUnsignedWrap(
+              !IsSigned &&
+              cast<OverflowingBinaryOperator>(LHS)->hasNoUnsignedWrap());
+          BO->setHasNoSignedWrap(
+              cast<OverflowingBinaryOperator>(LHS)->hasNoSignedWrap());
+          return BO;
+        }
+      }
+
+      if (!C2->isNullValue()) // avoid X udiv 0
+        if (Instruction *FoldedDiv = foldOpWithConstantIntoOperand(I))
+          return FoldedDiv;
+    }
+  }
+
+  if (match(Op0, m_One())) {
+    assert(!I.getType()->isIntOrIntVectorTy(1) && "i1 divide not removed?");
+    if (I.getOpcode() == Instruction::SDiv) {
+      // If Op1 is 0 then it's undefined behaviour, if Op1 is 1 then the
+      // result is one, if Op1 is -1 then the result is minus one, otherwise
+      // it's zero.
+      Value *Inc = Builder.CreateAdd(Op1, Op0);
+      Value *Cmp = Builder.CreateICmpULT(Inc, ConstantInt::get(I.getType(), 3));
+      return SelectInst::Create(Cmp, Op1, ConstantInt::get(I.getType(), 0));
+    } else {
+      // If Op1 is 0 then it's undefined behaviour. If Op1 is 1 then the
+      // result is one, otherwise it's zero.
+      return new ZExtInst(Builder.CreateICmpEQ(Op1, Op0), I.getType());
+    }
+  }
+
+  // See if we can fold away this div instruction.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  // (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y
+  Value *X = nullptr, *Z = nullptr;
+  if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) { // (X - Z) / Y; Y = Op1
+    bool isSigned = I.getOpcode() == Instruction::SDiv;
+    if ((isSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) ||
+        (!isSigned && match(Z, m_URem(m_Specific(X), m_Specific(Op1)))))
+      return BinaryOperator::Create(I.getOpcode(), X, Op1);
+  }
+
+  return nullptr;
+}
+
+/// dyn_castZExtVal - Checks if V is a zext or constant that can
+/// be truncated to Ty without losing bits.
+static Value *dyn_castZExtVal(Value *V, Type *Ty) {
+  if (ZExtInst *Z = dyn_cast<ZExtInst>(V)) {
+    if (Z->getSrcTy() == Ty)
+      return Z->getOperand(0);
+  } else if (ConstantInt *C = dyn_cast<ConstantInt>(V)) {
+    if (C->getValue().getActiveBits() <= cast<IntegerType>(Ty)->getBitWidth())
+      return ConstantExpr::getTrunc(C, Ty);
+  }
+  return nullptr;
+}
+
+namespace {
+const unsigned MaxDepth = 6;
+typedef Instruction *(*FoldUDivOperandCb)(Value *Op0, Value *Op1,
+                                          const BinaryOperator &I,
+                                          InstCombiner &IC);
+
+/// \brief Used to maintain state for visitUDivOperand().
+struct UDivFoldAction {
+  FoldUDivOperandCb FoldAction; ///< Informs visitUDiv() how to fold this
+                                ///< operand.  This can be zero if this action
+                                ///< joins two actions together.
+
+  Value *OperandToFold;         ///< Which operand to fold.
+  union {
+    Instruction *FoldResult;    ///< The instruction returned when FoldAction is
+                                ///< invoked.
+
+    size_t SelectLHSIdx;        ///< Stores the LHS action index if this action
+                                ///< joins two actions together.
+  };
+
+  UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand)
+      : FoldAction(FA), OperandToFold(InputOperand), FoldResult(nullptr) {}
+  UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand, size_t SLHS)
+      : FoldAction(FA), OperandToFold(InputOperand), SelectLHSIdx(SLHS) {}
+};
+}
+
+// X udiv 2^C -> X >> C
+static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
+                                    const BinaryOperator &I, InstCombiner &IC) {
+  const APInt &C = cast<Constant>(Op1)->getUniqueInteger();
+  BinaryOperator *LShr = BinaryOperator::CreateLShr(
+      Op0, ConstantInt::get(Op0->getType(), C.logBase2()));
+  if (I.isExact())
+    LShr->setIsExact();
+  return LShr;
+}
+
+// X udiv C, where C >= signbit
+static Instruction *foldUDivNegCst(Value *Op0, Value *Op1,
+                                   const BinaryOperator &I, InstCombiner &IC) {
+  Value *ICI = IC.Builder.CreateICmpULT(Op0, cast<ConstantInt>(Op1));
+
+  return SelectInst::Create(ICI, Constant::getNullValue(I.getType()),
+                            ConstantInt::get(I.getType(), 1));
+}
+
+// X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
+// X udiv (zext (C1 << N)), where C1 is "1<<C2"  -->  X >> (N+C2)
+static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
+                                InstCombiner &IC) {
+  Value *ShiftLeft;
+  if (!match(Op1, m_ZExt(m_Value(ShiftLeft))))
+    ShiftLeft = Op1;
+
+  const APInt *CI;
+  Value *N;
+  if (!match(ShiftLeft, m_Shl(m_APInt(CI), m_Value(N))))
+    llvm_unreachable("match should never fail here!");
+  if (*CI != 1)
+    N = IC.Builder.CreateAdd(N, ConstantInt::get(N->getType(), CI->logBase2()));
+  if (Op1 != ShiftLeft)
+    N = IC.Builder.CreateZExt(N, Op1->getType());
+  BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N);
+  if (I.isExact())
+    LShr->setIsExact();
+  return LShr;
+}
+
+// \brief Recursively visits the possible right hand operands of a udiv
+// instruction, seeing through select instructions, to determine if we can
+// replace the udiv with something simpler.  If we find that an operand is not
+// able to simplify the udiv, we abort the entire transformation.
+static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I,
+                               SmallVectorImpl<UDivFoldAction> &Actions,
+                               unsigned Depth = 0) {
+  // Check to see if this is an unsigned division with an exact power of 2,
+  // if so, convert to a right shift.
+  if (match(Op1, m_Power2())) {
+    Actions.push_back(UDivFoldAction(foldUDivPow2Cst, Op1));
+    return Actions.size();
+  }
+
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Op1))
+    // X udiv C, where C >= signbit
+    if (C->getValue().isNegative()) {
+      Actions.push_back(UDivFoldAction(foldUDivNegCst, C));
+      return Actions.size();
+    }
+
+  // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
+  if (match(Op1, m_Shl(m_Power2(), m_Value())) ||
+      match(Op1, m_ZExt(m_Shl(m_Power2(), m_Value())))) {
+    Actions.push_back(UDivFoldAction(foldUDivShl, Op1));
+    return Actions.size();
+  }
+
+  // The remaining tests are all recursive, so bail out if we hit the limit.
+  if (Depth++ == MaxDepth)
+    return 0;
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+    if (size_t LHSIdx =
+            visitUDivOperand(Op0, SI->getOperand(1), I, Actions, Depth))
+      if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions, Depth)) {
+        Actions.push_back(UDivFoldAction(nullptr, Op1, LHSIdx - 1));
+        return Actions.size();
+      }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyUDivInst(Op0, Op1, SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  // Handle the integer div common cases
+  if (Instruction *Common = commonIDivTransforms(I))
+    return Common;
+
+  // (x lshr C1) udiv C2 --> x udiv (C2 << C1)
+  {
+    Value *X;
+    const APInt *C1, *C2;
+    if (match(Op0, m_LShr(m_Value(X), m_APInt(C1))) &&
+        match(Op1, m_APInt(C2))) {
+      bool Overflow;
+      APInt C2ShlC1 = C2->ushl_ov(*C1, Overflow);
+      if (!Overflow) {
+        bool IsExact = I.isExact() && match(Op0, m_Exact(m_Value()));
+        BinaryOperator *BO = BinaryOperator::CreateUDiv(
+            X, ConstantInt::get(X->getType(), C2ShlC1));
+        if (IsExact)
+          BO->setIsExact();
+        return BO;
+      }
+    }
+  }
+
+  // (zext A) udiv (zext B) --> zext (A udiv B)
+  if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0))
+    if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy()))
+      return new ZExtInst(
+          Builder.CreateUDiv(ZOp0->getOperand(0), ZOp1, "div", I.isExact()),
+          I.getType());
+
+  // (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...))))
+  SmallVector<UDivFoldAction, 6> UDivActions;
+  if (visitUDivOperand(Op0, Op1, I, UDivActions))
+    for (unsigned i = 0, e = UDivActions.size(); i != e; ++i) {
+      FoldUDivOperandCb Action = UDivActions[i].FoldAction;
+      Value *ActionOp1 = UDivActions[i].OperandToFold;
+      Instruction *Inst;
+      if (Action)
+        Inst = Action(Op0, ActionOp1, I, *this);
+      else {
+        // This action joins two actions together.  The RHS of this action is
+        // simply the last action we processed, we saved the LHS action index in
+        // the joining action.
+        size_t SelectRHSIdx = i - 1;
+        Value *SelectRHS = UDivActions[SelectRHSIdx].FoldResult;
+        size_t SelectLHSIdx = UDivActions[i].SelectLHSIdx;
+        Value *SelectLHS = UDivActions[SelectLHSIdx].FoldResult;
+        Inst = SelectInst::Create(cast<SelectInst>(ActionOp1)->getCondition(),
+                                  SelectLHS, SelectRHS);
+      }
+
+      // If this is the last action to process, return it to the InstCombiner.
+      // Otherwise, we insert it before the UDiv and record it so that we may
+      // use it as part of a joining action (i.e., a SelectInst).
+      if (e - i != 1) {
+        Inst->insertBefore(&I);
+        UDivActions[i].FoldResult = Inst;
+      } else
+        return Inst;
+    }
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifySDivInst(Op0, Op1, SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  // Handle the integer div common cases
+  if (Instruction *Common = commonIDivTransforms(I))
+    return Common;
+
+  const APInt *Op1C;
+  if (match(Op1, m_APInt(Op1C))) {
+    // sdiv X, -1 == -X
+    if (Op1C->isAllOnesValue())
+      return BinaryOperator::CreateNeg(Op0);
+
+    // sdiv exact X, C  -->  ashr exact X, log2(C)
+    if (I.isExact() && Op1C->isNonNegative() && Op1C->isPowerOf2()) {
+      Value *ShAmt = ConstantInt::get(Op1->getType(), Op1C->exactLogBase2());
+      return BinaryOperator::CreateExactAShr(Op0, ShAmt, I.getName());
+    }
+
+    // If the dividend is sign-extended and the constant divisor is small enough
+    // to fit in the source type, shrink the division to the narrower type:
+    // (sext X) sdiv C --> sext (X sdiv C)
+    Value *Op0Src;
+    if (match(Op0, m_OneUse(m_SExt(m_Value(Op0Src)))) &&
+        Op0Src->getType()->getScalarSizeInBits() >= Op1C->getMinSignedBits()) {
+
+      // In the general case, we need to make sure that the dividend is not the
+      // minimum signed value because dividing that by -1 is UB. But here, we
+      // know that the -1 divisor case is already handled above.
+
+      Constant *NarrowDivisor =
+          ConstantExpr::getTrunc(cast<Constant>(Op1), Op0Src->getType());
+      Value *NarrowOp = Builder.CreateSDiv(Op0Src, NarrowDivisor);
+      return new SExtInst(NarrowOp, Op0->getType());
+    }
+  }
+
+  if (Constant *RHS = dyn_cast<Constant>(Op1)) {
+    // X/INT_MIN -> X == INT_MIN
+    if (RHS->isMinSignedValue())
+      return new ZExtInst(Builder.CreateICmpEQ(Op0, Op1), I.getType());
+
+    // -X/C  -->  X/-C  provided the negation doesn't overflow.
+    Value *X;
+    if (match(Op0, m_NSWSub(m_Zero(), m_Value(X)))) {
+      auto *BO = BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(RHS));
+      BO->setIsExact(I.isExact());
+      return BO;
+    }
+  }
+
+  // If the sign bits of both operands are zero (i.e. we can prove they are
+  // unsigned inputs), turn this into a udiv.
+  APInt Mask(APInt::getSignMask(I.getType()->getScalarSizeInBits()));
+  if (MaskedValueIsZero(Op0, Mask, 0, &I)) {
+    if (MaskedValueIsZero(Op1, Mask, 0, &I)) {
+      // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
+      auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+      BO->setIsExact(I.isExact());
+      return BO;
+    }
+
+    if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
+      // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
+      // Safe because the only negative value (1 << Y) can take on is
+      // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have
+      // the sign bit set.
+      auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+      BO->setIsExact(I.isExact());
+      return BO;
+    }
+  }
+
+  return nullptr;
+}
+
+/// CvtFDivConstToReciprocal tries to convert X/C into X*1/C if C not a special
+/// FP value and:
+///    1) 1/C is exact, or
+///    2) reciprocal is allowed.
+/// If the conversion was successful, the simplified expression "X * 1/C" is
+/// returned; otherwise, NULL is returned.
+///
+static Instruction *CvtFDivConstToReciprocal(Value *Dividend, Constant *Divisor,
+                                             bool AllowReciprocal) {
+  if (!isa<ConstantFP>(Divisor)) // TODO: handle vectors.
+    return nullptr;
+
+  const APFloat &FpVal = cast<ConstantFP>(Divisor)->getValueAPF();
+  APFloat Reciprocal(FpVal.getSemantics());
+  bool Cvt = FpVal.getExactInverse(&Reciprocal);
+
+  if (!Cvt && AllowReciprocal && FpVal.isFiniteNonZero()) {
+    Reciprocal = APFloat(FpVal.getSemantics(), 1.0f);
+    (void)Reciprocal.divide(FpVal, APFloat::rmNearestTiesToEven);
+    Cvt = !Reciprocal.isDenormal();
+  }
+
+  if (!Cvt)
+    return nullptr;
+
+  ConstantFP *R;
+  R = ConstantFP::get(Dividend->getType()->getContext(), Reciprocal);
+  return BinaryOperator::CreateFMul(Dividend, R);
+}
+
+Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyFDivInst(Op0, Op1, I.getFastMathFlags(),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (isa<Constant>(Op0))
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
+  bool AllowReassociate = I.hasUnsafeAlgebra();
+  bool AllowReciprocal = I.hasAllowReciprocal();
+
+  if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
+    if (AllowReassociate) {
+      Constant *C1 = nullptr;
+      Constant *C2 = Op1C;
+      Value *X;
+      Instruction *Res = nullptr;
+
+      if (match(Op0, m_FMul(m_Value(X), m_Constant(C1)))) {
+        // (X*C1)/C2 => X * (C1/C2)
+        //
+        Constant *C = ConstantExpr::getFDiv(C1, C2);
+        if (isNormalFp(C))
+          Res = BinaryOperator::CreateFMul(X, C);
+      } else if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) {
+        // (X/C1)/C2 => X /(C2*C1) [=> X * 1/(C2*C1) if reciprocal is allowed]
+        //
+        Constant *C = ConstantExpr::getFMul(C1, C2);
+        if (isNormalFp(C)) {
+          Res = CvtFDivConstToReciprocal(X, C, AllowReciprocal);
+          if (!Res)
+            Res = BinaryOperator::CreateFDiv(X, C);
+        }
+      }
+
+      if (Res) {
+        Res->setFastMathFlags(I.getFastMathFlags());
+        return Res;
+      }
+    }
+
+    // X / C => X * 1/C
+    if (Instruction *T = CvtFDivConstToReciprocal(Op0, Op1C, AllowReciprocal)) {
+      T->copyFastMathFlags(&I);
+      return T;
+    }
+
+    return nullptr;
+  }
+
+  if (AllowReassociate && isa<Constant>(Op0)) {
+    Constant *C1 = cast<Constant>(Op0), *C2;
+    Constant *Fold = nullptr;
+    Value *X;
+    bool CreateDiv = true;
+
+    // C1 / (X*C2) => (C1/C2) / X
+    if (match(Op1, m_FMul(m_Value(X), m_Constant(C2))))
+      Fold = ConstantExpr::getFDiv(C1, C2);
+    else if (match(Op1, m_FDiv(m_Value(X), m_Constant(C2)))) {
+      // C1 / (X/C2) => (C1*C2) / X
+      Fold = ConstantExpr::getFMul(C1, C2);
+    } else if (match(Op1, m_FDiv(m_Constant(C2), m_Value(X)))) {
+      // C1 / (C2/X) => (C1/C2) * X
+      Fold = ConstantExpr::getFDiv(C1, C2);
+      CreateDiv = false;
+    }
+
+    if (Fold && isNormalFp(Fold)) {
+      Instruction *R = CreateDiv ? BinaryOperator::CreateFDiv(Fold, X)
+                                 : BinaryOperator::CreateFMul(X, Fold);
+      R->setFastMathFlags(I.getFastMathFlags());
+      return R;
+    }
+    return nullptr;
+  }
+
+  if (AllowReassociate) {
+    Value *X, *Y;
+    Value *NewInst = nullptr;
+    Instruction *SimpR = nullptr;
+
+    if (Op0->hasOneUse() && match(Op0, m_FDiv(m_Value(X), m_Value(Y)))) {
+      // (X/Y) / Z => X / (Y*Z)
+      //
+      if (!isa<Constant>(Y) || !isa<Constant>(Op1)) {
+        NewInst = Builder.CreateFMul(Y, Op1);
+        if (Instruction *RI = dyn_cast<Instruction>(NewInst)) {
+          FastMathFlags Flags = I.getFastMathFlags();
+          Flags &= cast<Instruction>(Op0)->getFastMathFlags();
+          RI->setFastMathFlags(Flags);
+        }
+        SimpR = BinaryOperator::CreateFDiv(X, NewInst);
+      }
+    } else if (Op1->hasOneUse() && match(Op1, m_FDiv(m_Value(X), m_Value(Y)))) {
+      // Z / (X/Y) => Z*Y / X
+      //
+      if (!isa<Constant>(Y) || !isa<Constant>(Op0)) {
+        NewInst = Builder.CreateFMul(Op0, Y);
+        if (Instruction *RI = dyn_cast<Instruction>(NewInst)) {
+          FastMathFlags Flags = I.getFastMathFlags();
+          Flags &= cast<Instruction>(Op1)->getFastMathFlags();
+          RI->setFastMathFlags(Flags);
+        }
+        SimpR = BinaryOperator::CreateFDiv(NewInst, X);
+      }
+    }
+
+    if (NewInst) {
+      if (Instruction *T = dyn_cast<Instruction>(NewInst))
+        T->setDebugLoc(I.getDebugLoc());
+      SimpR->setFastMathFlags(I.getFastMathFlags());
+      return SimpR;
+    }
+  }
+
+  Value *LHS;
+  Value *RHS;
+
+  // -x / -y -> x / y
+  if (match(Op0, m_FNeg(m_Value(LHS))) && match(Op1, m_FNeg(m_Value(RHS)))) {
+    I.setOperand(0, LHS);
+    I.setOperand(1, RHS);
+    return &I;
+  }
+
+  return nullptr;
+}
+
+/// This function implements the transforms common to both integer remainder
+/// instructions (urem and srem). It is called by the visitors to those integer
+/// remainder instructions.
+/// @brief Common integer remainder transforms
+Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // The RHS is known non-zero.
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) {
+    I.setOperand(1, V);
+    return &I;
+  }
+
+  // Handle cases involving: rem X, (select Cond, Y, Z)
+  if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
+    return &I;
+
+  if (isa<Constant>(Op1)) {
+    if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
+      if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
+        if (Instruction *R = FoldOpIntoSelect(I, SI))
+          return R;
+      } else if (auto *PN = dyn_cast<PHINode>(Op0I)) {
+        using namespace llvm::PatternMatch;
+        const APInt *Op1Int;
+        if (match(Op1, m_APInt(Op1Int)) && !Op1Int->isMinValue() &&
+            (I.getOpcode() == Instruction::URem ||
+             !Op1Int->isMinSignedValue())) {
+          // foldOpIntoPhi will speculate instructions to the end of the PHI's
+          // predecessor blocks, so do this only if we know the srem or urem
+          // will not fault.
+          if (Instruction *NV = foldOpIntoPhi(I, PN))
+            return NV;
+        }
+      }
+
+      // See if we can fold away this rem instruction.
+      if (SimplifyDemandedInstructionBits(I))
+        return &I;
+    }
+  }
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitURem(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyURemInst(Op0, Op1, SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *common = commonIRemTransforms(I))
+    return common;
+
+  // (zext A) urem (zext B) --> zext (A urem B)
+  if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0))
+    if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy()))
+      return new ZExtInst(Builder.CreateURem(ZOp0->getOperand(0), ZOp1),
+                          I.getType());
+
+  // X urem Y -> X and Y-1, where Y is a power of 2,
+  if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
+    Constant *N1 = Constant::getAllOnesValue(I.getType());
+    Value *Add = Builder.CreateAdd(Op1, N1);
+    return BinaryOperator::CreateAnd(Op0, Add);
+  }
+
+  // 1 urem X -> zext(X != 1)
+  if (match(Op0, m_One())) {
+    Value *Cmp = Builder.CreateICmpNE(Op1, Op0);
+    Value *Ext = Builder.CreateZExt(Cmp, I.getType());
+    return replaceInstUsesWith(I, Ext);
+  }
+
+  // X urem C -> X < C ? X : X - C, where C >= signbit.
+  const APInt *DivisorC;
+  if (match(Op1, m_APInt(DivisorC)) && DivisorC->isNegative()) {
+    Value *Cmp = Builder.CreateICmpULT(Op0, Op1);
+    Value *Sub = Builder.CreateSub(Op0, Op1);
+    return SelectInst::Create(Cmp, Op0, Sub);
+  }
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifySRemInst(Op0, Op1, SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  // Handle the integer rem common cases
+  if (Instruction *Common = commonIRemTransforms(I))
+    return Common;
+
+  {
+    const APInt *Y;
+    // X % -Y -> X % Y
+    if (match(Op1, m_APInt(Y)) && Y->isNegative() && !Y->isMinSignedValue()) {
+      Worklist.AddValue(I.getOperand(1));
+      I.setOperand(1, ConstantInt::get(I.getType(), -*Y));
+      return &I;
+    }
+  }
+
+  // If the sign bits of both operands are zero (i.e. we can prove they are
+  // unsigned inputs), turn this into a urem.
+  APInt Mask(APInt::getSignMask(I.getType()->getScalarSizeInBits()));
+  if (MaskedValueIsZero(Op1, Mask, 0, &I) &&
+      MaskedValueIsZero(Op0, Mask, 0, &I)) {
+    // X srem Y -> X urem Y, iff X and Y don't have sign bit set
+    return BinaryOperator::CreateURem(Op0, Op1, I.getName());
+  }
+
+  // If it's a constant vector, flip any negative values positive.
+  if (isa<ConstantVector>(Op1) || isa<ConstantDataVector>(Op1)) {
+    Constant *C = cast<Constant>(Op1);
+    unsigned VWidth = C->getType()->getVectorNumElements();
+
+    bool hasNegative = false;
+    bool hasMissing = false;
+    for (unsigned i = 0; i != VWidth; ++i) {
+      Constant *Elt = C->getAggregateElement(i);
+      if (!Elt) {
+        hasMissing = true;
+        break;
+      }
+
+      if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elt))
+        if (RHS->isNegative())
+          hasNegative = true;
+    }
+
+    if (hasNegative && !hasMissing) {
+      SmallVector<Constant *, 16> Elts(VWidth);
+      for (unsigned i = 0; i != VWidth; ++i) {
+        Elts[i] = C->getAggregateElement(i);  // Handle undef, etc.
+        if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elts[i])) {
+          if (RHS->isNegative())
+            Elts[i] = cast<ConstantInt>(ConstantExpr::getNeg(RHS));
+        }
+      }
+
+      Constant *NewRHSV = ConstantVector::get(Elts);
+      if (NewRHSV != C) {  // Don't loop on -MININT
+        Worklist.AddValue(I.getOperand(1));
+        I.setOperand(1, NewRHSV);
+        return &I;
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyFRemInst(Op0, Op1, I.getFastMathFlags(),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  // Handle cases involving: rem X, (select Cond, Y, Z)
+  if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
+    return &I;
+
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
new file mode 100644
index 000000000000..0011412c2bf4
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -0,0 +1,1018 @@
+//===- InstCombinePHI.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitPHINode function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+/// The PHI arguments will be folded into a single operation with a PHI node
+/// as input. The debug location of the single operation will be the merged
+/// locations of the original PHI node arguments.
+DebugLoc InstCombiner::PHIArgMergedDebugLoc(PHINode &PN) {
+  auto *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+  const DILocation *Loc = FirstInst->getDebugLoc();
+
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+    auto *I = cast<Instruction>(PN.getIncomingValue(i));
+    Loc = DILocation::getMergedLocation(Loc, I->getDebugLoc());
+  }
+
+  return Loc;
+}
+
+/// If we have something like phi [add (a,b), add(a,c)] and if a/b/c and the
+/// adds all have a single use, turn this into a phi and a single binop.
+Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
+  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+  assert(isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst));
+  unsigned Opc = FirstInst->getOpcode();
+  Value *LHSVal = FirstInst->getOperand(0);
+  Value *RHSVal = FirstInst->getOperand(1);
+
+  Type *LHSType = LHSVal->getType();
+  Type *RHSType = RHSVal->getType();
+
+  // Scan to see if all operands are the same opcode, and all have one use.
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+    Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
+    if (!I || I->getOpcode() != Opc || !I->hasOneUse() ||
+        // Verify type of the LHS matches so we don't fold cmp's of different
+        // types.
+        I->getOperand(0)->getType() != LHSType ||
+        I->getOperand(1)->getType() != RHSType)
+      return nullptr;
+
+    // If they are CmpInst instructions, check their predicates
+    if (CmpInst *CI = dyn_cast<CmpInst>(I))
+      if (CI->getPredicate() != cast<CmpInst>(FirstInst)->getPredicate())
+        return nullptr;
+
+    // Keep track of which operand needs a phi node.
+    if (I->getOperand(0) != LHSVal) LHSVal = nullptr;
+    if (I->getOperand(1) != RHSVal) RHSVal = nullptr;
+  }
+
+  // If both LHS and RHS would need a PHI, don't do this transformation,
+  // because it would increase the number of PHIs entering the block,
+  // which leads to higher register pressure. This is especially
+  // bad when the PHIs are in the header of a loop.
+  if (!LHSVal && !RHSVal)
+    return nullptr;
+
+  // Otherwise, this is safe to transform!
+
+  Value *InLHS = FirstInst->getOperand(0);
+  Value *InRHS = FirstInst->getOperand(1);
+  PHINode *NewLHS = nullptr, *NewRHS = nullptr;
+  if (!LHSVal) {
+    NewLHS = PHINode::Create(LHSType, PN.getNumIncomingValues(),
+                             FirstInst->getOperand(0)->getName() + ".pn");
+    NewLHS->addIncoming(InLHS, PN.getIncomingBlock(0));
+    InsertNewInstBefore(NewLHS, PN);
+    LHSVal = NewLHS;
+  }
+
+  if (!RHSVal) {
+    NewRHS = PHINode::Create(RHSType, PN.getNumIncomingValues(),
+                             FirstInst->getOperand(1)->getName() + ".pn");
+    NewRHS->addIncoming(InRHS, PN.getIncomingBlock(0));
+    InsertNewInstBefore(NewRHS, PN);
+    RHSVal = NewRHS;
+  }
+
+  // Add all operands to the new PHIs.
+  if (NewLHS || NewRHS) {
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+      Instruction *InInst = cast<Instruction>(PN.getIncomingValue(i));
+      if (NewLHS) {
+        Value *NewInLHS = InInst->getOperand(0);
+        NewLHS->addIncoming(NewInLHS, PN.getIncomingBlock(i));
+      }
+      if (NewRHS) {
+        Value *NewInRHS = InInst->getOperand(1);
+        NewRHS->addIncoming(NewInRHS, PN.getIncomingBlock(i));
+      }
+    }
+  }
+
+  if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) {
+    CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
+                                     LHSVal, RHSVal);
+    NewCI->setDebugLoc(PHIArgMergedDebugLoc(PN));
+    return NewCI;
+  }
+
+  BinaryOperator *BinOp = cast<BinaryOperator>(FirstInst);
+  BinaryOperator *NewBinOp =
+    BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal);
+
+  NewBinOp->copyIRFlags(PN.getIncomingValue(0));
+
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i)
+    NewBinOp->andIRFlags(PN.getIncomingValue(i));
+
+  NewBinOp->setDebugLoc(PHIArgMergedDebugLoc(PN));
+  return NewBinOp;
+}
+
+Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
+  GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0));
+
+  SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(),
+                                        FirstInst->op_end());
+  // This is true if all GEP bases are allocas and if all indices into them are
+  // constants.
+  bool AllBasePointersAreAllocas = true;
+
+  // We don't want to replace this phi if the replacement would require
+  // more than one phi, which leads to higher register pressure. This is
+  // especially bad when the PHIs are in the header of a loop.
+  bool NeededPhi = false;
+
+  bool AllInBounds = true;
+
+  // Scan to see if all operands are the same opcode, and all have one use.
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+    GetElementPtrInst *GEP= dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i));
+    if (!GEP || !GEP->hasOneUse() || GEP->getType() != FirstInst->getType() ||
+      GEP->getNumOperands() != FirstInst->getNumOperands())
+      return nullptr;
+
+    AllInBounds &= GEP->isInBounds();
+
+    // Keep track of whether or not all GEPs are of alloca pointers.
+    if (AllBasePointersAreAllocas &&
+        (!isa<AllocaInst>(GEP->getOperand(0)) ||
+         !GEP->hasAllConstantIndices()))
+      AllBasePointersAreAllocas = false;
+
+    // Compare the operand lists.
+    for (unsigned op = 0, e = FirstInst->getNumOperands(); op != e; ++op) {
+      if (FirstInst->getOperand(op) == GEP->getOperand(op))
+        continue;
+
+      // Don't merge two GEPs when two operands differ (introducing phi nodes)
+      // if one of the PHIs has a constant for the index.  The index may be
+      // substantially cheaper to compute for the constants, so making it a
+      // variable index could pessimize the path.  This also handles the case
+      // for struct indices, which must always be constant.
+      if (isa<ConstantInt>(FirstInst->getOperand(op)) ||
+          isa<ConstantInt>(GEP->getOperand(op)))
+        return nullptr;
+
+      if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType())
+        return nullptr;
+
+      // If we already needed a PHI for an earlier operand, and another operand
+      // also requires a PHI, we'd be introducing more PHIs than we're
+      // eliminating, which increases register pressure on entry to the PHI's
+      // block.
+      if (NeededPhi)
+        return nullptr;
+
+      FixedOperands[op] = nullptr;  // Needs a PHI.
+      NeededPhi = true;
+    }
+  }
+
+  // If all of the base pointers of the PHI'd GEPs are from allocas, don't
+  // bother doing this transformation.  At best, this will just save a bit of
+  // offset calculation, but all the predecessors will have to materialize the
+  // stack address into a register anyway.  We'd actually rather *clone* the
+  // load up into the predecessors so that we have a load of a gep of an alloca,
+  // which can usually all be folded into the load.
+  if (AllBasePointersAreAllocas)
+    return nullptr;
+
+  // Otherwise, this is safe to transform.  Insert PHI nodes for each operand
+  // that is variable.
+  SmallVector<PHINode*, 16> OperandPhis(FixedOperands.size());
+
+  bool HasAnyPHIs = false;
+  for (unsigned i = 0, e = FixedOperands.size(); i != e; ++i) {
+    if (FixedOperands[i]) continue;  // operand doesn't need a phi.
+    Value *FirstOp = FirstInst->getOperand(i);
+    PHINode *NewPN = PHINode::Create(FirstOp->getType(), e,
+                                     FirstOp->getName()+".pn");
+    InsertNewInstBefore(NewPN, PN);
+
+    NewPN->addIncoming(FirstOp, PN.getIncomingBlock(0));
+    OperandPhis[i] = NewPN;
+    FixedOperands[i] = NewPN;
+    HasAnyPHIs = true;
+  }
+
+
+  // Add all operands to the new PHIs.
+  if (HasAnyPHIs) {
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+      GetElementPtrInst *InGEP =cast<GetElementPtrInst>(PN.getIncomingValue(i));
+      BasicBlock *InBB = PN.getIncomingBlock(i);
+
+      for (unsigned op = 0, e = OperandPhis.size(); op != e; ++op)
+        if (PHINode *OpPhi = OperandPhis[op])
+          OpPhi->addIncoming(InGEP->getOperand(op), InBB);
+    }
+  }
+
+  Value *Base = FixedOperands[0];
+  GetElementPtrInst *NewGEP =
+      GetElementPtrInst::Create(FirstInst->getSourceElementType(), Base,
+                                makeArrayRef(FixedOperands).slice(1));
+  if (AllInBounds) NewGEP->setIsInBounds();
+  NewGEP->setDebugLoc(PHIArgMergedDebugLoc(PN));
+  return NewGEP;
+}
+
+
+/// Return true if we know that it is safe to sink the load out of the block
+/// that defines it. This means that it must be obvious the value of the load is
+/// not changed from the point of the load to the end of the block it is in.
+///
+/// Finally, it is safe, but not profitable, to sink a load targeting a
+/// non-address-taken alloca.  Doing so will cause us to not promote the alloca
+/// to a register.
+static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
+  BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end();
+
+  for (++BBI; BBI != E; ++BBI)
+    if (BBI->mayWriteToMemory())
+      return false;
+
+  // Check for non-address taken alloca.  If not address-taken already, it isn't
+  // profitable to do this xform.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(L->getOperand(0))) {
+    bool isAddressTaken = false;
+    for (User *U : AI->users()) {
+      if (isa<LoadInst>(U)) continue;
+      if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+        // If storing TO the alloca, then the address isn't taken.
+        if (SI->getOperand(1) == AI) continue;
+      }
+      isAddressTaken = true;
+      break;
+    }
+
+    if (!isAddressTaken && AI->isStaticAlloca())
+      return false;
+  }
+
+  // If this load is a load from a GEP with a constant offset from an alloca,
+  // then we don't want to sink it.  In its present form, it will be
+  // load [constant stack offset].  Sinking it will cause us to have to
+  // materialize the stack addresses in each predecessor in a register only to
+  // do a shared load from register in the successor.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(L->getOperand(0)))
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(GEP->getOperand(0)))
+      if (AI->isStaticAlloca() && GEP->hasAllConstantIndices())
+        return false;
+
+  return true;
+}
+
+Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
+  LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0));
+
+  // FIXME: This is overconservative; this transform is allowed in some cases
+  // for atomic operations.
+  if (FirstLI->isAtomic())
+    return nullptr;
+
+  // When processing loads, we need to propagate two bits of information to the
+  // sunk load: whether it is volatile, and what its alignment is.  We currently
+  // don't sink loads when some have their alignment specified and some don't.
+  // visitLoadInst will propagate an alignment onto the load when TD is around,
+  // and if TD isn't around, we can't handle the mixed case.
+  bool isVolatile = FirstLI->isVolatile();
+  unsigned LoadAlignment = FirstLI->getAlignment();
+  unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
+
+  // We can't sink the load if the loaded value could be modified between the
+  // load and the PHI.
+  if (FirstLI->getParent() != PN.getIncomingBlock(0) ||
+      !isSafeAndProfitableToSinkLoad(FirstLI))
+    return nullptr;
+
+  // If the PHI is of volatile loads and the load block has multiple
+  // successors, sinking it would remove a load of the volatile value from
+  // the path through the other successor.
+  if (isVolatile &&
+      FirstLI->getParent()->getTerminator()->getNumSuccessors() != 1)
+    return nullptr;
+
+  // Check to see if all arguments are the same operation.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    LoadInst *LI = dyn_cast<LoadInst>(PN.getIncomingValue(i));
+    if (!LI || !LI->hasOneUse())
+      return nullptr;
+
+    // We can't sink the load if the loaded value could be modified between
+    // the load and the PHI.
+    if (LI->isVolatile() != isVolatile ||
+        LI->getParent() != PN.getIncomingBlock(i) ||
+        LI->getPointerAddressSpace() != LoadAddrSpace ||
+        !isSafeAndProfitableToSinkLoad(LI))
+      return nullptr;
+
+    // If some of the loads have an alignment specified but not all of them,
+    // we can't do the transformation.
+    if ((LoadAlignment != 0) != (LI->getAlignment() != 0))
+      return nullptr;
+
+    LoadAlignment = std::min(LoadAlignment, LI->getAlignment());
+
+    // If the PHI is of volatile loads and the load block has multiple
+    // successors, sinking it would remove a load of the volatile value from
+    // the path through the other successor.
+    if (isVolatile &&
+        LI->getParent()->getTerminator()->getNumSuccessors() != 1)
+      return nullptr;
+  }
+
+  // Okay, they are all the same operation.  Create a new PHI node of the
+  // correct type, and PHI together all of the LHS's of the instructions.
+  PHINode *NewPN = PHINode::Create(FirstLI->getOperand(0)->getType(),
+                                   PN.getNumIncomingValues(),
+                                   PN.getName()+".in");
+
+  Value *InVal = FirstLI->getOperand(0);
+  NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
+  LoadInst *NewLI = new LoadInst(NewPN, "", isVolatile, LoadAlignment);
+
+  unsigned KnownIDs[] = {
+    LLVMContext::MD_tbaa,
+    LLVMContext::MD_range,
+    LLVMContext::MD_invariant_load,
+    LLVMContext::MD_alias_scope,
+    LLVMContext::MD_noalias,
+    LLVMContext::MD_nonnull,
+    LLVMContext::MD_align,
+    LLVMContext::MD_dereferenceable,
+    LLVMContext::MD_dereferenceable_or_null,
+  };
+
+  for (unsigned ID : KnownIDs)
+    NewLI->setMetadata(ID, FirstLI->getMetadata(ID));
+
+  // Add all operands to the new PHI and combine TBAA metadata.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    LoadInst *LI = cast<LoadInst>(PN.getIncomingValue(i));
+    combineMetadata(NewLI, LI, KnownIDs);
+    Value *NewInVal = LI->getOperand(0);
+    if (NewInVal != InVal)
+      InVal = nullptr;
+    NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
+  }
+
+  if (InVal) {
+    // The new PHI unions all of the same values together.  This is really
+    // common, so we handle it intelligently here for compile-time speed.
+    NewLI->setOperand(0, InVal);
+    delete NewPN;
+  } else {
+    InsertNewInstBefore(NewPN, PN);
+  }
+
+  // If this was a volatile load that we are merging, make sure to loop through
+  // and mark all the input loads as non-volatile.  If we don't do this, we will
+  // insert a new volatile load and the old ones will not be deletable.
+  if (isVolatile)
+    for (Value *IncValue : PN.incoming_values())
+      cast<LoadInst>(IncValue)->setVolatile(false);
+
+  NewLI->setDebugLoc(PHIArgMergedDebugLoc(PN));
+  return NewLI;
+}
+
+/// TODO: This function could handle other cast types, but then it might
+/// require special-casing a cast from the 'i1' type. See the comment in
+/// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types.
+Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
+  // We cannot create a new instruction after the PHI if the terminator is an
+  // EHPad because there is no valid insertion point.
+  if (TerminatorInst *TI = Phi.getParent()->getTerminator())
+    if (TI->isEHPad())
+      return nullptr;
+
+  // Early exit for the common case of a phi with two operands. These are
+  // handled elsewhere. See the comment below where we check the count of zexts
+  // and constants for more details.
+  unsigned NumIncomingValues = Phi.getNumIncomingValues();
+  if (NumIncomingValues < 3)
+    return nullptr;
+
+  // Find the narrower type specified by the first zext.
+  Type *NarrowType = nullptr;
+  for (Value *V : Phi.incoming_values()) {
+    if (auto *Zext = dyn_cast<ZExtInst>(V)) {
+      NarrowType = Zext->getSrcTy();
+      break;
+    }
+  }
+  if (!NarrowType)
+    return nullptr;
+
+  // Walk the phi operands checking that we only have zexts or constants that
+  // we can shrink for free. Store the new operands for the new phi.
+  SmallVector<Value *, 4> NewIncoming;
+  unsigned NumZexts = 0;
+  unsigned NumConsts = 0;
+  for (Value *V : Phi.incoming_values()) {
+    if (auto *Zext = dyn_cast<ZExtInst>(V)) {
+      // All zexts must be identical and have one use.
+      if (Zext->getSrcTy() != NarrowType || !Zext->hasOneUse())
+        return nullptr;
+      NewIncoming.push_back(Zext->getOperand(0));
+      NumZexts++;
+    } else if (auto *C = dyn_cast<Constant>(V)) {
+      // Make sure that constants can fit in the new type.
+      Constant *Trunc = ConstantExpr::getTrunc(C, NarrowType);
+      if (ConstantExpr::getZExt(Trunc, C->getType()) != C)
+        return nullptr;
+      NewIncoming.push_back(Trunc);
+      NumConsts++;
+    } else {
+      // If it's not a cast or a constant, bail out.
+      return nullptr;
+    }
+  }
+
+  // The more common cases of a phi with no constant operands or just one
+  // variable operand are handled by FoldPHIArgOpIntoPHI() and foldOpIntoPhi()
+  // respectively. foldOpIntoPhi() wants to do the opposite transform that is
+  // performed here. It tries to replicate a cast in the phi operand's basic
+  // block to expose other folding opportunities. Thus, InstCombine will
+  // infinite loop without this check.
+  if (NumConsts == 0 || NumZexts < 2)
+    return nullptr;
+
+  // All incoming values are zexts or constants that are safe to truncate.
+  // Create a new phi node of the narrow type, phi together all of the new
+  // operands, and zext the result back to the original type.
+  PHINode *NewPhi = PHINode::Create(NarrowType, NumIncomingValues,
+                                    Phi.getName() + ".shrunk");
+  for (unsigned i = 0; i != NumIncomingValues; ++i)
+    NewPhi->addIncoming(NewIncoming[i], Phi.getIncomingBlock(i));
+
+  InsertNewInstBefore(NewPhi, Phi);
+  return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType());
+}
+
+/// If all operands to a PHI node are the same "unary" operator and they all are
+/// only used by the PHI, PHI together their inputs, and do the operation once,
+/// to the result of the PHI.
+Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
+  // We cannot create a new instruction after the PHI if the terminator is an
+  // EHPad because there is no valid insertion point.
+  if (TerminatorInst *TI = PN.getParent()->getTerminator())
+    if (TI->isEHPad())
+      return nullptr;
+
+  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+
+  if (isa<GetElementPtrInst>(FirstInst))
+    return FoldPHIArgGEPIntoPHI(PN);
+  if (isa<LoadInst>(FirstInst))
+    return FoldPHIArgLoadIntoPHI(PN);
+
+  // Scan the instruction, looking for input operations that can be folded away.
+  // If all input operands to the phi are the same instruction (e.g. a cast from
+  // the same type or "+42") we can pull the operation through the PHI, reducing
+  // code size and simplifying code.
+  Constant *ConstantOp = nullptr;
+  Type *CastSrcTy = nullptr;
+
+  if (isa<CastInst>(FirstInst)) {
+    CastSrcTy = FirstInst->getOperand(0)->getType();
+
+    // Be careful about transforming integer PHIs.  We don't want to pessimize
+    // the code by turning an i32 into an i1293.
+    if (PN.getType()->isIntegerTy() && CastSrcTy->isIntegerTy()) {
+      if (!shouldChangeType(PN.getType(), CastSrcTy))
+        return nullptr;
+    }
+  } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
+    // Can fold binop, compare or shift here if the RHS is a constant,
+    // otherwise call FoldPHIArgBinOpIntoPHI.
+    ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1));
+    if (!ConstantOp)
+      return FoldPHIArgBinOpIntoPHI(PN);
+  } else {
+    return nullptr;  // Cannot fold this operation.
+  }
+
+  // Check to see if all arguments are the same operation.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
+    if (!I || !I->hasOneUse() || !I->isSameOperationAs(FirstInst))
+      return nullptr;
+    if (CastSrcTy) {
+      if (I->getOperand(0)->getType() != CastSrcTy)
+        return nullptr;  // Cast operation must match.
+    } else if (I->getOperand(1) != ConstantOp) {
+      return nullptr;
+    }
+  }
+
+  // Okay, they are all the same operation.  Create a new PHI node of the
+  // correct type, and PHI together all of the LHS's of the instructions.
+  PHINode *NewPN = PHINode::Create(FirstInst->getOperand(0)->getType(),
+                                   PN.getNumIncomingValues(),
+                                   PN.getName()+".in");
+
+  Value *InVal = FirstInst->getOperand(0);
+  NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
+
+  // Add all operands to the new PHI.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    Value *NewInVal = cast<Instruction>(PN.getIncomingValue(i))->getOperand(0);
+    if (NewInVal != InVal)
+      InVal = nullptr;
+    NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
+  }
+
+  Value *PhiVal;
+  if (InVal) {
+    // The new PHI unions all of the same values together.  This is really
+    // common, so we handle it intelligently here for compile-time speed.
+    PhiVal = InVal;
+    delete NewPN;
+  } else {
+    InsertNewInstBefore(NewPN, PN);
+    PhiVal = NewPN;
+  }
+
+  // Insert and return the new operation.
+  if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst)) {
+    CastInst *NewCI = CastInst::Create(FirstCI->getOpcode(), PhiVal,
+                                       PN.getType());
+    NewCI->setDebugLoc(PHIArgMergedDebugLoc(PN));
+    return NewCI;
+  }
+
+  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst)) {
+    BinOp = BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp);
+    BinOp->copyIRFlags(PN.getIncomingValue(0));
+
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i)
+      BinOp->andIRFlags(PN.getIncomingValue(i));
+
+    BinOp->setDebugLoc(PHIArgMergedDebugLoc(PN));
+    return BinOp;
+  }
+
+  CmpInst *CIOp = cast<CmpInst>(FirstInst);
+  CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
+                                   PhiVal, ConstantOp);
+  NewCI->setDebugLoc(PHIArgMergedDebugLoc(PN));
+  return NewCI;
+}
+
+/// Return true if this PHI node is only used by a PHI node cycle that is dead.
+static bool DeadPHICycle(PHINode *PN,
+                         SmallPtrSetImpl<PHINode*> &PotentiallyDeadPHIs) {
+  if (PN->use_empty()) return true;
+  if (!PN->hasOneUse()) return false;
+
+  // Remember this node, and if we find the cycle, return.
+  if (!PotentiallyDeadPHIs.insert(PN).second)
+    return true;
+
+  // Don't scan crazily complex things.
+  if (PotentiallyDeadPHIs.size() == 16)
+    return false;
+
+  if (PHINode *PU = dyn_cast<PHINode>(PN->user_back()))
+    return DeadPHICycle(PU, PotentiallyDeadPHIs);
+
+  return false;
+}
+
+/// Return true if this phi node is always equal to NonPhiInVal.
+/// This happens with mutually cyclic phi nodes like:
+///   z = some value; x = phi (y, z); y = phi (x, z)
+static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal,
+                           SmallPtrSetImpl<PHINode*> &ValueEqualPHIs) {
+  // See if we already saw this PHI node.
+  if (!ValueEqualPHIs.insert(PN).second)
+    return true;
+
+  // Don't scan crazily complex things.
+  if (ValueEqualPHIs.size() == 16)
+    return false;
+
+  // Scan the operands to see if they are either phi nodes or are equal to
+  // the value.
+  for (Value *Op : PN->incoming_values()) {
+    if (PHINode *OpPN = dyn_cast<PHINode>(Op)) {
+      if (!PHIsEqualValue(OpPN, NonPhiInVal, ValueEqualPHIs))
+        return false;
+    } else if (Op != NonPhiInVal)
+      return false;
+  }
+
+  return true;
+}
+
+/// Return an existing non-zero constant if this phi node has one, otherwise
+/// return constant 1.
+static ConstantInt *GetAnyNonZeroConstInt(PHINode &PN) {
+  assert(isa<IntegerType>(PN.getType()) && "Expect only integer type phi");
+  for (Value *V : PN.operands())
+    if (auto *ConstVA = dyn_cast<ConstantInt>(V))
+      if (!ConstVA->isZero())
+        return ConstVA;
+  return ConstantInt::get(cast<IntegerType>(PN.getType()), 1);
+}
+
+namespace {
+struct PHIUsageRecord {
+  unsigned PHIId;     // The ID # of the PHI (something determinstic to sort on)
+  unsigned Shift;     // The amount shifted.
+  Instruction *Inst;  // The trunc instruction.
+
+  PHIUsageRecord(unsigned pn, unsigned Sh, Instruction *User)
+    : PHIId(pn), Shift(Sh), Inst(User) {}
+
+  bool operator<(const PHIUsageRecord &RHS) const {
+    if (PHIId < RHS.PHIId) return true;
+    if (PHIId > RHS.PHIId) return false;
+    if (Shift < RHS.Shift) return true;
+    if (Shift > RHS.Shift) return false;
+    return Inst->getType()->getPrimitiveSizeInBits() <
+           RHS.Inst->getType()->getPrimitiveSizeInBits();
+  }
+};
+
+struct LoweredPHIRecord {
+  PHINode *PN;        // The PHI that was lowered.
+  unsigned Shift;     // The amount shifted.
+  unsigned Width;     // The width extracted.
+
+  LoweredPHIRecord(PHINode *pn, unsigned Sh, Type *Ty)
+    : PN(pn), Shift(Sh), Width(Ty->getPrimitiveSizeInBits()) {}
+
+  // Ctor form used by DenseMap.
+  LoweredPHIRecord(PHINode *pn, unsigned Sh)
+    : PN(pn), Shift(Sh), Width(0) {}
+};
+}
+
+namespace llvm {
+  template<>
+  struct DenseMapInfo<LoweredPHIRecord> {
+    static inline LoweredPHIRecord getEmptyKey() {
+      return LoweredPHIRecord(nullptr, 0);
+    }
+    static inline LoweredPHIRecord getTombstoneKey() {
+      return LoweredPHIRecord(nullptr, 1);
+    }
+    static unsigned getHashValue(const LoweredPHIRecord &Val) {
+      return DenseMapInfo<PHINode*>::getHashValue(Val.PN) ^ (Val.Shift>>3) ^
+             (Val.Width>>3);
+    }
+    static bool isEqual(const LoweredPHIRecord &LHS,
+                        const LoweredPHIRecord &RHS) {
+      return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift &&
+             LHS.Width == RHS.Width;
+    }
+  };
+}
+
+
+/// This is an integer PHI and we know that it has an illegal type: see if it is
+/// only used by trunc or trunc(lshr) operations. If so, we split the PHI into
+/// the various pieces being extracted. This sort of thing is introduced when
+/// SROA promotes an aggregate to large integer values.
+///
+/// TODO: The user of the trunc may be an bitcast to float/double/vector or an
+/// inttoptr.  We should produce new PHIs in the right type.
+///
+Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
+  // PHIUsers - Keep track of all of the truncated values extracted from a set
+  // of PHIs, along with their offset.  These are the things we want to rewrite.
+  SmallVector<PHIUsageRecord, 16> PHIUsers;
+
+  // PHIs are often mutually cyclic, so we keep track of a whole set of PHI
+  // nodes which are extracted from. PHIsToSlice is a set we use to avoid
+  // revisiting PHIs, PHIsInspected is a ordered list of PHIs that we need to
+  // check the uses of (to ensure they are all extracts).
+  SmallVector<PHINode*, 8> PHIsToSlice;
+  SmallPtrSet<PHINode*, 8> PHIsInspected;
+
+  PHIsToSlice.push_back(&FirstPhi);
+  PHIsInspected.insert(&FirstPhi);
+
+  for (unsigned PHIId = 0; PHIId != PHIsToSlice.size(); ++PHIId) {
+    PHINode *PN = PHIsToSlice[PHIId];
+
+    // Scan the input list of the PHI.  If any input is an invoke, and if the
+    // input is defined in the predecessor, then we won't be split the critical
+    // edge which is required to insert a truncate.  Because of this, we have to
+    // bail out.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      InvokeInst *II = dyn_cast<InvokeInst>(PN->getIncomingValue(i));
+      if (!II) continue;
+      if (II->getParent() != PN->getIncomingBlock(i))
+        continue;
+
+      // If we have a phi, and if it's directly in the predecessor, then we have
+      // a critical edge where we need to put the truncate.  Since we can't
+      // split the edge in instcombine, we have to bail out.
+      return nullptr;
+    }
+
+    for (User *U : PN->users()) {
+      Instruction *UserI = cast<Instruction>(U);
+
+      // If the user is a PHI, inspect its uses recursively.
+      if (PHINode *UserPN = dyn_cast<PHINode>(UserI)) {
+        if (PHIsInspected.insert(UserPN).second)
+          PHIsToSlice.push_back(UserPN);
+        continue;
+      }
+
+      // Truncates are always ok.
+      if (isa<TruncInst>(UserI)) {
+        PHIUsers.push_back(PHIUsageRecord(PHIId, 0, UserI));
+        continue;
+      }
+
+      // Otherwise it must be a lshr which can only be used by one trunc.
+      if (UserI->getOpcode() != Instruction::LShr ||
+          !UserI->hasOneUse() || !isa<TruncInst>(UserI->user_back()) ||
+          !isa<ConstantInt>(UserI->getOperand(1)))
+        return nullptr;
+
+      unsigned Shift = cast<ConstantInt>(UserI->getOperand(1))->getZExtValue();
+      PHIUsers.push_back(PHIUsageRecord(PHIId, Shift, UserI->user_back()));
+    }
+  }
+
+  // If we have no users, they must be all self uses, just nuke the PHI.
+  if (PHIUsers.empty())
+    return replaceInstUsesWith(FirstPhi, UndefValue::get(FirstPhi.getType()));
+
+  // If this phi node is transformable, create new PHIs for all the pieces
+  // extracted out of it.  First, sort the users by their offset and size.
+  array_pod_sort(PHIUsers.begin(), PHIUsers.end());
+
+  DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n';
+        for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
+          dbgs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n';
+    );
+
+  // PredValues - This is a temporary used when rewriting PHI nodes.  It is
+  // hoisted out here to avoid construction/destruction thrashing.
+  DenseMap<BasicBlock*, Value*> PredValues;
+
+  // ExtractedVals - Each new PHI we introduce is saved here so we don't
+  // introduce redundant PHIs.
+  DenseMap<LoweredPHIRecord, PHINode*> ExtractedVals;
+
+  for (unsigned UserI = 0, UserE = PHIUsers.size(); UserI != UserE; ++UserI) {
+    unsigned PHIId = PHIUsers[UserI].PHIId;
+    PHINode *PN = PHIsToSlice[PHIId];
+    unsigned Offset = PHIUsers[UserI].Shift;
+    Type *Ty = PHIUsers[UserI].Inst->getType();
+
+    PHINode *EltPHI;
+
+    // If we've already lowered a user like this, reuse the previously lowered
+    // value.
+    if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == nullptr) {
+
+      // Otherwise, Create the new PHI node for this user.
+      EltPHI = PHINode::Create(Ty, PN->getNumIncomingValues(),
+                               PN->getName()+".off"+Twine(Offset), PN);
+      assert(EltPHI->getType() != PN->getType() &&
+             "Truncate didn't shrink phi?");
+
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        BasicBlock *Pred = PN->getIncomingBlock(i);
+        Value *&PredVal = PredValues[Pred];
+
+        // If we already have a value for this predecessor, reuse it.
+        if (PredVal) {
+          EltPHI->addIncoming(PredVal, Pred);
+          continue;
+        }
+
+        // Handle the PHI self-reuse case.
+        Value *InVal = PN->getIncomingValue(i);
+        if (InVal == PN) {
+          PredVal = EltPHI;
+          EltPHI->addIncoming(PredVal, Pred);
+          continue;
+        }
+
+        if (PHINode *InPHI = dyn_cast<PHINode>(PN)) {
+          // If the incoming value was a PHI, and if it was one of the PHIs we
+          // already rewrote it, just use the lowered value.
+          if (Value *Res = ExtractedVals[LoweredPHIRecord(InPHI, Offset, Ty)]) {
+            PredVal = Res;
+            EltPHI->addIncoming(PredVal, Pred);
+            continue;
+          }
+        }
+
+        // Otherwise, do an extract in the predecessor.
+        Builder.SetInsertPoint(Pred->getTerminator());
+        Value *Res = InVal;
+        if (Offset)
+          Res = Builder.CreateLShr(Res, ConstantInt::get(InVal->getType(),
+                                                          Offset), "extract");
+        Res = Builder.CreateTrunc(Res, Ty, "extract.t");
+        PredVal = Res;
+        EltPHI->addIncoming(Res, Pred);
+
+        // If the incoming value was a PHI, and if it was one of the PHIs we are
+        // rewriting, we will ultimately delete the code we inserted.  This
+        // means we need to revisit that PHI to make sure we extract out the
+        // needed piece.
+        if (PHINode *OldInVal = dyn_cast<PHINode>(PN->getIncomingValue(i)))
+          if (PHIsInspected.count(OldInVal)) {
+            unsigned RefPHIId =
+                find(PHIsToSlice, OldInVal) - PHIsToSlice.begin();
+            PHIUsers.push_back(PHIUsageRecord(RefPHIId, Offset,
+                                              cast<Instruction>(Res)));
+            ++UserE;
+          }
+      }
+      PredValues.clear();
+
+      DEBUG(dbgs() << "  Made element PHI for offset " << Offset << ": "
+                   << *EltPHI << '\n');
+      ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI;
+    }
+
+    // Replace the use of this piece with the PHI node.
+    replaceInstUsesWith(*PHIUsers[UserI].Inst, EltPHI);
+  }
+
+  // Replace all the remaining uses of the PHI nodes (self uses and the lshrs)
+  // with undefs.
+  Value *Undef = UndefValue::get(FirstPhi.getType());
+  for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
+    replaceInstUsesWith(*PHIsToSlice[i], Undef);
+  return replaceInstUsesWith(FirstPhi, Undef);
+}
+
+// PHINode simplification
+//
+Instruction *InstCombiner::visitPHINode(PHINode &PN) {
+  if (Value *V = SimplifyInstruction(&PN, SQ.getWithInstruction(&PN)))
+    return replaceInstUsesWith(PN, V);
+
+  if (Instruction *Result = FoldPHIArgZextsIntoPHI(PN))
+    return Result;
+
+  // If all PHI operands are the same operation, pull them through the PHI,
+  // reducing code size.
+  if (isa<Instruction>(PN.getIncomingValue(0)) &&
+      isa<Instruction>(PN.getIncomingValue(1)) &&
+      cast<Instruction>(PN.getIncomingValue(0))->getOpcode() ==
+      cast<Instruction>(PN.getIncomingValue(1))->getOpcode() &&
+      // FIXME: The hasOneUse check will fail for PHIs that use the value more
+      // than themselves more than once.
+      PN.getIncomingValue(0)->hasOneUse())
+    if (Instruction *Result = FoldPHIArgOpIntoPHI(PN))
+      return Result;
+
+  // If this is a trivial cycle in the PHI node graph, remove it.  Basically, if
+  // this PHI only has a single use (a PHI), and if that PHI only has one use (a
+  // PHI)... break the cycle.
+  if (PN.hasOneUse()) {
+    Instruction *PHIUser = cast<Instruction>(PN.user_back());
+    if (PHINode *PU = dyn_cast<PHINode>(PHIUser)) {
+      SmallPtrSet<PHINode*, 16> PotentiallyDeadPHIs;
+      PotentiallyDeadPHIs.insert(&PN);
+      if (DeadPHICycle(PU, PotentiallyDeadPHIs))
+        return replaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+    }
+
+    // If this phi has a single use, and if that use just computes a value for
+    // the next iteration of a loop, delete the phi.  This occurs with unused
+    // induction variables, e.g. "for (int j = 0; ; ++j);".  Detecting this
+    // common case here is good because the only other things that catch this
+    // are induction variable analysis (sometimes) and ADCE, which is only run
+    // late.
+    if (PHIUser->hasOneUse() &&
+        (isa<BinaryOperator>(PHIUser) || isa<GetElementPtrInst>(PHIUser)) &&
+        PHIUser->user_back() == &PN) {
+      return replaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+    }
+    // When a PHI is used only to be compared with zero, it is safe to replace
+    // an incoming value proved as known nonzero with any non-zero constant.
+    // For example, in the code below, the incoming value %v can be replaced
+    // with any non-zero constant based on the fact that the PHI is only used to
+    // be compared with zero and %v is a known non-zero value:
+    // %v = select %cond, 1, 2
+    // %p = phi [%v, BB] ...
+    //      icmp eq, %p, 0
+    auto *CmpInst = dyn_cast<ICmpInst>(PHIUser);
+    // FIXME: To be simple, handle only integer type for now.
+    if (CmpInst && isa<IntegerType>(PN.getType()) && CmpInst->isEquality() &&
+        match(CmpInst->getOperand(1), m_Zero())) {
+      ConstantInt *NonZeroConst = nullptr;
+      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+        Instruction *CtxI = PN.getIncomingBlock(i)->getTerminator();
+        Value *VA = PN.getIncomingValue(i);
+        if (isKnownNonZero(VA, DL, 0, &AC, CtxI, &DT)) {
+          if (!NonZeroConst)
+            NonZeroConst = GetAnyNonZeroConstInt(PN);
+          PN.setIncomingValue(i, NonZeroConst);
+        }
+      }
+    }
+  }
+
+  // We sometimes end up with phi cycles that non-obviously end up being the
+  // same value, for example:
+  //   z = some value; x = phi (y, z); y = phi (x, z)
+  // where the phi nodes don't necessarily need to be in the same block.  Do a
+  // quick check to see if the PHI node only contains a single non-phi value, if
+  // so, scan to see if the phi cycle is actually equal to that value.
+  {
+    unsigned InValNo = 0, NumIncomingVals = PN.getNumIncomingValues();
+    // Scan for the first non-phi operand.
+    while (InValNo != NumIncomingVals &&
+           isa<PHINode>(PN.getIncomingValue(InValNo)))
+      ++InValNo;
+
+    if (InValNo != NumIncomingVals) {
+      Value *NonPhiInVal = PN.getIncomingValue(InValNo);
+
+      // Scan the rest of the operands to see if there are any conflicts, if so
+      // there is no need to recursively scan other phis.
+      for (++InValNo; InValNo != NumIncomingVals; ++InValNo) {
+        Value *OpVal = PN.getIncomingValue(InValNo);
+        if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal))
+          break;
+      }
+
+      // If we scanned over all operands, then we have one unique value plus
+      // phi values.  Scan PHI nodes to see if they all merge in each other or
+      // the value.
+      if (InValNo == NumIncomingVals) {
+        SmallPtrSet<PHINode*, 16> ValueEqualPHIs;
+        if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs))
+          return replaceInstUsesWith(PN, NonPhiInVal);
+      }
+    }
+  }
+
+  // If there are multiple PHIs, sort their operands so that they all list
+  // the blocks in the same order. This will help identical PHIs be eliminated
+  // by other passes. Other passes shouldn't depend on this for correctness
+  // however.
+  PHINode *FirstPN = cast<PHINode>(PN.getParent()->begin());
+  if (&PN != FirstPN)
+    for (unsigned i = 0, e = FirstPN->getNumIncomingValues(); i != e; ++i) {
+      BasicBlock *BBA = PN.getIncomingBlock(i);
+      BasicBlock *BBB = FirstPN->getIncomingBlock(i);
+      if (BBA != BBB) {
+        Value *VA = PN.getIncomingValue(i);
+        unsigned j = PN.getBasicBlockIndex(BBB);
+        Value *VB = PN.getIncomingValue(j);
+        PN.setIncomingBlock(i, BBB);
+        PN.setIncomingValue(i, VB);
+        PN.setIncomingBlock(j, BBA);
+        PN.setIncomingValue(j, VA);
+        // NOTE: Instcombine normally would want us to "return &PN" if we
+        // modified any of the operands of an instruction.  However, since we
+        // aren't adding or removing uses (just rearranging them) we don't do
+        // this in this case.
+      }
+    }
+
+  // If this is an integer PHI and we know that it has an illegal type, see if
+  // it is only used by trunc or trunc(lshr) operations.  If so, we split the
+  // PHI into the various pieces being extracted.  This sort of thing is
+  // introduced when SROA promotes an aggregate to a single large integer type.
+  if (PN.getType()->isIntegerTy() &&
+      !DL.isLegalInteger(PN.getType()->getPrimitiveSizeInBits()))
+    if (Instruction *Res = SliceUpIllegalIntegerPHI(PN))
+      return Res;
+
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
new file mode 100644
index 000000000000..4eebe8255998
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -0,0 +1,1546 @@
+//===- InstCombineSelect.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitSelect function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+static SelectPatternFlavor
+getInverseMinMaxSelectPattern(SelectPatternFlavor SPF) {
+  switch (SPF) {
+  default:
+    llvm_unreachable("unhandled!");
+
+  case SPF_SMIN:
+    return SPF_SMAX;
+  case SPF_UMIN:
+    return SPF_UMAX;
+  case SPF_SMAX:
+    return SPF_SMIN;
+  case SPF_UMAX:
+    return SPF_UMIN;
+  }
+}
+
+static CmpInst::Predicate getCmpPredicateForMinMax(SelectPatternFlavor SPF,
+                                                   bool Ordered=false) {
+  switch (SPF) {
+  default:
+    llvm_unreachable("unhandled!");
+
+  case SPF_SMIN:
+    return ICmpInst::ICMP_SLT;
+  case SPF_UMIN:
+    return ICmpInst::ICMP_ULT;
+  case SPF_SMAX:
+    return ICmpInst::ICMP_SGT;
+  case SPF_UMAX:
+    return ICmpInst::ICMP_UGT;
+  case SPF_FMINNUM:
+    return Ordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT;
+  case SPF_FMAXNUM:
+    return Ordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT;
+  }
+}
+
+static Value *generateMinMaxSelectPattern(InstCombiner::BuilderTy &Builder,
+                                          SelectPatternFlavor SPF, Value *A,
+                                          Value *B) {
+  CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF);
+  assert(CmpInst::isIntPredicate(Pred));
+  return Builder.CreateSelect(Builder.CreateICmp(Pred, A, B), A, B);
+}
+
+/// We want to turn code that looks like this:
+///   %C = or %A, %B
+///   %D = select %cond, %C, %A
+/// into:
+///   %C = select %cond, %B, 0
+///   %D = or %A, %C
+///
+/// Assuming that the specified instruction is an operand to the select, return
+/// a bitmask indicating which operands of this instruction are foldable if they
+/// equal the other incoming value of the select.
+///
+static unsigned getSelectFoldableOperands(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return 3;              // Can fold through either operand.
+  case Instruction::Sub:   // Can only fold on the amount subtracted.
+  case Instruction::Shl:   // Can only fold on the shift amount.
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return 1;
+  default:
+    return 0;              // Cannot fold
+  }
+}
+
+/// For the same transformation as the previous function, return the identity
+/// constant that goes into the select.
+static Constant *getSelectFoldableConstant(Instruction *I) {
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("This cannot happen!");
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return Constant::getNullValue(I->getType());
+  case Instruction::And:
+    return Constant::getAllOnesValue(I->getType());
+  case Instruction::Mul:
+    return ConstantInt::get(I->getType(), 1);
+  }
+}
+
+/// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
+Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
+                                          Instruction *FI) {
+  // Don't break up min/max patterns. The hasOneUse checks below prevent that
+  // for most cases, but vector min/max with bitcasts can be transformed. If the
+  // one-use restrictions are eased for other patterns, we still don't want to
+  // obfuscate min/max.
+  if ((match(&SI, m_SMin(m_Value(), m_Value())) ||
+       match(&SI, m_SMax(m_Value(), m_Value())) ||
+       match(&SI, m_UMin(m_Value(), m_Value())) ||
+       match(&SI, m_UMax(m_Value(), m_Value()))))
+    return nullptr;
+
+  // If this is a cast from the same type, merge.
+  if (TI->getNumOperands() == 1 && TI->isCast()) {
+    Type *FIOpndTy = FI->getOperand(0)->getType();
+    if (TI->getOperand(0)->getType() != FIOpndTy)
+      return nullptr;
+
+    // The select condition may be a vector. We may only change the operand
+    // type if the vector width remains the same (and matches the condition).
+    Type *CondTy = SI.getCondition()->getType();
+    if (CondTy->isVectorTy()) {
+      if (!FIOpndTy->isVectorTy())
+        return nullptr;
+      if (CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements())
+        return nullptr;
+
+      // TODO: If the backend knew how to deal with casts better, we could
+      // remove this limitation. For now, there's too much potential to create
+      // worse codegen by promoting the select ahead of size-altering casts
+      // (PR28160).
+      //
+      // Note that ValueTracking's matchSelectPattern() looks through casts
+      // without checking 'hasOneUse' when it matches min/max patterns, so this
+      // transform may end up happening anyway.
+      if (TI->getOpcode() != Instruction::BitCast &&
+          (!TI->hasOneUse() || !FI->hasOneUse()))
+        return nullptr;
+
+    } else if (!TI->hasOneUse() || !FI->hasOneUse()) {
+      // TODO: The one-use restrictions for a scalar select could be eased if
+      // the fold of a select in visitLoadInst() was enhanced to match a pattern
+      // that includes a cast.
+      return nullptr;
+    }
+
+    // Fold this by inserting a select from the input values.
+    Value *NewSI =
+        Builder.CreateSelect(SI.getCondition(), TI->getOperand(0),
+                             FI->getOperand(0), SI.getName() + ".v", &SI);
+    return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI,
+                            TI->getType());
+  }
+
+  // Only handle binary operators with one-use here. As with the cast case
+  // above, it may be possible to relax the one-use constraint, but that needs
+  // be examined carefully since it may not reduce the total number of
+  // instructions.
+  BinaryOperator *BO = dyn_cast<BinaryOperator>(TI);
+  if (!BO || !TI->hasOneUse() || !FI->hasOneUse())
+    return nullptr;
+
+  // Figure out if the operations have any operands in common.
+  Value *MatchOp, *OtherOpT, *OtherOpF;
+  bool MatchIsOpZero;
+  if (TI->getOperand(0) == FI->getOperand(0)) {
+    MatchOp  = TI->getOperand(0);
+    OtherOpT = TI->getOperand(1);
+    OtherOpF = FI->getOperand(1);
+    MatchIsOpZero = true;
+  } else if (TI->getOperand(1) == FI->getOperand(1)) {
+    MatchOp  = TI->getOperand(1);
+    OtherOpT = TI->getOperand(0);
+    OtherOpF = FI->getOperand(0);
+    MatchIsOpZero = false;
+  } else if (!TI->isCommutative()) {
+    return nullptr;
+  } else if (TI->getOperand(0) == FI->getOperand(1)) {
+    MatchOp  = TI->getOperand(0);
+    OtherOpT = TI->getOperand(1);
+    OtherOpF = FI->getOperand(0);
+    MatchIsOpZero = true;
+  } else if (TI->getOperand(1) == FI->getOperand(0)) {
+    MatchOp  = TI->getOperand(1);
+    OtherOpT = TI->getOperand(0);
+    OtherOpF = FI->getOperand(1);
+    MatchIsOpZero = true;
+  } else {
+    return nullptr;
+  }
+
+  // If we reach here, they do have operations in common.
+  Value *NewSI = Builder.CreateSelect(SI.getCondition(), OtherOpT, OtherOpF,
+                                      SI.getName() + ".v", &SI);
+  Value *Op0 = MatchIsOpZero ? MatchOp : NewSI;
+  Value *Op1 = MatchIsOpZero ? NewSI : MatchOp;
+  return BinaryOperator::Create(BO->getOpcode(), Op0, Op1);
+}
+
+static bool isSelect01(Constant *C1, Constant *C2) {
+  ConstantInt *C1I = dyn_cast<ConstantInt>(C1);
+  if (!C1I)
+    return false;
+  ConstantInt *C2I = dyn_cast<ConstantInt>(C2);
+  if (!C2I)
+    return false;
+  if (!C1I->isZero() && !C2I->isZero()) // One side must be zero.
+    return false;
+  return C1I->isOne() || C1I->isMinusOne() ||
+         C2I->isOne() || C2I->isMinusOne();
+}
+
+/// Try to fold the select into one of the operands to allow further
+/// optimization.
+Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
+                                            Value *FalseVal) {
+  // See the comment above GetSelectFoldableOperands for a description of the
+  // transformation we are doing here.
+  if (Instruction *TVI = dyn_cast<Instruction>(TrueVal)) {
+    if (TVI->hasOneUse() && TVI->getNumOperands() == 2 &&
+        !isa<Constant>(FalseVal)) {
+      if (unsigned SFO = getSelectFoldableOperands(TVI)) {
+        unsigned OpToFold = 0;
+        if ((SFO & 1) && FalseVal == TVI->getOperand(0)) {
+          OpToFold = 1;
+        } else if ((SFO & 2) && FalseVal == TVI->getOperand(1)) {
+          OpToFold = 2;
+        }
+
+        if (OpToFold) {
+          Constant *C = getSelectFoldableConstant(TVI);
+          Value *OOp = TVI->getOperand(2-OpToFold);
+          // Avoid creating select between 2 constants unless it's selecting
+          // between 0, 1 and -1.
+          if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) {
+            Value *NewSel = Builder.CreateSelect(SI.getCondition(), OOp, C);
+            NewSel->takeName(TVI);
+            BinaryOperator *TVI_BO = cast<BinaryOperator>(TVI);
+            BinaryOperator *BO = BinaryOperator::Create(TVI_BO->getOpcode(),
+                                                        FalseVal, NewSel);
+            BO->copyIRFlags(TVI_BO);
+            return BO;
+          }
+        }
+      }
+    }
+  }
+
+  if (Instruction *FVI = dyn_cast<Instruction>(FalseVal)) {
+    if (FVI->hasOneUse() && FVI->getNumOperands() == 2 &&
+        !isa<Constant>(TrueVal)) {
+      if (unsigned SFO = getSelectFoldableOperands(FVI)) {
+        unsigned OpToFold = 0;
+        if ((SFO & 1) && TrueVal == FVI->getOperand(0)) {
+          OpToFold = 1;
+        } else if ((SFO & 2) && TrueVal == FVI->getOperand(1)) {
+          OpToFold = 2;
+        }
+
+        if (OpToFold) {
+          Constant *C = getSelectFoldableConstant(FVI);
+          Value *OOp = FVI->getOperand(2-OpToFold);
+          // Avoid creating select between 2 constants unless it's selecting
+          // between 0, 1 and -1.
+          if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) {
+            Value *NewSel = Builder.CreateSelect(SI.getCondition(), C, OOp);
+            NewSel->takeName(FVI);
+            BinaryOperator *FVI_BO = cast<BinaryOperator>(FVI);
+            BinaryOperator *BO = BinaryOperator::Create(FVI_BO->getOpcode(),
+                                                        TrueVal, NewSel);
+            BO->copyIRFlags(FVI_BO);
+            return BO;
+          }
+        }
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+/// We want to turn:
+///   (select (icmp eq (and X, C1), 0), Y, (or Y, C2))
+/// into:
+///   (or (shl (and X, C1), C3), Y)
+/// iff:
+///   C1 and C2 are both powers of 2
+/// where:
+///   C3 = Log(C2) - Log(C1)
+///
+/// This transform handles cases where:
+/// 1. The icmp predicate is inverted
+/// 2. The select operands are reversed
+/// 3. The magnitude of C2 and C1 are flipped
+static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
+                                  Value *FalseVal,
+                                  InstCombiner::BuilderTy &Builder) {
+  const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
+  if (!IC || !SI.getType()->isIntegerTy())
+    return nullptr;
+
+  Value *CmpLHS = IC->getOperand(0);
+  Value *CmpRHS = IC->getOperand(1);
+
+  Value *V;
+  unsigned C1Log;
+  bool IsEqualZero;
+  bool NeedAnd = false;
+  if (IC->isEquality()) {
+    if (!match(CmpRHS, m_Zero()))
+      return nullptr;
+
+    const APInt *C1;
+    if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1))))
+      return nullptr;
+
+    V = CmpLHS;
+    C1Log = C1->logBase2();
+    IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_EQ;
+  } else if (IC->getPredicate() == ICmpInst::ICMP_SLT ||
+             IC->getPredicate() == ICmpInst::ICMP_SGT) {
+    // We also need to recognize (icmp slt (trunc (X)), 0) and
+    // (icmp sgt (trunc (X)), -1).
+    IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_SGT;
+    if ((IsEqualZero && !match(CmpRHS, m_AllOnes())) ||
+        (!IsEqualZero && !match(CmpRHS, m_Zero())))
+      return nullptr;
+
+    if (!match(CmpLHS, m_OneUse(m_Trunc(m_Value(V)))))
+      return nullptr;
+
+    C1Log = CmpLHS->getType()->getScalarSizeInBits() - 1;
+    NeedAnd = true;
+  } else {
+    return nullptr;
+  }
+
+  const APInt *C2;
+  bool OrOnTrueVal = false;
+  bool OrOnFalseVal = match(FalseVal, m_Or(m_Specific(TrueVal), m_Power2(C2)));
+  if (!OrOnFalseVal)
+    OrOnTrueVal = match(TrueVal, m_Or(m_Specific(FalseVal), m_Power2(C2)));
+
+  if (!OrOnFalseVal && !OrOnTrueVal)
+    return nullptr;
+
+  Value *Y = OrOnFalseVal ? TrueVal : FalseVal;
+
+  unsigned C2Log = C2->logBase2();
+
+  bool NeedXor = (!IsEqualZero && OrOnFalseVal) || (IsEqualZero && OrOnTrueVal);
+  bool NeedShift = C1Log != C2Log;
+  bool NeedZExtTrunc = Y->getType()->getIntegerBitWidth() !=
+                       V->getType()->getIntegerBitWidth();
+
+  // Make sure we don't create more instructions than we save.
+  Value *Or = OrOnFalseVal ? FalseVal : TrueVal;
+  if ((NeedShift + NeedXor + NeedZExtTrunc) >
+      (IC->hasOneUse() + Or->hasOneUse()))
+    return nullptr;
+
+  if (NeedAnd) {
+    // Insert the AND instruction on the input to the truncate.
+    APInt C1 = APInt::getOneBitSet(V->getType()->getScalarSizeInBits(), C1Log);
+    V = Builder.CreateAnd(V, ConstantInt::get(V->getType(), C1));
+  }
+
+  if (C2Log > C1Log) {
+    V = Builder.CreateZExtOrTrunc(V, Y->getType());
+    V = Builder.CreateShl(V, C2Log - C1Log);
+  } else if (C1Log > C2Log) {
+    V = Builder.CreateLShr(V, C1Log - C2Log);
+    V = Builder.CreateZExtOrTrunc(V, Y->getType());
+  } else
+    V = Builder.CreateZExtOrTrunc(V, Y->getType());
+
+  if (NeedXor)
+    V = Builder.CreateXor(V, *C2);
+
+  return Builder.CreateOr(V, Y);
+}
+
+/// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single
+/// call to cttz/ctlz with flag 'is_zero_undef' cleared.
+///
+/// For example, we can fold the following code sequence:
+/// \code
+///   %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+///   %1 = icmp ne i32 %x, 0
+///   %2 = select i1 %1, i32 %0, i32 32
+/// \code
+///
+/// into:
+///   %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
+                                 InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+
+  // Check if the condition value compares a value for equality against zero.
+  if (!ICI->isEquality() || !match(CmpRHS, m_Zero()))
+    return nullptr;
+
+  Value *Count = FalseVal;
+  Value *ValueOnZero = TrueVal;
+  if (Pred == ICmpInst::ICMP_NE)
+    std::swap(Count, ValueOnZero);
+
+  // Skip zero extend/truncate.
+  Value *V = nullptr;
+  if (match(Count, m_ZExt(m_Value(V))) ||
+      match(Count, m_Trunc(m_Value(V))))
+    Count = V;
+
+  // Check if the value propagated on zero is a constant number equal to the
+  // sizeof in bits of 'Count'.
+  unsigned SizeOfInBits = Count->getType()->getScalarSizeInBits();
+  if (!match(ValueOnZero, m_SpecificInt(SizeOfInBits)))
+    return nullptr;
+
+  // Check that 'Count' is a call to intrinsic cttz/ctlz. Also check that the
+  // input to the cttz/ctlz is used as LHS for the compare instruction.
+  if (match(Count, m_Intrinsic<Intrinsic::cttz>(m_Specific(CmpLHS))) ||
+      match(Count, m_Intrinsic<Intrinsic::ctlz>(m_Specific(CmpLHS)))) {
+    IntrinsicInst *II = cast<IntrinsicInst>(Count);
+    // Explicitly clear the 'undef_on_zero' flag.
+    IntrinsicInst *NewI = cast<IntrinsicInst>(II->clone());
+    Type *Ty = NewI->getArgOperand(1)->getType();
+    NewI->setArgOperand(1, Constant::getNullValue(Ty));
+    Builder.Insert(NewI);
+    return Builder.CreateZExtOrTrunc(NewI, ValueOnZero->getType());
+  }
+
+  return nullptr;
+}
+
+/// Return true if we find and adjust an icmp+select pattern where the compare
+/// is with a constant that can be incremented or decremented to match the
+/// minimum or maximum idiom.
+static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) {
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *CmpLHS = Cmp.getOperand(0);
+  Value *CmpRHS = Cmp.getOperand(1);
+  Value *TrueVal = Sel.getTrueValue();
+  Value *FalseVal = Sel.getFalseValue();
+
+  // We may move or edit the compare, so make sure the select is the only user.
+  const APInt *CmpC;
+  if (!Cmp.hasOneUse() || !match(CmpRHS, m_APInt(CmpC)))
+    return false;
+
+  // These transforms only work for selects of integers or vector selects of
+  // integer vectors.
+  Type *SelTy = Sel.getType();
+  auto *SelEltTy = dyn_cast<IntegerType>(SelTy->getScalarType());
+  if (!SelEltTy || SelTy->isVectorTy() != Cmp.getType()->isVectorTy())
+    return false;
+
+  Constant *AdjustedRHS;
+  if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_SGT)
+    AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC + 1);
+  else if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SLT)
+    AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC - 1);
+  else
+    return false;
+
+  // X > C ? X : C+1  -->  X < C+1 ? C+1 : X
+  // X < C ? X : C-1  -->  X > C-1 ? C-1 : X
+  if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) ||
+      (CmpLHS == FalseVal && AdjustedRHS == TrueVal)) {
+    ; // Nothing to do here. Values match without any sign/zero extension.
+  }
+  // Types do not match. Instead of calculating this with mixed types, promote
+  // all to the larger type. This enables scalar evolution to analyze this
+  // expression.
+  else if (CmpRHS->getType()->getScalarSizeInBits() < SelEltTy->getBitWidth()) {
+    Constant *SextRHS = ConstantExpr::getSExt(AdjustedRHS, SelTy);
+
+    // X = sext x; x >s c ? X : C+1 --> X = sext x; X <s C+1 ? C+1 : X
+    // X = sext x; x <s c ? X : C-1 --> X = sext x; X >s C-1 ? C-1 : X
+    // X = sext x; x >u c ? X : C+1 --> X = sext x; X <u C+1 ? C+1 : X
+    // X = sext x; x <u c ? X : C-1 --> X = sext x; X >u C-1 ? C-1 : X
+    if (match(TrueVal, m_SExt(m_Specific(CmpLHS))) && SextRHS == FalseVal) {
+      CmpLHS = TrueVal;
+      AdjustedRHS = SextRHS;
+    } else if (match(FalseVal, m_SExt(m_Specific(CmpLHS))) &&
+               SextRHS == TrueVal) {
+      CmpLHS = FalseVal;
+      AdjustedRHS = SextRHS;
+    } else if (Cmp.isUnsigned()) {
+      Constant *ZextRHS = ConstantExpr::getZExt(AdjustedRHS, SelTy);
+      // X = zext x; x >u c ? X : C+1 --> X = zext x; X <u C+1 ? C+1 : X
+      // X = zext x; x <u c ? X : C-1 --> X = zext x; X >u C-1 ? C-1 : X
+      // zext + signed compare cannot be changed:
+      //    0xff <s 0x00, but 0x00ff >s 0x0000
+      if (match(TrueVal, m_ZExt(m_Specific(CmpLHS))) && ZextRHS == FalseVal) {
+        CmpLHS = TrueVal;
+        AdjustedRHS = ZextRHS;
+      } else if (match(FalseVal, m_ZExt(m_Specific(CmpLHS))) &&
+                 ZextRHS == TrueVal) {
+        CmpLHS = FalseVal;
+        AdjustedRHS = ZextRHS;
+      } else {
+        return false;
+      }
+    } else {
+      return false;
+    }
+  } else {
+    return false;
+  }
+
+  Pred = ICmpInst::getSwappedPredicate(Pred);
+  CmpRHS = AdjustedRHS;
+  std::swap(FalseVal, TrueVal);
+  Cmp.setPredicate(Pred);
+  Cmp.setOperand(0, CmpLHS);
+  Cmp.setOperand(1, CmpRHS);
+  Sel.setOperand(1, TrueVal);
+  Sel.setOperand(2, FalseVal);
+  Sel.swapProfMetadata();
+
+  // Move the compare instruction right before the select instruction. Otherwise
+  // the sext/zext value may be defined after the compare instruction uses it.
+  Cmp.moveBefore(&Sel);
+
+  return true;
+}
+
+/// If this is an integer min/max (icmp + select) with a constant operand,
+/// create the canonical icmp for the min/max operation and canonicalize the
+/// constant to the 'false' operand of the select:
+/// select (icmp Pred X, C1), C2, X --> select (icmp Pred' X, C2), X, C2
+/// Note: if C1 != C2, this will change the icmp constant to the existing
+/// constant operand of the select.
+static Instruction *
+canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
+                               InstCombiner::BuilderTy &Builder) {
+  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
+    return nullptr;
+
+  // Canonicalize the compare predicate based on whether we have min or max.
+  Value *LHS, *RHS;
+  ICmpInst::Predicate NewPred;
+  SelectPatternResult SPR = matchSelectPattern(&Sel, LHS, RHS);
+  switch (SPR.Flavor) {
+  case SPF_SMIN: NewPred = ICmpInst::ICMP_SLT; break;
+  case SPF_UMIN: NewPred = ICmpInst::ICMP_ULT; break;
+  case SPF_SMAX: NewPred = ICmpInst::ICMP_SGT; break;
+  case SPF_UMAX: NewPred = ICmpInst::ICMP_UGT; break;
+  default: return nullptr;
+  }
+
+  // Is this already canonical?
+  if (Cmp.getOperand(0) == LHS && Cmp.getOperand(1) == RHS &&
+      Cmp.getPredicate() == NewPred)
+    return nullptr;
+
+  // Create the canonical compare and plug it into the select.
+  Sel.setCondition(Builder.CreateICmp(NewPred, LHS, RHS));
+
+  // If the select operands did not change, we're done.
+  if (Sel.getTrueValue() == LHS && Sel.getFalseValue() == RHS)
+    return &Sel;
+
+  // If we are swapping the select operands, swap the metadata too.
+  assert(Sel.getTrueValue() == RHS && Sel.getFalseValue() == LHS &&
+         "Unexpected results from matchSelectPattern");
+  Sel.setTrueValue(LHS);
+  Sel.setFalseValue(RHS);
+  Sel.swapProfMetadata();
+  return &Sel;
+}
+
+/// Visit a SelectInst that has an ICmpInst as its first operand.
+Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
+                                                  ICmpInst *ICI) {
+  if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, Builder))
+    return NewSel;
+
+  bool Changed = adjustMinMax(SI, *ICI);
+
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+
+  // Transform (X >s -1) ? C1 : C2 --> ((X >>s 31) & (C2 - C1)) + C1
+  // and       (X <s  0) ? C2 : C1 --> ((X >>s 31) & (C2 - C1)) + C1
+  // FIXME: Type and constness constraints could be lifted, but we have to
+  //        watch code size carefully. We should consider xor instead of
+  //        sub/add when we decide to do that.
+  if (IntegerType *Ty = dyn_cast<IntegerType>(CmpLHS->getType())) {
+    if (TrueVal->getType() == Ty) {
+      if (ConstantInt *Cmp = dyn_cast<ConstantInt>(CmpRHS)) {
+        ConstantInt *C1 = nullptr, *C2 = nullptr;
+        if (Pred == ICmpInst::ICMP_SGT && Cmp->isMinusOne()) {
+          C1 = dyn_cast<ConstantInt>(TrueVal);
+          C2 = dyn_cast<ConstantInt>(FalseVal);
+        } else if (Pred == ICmpInst::ICMP_SLT && Cmp->isZero()) {
+          C1 = dyn_cast<ConstantInt>(FalseVal);
+          C2 = dyn_cast<ConstantInt>(TrueVal);
+        }
+        if (C1 && C2) {
+          // This shift results in either -1 or 0.
+          Value *AShr = Builder.CreateAShr(CmpLHS, Ty->getBitWidth() - 1);
+
+          // Check if we can express the operation with a single or.
+          if (C2->isMinusOne())
+            return replaceInstUsesWith(SI, Builder.CreateOr(AShr, C1));
+
+          Value *And = Builder.CreateAnd(AShr, C2->getValue() - C1->getValue());
+          return replaceInstUsesWith(SI, Builder.CreateAdd(And, C1));
+        }
+      }
+    }
+  }
+
+  // NOTE: if we wanted to, this is where to detect integer MIN/MAX
+
+  if (CmpRHS != CmpLHS && isa<Constant>(CmpRHS)) {
+    if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) {
+      // Transform (X == C) ? X : Y -> (X == C) ? C : Y
+      SI.setOperand(1, CmpRHS);
+      Changed = true;
+    } else if (CmpLHS == FalseVal && Pred == ICmpInst::ICMP_NE) {
+      // Transform (X != C) ? Y : X -> (X != C) ? Y : C
+      SI.setOperand(2, CmpRHS);
+      Changed = true;
+    }
+  }
+
+  // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring
+  // decomposeBitTestICmp() might help.
+  {
+    unsigned BitWidth =
+        DL.getTypeSizeInBits(TrueVal->getType()->getScalarType());
+    APInt MinSignedValue = APInt::getSignedMinValue(BitWidth);
+    Value *X;
+    const APInt *Y, *C;
+    bool TrueWhenUnset;
+    bool IsBitTest = false;
+    if (ICmpInst::isEquality(Pred) &&
+        match(CmpLHS, m_And(m_Value(X), m_Power2(Y))) &&
+        match(CmpRHS, m_Zero())) {
+      IsBitTest = true;
+      TrueWhenUnset = Pred == ICmpInst::ICMP_EQ;
+    } else if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_Zero())) {
+      X = CmpLHS;
+      Y = &MinSignedValue;
+      IsBitTest = true;
+      TrueWhenUnset = false;
+    } else if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_AllOnes())) {
+      X = CmpLHS;
+      Y = &MinSignedValue;
+      IsBitTest = true;
+      TrueWhenUnset = true;
+    }
+    if (IsBitTest) {
+      Value *V = nullptr;
+      // (X & Y) == 0 ? X : X ^ Y  --> X & ~Y
+      if (TrueWhenUnset && TrueVal == X &&
+          match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+        V = Builder.CreateAnd(X, ~(*Y));
+      // (X & Y) != 0 ? X ^ Y : X  --> X & ~Y
+      else if (!TrueWhenUnset && FalseVal == X &&
+               match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+        V = Builder.CreateAnd(X, ~(*Y));
+      // (X & Y) == 0 ? X ^ Y : X  --> X | Y
+      else if (TrueWhenUnset && FalseVal == X &&
+               match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+        V = Builder.CreateOr(X, *Y);
+      // (X & Y) != 0 ? X : X ^ Y  --> X | Y
+      else if (!TrueWhenUnset && TrueVal == X &&
+               match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+        V = Builder.CreateOr(X, *Y);
+
+      if (V)
+        return replaceInstUsesWith(SI, V);
+    }
+  }
+
+  if (Value *V = foldSelectICmpAndOr(SI, TrueVal, FalseVal, Builder))
+    return replaceInstUsesWith(SI, V);
+
+  if (Value *V = foldSelectCttzCtlz(ICI, TrueVal, FalseVal, Builder))
+    return replaceInstUsesWith(SI, V);
+
+  return Changed ? &SI : nullptr;
+}
+
+
+/// SI is a select whose condition is a PHI node (but the two may be in
+/// different blocks). See if the true/false values (V) are live in all of the
+/// predecessor blocks of the PHI. For example, cases like this can't be mapped:
+///
+///   X = phi [ C1, BB1], [C2, BB2]
+///   Y = add
+///   Z = select X, Y, 0
+///
+/// because Y is not live in BB1/BB2.
+///
+static bool canSelectOperandBeMappingIntoPredBlock(const Value *V,
+                                                   const SelectInst &SI) {
+  // If the value is a non-instruction value like a constant or argument, it
+  // can always be mapped.
+  const Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return true;
+
+  // If V is a PHI node defined in the same block as the condition PHI, we can
+  // map the arguments.
+  const PHINode *CondPHI = cast<PHINode>(SI.getCondition());
+
+  if (const PHINode *VP = dyn_cast<PHINode>(I))
+    if (VP->getParent() == CondPHI->getParent())
+      return true;
+
+  // Otherwise, if the PHI and select are defined in the same block and if V is
+  // defined in a different block, then we can transform it.
+  if (SI.getParent() == CondPHI->getParent() &&
+      I->getParent() != CondPHI->getParent())
+    return true;
+
+  // Otherwise we have a 'hard' case and we can't tell without doing more
+  // detailed dominator based analysis, punt.
+  return false;
+}
+
+/// We have an SPF (e.g. a min or max) of an SPF of the form:
+///   SPF2(SPF1(A, B), C)
+Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
+                                        SelectPatternFlavor SPF1,
+                                        Value *A, Value *B,
+                                        Instruction &Outer,
+                                        SelectPatternFlavor SPF2, Value *C) {
+  if (Outer.getType() != Inner->getType())
+    return nullptr;
+
+  if (C == A || C == B) {
+    // MAX(MAX(A, B), B) -> MAX(A, B)
+    // MIN(MIN(a, b), a) -> MIN(a, b)
+    if (SPF1 == SPF2)
+      return replaceInstUsesWith(Outer, Inner);
+
+    // MAX(MIN(a, b), a) -> a
+    // MIN(MAX(a, b), a) -> a
+    if ((SPF1 == SPF_SMIN && SPF2 == SPF_SMAX) ||
+        (SPF1 == SPF_SMAX && SPF2 == SPF_SMIN) ||
+        (SPF1 == SPF_UMIN && SPF2 == SPF_UMAX) ||
+        (SPF1 == SPF_UMAX && SPF2 == SPF_UMIN))
+      return replaceInstUsesWith(Outer, C);
+  }
+
+  if (SPF1 == SPF2) {
+    const APInt *CB, *CC;
+    if (match(B, m_APInt(CB)) && match(C, m_APInt(CC))) {
+      // MIN(MIN(A, 23), 97) -> MIN(A, 23)
+      // MAX(MAX(A, 97), 23) -> MAX(A, 97)
+      if ((SPF1 == SPF_UMIN && CB->ule(*CC)) ||
+          (SPF1 == SPF_SMIN && CB->sle(*CC)) ||
+          (SPF1 == SPF_UMAX && CB->uge(*CC)) ||
+          (SPF1 == SPF_SMAX && CB->sge(*CC)))
+        return replaceInstUsesWith(Outer, Inner);
+
+      // MIN(MIN(A, 97), 23) -> MIN(A, 23)
+      // MAX(MAX(A, 23), 97) -> MAX(A, 97)
+      if ((SPF1 == SPF_UMIN && CB->ugt(*CC)) ||
+          (SPF1 == SPF_SMIN && CB->sgt(*CC)) ||
+          (SPF1 == SPF_UMAX && CB->ult(*CC)) ||
+          (SPF1 == SPF_SMAX && CB->slt(*CC))) {
+        Outer.replaceUsesOfWith(Inner, A);
+        return &Outer;
+      }
+    }
+  }
+
+  // ABS(ABS(X)) -> ABS(X)
+  // NABS(NABS(X)) -> NABS(X)
+  if (SPF1 == SPF2 && (SPF1 == SPF_ABS || SPF1 == SPF_NABS)) {
+    return replaceInstUsesWith(Outer, Inner);
+  }
+
+  // ABS(NABS(X)) -> ABS(X)
+  // NABS(ABS(X)) -> NABS(X)
+  if ((SPF1 == SPF_ABS && SPF2 == SPF_NABS) ||
+      (SPF1 == SPF_NABS && SPF2 == SPF_ABS)) {
+    SelectInst *SI = cast<SelectInst>(Inner);
+    Value *NewSI =
+        Builder.CreateSelect(SI->getCondition(), SI->getFalseValue(),
+                             SI->getTrueValue(), SI->getName(), SI);
+    return replaceInstUsesWith(Outer, NewSI);
+  }
+
+  auto IsFreeOrProfitableToInvert =
+      [&](Value *V, Value *&NotV, bool &ElidesXor) {
+    if (match(V, m_Not(m_Value(NotV)))) {
+      // If V has at most 2 uses then we can get rid of the xor operation
+      // entirely.
+      ElidesXor |= !V->hasNUsesOrMore(3);
+      return true;
+    }
+
+    if (IsFreeToInvert(V, !V->hasNUsesOrMore(3))) {
+      NotV = nullptr;
+      return true;
+    }
+
+    return false;
+  };
+
+  Value *NotA, *NotB, *NotC;
+  bool ElidesXor = false;
+
+  // MIN(MIN(~A, ~B), ~C) == ~MAX(MAX(A, B), C)
+  // MIN(MAX(~A, ~B), ~C) == ~MAX(MIN(A, B), C)
+  // MAX(MIN(~A, ~B), ~C) == ~MIN(MAX(A, B), C)
+  // MAX(MAX(~A, ~B), ~C) == ~MIN(MIN(A, B), C)
+  //
+  // This transform is performance neutral if we can elide at least one xor from
+  // the set of three operands, since we'll be tacking on an xor at the very
+  // end.
+  if (SelectPatternResult::isMinOrMax(SPF1) &&
+      SelectPatternResult::isMinOrMax(SPF2) &&
+      IsFreeOrProfitableToInvert(A, NotA, ElidesXor) &&
+      IsFreeOrProfitableToInvert(B, NotB, ElidesXor) &&
+      IsFreeOrProfitableToInvert(C, NotC, ElidesXor) && ElidesXor) {
+    if (!NotA)
+      NotA = Builder.CreateNot(A);
+    if (!NotB)
+      NotB = Builder.CreateNot(B);
+    if (!NotC)
+      NotC = Builder.CreateNot(C);
+
+    Value *NewInner = generateMinMaxSelectPattern(
+        Builder, getInverseMinMaxSelectPattern(SPF1), NotA, NotB);
+    Value *NewOuter = Builder.CreateNot(generateMinMaxSelectPattern(
+        Builder, getInverseMinMaxSelectPattern(SPF2), NewInner, NotC));
+    return replaceInstUsesWith(Outer, NewOuter);
+  }
+
+  return nullptr;
+}
+
+/// If one of the constants is zero (we know they can't both be) and we have an
+/// icmp instruction with zero, and we have an 'and' with the non-constant value
+/// and a power of two we can turn the select into a shift on the result of the
+/// 'and'.
+static Value *foldSelectICmpAnd(const SelectInst &SI, APInt TrueVal,
+                                APInt FalseVal,
+                                InstCombiner::BuilderTy &Builder) {
+  const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
+  if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy())
+    return nullptr;
+
+  if (!match(IC->getOperand(1), m_Zero()))
+    return nullptr;
+
+  ConstantInt *AndRHS;
+  Value *LHS = IC->getOperand(0);
+  if (!match(LHS, m_And(m_Value(), m_ConstantInt(AndRHS))))
+    return nullptr;
+
+  // If both select arms are non-zero see if we have a select of the form
+  // 'x ? 2^n + C : C'. Then we can offset both arms by C, use the logic
+  // for 'x ? 2^n : 0' and fix the thing up at the end.
+  APInt Offset(TrueVal.getBitWidth(), 0);
+  if (!TrueVal.isNullValue() && !FalseVal.isNullValue()) {
+    if ((TrueVal - FalseVal).isPowerOf2())
+      Offset = FalseVal;
+    else if ((FalseVal - TrueVal).isPowerOf2())
+      Offset = TrueVal;
+    else
+      return nullptr;
+
+    // Adjust TrueVal and FalseVal to the offset.
+    TrueVal -= Offset;
+    FalseVal -= Offset;
+  }
+
+  // Make sure the mask in the 'and' and one of the select arms is a power of 2.
+  if (!AndRHS->getValue().isPowerOf2() ||
+      (!TrueVal.isPowerOf2() && !FalseVal.isPowerOf2()))
+    return nullptr;
+
+  // Determine which shift is needed to transform result of the 'and' into the
+  // desired result.
+  const APInt &ValC = !TrueVal.isNullValue() ? TrueVal : FalseVal;
+  unsigned ValZeros = ValC.logBase2();
+  unsigned AndZeros = AndRHS->getValue().logBase2();
+
+  // If types don't match we can still convert the select by introducing a zext
+  // or a trunc of the 'and'. The trunc case requires that all of the truncated
+  // bits are zero, we can figure that out by looking at the 'and' mask.
+  if (AndZeros >= ValC.getBitWidth())
+    return nullptr;
+
+  Value *V = Builder.CreateZExtOrTrunc(LHS, SI.getType());
+  if (ValZeros > AndZeros)
+    V = Builder.CreateShl(V, ValZeros - AndZeros);
+  else if (ValZeros < AndZeros)
+    V = Builder.CreateLShr(V, AndZeros - ValZeros);
+
+  // Okay, now we know that everything is set up, we just don't know whether we
+  // have a icmp_ne or icmp_eq and whether the true or false val is the zero.
+  bool ShouldNotVal = !TrueVal.isNullValue();
+  ShouldNotVal ^= IC->getPredicate() == ICmpInst::ICMP_NE;
+  if (ShouldNotVal)
+    V = Builder.CreateXor(V, ValC);
+
+  // Apply an offset if needed.
+  if (!Offset.isNullValue())
+    V = Builder.CreateAdd(V, ConstantInt::get(V->getType(), Offset));
+  return V;
+}
+
+/// Turn select C, (X + Y), (X - Y) --> (X + (select C, Y, (-Y))).
+/// This is even legal for FP.
+static Instruction *foldAddSubSelect(SelectInst &SI,
+                                     InstCombiner::BuilderTy &Builder) {
+  Value *CondVal = SI.getCondition();
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+  auto *TI = dyn_cast<Instruction>(TrueVal);
+  auto *FI = dyn_cast<Instruction>(FalseVal);
+  if (!TI || !FI || !TI->hasOneUse() || !FI->hasOneUse())
+    return nullptr;
+
+  Instruction *AddOp = nullptr, *SubOp = nullptr;
+  if ((TI->getOpcode() == Instruction::Sub &&
+       FI->getOpcode() == Instruction::Add) ||
+      (TI->getOpcode() == Instruction::FSub &&
+       FI->getOpcode() == Instruction::FAdd)) {
+    AddOp = FI;
+    SubOp = TI;
+  } else if ((FI->getOpcode() == Instruction::Sub &&
+              TI->getOpcode() == Instruction::Add) ||
+             (FI->getOpcode() == Instruction::FSub &&
+              TI->getOpcode() == Instruction::FAdd)) {
+    AddOp = TI;
+    SubOp = FI;
+  }
+
+  if (AddOp) {
+    Value *OtherAddOp = nullptr;
+    if (SubOp->getOperand(0) == AddOp->getOperand(0)) {
+      OtherAddOp = AddOp->getOperand(1);
+    } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) {
+      OtherAddOp = AddOp->getOperand(0);
+    }
+
+    if (OtherAddOp) {
+      // So at this point we know we have (Y -> OtherAddOp):
+      //        select C, (add X, Y), (sub X, Z)
+      Value *NegVal; // Compute -Z
+      if (SI.getType()->isFPOrFPVectorTy()) {
+        NegVal = Builder.CreateFNeg(SubOp->getOperand(1));
+        if (Instruction *NegInst = dyn_cast<Instruction>(NegVal)) {
+          FastMathFlags Flags = AddOp->getFastMathFlags();
+          Flags &= SubOp->getFastMathFlags();
+          NegInst->setFastMathFlags(Flags);
+        }
+      } else {
+        NegVal = Builder.CreateNeg(SubOp->getOperand(1));
+      }
+
+      Value *NewTrueOp = OtherAddOp;
+      Value *NewFalseOp = NegVal;
+      if (AddOp != TI)
+        std::swap(NewTrueOp, NewFalseOp);
+      Value *NewSel = Builder.CreateSelect(CondVal, NewTrueOp, NewFalseOp,
+                                           SI.getName() + ".p", &SI);
+
+      if (SI.getType()->isFPOrFPVectorTy()) {
+        Instruction *RI =
+            BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel);
+
+        FastMathFlags Flags = AddOp->getFastMathFlags();
+        Flags &= SubOp->getFastMathFlags();
+        RI->setFastMathFlags(Flags);
+        return RI;
+      } else
+        return BinaryOperator::CreateAdd(SubOp->getOperand(0), NewSel);
+    }
+  }
+  return nullptr;
+}
+
+Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) {
+  Instruction *ExtInst;
+  if (!match(Sel.getTrueValue(), m_Instruction(ExtInst)) &&
+      !match(Sel.getFalseValue(), m_Instruction(ExtInst)))
+    return nullptr;
+
+  auto ExtOpcode = ExtInst->getOpcode();
+  if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt)
+    return nullptr;
+
+  // TODO: Handle larger types? That requires adjusting FoldOpIntoSelect too.
+  Value *X = ExtInst->getOperand(0);
+  Type *SmallType = X->getType();
+  if (!SmallType->isIntOrIntVectorTy(1))
+    return nullptr;
+
+  Constant *C;
+  if (!match(Sel.getTrueValue(), m_Constant(C)) &&
+      !match(Sel.getFalseValue(), m_Constant(C)))
+    return nullptr;
+
+  // If the constant is the same after truncation to the smaller type and
+  // extension to the original type, we can narrow the select.
+  Value *Cond = Sel.getCondition();
+  Type *SelType = Sel.getType();
+  Constant *TruncC = ConstantExpr::getTrunc(C, SmallType);
+  Constant *ExtC = ConstantExpr::getCast(ExtOpcode, TruncC, SelType);
+  if (ExtC == C) {
+    Value *TruncCVal = cast<Value>(TruncC);
+    if (ExtInst == Sel.getFalseValue())
+      std::swap(X, TruncCVal);
+
+    // select Cond, (ext X), C --> ext(select Cond, X, C')
+    // select Cond, C, (ext X) --> ext(select Cond, C', X)
+    Value *NewSel = Builder.CreateSelect(Cond, X, TruncCVal, "narrow", &Sel);
+    return CastInst::Create(Instruction::CastOps(ExtOpcode), NewSel, SelType);
+  }
+
+  // If one arm of the select is the extend of the condition, replace that arm
+  // with the extension of the appropriate known bool value.
+  if (Cond == X) {
+    if (ExtInst == Sel.getTrueValue()) {
+      // select X, (sext X), C --> select X, -1, C
+      // select X, (zext X), C --> select X,  1, C
+      Constant *One = ConstantInt::getTrue(SmallType);
+      Constant *AllOnesOrOne = ConstantExpr::getCast(ExtOpcode, One, SelType);
+      return SelectInst::Create(Cond, AllOnesOrOne, C, "", nullptr, &Sel);
+    } else {
+      // select X, C, (sext X) --> select X, C, 0
+      // select X, C, (zext X) --> select X, C, 0
+      Constant *Zero = ConstantInt::getNullValue(SelType);
+      return SelectInst::Create(Cond, C, Zero, "", nullptr, &Sel);
+    }
+  }
+
+  return nullptr;
+}
+
+/// Try to transform a vector select with a constant condition vector into a
+/// shuffle for easier combining with other shuffles and insert/extract.
+static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
+  Value *CondVal = SI.getCondition();
+  Constant *CondC;
+  if (!CondVal->getType()->isVectorTy() || !match(CondVal, m_Constant(CondC)))
+    return nullptr;
+
+  unsigned NumElts = CondVal->getType()->getVectorNumElements();
+  SmallVector<Constant *, 16> Mask;
+  Mask.reserve(NumElts);
+  Type *Int32Ty = Type::getInt32Ty(CondVal->getContext());
+  for (unsigned i = 0; i != NumElts; ++i) {
+    Constant *Elt = CondC->getAggregateElement(i);
+    if (!Elt)
+      return nullptr;
+
+    if (Elt->isOneValue()) {
+      // If the select condition element is true, choose from the 1st vector.
+      Mask.push_back(ConstantInt::get(Int32Ty, i));
+    } else if (Elt->isNullValue()) {
+      // If the select condition element is false, choose from the 2nd vector.
+      Mask.push_back(ConstantInt::get(Int32Ty, i + NumElts));
+    } else if (isa<UndefValue>(Elt)) {
+      // Undef in a select condition (choose one of the operands) does not mean
+      // the same thing as undef in a shuffle mask (any value is acceptable), so
+      // give up.
+      return nullptr;
+    } else {
+      // Bail out on a constant expression.
+      return nullptr;
+    }
+  }
+
+  return new ShuffleVectorInst(SI.getTrueValue(), SI.getFalseValue(),
+                               ConstantVector::get(Mask));
+}
+
+/// Reuse bitcasted operands between a compare and select:
+/// select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) -->
+/// bitcast (select (cmp (bitcast C), (bitcast D)), (bitcast C), (bitcast D))
+static Instruction *foldSelectCmpBitcasts(SelectInst &Sel,
+                                          InstCombiner::BuilderTy &Builder) {
+  Value *Cond = Sel.getCondition();
+  Value *TVal = Sel.getTrueValue();
+  Value *FVal = Sel.getFalseValue();
+
+  CmpInst::Predicate Pred;
+  Value *A, *B;
+  if (!match(Cond, m_Cmp(Pred, m_Value(A), m_Value(B))))
+    return nullptr;
+
+  // The select condition is a compare instruction. If the select's true/false
+  // values are already the same as the compare operands, there's nothing to do.
+  if (TVal == A || TVal == B || FVal == A || FVal == B)
+    return nullptr;
+
+  Value *C, *D;
+  if (!match(A, m_BitCast(m_Value(C))) || !match(B, m_BitCast(m_Value(D))))
+    return nullptr;
+
+  // select (cmp (bitcast C), (bitcast D)), (bitcast TSrc), (bitcast FSrc)
+  Value *TSrc, *FSrc;
+  if (!match(TVal, m_BitCast(m_Value(TSrc))) ||
+      !match(FVal, m_BitCast(m_Value(FSrc))))
+    return nullptr;
+
+  // If the select true/false values are *different bitcasts* of the same source
+  // operands, make the select operands the same as the compare operands and
+  // cast the result. This is the canonical select form for min/max.
+  Value *NewSel;
+  if (TSrc == C && FSrc == D) {
+    // select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) -->
+    // bitcast (select (cmp A, B), A, B)
+    NewSel = Builder.CreateSelect(Cond, A, B, "", &Sel);
+  } else if (TSrc == D && FSrc == C) {
+    // select (cmp (bitcast C), (bitcast D)), (bitcast' D), (bitcast' C) -->
+    // bitcast (select (cmp A, B), B, A)
+    NewSel = Builder.CreateSelect(Cond, B, A, "", &Sel);
+  } else {
+    return nullptr;
+  }
+  return CastInst::CreateBitOrPointerCast(NewSel, Sel.getType());
+}
+
+Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
+  Value *CondVal = SI.getCondition();
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+  Type *SelType = SI.getType();
+
+  if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal,
+                                    SQ.getWithInstruction(&SI)))
+    return replaceInstUsesWith(SI, V);
+
+  if (Instruction *I = canonicalizeSelectToShuffle(SI))
+    return I;
+
+  // Canonicalize a one-use integer compare with a non-canonical predicate by
+  // inverting the predicate and swapping the select operands. This matches a
+  // compare canonicalization for conditional branches.
+  // TODO: Should we do the same for FP compares?
+  CmpInst::Predicate Pred;
+  if (match(CondVal, m_OneUse(m_ICmp(Pred, m_Value(), m_Value()))) &&
+      !isCanonicalPredicate(Pred)) {
+    // Swap true/false values and condition.
+    CmpInst *Cond = cast<CmpInst>(CondVal);
+    Cond->setPredicate(CmpInst::getInversePredicate(Pred));
+    SI.setOperand(1, FalseVal);
+    SI.setOperand(2, TrueVal);
+    SI.swapProfMetadata();
+    Worklist.Add(Cond);
+    return &SI;
+  }
+
+  if (SelType->isIntOrIntVectorTy(1) &&
+      TrueVal->getType() == CondVal->getType()) {
+    if (match(TrueVal, m_One())) {
+      // Change: A = select B, true, C --> A = or B, C
+      return BinaryOperator::CreateOr(CondVal, FalseVal);
+    }
+    if (match(TrueVal, m_Zero())) {
+      // Change: A = select B, false, C --> A = and !B, C
+      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
+      return BinaryOperator::CreateAnd(NotCond, FalseVal);
+    }
+    if (match(FalseVal, m_Zero())) {
+      // Change: A = select B, C, false --> A = and B, C
+      return BinaryOperator::CreateAnd(CondVal, TrueVal);
+    }
+    if (match(FalseVal, m_One())) {
+      // Change: A = select B, C, true --> A = or !B, C
+      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
+      return BinaryOperator::CreateOr(NotCond, TrueVal);
+    }
+
+    // select a, a, b  -> a | b
+    // select a, b, a  -> a & b
+    if (CondVal == TrueVal)
+      return BinaryOperator::CreateOr(CondVal, FalseVal);
+    if (CondVal == FalseVal)
+      return BinaryOperator::CreateAnd(CondVal, TrueVal);
+
+    // select a, ~a, b -> (~a) & b
+    // select a, b, ~a -> (~a) | b
+    if (match(TrueVal, m_Not(m_Specific(CondVal))))
+      return BinaryOperator::CreateAnd(TrueVal, FalseVal);
+    if (match(FalseVal, m_Not(m_Specific(CondVal))))
+      return BinaryOperator::CreateOr(TrueVal, FalseVal);
+  }
+
+  // Selecting between two integer or vector splat integer constants?
+  //
+  // Note that we don't handle a scalar select of vectors:
+  // select i1 %c, <2 x i8> <1, 1>, <2 x i8> <0, 0>
+  // because that may need 3 instructions to splat the condition value:
+  // extend, insertelement, shufflevector.
+  if (SelType->isIntOrIntVectorTy() &&
+      CondVal->getType()->isVectorTy() == SelType->isVectorTy()) {
+    // select C, 1, 0 -> zext C to int
+    if (match(TrueVal, m_One()) && match(FalseVal, m_Zero()))
+      return new ZExtInst(CondVal, SelType);
+
+    // select C, -1, 0 -> sext C to int
+    if (match(TrueVal, m_AllOnes()) && match(FalseVal, m_Zero()))
+      return new SExtInst(CondVal, SelType);
+
+    // select C, 0, 1 -> zext !C to int
+    if (match(TrueVal, m_Zero()) && match(FalseVal, m_One())) {
+      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
+      return new ZExtInst(NotCond, SelType);
+    }
+
+    // select C, 0, -1 -> sext !C to int
+    if (match(TrueVal, m_Zero()) && match(FalseVal, m_AllOnes())) {
+      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
+      return new SExtInst(NotCond, SelType);
+    }
+  }
+
+  if (ConstantInt *TrueValC = dyn_cast<ConstantInt>(TrueVal))
+    if (ConstantInt *FalseValC = dyn_cast<ConstantInt>(FalseVal))
+      if (Value *V = foldSelectICmpAnd(SI, TrueValC->getValue(),
+                                       FalseValC->getValue(), Builder))
+        return replaceInstUsesWith(SI, V);
+
+  // See if we are selecting two values based on a comparison of the two values.
+  if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) {
+    if (FCI->getOperand(0) == TrueVal && FCI->getOperand(1) == FalseVal) {
+      // Transform (X == Y) ? X : Y  -> Y
+      if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) {
+        // This is not safe in general for floating point:
+        // consider X== -0, Y== +0.
+        // It becomes safe if either operand is a nonzero constant.
+        ConstantFP *CFPt, *CFPf;
+        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
+              !CFPt->getValueAPF().isZero()) ||
+            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
+             !CFPf->getValueAPF().isZero()))
+        return replaceInstUsesWith(SI, FalseVal);
+      }
+      // Transform (X une Y) ? X : Y  -> X
+      if (FCI->getPredicate() == FCmpInst::FCMP_UNE) {
+        // This is not safe in general for floating point:
+        // consider X== -0, Y== +0.
+        // It becomes safe if either operand is a nonzero constant.
+        ConstantFP *CFPt, *CFPf;
+        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
+              !CFPt->getValueAPF().isZero()) ||
+            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
+             !CFPf->getValueAPF().isZero()))
+        return replaceInstUsesWith(SI, TrueVal);
+      }
+
+      // Canonicalize to use ordered comparisons by swapping the select
+      // operands.
+      //
+      // e.g.
+      // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X
+      if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {
+        FCmpInst::Predicate InvPred = FCI->getInversePredicate();
+        IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+        Builder.setFastMathFlags(FCI->getFastMathFlags());
+        Value *NewCond = Builder.CreateFCmp(InvPred, TrueVal, FalseVal,
+                                            FCI->getName() + ".inv");
+
+        return SelectInst::Create(NewCond, FalseVal, TrueVal,
+                                  SI.getName() + ".p");
+      }
+
+      // NOTE: if we wanted to, this is where to detect MIN/MAX
+    } else if (FCI->getOperand(0) == FalseVal && FCI->getOperand(1) == TrueVal){
+      // Transform (X == Y) ? Y : X  -> X
+      if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) {
+        // This is not safe in general for floating point:
+        // consider X== -0, Y== +0.
+        // It becomes safe if either operand is a nonzero constant.
+        ConstantFP *CFPt, *CFPf;
+        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
+              !CFPt->getValueAPF().isZero()) ||
+            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
+             !CFPf->getValueAPF().isZero()))
+          return replaceInstUsesWith(SI, FalseVal);
+      }
+      // Transform (X une Y) ? Y : X  -> Y
+      if (FCI->getPredicate() == FCmpInst::FCMP_UNE) {
+        // This is not safe in general for floating point:
+        // consider X== -0, Y== +0.
+        // It becomes safe if either operand is a nonzero constant.
+        ConstantFP *CFPt, *CFPf;
+        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
+              !CFPt->getValueAPF().isZero()) ||
+            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
+             !CFPf->getValueAPF().isZero()))
+          return replaceInstUsesWith(SI, TrueVal);
+      }
+
+      // Canonicalize to use ordered comparisons by swapping the select
+      // operands.
+      //
+      // e.g.
+      // (X ugt Y) ? X : Y -> (X ole Y) ? X : Y
+      if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {
+        FCmpInst::Predicate InvPred = FCI->getInversePredicate();
+        IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+        Builder.setFastMathFlags(FCI->getFastMathFlags());
+        Value *NewCond = Builder.CreateFCmp(InvPred, FalseVal, TrueVal,
+                                            FCI->getName() + ".inv");
+
+        return SelectInst::Create(NewCond, FalseVal, TrueVal,
+                                  SI.getName() + ".p");
+      }
+
+      // NOTE: if we wanted to, this is where to detect MIN/MAX
+    }
+    // NOTE: if we wanted to, this is where to detect ABS
+  }
+
+  // See if we are selecting two values based on a comparison of the two values.
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal))
+    if (Instruction *Result = foldSelectInstWithICmp(SI, ICI))
+      return Result;
+
+  if (Instruction *Add = foldAddSubSelect(SI, Builder))
+    return Add;
+
+  // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
+  auto *TI = dyn_cast<Instruction>(TrueVal);
+  auto *FI = dyn_cast<Instruction>(FalseVal);
+  if (TI && FI && TI->getOpcode() == FI->getOpcode())
+    if (Instruction *IV = foldSelectOpOp(SI, TI, FI))
+      return IV;
+
+  if (Instruction *I = foldSelectExtConst(SI))
+    return I;
+
+  // See if we can fold the select into one of our operands.
+  if (SelType->isIntOrIntVectorTy() || SelType->isFPOrFPVectorTy()) {
+    if (Instruction *FoldI = foldSelectIntoOp(SI, TrueVal, FalseVal))
+      return FoldI;
+
+    Value *LHS, *RHS, *LHS2, *RHS2;
+    Instruction::CastOps CastOp;
+    SelectPatternResult SPR = matchSelectPattern(&SI, LHS, RHS, &CastOp);
+    auto SPF = SPR.Flavor;
+
+    if (SelectPatternResult::isMinOrMax(SPF)) {
+      // Canonicalize so that type casts are outside select patterns.
+      if (LHS->getType()->getPrimitiveSizeInBits() !=
+          SelType->getPrimitiveSizeInBits()) {
+        CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF, SPR.Ordered);
+
+        Value *Cmp;
+        if (CmpInst::isIntPredicate(Pred)) {
+          Cmp = Builder.CreateICmp(Pred, LHS, RHS);
+        } else {
+          IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+          auto FMF = cast<FPMathOperator>(SI.getCondition())->getFastMathFlags();
+          Builder.setFastMathFlags(FMF);
+          Cmp = Builder.CreateFCmp(Pred, LHS, RHS);
+        }
+
+        Value *NewSI = Builder.CreateCast(
+            CastOp, Builder.CreateSelect(Cmp, LHS, RHS, SI.getName(), &SI),
+            SelType);
+        return replaceInstUsesWith(SI, NewSI);
+      }
+    }
+
+    if (SPF) {
+      // MAX(MAX(a, b), a) -> MAX(a, b)
+      // MIN(MIN(a, b), a) -> MIN(a, b)
+      // MAX(MIN(a, b), a) -> a
+      // MIN(MAX(a, b), a) -> a
+      // ABS(ABS(a)) -> ABS(a)
+      // NABS(NABS(a)) -> NABS(a)
+      if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor)
+        if (Instruction *R = foldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2,
+                                          SI, SPF, RHS))
+          return R;
+      if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2).Flavor)
+        if (Instruction *R = foldSPFofSPF(cast<Instruction>(RHS),SPF2,LHS2,RHS2,
+                                          SI, SPF, LHS))
+          return R;
+    }
+
+    // MAX(~a, ~b) -> ~MIN(a, b)
+    if ((SPF == SPF_SMAX || SPF == SPF_UMAX) &&
+        IsFreeToInvert(LHS, LHS->hasNUses(2)) &&
+        IsFreeToInvert(RHS, RHS->hasNUses(2))) {
+      // For this transform to be profitable, we need to eliminate at least two
+      // 'not' instructions if we're going to add one 'not' instruction.
+      int NumberOfNots =
+          (LHS->hasNUses(2) && match(LHS, m_Not(m_Value()))) +
+          (RHS->hasNUses(2) && match(RHS, m_Not(m_Value()))) +
+          (SI.hasOneUse() && match(*SI.user_begin(), m_Not(m_Value())));
+
+      if (NumberOfNots >= 2) {
+        Value *NewLHS = Builder.CreateNot(LHS);
+        Value *NewRHS = Builder.CreateNot(RHS);
+        Value *NewCmp = SPF == SPF_SMAX ? Builder.CreateICmpSLT(NewLHS, NewRHS)
+                                        : Builder.CreateICmpULT(NewLHS, NewRHS);
+        Value *NewSI =
+            Builder.CreateNot(Builder.CreateSelect(NewCmp, NewLHS, NewRHS));
+        return replaceInstUsesWith(SI, NewSI);
+      }
+    }
+
+    // TODO.
+    // ABS(-X) -> ABS(X)
+  }
+
+  // See if we can fold the select into a phi node if the condition is a select.
+  if (auto *PN = dyn_cast<PHINode>(SI.getCondition()))
+    // The true/false values have to be live in the PHI predecessor's blocks.
+    if (canSelectOperandBeMappingIntoPredBlock(TrueVal, SI) &&
+        canSelectOperandBeMappingIntoPredBlock(FalseVal, SI))
+      if (Instruction *NV = foldOpIntoPhi(SI, PN))
+        return NV;
+
+  if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) {
+    if (TrueSI->getCondition()->getType() == CondVal->getType()) {
+      // select(C, select(C, a, b), c) -> select(C, a, c)
+      if (TrueSI->getCondition() == CondVal) {
+        if (SI.getTrueValue() == TrueSI->getTrueValue())
+          return nullptr;
+        SI.setOperand(1, TrueSI->getTrueValue());
+        return &SI;
+      }
+      // select(C0, select(C1, a, b), b) -> select(C0&C1, a, b)
+      // We choose this as normal form to enable folding on the And and shortening
+      // paths for the values (this helps GetUnderlyingObjects() for example).
+      if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) {
+        Value *And = Builder.CreateAnd(CondVal, TrueSI->getCondition());
+        SI.setOperand(0, And);
+        SI.setOperand(1, TrueSI->getTrueValue());
+        return &SI;
+      }
+    }
+  }
+  if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) {
+    if (FalseSI->getCondition()->getType() == CondVal->getType()) {
+      // select(C, a, select(C, b, c)) -> select(C, a, c)
+      if (FalseSI->getCondition() == CondVal) {
+        if (SI.getFalseValue() == FalseSI->getFalseValue())
+          return nullptr;
+        SI.setOperand(2, FalseSI->getFalseValue());
+        return &SI;
+      }
+      // select(C0, a, select(C1, a, b)) -> select(C0|C1, a, b)
+      if (FalseSI->getTrueValue() == TrueVal && FalseSI->hasOneUse()) {
+        Value *Or = Builder.CreateOr(CondVal, FalseSI->getCondition());
+        SI.setOperand(0, Or);
+        SI.setOperand(2, FalseSI->getFalseValue());
+        return &SI;
+      }
+    }
+  }
+
+  if (BinaryOperator::isNot(CondVal)) {
+    SI.setOperand(0, BinaryOperator::getNotArgument(CondVal));
+    SI.setOperand(1, FalseVal);
+    SI.setOperand(2, TrueVal);
+    return &SI;
+  }
+
+  if (VectorType *VecTy = dyn_cast<VectorType>(SelType)) {
+    unsigned VWidth = VecTy->getNumElements();
+    APInt UndefElts(VWidth, 0);
+    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+    if (Value *V = SimplifyDemandedVectorElts(&SI, AllOnesEltMask, UndefElts)) {
+      if (V != &SI)
+        return replaceInstUsesWith(SI, V);
+      return &SI;
+    }
+
+    if (isa<ConstantAggregateZero>(CondVal)) {
+      return replaceInstUsesWith(SI, FalseVal);
+    }
+  }
+
+  // See if we can determine the result of this select based on a dominating
+  // condition.
+  BasicBlock *Parent = SI.getParent();
+  if (BasicBlock *Dom = Parent->getSinglePredecessor()) {
+    auto *PBI = dyn_cast_or_null<BranchInst>(Dom->getTerminator());
+    if (PBI && PBI->isConditional() &&
+        PBI->getSuccessor(0) != PBI->getSuccessor(1) &&
+        (PBI->getSuccessor(0) == Parent || PBI->getSuccessor(1) == Parent)) {
+      bool CondIsFalse = PBI->getSuccessor(1) == Parent;
+      Optional<bool> Implication = isImpliedCondition(
+        PBI->getCondition(), SI.getCondition(), DL, CondIsFalse);
+      if (Implication) {
+        Value *V = *Implication ? TrueVal : FalseVal;
+        return replaceInstUsesWith(SI, V);
+      }
+    }
+  }
+
+  // If we can compute the condition, there's no need for a select.
+  // Like the above fold, we are attempting to reduce compile-time cost by
+  // putting this fold here with limitations rather than in InstSimplify.
+  // The motivation for this call into value tracking is to take advantage of
+  // the assumption cache, so make sure that is populated.
+  if (!CondVal->getType()->isVectorTy() && !AC.assumptions().empty()) {
+    KnownBits Known(1);
+    computeKnownBits(CondVal, Known, 0, &SI);
+    if (Known.One.isOneValue())
+      return replaceInstUsesWith(SI, TrueVal);
+    if (Known.Zero.isOneValue())
+      return replaceInstUsesWith(SI, FalseVal);
+  }
+
+  if (Instruction *BitCastSel = foldSelectCmpBitcasts(SI, Builder))
+    return BitCastSel;
+
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
new file mode 100644
index 000000000000..7ed141c7fd79
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -0,0 +1,794 @@
+//===- InstCombineShifts.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitShl, visitLShr, and visitAShr functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  assert(Op0->getType() == Op1->getType());
+
+  // See if we can fold away this shift.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  // Try to fold constant and into select arguments.
+  if (isa<Constant>(Op0))
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
+  if (Constant *CUI = dyn_cast<Constant>(Op1))
+    if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I))
+      return Res;
+
+  // (C1 shift (A add C2)) -> (C1 shift C2) shift A)
+  // iff A and C2 are both positive.
+  Value *A;
+  Constant *C;
+  if (match(Op0, m_Constant()) && match(Op1, m_Add(m_Value(A), m_Constant(C))))
+    if (isKnownNonNegative(A, DL, 0, &AC, &I, &DT) &&
+        isKnownNonNegative(C, DL, 0, &AC, &I, &DT))
+      return BinaryOperator::Create(
+          I.getOpcode(), Builder.CreateBinOp(I.getOpcode(), Op0, C), A);
+
+  // X shift (A srem B) -> X shift (A and B-1) iff B is a power of 2.
+  // Because shifts by negative values (which could occur if A were negative)
+  // are undefined.
+  const APInt *B;
+  if (Op1->hasOneUse() && match(Op1, m_SRem(m_Value(A), m_Power2(B)))) {
+    // FIXME: Should this get moved into SimplifyDemandedBits by saying we don't
+    // demand the sign bit (and many others) here??
+    Value *Rem = Builder.CreateAnd(A, ConstantInt::get(I.getType(), *B - 1),
+                                   Op1->getName());
+    I.setOperand(1, Rem);
+    return &I;
+  }
+
+  return nullptr;
+}
+
+/// Return true if we can simplify two logical (either left or right) shifts
+/// that have constant shift amounts: OuterShift (InnerShift X, C1), C2.
+static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
+                                    Instruction *InnerShift, InstCombiner &IC,
+                                    Instruction *CxtI) {
+  assert(InnerShift->isLogicalShift() && "Unexpected instruction type");
+
+  // We need constant scalar or constant splat shifts.
+  const APInt *InnerShiftConst;
+  if (!match(InnerShift->getOperand(1), m_APInt(InnerShiftConst)))
+    return false;
+
+  // Two logical shifts in the same direction:
+  // shl (shl X, C1), C2 -->  shl X, C1 + C2
+  // lshr (lshr X, C1), C2 --> lshr X, C1 + C2
+  bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl;
+  if (IsInnerShl == IsOuterShl)
+    return true;
+
+  // Equal shift amounts in opposite directions become bitwise 'and':
+  // lshr (shl X, C), C --> and X, C'
+  // shl (lshr X, C), C --> and X, C'
+  unsigned InnerShAmt = InnerShiftConst->getZExtValue();
+  if (InnerShAmt == OuterShAmt)
+    return true;
+
+  // If the 2nd shift is bigger than the 1st, we can fold:
+  // lshr (shl X, C1), C2 -->  and (shl X, C1 - C2), C3
+  // shl (lshr X, C1), C2 --> and (lshr X, C1 - C2), C3
+  // but it isn't profitable unless we know the and'd out bits are already zero.
+  // Also, check that the inner shift is valid (less than the type width) or
+  // we'll crash trying to produce the bit mask for the 'and'.
+  unsigned TypeWidth = InnerShift->getType()->getScalarSizeInBits();
+  if (InnerShAmt > OuterShAmt && InnerShAmt < TypeWidth) {
+    unsigned MaskShift =
+        IsInnerShl ? TypeWidth - InnerShAmt : InnerShAmt - OuterShAmt;
+    APInt Mask = APInt::getLowBitsSet(TypeWidth, OuterShAmt) << MaskShift;
+    if (IC.MaskedValueIsZero(InnerShift->getOperand(0), Mask, 0, CxtI))
+      return true;
+  }
+
+  return false;
+}
+
+/// See if we can compute the specified value, but shifted logically to the left
+/// or right by some number of bits. This should return true if the expression
+/// can be computed for the same cost as the current expression tree. This is
+/// used to eliminate extraneous shifting from things like:
+///      %C = shl i128 %A, 64
+///      %D = shl i128 %B, 96
+///      %E = or i128 %C, %D
+///      %F = lshr i128 %E, 64
+/// where the client will ask if E can be computed shifted right by 64-bits. If
+/// this succeeds, getShiftedValue() will be called to produce the value.
+static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
+                               InstCombiner &IC, Instruction *CxtI) {
+  // We can always evaluate constants shifted.
+  if (isa<Constant>(V))
+    return true;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // If this is the opposite shift, we can directly reuse the input of the shift
+  // if the needed bits are already zero in the input.  This allows us to reuse
+  // the value which means that we don't care if the shift has multiple uses.
+  //  TODO:  Handle opposite shift by exact value.
+  ConstantInt *CI = nullptr;
+  if ((IsLeftShift && match(I, m_LShr(m_Value(), m_ConstantInt(CI)))) ||
+      (!IsLeftShift && match(I, m_Shl(m_Value(), m_ConstantInt(CI))))) {
+    if (CI->getZExtValue() == NumBits) {
+      // TODO: Check that the input bits are already zero with MaskedValueIsZero
+#if 0
+      // If this is a truncate of a logical shr, we can truncate it to a smaller
+      // lshr iff we know that the bits we would otherwise be shifting in are
+      // already zeros.
+      uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+      uint32_t BitWidth = Ty->getScalarSizeInBits();
+      if (MaskedValueIsZero(I->getOperand(0),
+            APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth)) &&
+          CI->getLimitedValue(BitWidth) < BitWidth) {
+        return CanEvaluateTruncated(I->getOperand(0), Ty);
+      }
+#endif
+
+    }
+  }
+
+  // We can't mutate something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
+
+  switch (I->getOpcode()) {
+  default: return false;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
+    return canEvaluateShifted(I->getOperand(0), NumBits, IsLeftShift, IC, I) &&
+           canEvaluateShifted(I->getOperand(1), NumBits, IsLeftShift, IC, I);
+
+  case Instruction::Shl:
+  case Instruction::LShr:
+    return canEvaluateShiftedShift(NumBits, IsLeftShift, I, IC, CxtI);
+
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    Value *TrueVal = SI->getTrueValue();
+    Value *FalseVal = SI->getFalseValue();
+    return canEvaluateShifted(TrueVal, NumBits, IsLeftShift, IC, SI) &&
+           canEvaluateShifted(FalseVal, NumBits, IsLeftShift, IC, SI);
+  }
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (Value *IncValue : PN->incoming_values())
+      if (!canEvaluateShifted(IncValue, NumBits, IsLeftShift, IC, PN))
+        return false;
+    return true;
+  }
+  }
+}
+
+/// Fold OuterShift (InnerShift X, C1), C2.
+/// See canEvaluateShiftedShift() for the constraints on these instructions.
+static Value *foldShiftedShift(BinaryOperator *InnerShift, unsigned OuterShAmt,
+                               bool IsOuterShl,
+                               InstCombiner::BuilderTy &Builder) {
+  bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl;
+  Type *ShType = InnerShift->getType();
+  unsigned TypeWidth = ShType->getScalarSizeInBits();
+
+  // We only accept shifts-by-a-constant in canEvaluateShifted().
+  const APInt *C1;
+  match(InnerShift->getOperand(1), m_APInt(C1));
+  unsigned InnerShAmt = C1->getZExtValue();
+
+  // Change the shift amount and clear the appropriate IR flags.
+  auto NewInnerShift = [&](unsigned ShAmt) {
+    InnerShift->setOperand(1, ConstantInt::get(ShType, ShAmt));
+    if (IsInnerShl) {
+      InnerShift->setHasNoUnsignedWrap(false);
+      InnerShift->setHasNoSignedWrap(false);
+    } else {
+      InnerShift->setIsExact(false);
+    }
+    return InnerShift;
+  };
+
+  // Two logical shifts in the same direction:
+  // shl (shl X, C1), C2 -->  shl X, C1 + C2
+  // lshr (lshr X, C1), C2 --> lshr X, C1 + C2
+  if (IsInnerShl == IsOuterShl) {
+    // If this is an oversized composite shift, then unsigned shifts get 0.
+    if (InnerShAmt + OuterShAmt >= TypeWidth)
+      return Constant::getNullValue(ShType);
+
+    return NewInnerShift(InnerShAmt + OuterShAmt);
+  }
+
+  // Equal shift amounts in opposite directions become bitwise 'and':
+  // lshr (shl X, C), C --> and X, C'
+  // shl (lshr X, C), C --> and X, C'
+  if (InnerShAmt == OuterShAmt) {
+    APInt Mask = IsInnerShl
+                     ? APInt::getLowBitsSet(TypeWidth, TypeWidth - OuterShAmt)
+                     : APInt::getHighBitsSet(TypeWidth, TypeWidth - OuterShAmt);
+    Value *And = Builder.CreateAnd(InnerShift->getOperand(0),
+                                   ConstantInt::get(ShType, Mask));
+    if (auto *AndI = dyn_cast<Instruction>(And)) {
+      AndI->moveBefore(InnerShift);
+      AndI->takeName(InnerShift);
+    }
+    return And;
+  }
+
+  assert(InnerShAmt > OuterShAmt &&
+         "Unexpected opposite direction logical shift pair");
+
+  // In general, we would need an 'and' for this transform, but
+  // canEvaluateShiftedShift() guarantees that the masked-off bits are not used.
+  // lshr (shl X, C1), C2 -->  shl X, C1 - C2
+  // shl (lshr X, C1), C2 --> lshr X, C1 - C2
+  return NewInnerShift(InnerShAmt - OuterShAmt);
+}
+
+/// When canEvaluateShifted() returns true for an expression, this function
+/// inserts the new computation that produces the shifted value.
+static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
+                              InstCombiner &IC, const DataLayout &DL) {
+  // We can always evaluate constants shifted.
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    if (isLeftShift)
+      V = IC.Builder.CreateShl(C, NumBits);
+    else
+      V = IC.Builder.CreateLShr(C, NumBits);
+    // If we got a constantexpr back, try to simplify it with TD info.
+    if (auto *C = dyn_cast<Constant>(V))
+      if (auto *FoldedC =
+              ConstantFoldConstant(C, DL, &IC.getTargetLibraryInfo()))
+        V = FoldedC;
+    return V;
+  }
+
+  Instruction *I = cast<Instruction>(V);
+  IC.Worklist.Add(I);
+
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Inconsistency with CanEvaluateShifted");
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
+    I->setOperand(
+        0, getShiftedValue(I->getOperand(0), NumBits, isLeftShift, IC, DL));
+    I->setOperand(
+        1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
+    return I;
+
+  case Instruction::Shl:
+  case Instruction::LShr:
+    return foldShiftedShift(cast<BinaryOperator>(I), NumBits, isLeftShift,
+                            IC.Builder);
+
+  case Instruction::Select:
+    I->setOperand(
+        1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
+    I->setOperand(
+        2, getShiftedValue(I->getOperand(2), NumBits, isLeftShift, IC, DL));
+    return I;
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      PN->setIncomingValue(i, getShiftedValue(PN->getIncomingValue(i), NumBits,
+                                              isLeftShift, IC, DL));
+    return PN;
+  }
+  }
+}
+
+Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
+                                               BinaryOperator &I) {
+  bool isLeftShift = I.getOpcode() == Instruction::Shl;
+
+  const APInt *Op1C;
+  if (!match(Op1, m_APInt(Op1C)))
+    return nullptr;
+
+  // See if we can propagate this shift into the input, this covers the trivial
+  // cast of lshr(shl(x,c1),c2) as well as other more complex cases.
+  if (I.getOpcode() != Instruction::AShr &&
+      canEvaluateShifted(Op0, Op1C->getZExtValue(), isLeftShift, *this, &I)) {
+    DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression"
+              " to eliminate shift:\n  IN: " << *Op0 << "\n  SH: " << I <<"\n");
+
+    return replaceInstUsesWith(
+        I, getShiftedValue(Op0, Op1C->getZExtValue(), isLeftShift, *this, DL));
+  }
+
+  // See if we can simplify any instructions used by the instruction whose sole
+  // purpose is to compute bits we don't care about.
+  unsigned TypeBits = Op0->getType()->getScalarSizeInBits();
+
+  assert(!Op1C->uge(TypeBits) &&
+         "Shift over the type width should have been removed already");
+
+  if (Instruction *FoldedShift = foldOpWithConstantIntoOperand(I))
+    return FoldedShift;
+
+  // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
+  if (TruncInst *TI = dyn_cast<TruncInst>(Op0)) {
+    Instruction *TrOp = dyn_cast<Instruction>(TI->getOperand(0));
+    // If 'shift2' is an ashr, we would have to get the sign bit into a funny
+    // place.  Don't try to do this transformation in this case.  Also, we
+    // require that the input operand is a shift-by-constant so that we have
+    // confidence that the shifts will get folded together.  We could do this
+    // xform in more cases, but it is unlikely to be profitable.
+    if (TrOp && I.isLogicalShift() && TrOp->isShift() &&
+        isa<ConstantInt>(TrOp->getOperand(1))) {
+      // Okay, we'll do this xform.  Make the shift of shift.
+      Constant *ShAmt =
+          ConstantExpr::getZExt(cast<Constant>(Op1), TrOp->getType());
+      // (shift2 (shift1 & 0x00FF), c2)
+      Value *NSh = Builder.CreateBinOp(I.getOpcode(), TrOp, ShAmt, I.getName());
+
+      // For logical shifts, the truncation has the effect of making the high
+      // part of the register be zeros.  Emulate this by inserting an AND to
+      // clear the top bits as needed.  This 'and' will usually be zapped by
+      // other xforms later if dead.
+      unsigned SrcSize = TrOp->getType()->getScalarSizeInBits();
+      unsigned DstSize = TI->getType()->getScalarSizeInBits();
+      APInt MaskV(APInt::getLowBitsSet(SrcSize, DstSize));
+
+      // The mask we constructed says what the trunc would do if occurring
+      // between the shifts.  We want to know the effect *after* the second
+      // shift.  We know that it is a logical shift by a constant, so adjust the
+      // mask as appropriate.
+      if (I.getOpcode() == Instruction::Shl)
+        MaskV <<= Op1C->getZExtValue();
+      else {
+        assert(I.getOpcode() == Instruction::LShr && "Unknown logical shift");
+        MaskV.lshrInPlace(Op1C->getZExtValue());
+      }
+
+      // shift1 & 0x00FF
+      Value *And = Builder.CreateAnd(NSh,
+                                     ConstantInt::get(I.getContext(), MaskV),
+                                     TI->getName());
+
+      // Return the value truncated to the interesting size.
+      return new TruncInst(And, I.getType());
+    }
+  }
+
+  if (Op0->hasOneUse()) {
+    if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) {
+      // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
+      Value *V1, *V2;
+      ConstantInt *CC;
+      switch (Op0BO->getOpcode()) {
+      default: break;
+      case Instruction::Add:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor: {
+        // These operators commute.
+        // Turn (Y + (X >> C)) << C  ->  (X + (Y << C)) & (~0 << C)
+        if (isLeftShift && Op0BO->getOperand(1)->hasOneUse() &&
+            match(Op0BO->getOperand(1), m_Shr(m_Value(V1),
+                  m_Specific(Op1)))) {
+          Value *YS =         // (Y << C)
+            Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
+          // (X + (Y << C))
+          Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), YS, V1,
+                                         Op0BO->getOperand(1)->getName());
+          unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
+
+          APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
+          Constant *Mask = ConstantInt::get(I.getContext(), Bits);
+          if (VectorType *VT = dyn_cast<VectorType>(X->getType()))
+            Mask = ConstantVector::getSplat(VT->getNumElements(), Mask);
+          return BinaryOperator::CreateAnd(X, Mask);
+        }
+
+        // Turn (Y + ((X >> C) & CC)) << C  ->  ((X & (CC << C)) + (Y << C))
+        Value *Op0BOOp1 = Op0BO->getOperand(1);
+        if (isLeftShift && Op0BOOp1->hasOneUse() &&
+            match(Op0BOOp1,
+                  m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))),
+                        m_ConstantInt(CC)))) {
+          Value *YS =   // (Y << C)
+            Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
+          // X & (CC << C)
+          Value *XM = Builder.CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
+                                        V1->getName()+".mask");
+          return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM);
+        }
+        LLVM_FALLTHROUGH;
+      }
+
+      case Instruction::Sub: {
+        // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
+        if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
+            match(Op0BO->getOperand(0), m_Shr(m_Value(V1),
+                  m_Specific(Op1)))) {
+          Value *YS =  // (Y << C)
+            Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
+          // (X + (Y << C))
+          Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), V1, YS,
+                                         Op0BO->getOperand(0)->getName());
+          unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
+
+          APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
+          Constant *Mask = ConstantInt::get(I.getContext(), Bits);
+          if (VectorType *VT = dyn_cast<VectorType>(X->getType()))
+            Mask = ConstantVector::getSplat(VT->getNumElements(), Mask);
+          return BinaryOperator::CreateAnd(X, Mask);
+        }
+
+        // Turn (((X >> C)&CC) + Y) << C  ->  (X + (Y << C)) & (CC << C)
+        if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
+            match(Op0BO->getOperand(0),
+                  m_And(m_OneUse(m_Shr(m_Value(V1), m_Value(V2))),
+                        m_ConstantInt(CC))) && V2 == Op1) {
+          Value *YS = // (Y << C)
+            Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
+          // X & (CC << C)
+          Value *XM = Builder.CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
+                                        V1->getName()+".mask");
+
+          return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS);
+        }
+
+        break;
+      }
+      }
+
+
+      // If the operand is a bitwise operator with a constant RHS, and the
+      // shift is the only use, we can pull it out of the shift.
+      if (ConstantInt *Op0C = dyn_cast<ConstantInt>(Op0BO->getOperand(1))) {
+        bool isValid = true;     // Valid only for And, Or, Xor
+        bool highBitSet = false; // Transform if high bit of constant set?
+
+        switch (Op0BO->getOpcode()) {
+        default: isValid = false; break;   // Do not perform transform!
+        case Instruction::Add:
+          isValid = isLeftShift;
+          break;
+        case Instruction::Or:
+        case Instruction::Xor:
+          highBitSet = false;
+          break;
+        case Instruction::And:
+          highBitSet = true;
+          break;
+        }
+
+        // If this is a signed shift right, and the high bit is modified
+        // by the logical operation, do not perform the transformation.
+        // The highBitSet boolean indicates the value of the high bit of
+        // the constant which would cause it to be modified for this
+        // operation.
+        //
+        if (isValid && I.getOpcode() == Instruction::AShr)
+          isValid = Op0C->getValue()[TypeBits-1] == highBitSet;
+
+        if (isValid) {
+          Constant *NewRHS = ConstantExpr::get(I.getOpcode(), Op0C, Op1);
+
+          Value *NewShift =
+            Builder.CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1);
+          NewShift->takeName(Op0BO);
+
+          return BinaryOperator::Create(Op0BO->getOpcode(), NewShift,
+                                        NewRHS);
+        }
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitShl(BinaryOperator &I) {
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (Value *V =
+          SimplifyShlInst(Op0, Op1, I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+                          SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *V = commonShiftTransforms(I))
+    return V;
+
+  const APInt *ShAmtAPInt;
+  if (match(Op1, m_APInt(ShAmtAPInt))) {
+    unsigned ShAmt = ShAmtAPInt->getZExtValue();
+    unsigned BitWidth = I.getType()->getScalarSizeInBits();
+    Type *Ty = I.getType();
+
+    // shl (zext X), ShAmt --> zext (shl X, ShAmt)
+    // This is only valid if X would have zeros shifted out.
+    Value *X;
+    if (match(Op0, m_ZExt(m_Value(X)))) {
+      unsigned SrcWidth = X->getType()->getScalarSizeInBits();
+      if (ShAmt < SrcWidth &&
+          MaskedValueIsZero(X, APInt::getHighBitsSet(SrcWidth, ShAmt), 0, &I))
+        return new ZExtInst(Builder.CreateShl(X, ShAmt), Ty);
+    }
+
+    // (X >>u C) << C --> X & (-1 << C)
+    if (match(Op0, m_LShr(m_Value(X), m_Specific(Op1)))) {
+      APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt));
+      return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
+    }
+
+    // Be careful about hiding shl instructions behind bit masks. They are used
+    // to represent multiplies by a constant, and it is important that simple
+    // arithmetic expressions are still recognizable by scalar evolution.
+    // The inexact versions are deferred to DAGCombine, so we don't hide shl
+    // behind a bit mask.
+    const APInt *ShOp1;
+    if (match(Op0, m_Exact(m_Shr(m_Value(X), m_APInt(ShOp1))))) {
+      unsigned ShrAmt = ShOp1->getZExtValue();
+      if (ShrAmt < ShAmt) {
+        // If C1 < C2: (X >>?,exact C1) << C2 --> X << (C2 - C1)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShrAmt);
+        auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
+        NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+        NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
+        return NewShl;
+      }
+      if (ShrAmt > ShAmt) {
+        // If C1 > C2: (X >>?exact C1) << C2 --> X >>?exact (C1 - C2)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShrAmt - ShAmt);
+        auto *NewShr = BinaryOperator::Create(
+            cast<BinaryOperator>(Op0)->getOpcode(), X, ShiftDiff);
+        NewShr->setIsExact(true);
+        return NewShr;
+      }
+    }
+
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized shifts are simplified to zero in InstSimplify.
+      if (AmtSum < BitWidth)
+        // (X << C1) << C2 --> X << (C1 + C2)
+        return BinaryOperator::CreateShl(X, ConstantInt::get(Ty, AmtSum));
+    }
+
+    // If the shifted-out value is known-zero, then this is a NUW shift.
+    if (!I.hasNoUnsignedWrap() &&
+        MaskedValueIsZero(Op0, APInt::getHighBitsSet(BitWidth, ShAmt), 0, &I)) {
+      I.setHasNoUnsignedWrap();
+      return &I;
+    }
+
+    // If the shifted-out value is all signbits, then this is a NSW shift.
+    if (!I.hasNoSignedWrap() && ComputeNumSignBits(Op0, 0, &I) > ShAmt) {
+      I.setHasNoSignedWrap();
+      return &I;
+    }
+  }
+
+  Constant *C1;
+  if (match(Op1, m_Constant(C1))) {
+    Constant *C2;
+    Value *X;
+    // (C2 << X) << C1 --> (C2 << C1) << X
+    if (match(Op0, m_OneUse(m_Shl(m_Constant(C2), m_Value(X)))))
+      return BinaryOperator::CreateShl(ConstantExpr::getShl(C2, C1), X);
+
+    // (X * C2) << C1 --> X * (C2 << C1)
+    if (match(Op0, m_Mul(m_Value(X), m_Constant(C2))))
+      return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1));
+  }
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (Value *V =
+          SimplifyLShrInst(Op0, Op1, I.isExact(), SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *R = commonShiftTransforms(I))
+    return R;
+
+  Type *Ty = I.getType();
+  const APInt *ShAmtAPInt;
+  if (match(Op1, m_APInt(ShAmtAPInt))) {
+    unsigned ShAmt = ShAmtAPInt->getZExtValue();
+    unsigned BitWidth = Ty->getScalarSizeInBits();
+    auto *II = dyn_cast<IntrinsicInst>(Op0);
+    if (II && isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == ShAmt &&
+        (II->getIntrinsicID() == Intrinsic::ctlz ||
+         II->getIntrinsicID() == Intrinsic::cttz ||
+         II->getIntrinsicID() == Intrinsic::ctpop)) {
+      // ctlz.i32(x)>>5  --> zext(x == 0)
+      // cttz.i32(x)>>5  --> zext(x == 0)
+      // ctpop.i32(x)>>5 --> zext(x == -1)
+      bool IsPop = II->getIntrinsicID() == Intrinsic::ctpop;
+      Constant *RHS = ConstantInt::getSigned(Ty, IsPop ? -1 : 0);
+      Value *Cmp = Builder.CreateICmpEQ(II->getArgOperand(0), RHS);
+      return new ZExtInst(Cmp, Ty);
+    }
+
+    Value *X;
+    const APInt *ShOp1;
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned ShlAmt = ShOp1->getZExtValue();
+      if (ShlAmt < ShAmt) {
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
+        if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
+          // (X <<nuw C1) >>u C2 --> X >>u (C2 - C1)
+          auto *NewLShr = BinaryOperator::CreateLShr(X, ShiftDiff);
+          NewLShr->setIsExact(I.isExact());
+          return NewLShr;
+        }
+        // (X << C1) >>u C2  --> (X >>u (C2 - C1)) & (-1 >> C2)
+        Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact());
+        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+        return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask));
+      }
+      if (ShlAmt > ShAmt) {
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
+        if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
+          // (X <<nuw C1) >>u C2 --> X <<nuw (C1 - C2)
+          auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
+          NewShl->setHasNoUnsignedWrap(true);
+          return NewShl;
+        }
+        // (X << C1) >>u C2  --> X << (C1 - C2) & (-1 >> C2)
+        Value *NewShl = Builder.CreateShl(X, ShiftDiff);
+        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+        return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
+      }
+      assert(ShlAmt == ShAmt);
+      // (X << C) >>u C --> X & (-1 >>u C)
+      APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+      return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
+    }
+
+    if (match(Op0, m_SExt(m_Value(X))) &&
+        (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
+      // Are we moving the sign bit to the low bit and widening with high zeros?
+      unsigned SrcTyBitWidth = X->getType()->getScalarSizeInBits();
+      if (ShAmt == BitWidth - 1) {
+        // lshr (sext i1 X to iN), N-1 --> zext X to iN
+        if (SrcTyBitWidth == 1)
+          return new ZExtInst(X, Ty);
+
+        // lshr (sext iM X to iN), N-1 --> zext (lshr X, M-1) to iN
+        if (Op0->hasOneUse()) {
+          Value *NewLShr = Builder.CreateLShr(X, SrcTyBitWidth - 1);
+          return new ZExtInst(NewLShr, Ty);
+        }
+      }
+
+      // lshr (sext iM X to iN), N-M --> zext (ashr X, min(N-M, M-1)) to iN
+      if (ShAmt == BitWidth - SrcTyBitWidth && Op0->hasOneUse()) {
+        // The new shift amount can't be more than the narrow source type.
+        unsigned NewShAmt = std::min(ShAmt, SrcTyBitWidth - 1);
+        Value *AShr = Builder.CreateAShr(X, NewShAmt);
+        return new ZExtInst(AShr, Ty);
+      }
+    }
+
+    if (match(Op0, m_LShr(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized shifts are simplified to zero in InstSimplify.
+      if (AmtSum < BitWidth)
+        // (X >>u C1) >>u C2 --> X >>u (C1 + C2)
+        return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum));
+    }
+
+    // If the shifted-out value is known-zero, then this is an exact shift.
+    if (!I.isExact() &&
+        MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
+      I.setIsExact();
+      return &I;
+    }
+  }
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
+  if (Value *V = SimplifyVectorOp(I))
+    return replaceInstUsesWith(I, V);
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (Value *V =
+          SimplifyAShrInst(Op0, Op1, I.isExact(), SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *R = commonShiftTransforms(I))
+    return R;
+
+  Type *Ty = I.getType();
+  unsigned BitWidth = Ty->getScalarSizeInBits();
+  const APInt *ShAmtAPInt;
+  if (match(Op1, m_APInt(ShAmtAPInt))) {
+    unsigned ShAmt = ShAmtAPInt->getZExtValue();
+
+    // If the shift amount equals the difference in width of the destination
+    // and source scalar types:
+    // ashr (shl (zext X), C), C --> sext X
+    Value *X;
+    if (match(Op0, m_Shl(m_ZExt(m_Value(X)), m_Specific(Op1))) &&
+        ShAmt == BitWidth - X->getType()->getScalarSizeInBits())
+      return new SExtInst(X, Ty);
+
+    // We can't handle (X << C1) >>s C2. It shifts arbitrary bits in. However,
+    // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
+    const APInt *ShOp1;
+    if (match(Op0, m_NSWShl(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned ShlAmt = ShOp1->getZExtValue();
+      if (ShlAmt < ShAmt) {
+        // (X <<nsw C1) >>s C2 --> X >>s (C2 - C1)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
+        auto *NewAShr = BinaryOperator::CreateAShr(X, ShiftDiff);
+        NewAShr->setIsExact(I.isExact());
+        return NewAShr;
+      }
+      if (ShlAmt > ShAmt) {
+        // (X <<nsw C1) >>s C2 --> X <<nsw (C1 - C2)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
+        auto *NewShl = BinaryOperator::Create(Instruction::Shl, X, ShiftDiff);
+        NewShl->setHasNoSignedWrap(true);
+        return NewShl;
+      }
+    }
+
+    if (match(Op0, m_AShr(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized arithmetic shifts replicate the sign bit.
+      AmtSum = std::min(AmtSum, BitWidth - 1);
+      // (X >>s C1) >>s C2 --> X >>s (C1 + C2)
+      return BinaryOperator::CreateAShr(X, ConstantInt::get(Ty, AmtSum));
+    }
+
+    // If the shifted-out value is known-zero, then this is an exact shift.
+    if (!I.isExact() &&
+        MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
+      I.setIsExact();
+      return &I;
+    }
+  }
+
+  // See if we can turn a signed shr into an unsigned shr.
+  if (MaskedValueIsZero(Op0, APInt::getSignMask(BitWidth), 0, &I))
+    return BinaryOperator::CreateLShr(Op0, Op1);
+
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
new file mode 100644
index 000000000000..5689c0604239
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -0,0 +1,1679 @@
+//===- InstCombineSimplifyDemanded.cpp ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains logic for simplifying instructions based on information
+// about how they are used.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+/// Check to see if the specified operand of the specified instruction is a
+/// constant integer. If so, check to see if there are any bits set in the
+/// constant that are not demanded. If so, shrink the constant and return true.
+static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
+                                   const APInt &Demanded) {
+  assert(I && "No instruction?");
+  assert(OpNo < I->getNumOperands() && "Operand index too large");
+
+  // The operand must be a constant integer or splat integer.
+  Value *Op = I->getOperand(OpNo);
+  const APInt *C;
+  if (!match(Op, m_APInt(C)))
+    return false;
+
+  // If there are no bits set that aren't demanded, nothing to do.
+  if (C->isSubsetOf(Demanded))
+    return false;
+
+  // This instruction is producing bits that are not demanded. Shrink the RHS.
+  I->setOperand(OpNo, ConstantInt::get(Op->getType(), *C & Demanded));
+
+  return true;
+}
+
+
+
+/// Inst is an integer instruction that SimplifyDemandedBits knows about. See if
+/// the instruction has any properties that allow us to simplify its operands.
+bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
+  unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
+  KnownBits Known(BitWidth);
+  APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
+
+  Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, Known,
+                                     0, &Inst);
+  if (!V) return false;
+  if (V == &Inst) return true;
+  replaceInstUsesWith(Inst, V);
+  return true;
+}
+
+/// This form of SimplifyDemandedBits simplifies the specified instruction
+/// operand if possible, updating it in place. It returns true if it made any
+/// change and false otherwise.
+bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
+                                        const APInt &DemandedMask,
+                                        KnownBits &Known,
+                                        unsigned Depth) {
+  Use &U = I->getOperandUse(OpNo);
+  Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known,
+                                          Depth, I);
+  if (!NewVal) return false;
+  U = NewVal;
+  return true;
+}
+
+
+/// This function attempts to replace V with a simpler value based on the
+/// demanded bits. When this function is called, it is known that only the bits
+/// set in DemandedMask of the result of V are ever used downstream.
+/// Consequently, depending on the mask and V, it may be possible to replace V
+/// with a constant or one of its operands. In such cases, this function does
+/// the replacement and returns true. In all other cases, it returns false after
+/// analyzing the expression and setting KnownOne and known to be one in the
+/// expression. Known.Zero contains all the bits that are known to be zero in
+/// the expression. These are provided to potentially allow the caller (which
+/// might recursively be SimplifyDemandedBits itself) to simplify the
+/// expression.
+/// Known.One and Known.Zero always follow the invariant that:
+///   Known.One & Known.Zero == 0.
+/// That is, a bit can't be both 1 and 0. Note that the bits in Known.One and
+/// Known.Zero may only be accurate for those bits set in DemandedMask. Note
+/// also that the bitwidth of V, DemandedMask, Known.Zero and Known.One must all
+/// be the same.
+///
+/// This returns null if it did not change anything and it permits no
+/// simplification.  This returns V itself if it did some simplification of V's
+/// operands based on the information about what bits are demanded. This returns
+/// some other non-null value if it found out that V is equal to another value
+/// in the context where the specified bits are demanded, but not for all users.
+Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
+                                             KnownBits &Known, unsigned Depth,
+                                             Instruction *CxtI) {
+  assert(V != nullptr && "Null pointer of Value???");
+  assert(Depth <= 6 && "Limit Search Depth");
+  uint32_t BitWidth = DemandedMask.getBitWidth();
+  Type *VTy = V->getType();
+  assert(
+      (!VTy->isIntOrIntVectorTy() || VTy->getScalarSizeInBits() == BitWidth) &&
+      Known.getBitWidth() == BitWidth &&
+      "Value *V, DemandedMask and Known must have same BitWidth");
+
+  if (isa<Constant>(V)) {
+    computeKnownBits(V, Known, Depth, CxtI);
+    return nullptr;
+  }
+
+  Known.resetAll();
+  if (DemandedMask.isNullValue())     // Not demanding any bits from V.
+    return UndefValue::get(VTy);
+
+  if (Depth == 6)        // Limit search depth.
+    return nullptr;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    computeKnownBits(V, Known, Depth, CxtI);
+    return nullptr;        // Only analyze instructions.
+  }
+
+  // If there are multiple uses of this value and we aren't at the root, then
+  // we can't do any simplifications of the operands, because DemandedMask
+  // only reflects the bits demanded by *one* of the users.
+  if (Depth != 0 && !I->hasOneUse())
+    return SimplifyMultipleUseDemandedBits(I, DemandedMask, Known, Depth, CxtI);
+
+  KnownBits LHSKnown(BitWidth), RHSKnown(BitWidth);
+
+  // If this is the root being simplified, allow it to have multiple uses,
+  // just set the DemandedMask to all bits so that we can try to simplify the
+  // operands.  This allows visitTruncInst (for example) to simplify the
+  // operand of a trunc without duplicating all the logic below.
+  if (Depth == 0 && !V->hasOneUse())
+    DemandedMask.setAllBits();
+
+  switch (I->getOpcode()) {
+  default:
+    computeKnownBits(I, Known, Depth, CxtI);
+    break;
+  case Instruction::And: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.Zero, LHSKnown,
+                             Depth + 1))
+      return I;
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero;
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    APInt IKnownOne = RHSKnown.One & LHSKnown.One;
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
+      return Constant::getIntegerValue(VTy, IKnownOne);
+
+    // If all of the demanded bits are known 1 on one side, return the other.
+    // These bits cannot contribute to the result of the 'and'.
+    if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
+      return I->getOperand(1);
+
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnown.Zero))
+      return I;
+
+    Known.Zero = std::move(IKnownZero);
+    Known.One  = std::move(IKnownOne);
+    break;
+  }
+  case Instruction::Or: {
+    // If either the LHS or the RHS are One, the result is One.
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.One, LHSKnown,
+                             Depth + 1))
+      return I;
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero;
+    // Output known-1 are known. to be set if s.et in either the LHS | RHS.
+    APInt IKnownOne = RHSKnown.One | LHSKnown.One;
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
+      return Constant::getIntegerValue(VTy, IKnownOne);
+
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'or'.
+    if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
+      return I->getOperand(1);
+
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return I;
+
+    Known.Zero = std::move(IKnownZero);
+    Known.One  = std::move(IKnownOne);
+    break;
+  }
+  case Instruction::Xor: {
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1))
+      return I;
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) |
+                       (RHSKnown.One & LHSKnown.One);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    APInt IKnownOne =  (RHSKnown.Zero & LHSKnown.One) |
+                       (RHSKnown.One & LHSKnown.Zero);
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
+      return Constant::getIntegerValue(VTy, IKnownOne);
+
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'xor'.
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(LHSKnown.Zero))
+      return I->getOperand(1);
+
+    // If all of the demanded bits are known to be zero on one side or the
+    // other, turn this into an *inclusive* or.
+    //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero)) {
+      Instruction *Or =
+        BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
+                                 I->getName());
+      return InsertNewInstWith(Or, *I);
+    }
+
+    // If all of the demanded bits on one side are known, and all of the set
+    // bits on that side are also known to be set on the other side, turn this
+    // into an AND, as we know the bits will be cleared.
+    //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero|RHSKnown.One) &&
+        RHSKnown.One.isSubsetOf(LHSKnown.One)) {
+      Constant *AndC = Constant::getIntegerValue(VTy,
+                                                 ~RHSKnown.One & DemandedMask);
+      Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
+      return InsertNewInstWith(And, *I);
+    }
+
+    // If the RHS is a constant, see if we can simplify it.
+    // FIXME: for XOR, we prefer to force bits to 1 if they will make a -1.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return I;
+
+    // If our LHS is an 'and' and if it has one use, and if any of the bits we
+    // are flipping are known to be set, then the xor is just resetting those
+    // bits to zero.  We can just knock out bits from the 'and' and the 'xor',
+    // simplifying both of them.
+    if (Instruction *LHSInst = dyn_cast<Instruction>(I->getOperand(0)))
+      if (LHSInst->getOpcode() == Instruction::And && LHSInst->hasOneUse() &&
+          isa<ConstantInt>(I->getOperand(1)) &&
+          isa<ConstantInt>(LHSInst->getOperand(1)) &&
+          (LHSKnown.One & RHSKnown.One & DemandedMask) != 0) {
+        ConstantInt *AndRHS = cast<ConstantInt>(LHSInst->getOperand(1));
+        ConstantInt *XorRHS = cast<ConstantInt>(I->getOperand(1));
+        APInt NewMask = ~(LHSKnown.One & RHSKnown.One & DemandedMask);
+
+        Constant *AndC =
+          ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
+        Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
+        InsertNewInstWith(NewAnd, *I);
+
+        Constant *XorC =
+          ConstantInt::get(I->getType(), NewMask & XorRHS->getValue());
+        Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC);
+        return InsertNewInstWith(NewXor, *I);
+      }
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    Known.Zero = std::move(IKnownZero);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    Known.One  = std::move(IKnownOne);
+    break;
+  }
+  case Instruction::Select:
+    // If this is a select as part of a min/max pattern, don't simplify any
+    // further in case we break the structure.
+    Value *LHS, *RHS;
+    if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN)
+      return nullptr;
+
+    if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) ||
+        SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1))
+      return I;
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+    // If the operands are constants, see if we can simplify them.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask) ||
+        ShrinkDemandedConstant(I, 2, DemandedMask))
+      return I;
+
+    // Only known if known in both the LHS and RHS.
+    Known.One = RHSKnown.One & LHSKnown.One;
+    Known.Zero = RHSKnown.Zero & LHSKnown.Zero;
+    break;
+  case Instruction::ZExt:
+  case Instruction::Trunc: {
+    unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
+
+    APInt InputDemandedMask = DemandedMask.zextOrTrunc(SrcBitWidth);
+    KnownBits InputKnown(SrcBitWidth);
+    if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1))
+      return I;
+    Known = Known.zextOrTrunc(BitWidth);
+    // Any top bits are known to be zero.
+    if (BitWidth > SrcBitWidth)
+      Known.Zero.setBitsFrom(SrcBitWidth);
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    break;
+  }
+  case Instruction::BitCast:
+    if (!I->getOperand(0)->getType()->isIntOrIntVectorTy())
+      return nullptr;  // vector->int or fp->int?
+
+    if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) {
+      if (VectorType *SrcVTy =
+            dyn_cast<VectorType>(I->getOperand(0)->getType())) {
+        if (DstVTy->getNumElements() != SrcVTy->getNumElements())
+          // Don't touch a bitcast between vectors of different element counts.
+          return nullptr;
+      } else
+        // Don't touch a scalar-to-vector bitcast.
+        return nullptr;
+    } else if (I->getOperand(0)->getType()->isVectorTy())
+      // Don't touch a vector-to-scalar bitcast.
+      return nullptr;
+
+    if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1))
+      return I;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    break;
+  case Instruction::SExt: {
+    // Compute the bits in the result that are not present in the input.
+    unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
+
+    APInt InputDemandedBits = DemandedMask.trunc(SrcBitWidth);
+
+    // If any of the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    if (DemandedMask.getActiveBits() > SrcBitWidth)
+      InputDemandedBits.setBit(SrcBitWidth-1);
+
+    KnownBits InputKnown(SrcBitWidth);
+    if (SimplifyDemandedBits(I, 0, InputDemandedBits, InputKnown, Depth + 1))
+      return I;
+
+    // If the input sign bit is known zero, or if the NewBits are not demanded
+    // convert this into a zero extension.
+    if (InputKnown.isNonNegative() ||
+        DemandedMask.getActiveBits() <= SrcBitWidth) {
+      // Convert to ZExt cast.
+      CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName());
+      return InsertNewInstWith(NewCast, *I);
+     }
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+    Known = InputKnown.sext(BitWidth);
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    break;
+  }
+  case Instruction::Add:
+  case Instruction::Sub: {
+    /// If the high-bits of an ADD/SUB are not demanded, then we do not care
+    /// about the high bits of the operands.
+    unsigned NLZ = DemandedMask.countLeadingZeros();
+    if (NLZ > 0) {
+      // Right fill the mask of bits for this ADD/SUB to demand the most
+      // significant bit and all those below it.
+      APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
+      if (ShrinkDemandedConstant(I, 0, DemandedFromOps) ||
+          SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnown, Depth + 1) ||
+          ShrinkDemandedConstant(I, 1, DemandedFromOps) ||
+          SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnown, Depth + 1)) {
+        // Disable the nsw and nuw flags here: We can no longer guarantee that
+        // we won't wrap after simplification. Removing the nsw/nuw flags is
+        // legal here because the top bit is not demanded.
+        BinaryOperator &BinOP = *cast<BinaryOperator>(I);
+        BinOP.setHasNoSignedWrap(false);
+        BinOP.setHasNoUnsignedWrap(false);
+        return I;
+      }
+
+      // If we are known to be adding/subtracting zeros to every bit below
+      // the highest demanded bit, we just return the other side.
+      if (DemandedFromOps.isSubsetOf(RHSKnown.Zero))
+        return I->getOperand(0);
+      // We can't do this with the LHS for subtraction.
+      if (I->getOpcode() == Instruction::Add &&
+          DemandedFromOps.isSubsetOf(LHSKnown.Zero))
+        return I->getOperand(1);
+    }
+
+    // Otherwise just hand the add/sub off to computeKnownBits to fill in
+    // the known zeros and ones.
+    computeKnownBits(V, Known, Depth, CxtI);
+    break;
+  }
+  case Instruction::Shl: {
+    const APInt *SA;
+    if (match(I->getOperand(1), m_APInt(SA))) {
+      const APInt *ShrAmt;
+      if (match(I->getOperand(0), m_Shr(m_Value(), m_APInt(ShrAmt)))) {
+        Instruction *Shr = cast<Instruction>(I->getOperand(0));
+        if (Value *R = simplifyShrShlDemandedBits(
+                Shr, *ShrAmt, I, *SA, DemandedMask, Known))
+          return R;
+      }
+
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+      APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt));
+
+      // If the shift is NUW/NSW, then it does demand the high bits.
+      ShlOperator *IOp = cast<ShlOperator>(I);
+      if (IOp->hasNoSignedWrap())
+        DemandedMaskIn.setHighBits(ShiftAmt+1);
+      else if (IOp->hasNoUnsignedWrap())
+        DemandedMaskIn.setHighBits(ShiftAmt);
+
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
+        return I;
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      Known.Zero <<= ShiftAmt;
+      Known.One  <<= ShiftAmt;
+      // low bits known zero.
+      if (ShiftAmt)
+        Known.Zero.setLowBits(ShiftAmt);
+    }
+    break;
+  }
+  case Instruction::LShr: {
+    const APInt *SA;
+    if (match(I->getOperand(1), m_APInt(SA))) {
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+
+      // Unsigned shift right.
+      APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
+
+      // If the shift is exact, then it does demand the low bits (and knows that
+      // they are zero).
+      if (cast<LShrOperator>(I)->isExact())
+        DemandedMaskIn.setLowBits(ShiftAmt);
+
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
+        return I;
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      Known.Zero.lshrInPlace(ShiftAmt);
+      Known.One.lshrInPlace(ShiftAmt);
+      if (ShiftAmt)
+        Known.Zero.setHighBits(ShiftAmt);  // high bits known zero.
+    }
+    break;
+  }
+  case Instruction::AShr: {
+    // If this is an arithmetic shift right and only the low-bit is set, we can
+    // always convert this into a logical shr, even if the shift amount is
+    // variable.  The low bit of the shift cannot be an input sign bit unless
+    // the shift amount is >= the size of the datatype, which is undefined.
+    if (DemandedMask.isOneValue()) {
+      // Perform the logical shift right.
+      Instruction *NewVal = BinaryOperator::CreateLShr(
+                        I->getOperand(0), I->getOperand(1), I->getName());
+      return InsertNewInstWith(NewVal, *I);
+    }
+
+    // If the sign bit is the only bit demanded by this ashr, then there is no
+    // need to do it, the shift doesn't change the high bit.
+    if (DemandedMask.isSignMask())
+      return I->getOperand(0);
+
+    const APInt *SA;
+    if (match(I->getOperand(1), m_APInt(SA))) {
+      uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+
+      // Signed shift right.
+      APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
+      // If any of the high bits are demanded, we should set the sign bit as
+      // demanded.
+      if (DemandedMask.countLeadingZeros() <= ShiftAmt)
+        DemandedMaskIn.setSignBit();
+
+      // If the shift is exact, then it does demand the low bits (and knows that
+      // they are zero).
+      if (cast<AShrOperator>(I)->isExact())
+        DemandedMaskIn.setLowBits(ShiftAmt);
+
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
+        return I;
+
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      // Compute the new bits that are at the top now.
+      APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
+      Known.Zero.lshrInPlace(ShiftAmt);
+      Known.One.lshrInPlace(ShiftAmt);
+
+      // Handle the sign bits.
+      APInt SignMask(APInt::getSignMask(BitWidth));
+      // Adjust to where it is now in the mask.
+      SignMask.lshrInPlace(ShiftAmt);
+
+      // If the input sign bit is known to be zero, or if none of the top bits
+      // are demanded, turn this into an unsigned shift right.
+      if (BitWidth <= ShiftAmt || Known.Zero[BitWidth-ShiftAmt-1] ||
+          !DemandedMask.intersects(HighBits)) {
+        BinaryOperator *LShr = BinaryOperator::CreateLShr(I->getOperand(0),
+                                                          I->getOperand(1));
+        LShr->setIsExact(cast<BinaryOperator>(I)->isExact());
+        return InsertNewInstWith(LShr, *I);
+      } else if (Known.One.intersects(SignMask)) { // New bits are known one.
+        Known.One |= HighBits;
+      }
+    }
+    break;
+  }
+  case Instruction::SRem:
+    if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // X % -1 demands all the bits because we don't want to introduce
+      // INT_MIN % -1 (== undef) by accident.
+      if (Rem->isMinusOne())
+        break;
+      APInt RA = Rem->getValue().abs();
+      if (RA.isPowerOf2()) {
+        if (DemandedMask.ult(RA))    // srem won't affect demanded bits
+          return I->getOperand(0);
+
+        APInt LowBits = RA - 1;
+        APInt Mask2 = LowBits | APInt::getSignMask(BitWidth);
+        if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1))
+          return I;
+
+        // The low bits of LHS are unchanged by the srem.
+        Known.Zero = LHSKnown.Zero & LowBits;
+        Known.One = LHSKnown.One & LowBits;
+
+        // If LHS is non-negative or has all low bits zero, then the upper bits
+        // are all zero.
+        if (LHSKnown.isNonNegative() || LowBits.isSubsetOf(LHSKnown.Zero))
+          Known.Zero |= ~LowBits;
+
+        // If LHS is negative and not all low bits are zero, then the upper bits
+        // are all one.
+        if (LHSKnown.isNegative() && LowBits.intersects(LHSKnown.One))
+          Known.One |= ~LowBits;
+
+        assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+        break;
+      }
+    }
+
+    // The sign bit is the LHS's sign bit, except when the result of the
+    // remainder is zero.
+    if (DemandedMask.isSignBitSet()) {
+      computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, CxtI);
+      // If it's known zero, our sign bit is also zero.
+      if (LHSKnown.isNonNegative())
+        Known.makeNonNegative();
+    }
+    break;
+  case Instruction::URem: {
+    KnownBits Known2(BitWidth);
+    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    if (SimplifyDemandedBits(I, 0, AllOnes, Known2, Depth + 1) ||
+        SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1))
+      return I;
+
+    unsigned Leaders = Known2.countMinLeadingZeros();
+    Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
+    break;
+  }
+  case Instruction::Call:
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::bswap: {
+        // If the only bits demanded come from one byte of the bswap result,
+        // just shift the input byte into position to eliminate the bswap.
+        unsigned NLZ = DemandedMask.countLeadingZeros();
+        unsigned NTZ = DemandedMask.countTrailingZeros();
+
+        // Round NTZ down to the next byte.  If we have 11 trailing zeros, then
+        // we need all the bits down to bit 8.  Likewise, round NLZ.  If we
+        // have 14 leading zeros, round to 8.
+        NLZ &= ~7;
+        NTZ &= ~7;
+        // If we need exactly one byte, we can do this transformation.
+        if (BitWidth-NLZ-NTZ == 8) {
+          unsigned ResultBit = NTZ;
+          unsigned InputBit = BitWidth-NTZ-8;
+
+          // Replace this with either a left or right shift to get the byte into
+          // the right place.
+          Instruction *NewVal;
+          if (InputBit > ResultBit)
+            NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0),
+                    ConstantInt::get(I->getType(), InputBit-ResultBit));
+          else
+            NewVal = BinaryOperator::CreateShl(II->getArgOperand(0),
+                    ConstantInt::get(I->getType(), ResultBit-InputBit));
+          NewVal->takeName(I);
+          return InsertNewInstWith(NewVal, *I);
+        }
+
+        // TODO: Could compute known zero/one bits based on the input.
+        break;
+      }
+      case Intrinsic::x86_mmx_pmovmskb:
+      case Intrinsic::x86_sse_movmsk_ps:
+      case Intrinsic::x86_sse2_movmsk_pd:
+      case Intrinsic::x86_sse2_pmovmskb_128:
+      case Intrinsic::x86_avx_movmsk_ps_256:
+      case Intrinsic::x86_avx_movmsk_pd_256:
+      case Intrinsic::x86_avx2_pmovmskb: {
+        // MOVMSK copies the vector elements' sign bits to the low bits
+        // and zeros the high bits.
+        unsigned ArgWidth;
+        if (II->getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
+          ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
+        } else {
+          auto Arg = II->getArgOperand(0);
+          auto ArgType = cast<VectorType>(Arg->getType());
+          ArgWidth = ArgType->getNumElements();
+        }
+
+        // If we don't need any of low bits then return zero,
+        // we know that DemandedMask is non-zero already.
+        APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
+        if (DemandedElts.isNullValue())
+          return ConstantInt::getNullValue(VTy);
+
+        // We know that the upper bits are set to zero.
+        Known.Zero.setBitsFrom(ArgWidth);
+        return nullptr;
+      }
+      case Intrinsic::x86_sse42_crc32_64_64:
+        Known.Zero.setBitsFrom(32);
+        return nullptr;
+      }
+    }
+    computeKnownBits(V, Known, Depth, CxtI);
+    break;
+  }
+
+  // If the client is only demanding bits that we know, return the known
+  // constant.
+  if (DemandedMask.isSubsetOf(Known.Zero|Known.One))
+    return Constant::getIntegerValue(VTy, Known.One);
+  return nullptr;
+}
+
+/// Helper routine of SimplifyDemandedUseBits. It computes Known
+/// bits. It also tries to handle simplifications that can be done based on
+/// DemandedMask, but without modifying the Instruction.
+Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
+                                                     const APInt &DemandedMask,
+                                                     KnownBits &Known,
+                                                     unsigned Depth,
+                                                     Instruction *CxtI) {
+  unsigned BitWidth = DemandedMask.getBitWidth();
+  Type *ITy = I->getType();
+
+  KnownBits LHSKnown(BitWidth);
+  KnownBits RHSKnown(BitWidth);
+
+  // Despite the fact that we can't simplify this instruction in all User's
+  // context, we can at least compute the known bits, and we can
+  // do simplifications that apply to *just* the one user if we know that
+  // this instruction has a simpler value in that context.
+  switch (I->getOpcode()) {
+  case Instruction::And: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
+    computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
+                     CxtI);
+
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero;
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    APInt IKnownOne = RHSKnown.One & LHSKnown.One;
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
+      return Constant::getIntegerValue(ITy, IKnownOne);
+
+    // If all of the demanded bits are known 1 on one side, return the other.
+    // These bits cannot contribute to the result of the 'and' in this
+    // context.
+    if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
+      return I->getOperand(1);
+
+    Known.Zero = std::move(IKnownZero);
+    Known.One  = std::move(IKnownOne);
+    break;
+  }
+  case Instruction::Or: {
+    // We can simplify (X|Y) -> X or Y in the user's context if we know that
+    // only bits from X or Y are demanded.
+
+    // If either the LHS or the RHS are One, the result is One.
+    computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
+    computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
+                     CxtI);
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    APInt IKnownOne = RHSKnown.One | LHSKnown.One;
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
+      return Constant::getIntegerValue(ITy, IKnownOne);
+
+    // If all of the demanded bits are known zero on one side, return the
+    // other.  These bits cannot contribute to the result of the 'or' in this
+    // context.
+    if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
+      return I->getOperand(1);
+
+    Known.Zero = std::move(IKnownZero);
+    Known.One  = std::move(IKnownOne);
+    break;
+  }
+  case Instruction::Xor: {
+    // We can simplify (X^Y) -> X or Y in the user's context if we know that
+    // only bits from X or Y are demanded.
+
+    computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
+    computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
+                     CxtI);
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) |
+                       (RHSKnown.One & LHSKnown.One);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    APInt IKnownOne =  (RHSKnown.Zero & LHSKnown.One) |
+                       (RHSKnown.One & LHSKnown.Zero);
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
+      return Constant::getIntegerValue(ITy, IKnownOne);
+
+    // If all of the demanded bits are known zero on one side, return the
+    // other.
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(LHSKnown.Zero))
+      return I->getOperand(1);
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    Known.Zero = std::move(IKnownZero);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    Known.One  = std::move(IKnownOne);
+    break;
+  }
+  default:
+    // Compute the Known bits to simplify things downstream.
+    computeKnownBits(I, Known, Depth, CxtI);
+
+    // If this user is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(Known.Zero|Known.One))
+      return Constant::getIntegerValue(ITy, Known.One);
+
+    break;
+  }
+
+  return nullptr;
+}
+
+
+/// Helper routine of SimplifyDemandedUseBits. It tries to simplify
+/// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into
+/// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign
+/// of "C2-C1".
+///
+/// Suppose E1 and E2 are generally different in bits S={bm, bm+1,
+/// ..., bn}, without considering the specific value X is holding.
+/// This transformation is legal iff one of following conditions is hold:
+///  1) All the bit in S are 0, in this case E1 == E2.
+///  2) We don't care those bits in S, per the input DemandedMask.
+///  3) Combination of 1) and 2). Some bits in S are 0, and we don't care the
+///     rest bits.
+///
+/// Currently we only test condition 2).
+///
+/// As with SimplifyDemandedUseBits, it returns NULL if the simplification was
+/// not successful.
+Value *
+InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
+                                         Instruction *Shl, const APInt &ShlOp1,
+                                         const APInt &DemandedMask,
+                                         KnownBits &Known) {
+  if (!ShlOp1 || !ShrOp1)
+    return nullptr; // No-op.
+
+  Value *VarX = Shr->getOperand(0);
+  Type *Ty = VarX->getType();
+  unsigned BitWidth = Ty->getScalarSizeInBits();
+  if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth))
+    return nullptr; // Undef.
+
+  unsigned ShlAmt = ShlOp1.getZExtValue();
+  unsigned ShrAmt = ShrOp1.getZExtValue();
+
+  Known.One.clearAllBits();
+  Known.Zero.setLowBits(ShlAmt - 1);
+  Known.Zero &= DemandedMask;
+
+  APInt BitMask1(APInt::getAllOnesValue(BitWidth));
+  APInt BitMask2(APInt::getAllOnesValue(BitWidth));
+
+  bool isLshr = (Shr->getOpcode() == Instruction::LShr);
+  BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) :
+                      (BitMask1.ashr(ShrAmt) << ShlAmt);
+
+  if (ShrAmt <= ShlAmt) {
+    BitMask2 <<= (ShlAmt - ShrAmt);
+  } else {
+    BitMask2 = isLshr ? BitMask2.lshr(ShrAmt - ShlAmt):
+                        BitMask2.ashr(ShrAmt - ShlAmt);
+  }
+
+  // Check if condition-2 (see the comment to this function) is satified.
+  if ((BitMask1 & DemandedMask) == (BitMask2 & DemandedMask)) {
+    if (ShrAmt == ShlAmt)
+      return VarX;
+
+    if (!Shr->hasOneUse())
+      return nullptr;
+
+    BinaryOperator *New;
+    if (ShrAmt < ShlAmt) {
+      Constant *Amt = ConstantInt::get(VarX->getType(), ShlAmt - ShrAmt);
+      New = BinaryOperator::CreateShl(VarX, Amt);
+      BinaryOperator *Orig = cast<BinaryOperator>(Shl);
+      New->setHasNoSignedWrap(Orig->hasNoSignedWrap());
+      New->setHasNoUnsignedWrap(Orig->hasNoUnsignedWrap());
+    } else {
+      Constant *Amt = ConstantInt::get(VarX->getType(), ShrAmt - ShlAmt);
+      New = isLshr ? BinaryOperator::CreateLShr(VarX, Amt) :
+                     BinaryOperator::CreateAShr(VarX, Amt);
+      if (cast<BinaryOperator>(Shr)->isExact())
+        New->setIsExact(true);
+    }
+
+    return InsertNewInstWith(New, *Shl);
+  }
+
+  return nullptr;
+}
+
+/// The specified value produces a vector with any number of elements.
+/// DemandedElts contains the set of elements that are actually used by the
+/// caller. This method analyzes which elements of the operand are undef and
+/// returns that information in UndefElts.
+///
+/// If the information about demanded elements can be used to simplify the
+/// operation, the operation is simplified, then the resultant value is
+/// returned.  This returns null if no change was made.
+Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
+                                                APInt &UndefElts,
+                                                unsigned Depth) {
+  unsigned VWidth = V->getType()->getVectorNumElements();
+  APInt EltMask(APInt::getAllOnesValue(VWidth));
+  assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
+
+  if (isa<UndefValue>(V)) {
+    // If the entire vector is undefined, just return this info.
+    UndefElts = EltMask;
+    return nullptr;
+  }
+
+  if (DemandedElts.isNullValue()) { // If nothing is demanded, provide undef.
+    UndefElts = EltMask;
+    return UndefValue::get(V->getType());
+  }
+
+  UndefElts = 0;
+
+  // Handle ConstantAggregateZero, ConstantVector, ConstantDataSequential.
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    // Check if this is identity. If so, return 0 since we are not simplifying
+    // anything.
+    if (DemandedElts.isAllOnesValue())
+      return nullptr;
+
+    Type *EltTy = cast<VectorType>(V->getType())->getElementType();
+    Constant *Undef = UndefValue::get(EltTy);
+
+    SmallVector<Constant*, 16> Elts;
+    for (unsigned i = 0; i != VWidth; ++i) {
+      if (!DemandedElts[i]) {   // If not demanded, set to undef.
+        Elts.push_back(Undef);
+        UndefElts.setBit(i);
+        continue;
+      }
+
+      Constant *Elt = C->getAggregateElement(i);
+      if (!Elt) return nullptr;
+
+      if (isa<UndefValue>(Elt)) {   // Already undef.
+        Elts.push_back(Undef);
+        UndefElts.setBit(i);
+      } else {                               // Otherwise, defined.
+        Elts.push_back(Elt);
+      }
+    }
+
+    // If we changed the constant, return it.
+    Constant *NewCV = ConstantVector::get(Elts);
+    return NewCV != C ? NewCV : nullptr;
+  }
+
+  // Limit search depth.
+  if (Depth == 10)
+    return nullptr;
+
+  // If multiple users are using the root value, proceed with
+  // simplification conservatively assuming that all elements
+  // are needed.
+  if (!V->hasOneUse()) {
+    // Quit if we find multiple users of a non-root value though.
+    // They'll be handled when it's their turn to be visited by
+    // the main instcombine process.
+    if (Depth != 0)
+      // TODO: Just compute the UndefElts information recursively.
+      return nullptr;
+
+    // Conservatively assume that all elements are needed.
+    DemandedElts = EltMask;
+  }
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return nullptr;        // Only analyze instructions.
+
+  bool MadeChange = false;
+  APInt UndefElts2(VWidth, 0);
+  APInt UndefElts3(VWidth, 0);
+  Value *TmpV;
+  switch (I->getOpcode()) {
+  default: break;
+
+  case Instruction::InsertElement: {
+    // If this is a variable index, we don't know which element it overwrites.
+    // demand exactly the same input as we produce.
+    ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2));
+    if (!Idx) {
+      // Note that we can't propagate undef elt info, because we don't know
+      // which elt is getting updated.
+      TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
+                                        UndefElts2, Depth + 1);
+      if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+      break;
+    }
+
+    // If this is inserting an element that isn't demanded, remove this
+    // insertelement.
+    unsigned IdxNo = Idx->getZExtValue();
+    if (IdxNo >= VWidth || !DemandedElts[IdxNo]) {
+      Worklist.Add(I);
+      return I->getOperand(0);
+    }
+
+    // Otherwise, the element inserted overwrites whatever was there, so the
+    // input demanded set is simpler than the output set.
+    APInt DemandedElts2 = DemandedElts;
+    DemandedElts2.clearBit(IdxNo);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts2,
+                                      UndefElts, Depth + 1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+
+    // The inserted element is defined.
+    UndefElts.clearBit(IdxNo);
+    break;
+  }
+  case Instruction::ShuffleVector: {
+    ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
+    unsigned LHSVWidth =
+      Shuffle->getOperand(0)->getType()->getVectorNumElements();
+    APInt LeftDemanded(LHSVWidth, 0), RightDemanded(LHSVWidth, 0);
+    for (unsigned i = 0; i < VWidth; i++) {
+      if (DemandedElts[i]) {
+        unsigned MaskVal = Shuffle->getMaskValue(i);
+        if (MaskVal != -1u) {
+          assert(MaskVal < LHSVWidth * 2 &&
+                 "shufflevector mask index out of range!");
+          if (MaskVal < LHSVWidth)
+            LeftDemanded.setBit(MaskVal);
+          else
+            RightDemanded.setBit(MaskVal - LHSVWidth);
+        }
+      }
+    }
+
+    APInt LHSUndefElts(LHSVWidth, 0);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), LeftDemanded,
+                                      LHSUndefElts, Depth + 1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+
+    APInt RHSUndefElts(LHSVWidth, 0);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), RightDemanded,
+                                      RHSUndefElts, Depth + 1);
+    if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
+
+    bool NewUndefElts = false;
+    unsigned LHSIdx = -1u, LHSValIdx = -1u;
+    unsigned RHSIdx = -1u, RHSValIdx = -1u;
+    bool LHSUniform = true;
+    bool RHSUniform = true;
+    for (unsigned i = 0; i < VWidth; i++) {
+      unsigned MaskVal = Shuffle->getMaskValue(i);
+      if (MaskVal == -1u) {
+        UndefElts.setBit(i);
+      } else if (!DemandedElts[i]) {
+        NewUndefElts = true;
+        UndefElts.setBit(i);
+      } else if (MaskVal < LHSVWidth) {
+        if (LHSUndefElts[MaskVal]) {
+          NewUndefElts = true;
+          UndefElts.setBit(i);
+        } else {
+          LHSIdx = LHSIdx == -1u ? i : LHSVWidth;
+          LHSValIdx = LHSValIdx == -1u ? MaskVal : LHSVWidth;
+          LHSUniform = LHSUniform && (MaskVal == i);
+        }
+      } else {
+        if (RHSUndefElts[MaskVal - LHSVWidth]) {
+          NewUndefElts = true;
+          UndefElts.setBit(i);
+        } else {
+          RHSIdx = RHSIdx == -1u ? i : LHSVWidth;
+          RHSValIdx = RHSValIdx == -1u ? MaskVal - LHSVWidth : LHSVWidth;
+          RHSUniform = RHSUniform && (MaskVal - LHSVWidth == i);
+        }
+      }
+    }
+
+    // Try to transform shuffle with constant vector and single element from
+    // this constant vector to single insertelement instruction.
+    // shufflevector V, C, <v1, v2, .., ci, .., vm> ->
+    // insertelement V, C[ci], ci-n
+    if (LHSVWidth == Shuffle->getType()->getNumElements()) {
+      Value *Op = nullptr;
+      Constant *Value = nullptr;
+      unsigned Idx = -1u;
+
+      // Find constant vector with the single element in shuffle (LHS or RHS).
+      if (LHSIdx < LHSVWidth && RHSUniform) {
+        if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(0))) {
+          Op = Shuffle->getOperand(1);
+          Value = CV->getOperand(LHSValIdx);
+          Idx = LHSIdx;
+        }
+      }
+      if (RHSIdx < LHSVWidth && LHSUniform) {
+        if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(1))) {
+          Op = Shuffle->getOperand(0);
+          Value = CV->getOperand(RHSValIdx);
+          Idx = RHSIdx;
+        }
+      }
+      // Found constant vector with single element - convert to insertelement.
+      if (Op && Value) {
+        Instruction *New = InsertElementInst::Create(
+            Op, Value, ConstantInt::get(Type::getInt32Ty(I->getContext()), Idx),
+            Shuffle->getName());
+        InsertNewInstWith(New, *Shuffle);
+        return New;
+      }
+    }
+    if (NewUndefElts) {
+      // Add additional discovered undefs.
+      SmallVector<Constant*, 16> Elts;
+      for (unsigned i = 0; i < VWidth; ++i) {
+        if (UndefElts[i])
+          Elts.push_back(UndefValue::get(Type::getInt32Ty(I->getContext())));
+        else
+          Elts.push_back(ConstantInt::get(Type::getInt32Ty(I->getContext()),
+                                          Shuffle->getMaskValue(i)));
+      }
+      I->setOperand(2, ConstantVector::get(Elts));
+      MadeChange = true;
+    }
+    break;
+  }
+  case Instruction::Select: {
+    APInt LeftDemanded(DemandedElts), RightDemanded(DemandedElts);
+    if (ConstantVector* CV = dyn_cast<ConstantVector>(I->getOperand(0))) {
+      for (unsigned i = 0; i < VWidth; i++) {
+        Constant *CElt = CV->getAggregateElement(i);
+        // Method isNullValue always returns false when called on a
+        // ConstantExpr. If CElt is a ConstantExpr then skip it in order to
+        // to avoid propagating incorrect information.
+        if (isa<ConstantExpr>(CElt))
+          continue;
+        if (CElt->isNullValue())
+          LeftDemanded.clearBit(i);
+        else
+          RightDemanded.clearBit(i);
+      }
+    }
+
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), LeftDemanded, UndefElts,
+                                      Depth + 1);
+    if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
+
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(2), RightDemanded,
+                                      UndefElts2, Depth + 1);
+    if (TmpV) { I->setOperand(2, TmpV); MadeChange = true; }
+
+    // Output elements are undefined if both are undefined.
+    UndefElts &= UndefElts2;
+    break;
+  }
+  case Instruction::BitCast: {
+    // Vector->vector casts only.
+    VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
+    if (!VTy) break;
+    unsigned InVWidth = VTy->getNumElements();
+    APInt InputDemandedElts(InVWidth, 0);
+    UndefElts2 = APInt(InVWidth, 0);
+    unsigned Ratio;
+
+    if (VWidth == InVWidth) {
+      // If we are converting from <4 x i32> -> <4 x f32>, we demand the same
+      // elements as are demanded of us.
+      Ratio = 1;
+      InputDemandedElts = DemandedElts;
+    } else if ((VWidth % InVWidth) == 0) {
+      // If the number of elements in the output is a multiple of the number of
+      // elements in the input then an input element is live if any of the
+      // corresponding output elements are live.
+      Ratio = VWidth / InVWidth;
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
+        if (DemandedElts[OutIdx])
+          InputDemandedElts.setBit(OutIdx / Ratio);
+    } else if ((InVWidth % VWidth) == 0) {
+      // If the number of elements in the input is a multiple of the number of
+      // elements in the output then an input element is live if the
+      // corresponding output element is live.
+      Ratio = InVWidth / VWidth;
+      for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
+        if (DemandedElts[InIdx / Ratio])
+          InputDemandedElts.setBit(InIdx);
+    } else {
+      // Unsupported so far.
+      break;
+    }
+
+    // div/rem demand all inputs, because they don't want divide by zero.
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), InputDemandedElts,
+                                      UndefElts2, Depth + 1);
+    if (TmpV) {
+      I->setOperand(0, TmpV);
+      MadeChange = true;
+    }
+
+    if (VWidth == InVWidth) {
+      UndefElts = UndefElts2;
+    } else if ((VWidth % InVWidth) == 0) {
+      // If the number of elements in the output is a multiple of the number of
+      // elements in the input then an output element is undef if the
+      // corresponding input element is undef.
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
+        if (UndefElts2[OutIdx / Ratio])
+          UndefElts.setBit(OutIdx);
+    } else if ((InVWidth % VWidth) == 0) {
+      // If the number of elements in the input is a multiple of the number of
+      // elements in the output then an output element is undef if all of the
+      // corresponding input elements are undef.
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
+        APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio);
+        if (SubUndef.countPopulation() == Ratio)
+          UndefElts.setBit(OutIdx);
+      }
+    } else {
+      llvm_unreachable("Unimp");
+    }
+    break;
+  }
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // div/rem demand all inputs, because they don't want divide by zero.
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, UndefElts,
+                                      Depth + 1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), DemandedElts,
+                                      UndefElts2, Depth + 1);
+    if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
+
+    // Output elements are undefined if both are undefined.  Consider things
+    // like undef&0.  The result is known zero, not undef.
+    UndefElts &= UndefElts2;
+    break;
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, UndefElts,
+                                      Depth + 1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+    break;
+
+  case Instruction::Call: {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+    if (!II) break;
+    switch (II->getIntrinsicID()) {
+    default: break;
+
+    case Intrinsic::x86_xop_vfrcz_ss:
+    case Intrinsic::x86_xop_vfrcz_sd:
+      // The instructions for these intrinsics are speced to zero upper bits not
+      // pass them through like other scalar intrinsics. So we shouldn't just
+      // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
+      // Instead we should return a zero vector.
+      if (!DemandedElts[0]) {
+        Worklist.Add(II);
+        return ConstantAggregateZero::get(II->getType());
+      }
+
+      // Only the lower element is used.
+      DemandedElts = 1;
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
+                                        UndefElts, Depth + 1);
+      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+      // Only the lower element is undefined. The high elements are zero.
+      UndefElts = UndefElts[0];
+      break;
+
+    // Unary scalar-as-vector operations that work column-wise.
+    case Intrinsic::x86_sse_rcp_ss:
+    case Intrinsic::x86_sse_rsqrt_ss:
+    case Intrinsic::x86_sse_sqrt_ss:
+    case Intrinsic::x86_sse2_sqrt_sd:
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
+                                        UndefElts, Depth + 1);
+      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+      // If lowest element of a scalar op isn't used then use Arg0.
+      if (!DemandedElts[0]) {
+        Worklist.Add(II);
+        return II->getArgOperand(0);
+      }
+      // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
+      // checks).
+      break;
+
+    // Binary scalar-as-vector operations that work column-wise. The high
+    // elements come from operand 0. The low element is a function of both
+    // operands.
+    case Intrinsic::x86_sse_min_ss:
+    case Intrinsic::x86_sse_max_ss:
+    case Intrinsic::x86_sse_cmp_ss:
+    case Intrinsic::x86_sse2_min_sd:
+    case Intrinsic::x86_sse2_max_sd:
+    case Intrinsic::x86_sse2_cmp_sd: {
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
+                                        UndefElts, Depth + 1);
+      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+      // If lowest element of a scalar op isn't used then use Arg0.
+      if (!DemandedElts[0]) {
+        Worklist.Add(II);
+        return II->getArgOperand(0);
+      }
+
+      // Only lower element is used for operand 1.
+      DemandedElts = 1;
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
+                                        UndefElts2, Depth + 1);
+      if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+
+      // Lower element is undefined if both lower elements are undefined.
+      // Consider things like undef&0.  The result is known zero, not undef.
+      if (!UndefElts2[0])
+        UndefElts.clearBit(0);
+
+      break;
+    }
+
+    // Binary scalar-as-vector operations that work column-wise. The high
+    // elements come from operand 0 and the low element comes from operand 1.
+    case Intrinsic::x86_sse41_round_ss:
+    case Intrinsic::x86_sse41_round_sd: {
+      // Don't use the low element of operand 0.
+      APInt DemandedElts2 = DemandedElts;
+      DemandedElts2.clearBit(0);
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts2,
+                                        UndefElts, Depth + 1);
+      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+      // If lowest element of a scalar op isn't used then use Arg0.
+      if (!DemandedElts[0]) {
+        Worklist.Add(II);
+        return II->getArgOperand(0);
+      }
+
+      // Only lower element is used for operand 1.
+      DemandedElts = 1;
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
+                                        UndefElts2, Depth + 1);
+      if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+
+      // Take the high undef elements from operand 0 and take the lower element
+      // from operand 1.
+      UndefElts.clearBit(0);
+      UndefElts |= UndefElts2[0];
+      break;
+    }
+
+    // Three input scalar-as-vector operations that work column-wise. The high
+    // elements come from operand 0 and the low element is a function of all
+    // three inputs.
+    case Intrinsic::x86_avx512_mask_add_ss_round:
+    case Intrinsic::x86_avx512_mask_div_ss_round:
+    case Intrinsic::x86_avx512_mask_mul_ss_round:
+    case Intrinsic::x86_avx512_mask_sub_ss_round:
+    case Intrinsic::x86_avx512_mask_max_ss_round:
+    case Intrinsic::x86_avx512_mask_min_ss_round:
+    case Intrinsic::x86_avx512_mask_add_sd_round:
+    case Intrinsic::x86_avx512_mask_div_sd_round:
+    case Intrinsic::x86_avx512_mask_mul_sd_round:
+    case Intrinsic::x86_avx512_mask_sub_sd_round:
+    case Intrinsic::x86_avx512_mask_max_sd_round:
+    case Intrinsic::x86_avx512_mask_min_sd_round:
+    case Intrinsic::x86_fma_vfmadd_ss:
+    case Intrinsic::x86_fma_vfmsub_ss:
+    case Intrinsic::x86_fma_vfnmadd_ss:
+    case Intrinsic::x86_fma_vfnmsub_ss:
+    case Intrinsic::x86_fma_vfmadd_sd:
+    case Intrinsic::x86_fma_vfmsub_sd:
+    case Intrinsic::x86_fma_vfnmadd_sd:
+    case Intrinsic::x86_fma_vfnmsub_sd:
+    case Intrinsic::x86_avx512_mask_vfmadd_ss:
+    case Intrinsic::x86_avx512_mask_vfmadd_sd:
+    case Intrinsic::x86_avx512_maskz_vfmadd_ss:
+    case Intrinsic::x86_avx512_maskz_vfmadd_sd:
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
+                                        UndefElts, Depth + 1);
+      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+      // If lowest element of a scalar op isn't used then use Arg0.
+      if (!DemandedElts[0]) {
+        Worklist.Add(II);
+        return II->getArgOperand(0);
+      }
+
+      // Only lower element is used for operand 1 and 2.
+      DemandedElts = 1;
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
+                                        UndefElts2, Depth + 1);
+      if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(2), DemandedElts,
+                                        UndefElts3, Depth + 1);
+      if (TmpV) { II->setArgOperand(2, TmpV); MadeChange = true; }
+
+      // Lower element is undefined if all three lower elements are undefined.
+      // Consider things like undef&0.  The result is known zero, not undef.
+      if (!UndefElts2[0] || !UndefElts3[0])
+        UndefElts.clearBit(0);
+
+      break;
+
+    case Intrinsic::x86_avx512_mask3_vfmadd_ss:
+    case Intrinsic::x86_avx512_mask3_vfmadd_sd:
+    case Intrinsic::x86_avx512_mask3_vfmsub_ss:
+    case Intrinsic::x86_avx512_mask3_vfmsub_sd:
+    case Intrinsic::x86_avx512_mask3_vfnmsub_ss:
+    case Intrinsic::x86_avx512_mask3_vfnmsub_sd:
+      // These intrinsics get the passthru bits from operand 2.
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(2), DemandedElts,
+                                        UndefElts, Depth + 1);
+      if (TmpV) { II->setArgOperand(2, TmpV); MadeChange = true; }
+
+      // If lowest element of a scalar op isn't used then use Arg2.
+      if (!DemandedElts[0]) {
+        Worklist.Add(II);
+        return II->getArgOperand(2);
+      }
+
+      // Only lower element is used for operand 0 and 1.
+      DemandedElts = 1;
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
+                                        UndefElts2, Depth + 1);
+      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
+                                        UndefElts3, Depth + 1);
+      if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+
+      // Lower element is undefined if all three lower elements are undefined.
+      // Consider things like undef&0.  The result is known zero, not undef.
+      if (!UndefElts2[0] || !UndefElts3[0])
+        UndefElts.clearBit(0);
+
+      break;
+
+    case Intrinsic::x86_sse2_pmulu_dq:
+    case Intrinsic::x86_sse41_pmuldq:
+    case Intrinsic::x86_avx2_pmul_dq:
+    case Intrinsic::x86_avx2_pmulu_dq:
+    case Intrinsic::x86_avx512_pmul_dq_512:
+    case Intrinsic::x86_avx512_pmulu_dq_512: {
+      Value *Op0 = II->getArgOperand(0);
+      Value *Op1 = II->getArgOperand(1);
+      unsigned InnerVWidth = Op0->getType()->getVectorNumElements();
+      assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
+
+      APInt InnerDemandedElts(InnerVWidth, 0);
+      for (unsigned i = 0; i != VWidth; ++i)
+        if (DemandedElts[i])
+          InnerDemandedElts.setBit(i * 2);
+
+      UndefElts2 = APInt(InnerVWidth, 0);
+      TmpV = SimplifyDemandedVectorElts(Op0, InnerDemandedElts, UndefElts2,
+                                        Depth + 1);
+      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+      UndefElts3 = APInt(InnerVWidth, 0);
+      TmpV = SimplifyDemandedVectorElts(Op1, InnerDemandedElts, UndefElts3,
+                                        Depth + 1);
+      if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+
+      break;
+    }
+
+    case Intrinsic::x86_sse2_packssdw_128:
+    case Intrinsic::x86_sse2_packsswb_128:
+    case Intrinsic::x86_sse2_packuswb_128:
+    case Intrinsic::x86_sse41_packusdw:
+    case Intrinsic::x86_avx2_packssdw:
+    case Intrinsic::x86_avx2_packsswb:
+    case Intrinsic::x86_avx2_packusdw:
+    case Intrinsic::x86_avx2_packuswb:
+    case Intrinsic::x86_avx512_packssdw_512:
+    case Intrinsic::x86_avx512_packsswb_512:
+    case Intrinsic::x86_avx512_packusdw_512:
+    case Intrinsic::x86_avx512_packuswb_512: {
+      auto *Ty0 = II->getArgOperand(0)->getType();
+      unsigned InnerVWidth = Ty0->getVectorNumElements();
+      assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
+
+      unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
+      unsigned VWidthPerLane = VWidth / NumLanes;
+      unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
+
+      // Per lane, pack the elements of the first input and then the second.
+      // e.g.
+      // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
+      // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
+      for (int OpNum = 0; OpNum != 2; ++OpNum) {
+        APInt OpDemandedElts(InnerVWidth, 0);
+        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          unsigned LaneIdx = Lane * VWidthPerLane;
+          for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
+            unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
+            if (DemandedElts[Idx])
+              OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
+          }
+        }
+
+        // Demand elements from the operand.
+        auto *Op = II->getArgOperand(OpNum);
+        APInt OpUndefElts(InnerVWidth, 0);
+        TmpV = SimplifyDemandedVectorElts(Op, OpDemandedElts, OpUndefElts,
+                                          Depth + 1);
+        if (TmpV) {
+          II->setArgOperand(OpNum, TmpV);
+          MadeChange = true;
+        }
+
+        // Pack the operand's UNDEF elements, one lane at a time.
+        OpUndefElts = OpUndefElts.zext(VWidth);
+        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
+          LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
+          LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
+          UndefElts |= LaneElts;
+        }
+      }
+      break;
+    }
+
+    // PSHUFB
+    case Intrinsic::x86_ssse3_pshuf_b_128:
+    case Intrinsic::x86_avx2_pshuf_b:
+    case Intrinsic::x86_avx512_pshuf_b_512:
+    // PERMILVAR
+    case Intrinsic::x86_avx_vpermilvar_ps:
+    case Intrinsic::x86_avx_vpermilvar_ps_256:
+    case Intrinsic::x86_avx512_vpermilvar_ps_512:
+    case Intrinsic::x86_avx_vpermilvar_pd:
+    case Intrinsic::x86_avx_vpermilvar_pd_256:
+    case Intrinsic::x86_avx512_vpermilvar_pd_512:
+    // PERMV
+    case Intrinsic::x86_avx2_permd:
+    case Intrinsic::x86_avx2_permps: {
+      Value *Op1 = II->getArgOperand(1);
+      TmpV = SimplifyDemandedVectorElts(Op1, DemandedElts, UndefElts,
+                                        Depth + 1);
+      if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+      break;
+    }
+
+    // SSE4A instructions leave the upper 64-bits of the 128-bit result
+    // in an undefined state.
+    case Intrinsic::x86_sse4a_extrq:
+    case Intrinsic::x86_sse4a_extrqi:
+    case Intrinsic::x86_sse4a_insertq:
+    case Intrinsic::x86_sse4a_insertqi:
+      UndefElts.setHighBits(VWidth / 2);
+      break;
+    case Intrinsic::amdgcn_buffer_load:
+    case Intrinsic::amdgcn_buffer_load_format:
+    case Intrinsic::amdgcn_image_sample:
+    case Intrinsic::amdgcn_image_sample_cl:
+    case Intrinsic::amdgcn_image_sample_d:
+    case Intrinsic::amdgcn_image_sample_d_cl:
+    case Intrinsic::amdgcn_image_sample_l:
+    case Intrinsic::amdgcn_image_sample_b:
+    case Intrinsic::amdgcn_image_sample_b_cl:
+    case Intrinsic::amdgcn_image_sample_lz:
+    case Intrinsic::amdgcn_image_sample_cd:
+    case Intrinsic::amdgcn_image_sample_cd_cl:
+
+    case Intrinsic::amdgcn_image_sample_c:
+    case Intrinsic::amdgcn_image_sample_c_cl:
+    case Intrinsic::amdgcn_image_sample_c_d:
+    case Intrinsic::amdgcn_image_sample_c_d_cl:
+    case Intrinsic::amdgcn_image_sample_c_l:
+    case Intrinsic::amdgcn_image_sample_c_b:
+    case Intrinsic::amdgcn_image_sample_c_b_cl:
+    case Intrinsic::amdgcn_image_sample_c_lz:
+    case Intrinsic::amdgcn_image_sample_c_cd:
+    case Intrinsic::amdgcn_image_sample_c_cd_cl:
+
+    case Intrinsic::amdgcn_image_sample_o:
+    case Intrinsic::amdgcn_image_sample_cl_o:
+    case Intrinsic::amdgcn_image_sample_d_o:
+    case Intrinsic::amdgcn_image_sample_d_cl_o:
+    case Intrinsic::amdgcn_image_sample_l_o:
+    case Intrinsic::amdgcn_image_sample_b_o:
+    case Intrinsic::amdgcn_image_sample_b_cl_o:
+    case Intrinsic::amdgcn_image_sample_lz_o:
+    case Intrinsic::amdgcn_image_sample_cd_o:
+    case Intrinsic::amdgcn_image_sample_cd_cl_o:
+
+    case Intrinsic::amdgcn_image_sample_c_o:
+    case Intrinsic::amdgcn_image_sample_c_cl_o:
+    case Intrinsic::amdgcn_image_sample_c_d_o:
+    case Intrinsic::amdgcn_image_sample_c_d_cl_o:
+    case Intrinsic::amdgcn_image_sample_c_l_o:
+    case Intrinsic::amdgcn_image_sample_c_b_o:
+    case Intrinsic::amdgcn_image_sample_c_b_cl_o:
+    case Intrinsic::amdgcn_image_sample_c_lz_o:
+    case Intrinsic::amdgcn_image_sample_c_cd_o:
+    case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
+
+    case Intrinsic::amdgcn_image_getlod: {
+      if (VWidth == 1 || !DemandedElts.isMask())
+        return nullptr;
+
+      // TODO: Handle 3 vectors when supported in code gen.
+      unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countTrailingOnes());
+      if (NewNumElts == VWidth)
+        return nullptr;
+
+      Module *M = II->getParent()->getParent()->getParent();
+      Type *EltTy = V->getType()->getVectorElementType();
+
+      Type *NewTy = (NewNumElts == 1) ? EltTy :
+        VectorType::get(EltTy, NewNumElts);
+
+      auto IID = II->getIntrinsicID();
+
+      bool IsBuffer = IID == Intrinsic::amdgcn_buffer_load ||
+                      IID == Intrinsic::amdgcn_buffer_load_format;
+
+      Function *NewIntrin = IsBuffer ?
+        Intrinsic::getDeclaration(M, IID, NewTy) :
+        // Samplers have 3 mangled types.
+        Intrinsic::getDeclaration(M, IID,
+                                  { NewTy, II->getArgOperand(0)->getType(),
+                                      II->getArgOperand(1)->getType()});
+
+      SmallVector<Value *, 5> Args;
+      for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
+        Args.push_back(II->getArgOperand(I));
+
+      IRBuilderBase::InsertPointGuard Guard(Builder);
+      Builder.SetInsertPoint(II);
+
+      CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
+      NewCall->takeName(II);
+      NewCall->copyMetadata(*II);
+
+      if (!IsBuffer) {
+        ConstantInt *DMask = dyn_cast<ConstantInt>(NewCall->getArgOperand(3));
+        if (DMask) {
+          unsigned DMaskVal = DMask->getZExtValue() & 0xf;
+
+          unsigned PopCnt = 0;
+          unsigned NewDMask = 0;
+          for (unsigned I = 0; I < 4; ++I) {
+            const unsigned Bit = 1 << I;
+            if (!!(DMaskVal & Bit)) {
+              if (++PopCnt > NewNumElts)
+                break;
+
+              NewDMask |= Bit;
+            }
+          }
+
+          NewCall->setArgOperand(3, ConstantInt::get(DMask->getType(), NewDMask));
+        }
+      }
+
+
+      if (NewNumElts == 1) {
+        return Builder.CreateInsertElement(UndefValue::get(V->getType()),
+                                           NewCall, static_cast<uint64_t>(0));
+      }
+
+      SmallVector<uint32_t, 8> EltMask;
+      for (unsigned I = 0; I < VWidth; ++I)
+        EltMask.push_back(I);
+
+      Value *Shuffle = Builder.CreateShuffleVector(
+        NewCall, UndefValue::get(NewTy), EltMask);
+
+      MadeChange = true;
+      return Shuffle;
+    }
+    }
+    break;
+  }
+  }
+  return MadeChange ? I : nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
new file mode 100644
index 000000000000..dd71a31b644b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -0,0 +1,1492 @@
+//===- InstCombineVectorOps.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements instcombine for ExtractElement, InsertElement and
+// ShuffleVector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/PatternMatch.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+/// Return true if the value is cheaper to scalarize than it is to leave as a
+/// vector operation. isConstant indicates whether we're extracting one known
+/// element. If false we're extracting a variable index.
+static bool cheapToScalarize(Value *V, bool isConstant) {
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    if (isConstant) return true;
+
+    // If all elts are the same, we can extract it and use any of the values.
+    if (Constant *Op0 = C->getAggregateElement(0U)) {
+      for (unsigned i = 1, e = V->getType()->getVectorNumElements(); i != e;
+           ++i)
+        if (C->getAggregateElement(i) != Op0)
+          return false;
+      return true;
+    }
+  }
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // Insert element gets simplified to the inserted element or is deleted if
+  // this is constant idx extract element and its a constant idx insertelt.
+  if (I->getOpcode() == Instruction::InsertElement && isConstant &&
+      isa<ConstantInt>(I->getOperand(2)))
+    return true;
+  if (I->getOpcode() == Instruction::Load && I->hasOneUse())
+    return true;
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I))
+    if (BO->hasOneUse() &&
+        (cheapToScalarize(BO->getOperand(0), isConstant) ||
+         cheapToScalarize(BO->getOperand(1), isConstant)))
+      return true;
+  if (CmpInst *CI = dyn_cast<CmpInst>(I))
+    if (CI->hasOneUse() &&
+        (cheapToScalarize(CI->getOperand(0), isConstant) ||
+         cheapToScalarize(CI->getOperand(1), isConstant)))
+      return true;
+
+  return false;
+}
+
+// If we have a PHI node with a vector type that is only used to feed
+// itself and be an operand of extractelement at a constant location,
+// try to replace the PHI of the vector type with a PHI of a scalar type.
+Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
+  SmallVector<Instruction *, 2> Extracts;
+  // The users we want the PHI to have are:
+  // 1) The EI ExtractElement (we already know this)
+  // 2) Possibly more ExtractElements with the same index.
+  // 3) Another operand, which will feed back into the PHI.
+  Instruction *PHIUser = nullptr;
+  for (auto U : PN->users()) {
+    if (ExtractElementInst *EU = dyn_cast<ExtractElementInst>(U)) {
+      if (EI.getIndexOperand() == EU->getIndexOperand())
+        Extracts.push_back(EU);
+      else
+        return nullptr;
+    } else if (!PHIUser) {
+      PHIUser = cast<Instruction>(U);
+    } else {
+      return nullptr;
+    }
+  }
+
+  if (!PHIUser)
+    return nullptr;
+
+  // Verify that this PHI user has one use, which is the PHI itself,
+  // and that it is a binary operation which is cheap to scalarize.
+  // otherwise return NULL.
+  if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) ||
+      !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true))
+    return nullptr;
+
+  // Create a scalar PHI node that will replace the vector PHI node
+  // just before the current PHI node.
+  PHINode *scalarPHI = cast<PHINode>(InsertNewInstWith(
+      PHINode::Create(EI.getType(), PN->getNumIncomingValues(), ""), *PN));
+  // Scalarize each PHI operand.
+  for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
+    Value *PHIInVal = PN->getIncomingValue(i);
+    BasicBlock *inBB = PN->getIncomingBlock(i);
+    Value *Elt = EI.getIndexOperand();
+    // If the operand is the PHI induction variable:
+    if (PHIInVal == PHIUser) {
+      // Scalarize the binary operation. Its first operand is the
+      // scalar PHI, and the second operand is extracted from the other
+      // vector operand.
+      BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);
+      unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0;
+      Value *Op = InsertNewInstWith(
+          ExtractElementInst::Create(B0->getOperand(opId), Elt,
+                                     B0->getOperand(opId)->getName() + ".Elt"),
+          *B0);
+      Value *newPHIUser = InsertNewInstWith(
+          BinaryOperator::CreateWithCopiedFlags(B0->getOpcode(),
+                                                scalarPHI, Op, B0), *B0);
+      scalarPHI->addIncoming(newPHIUser, inBB);
+    } else {
+      // Scalarize PHI input:
+      Instruction *newEI = ExtractElementInst::Create(PHIInVal, Elt, "");
+      // Insert the new instruction into the predecessor basic block.
+      Instruction *pos = dyn_cast<Instruction>(PHIInVal);
+      BasicBlock::iterator InsertPos;
+      if (pos && !isa<PHINode>(pos)) {
+        InsertPos = ++pos->getIterator();
+      } else {
+        InsertPos = inBB->getFirstInsertionPt();
+      }
+
+      InsertNewInstWith(newEI, *InsertPos);
+
+      scalarPHI->addIncoming(newEI, inBB);
+    }
+  }
+
+  for (auto E : Extracts)
+    replaceInstUsesWith(*E, scalarPHI);
+
+  return &EI;
+}
+
+Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
+  if (Value *V = SimplifyExtractElementInst(EI.getVectorOperand(),
+                                            EI.getIndexOperand(),
+                                            SQ.getWithInstruction(&EI)))
+    return replaceInstUsesWith(EI, V);
+
+  // If vector val is constant with all elements the same, replace EI with
+  // that element.  We handle a known element # below.
+  if (Constant *C = dyn_cast<Constant>(EI.getOperand(0)))
+    if (cheapToScalarize(C, false))
+      return replaceInstUsesWith(EI, C->getAggregateElement(0U));
+
+  // If extracting a specified index from the vector, see if we can recursively
+  // find a previously computed scalar that was inserted into the vector.
+  if (ConstantInt *IdxC = dyn_cast<ConstantInt>(EI.getOperand(1))) {
+    unsigned IndexVal = IdxC->getZExtValue();
+    unsigned VectorWidth = EI.getVectorOperandType()->getNumElements();
+
+    // InstSimplify handles cases where the index is invalid.
+    assert(IndexVal < VectorWidth);
+
+    // This instruction only demands the single element from the input vector.
+    // If the input vector has a single use, simplify it based on this use
+    // property.
+    if (EI.getOperand(0)->hasOneUse() && VectorWidth != 1) {
+      APInt UndefElts(VectorWidth, 0);
+      APInt DemandedMask(VectorWidth, 0);
+      DemandedMask.setBit(IndexVal);
+      if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0), DemandedMask,
+                                                UndefElts)) {
+        EI.setOperand(0, V);
+        return &EI;
+      }
+    }
+
+    // If this extractelement is directly using a bitcast from a vector of
+    // the same number of elements, see if we can find the source element from
+    // it.  In this case, we will end up needing to bitcast the scalars.
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) {
+      if (VectorType *VT = dyn_cast<VectorType>(BCI->getOperand(0)->getType()))
+        if (VT->getNumElements() == VectorWidth)
+          if (Value *Elt = findScalarElement(BCI->getOperand(0), IndexVal))
+            return new BitCastInst(Elt, EI.getType());
+    }
+
+    // If there's a vector PHI feeding a scalar use through this extractelement
+    // instruction, try to scalarize the PHI.
+    if (PHINode *PN = dyn_cast<PHINode>(EI.getOperand(0))) {
+      Instruction *scalarPHI = scalarizePHI(EI, PN);
+      if (scalarPHI)
+        return scalarPHI;
+    }
+  }
+
+  if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) {
+    // Push extractelement into predecessor operation if legal and
+    // profitable to do so.
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+      if (I->hasOneUse() &&
+          cheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) {
+        Value *newEI0 =
+          Builder.CreateExtractElement(BO->getOperand(0), EI.getOperand(1),
+                                       EI.getName()+".lhs");
+        Value *newEI1 =
+          Builder.CreateExtractElement(BO->getOperand(1), EI.getOperand(1),
+                                       EI.getName()+".rhs");
+        return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(),
+                                                     newEI0, newEI1, BO);
+      }
+    } else if (InsertElementInst *IE = dyn_cast<InsertElementInst>(I)) {
+      // Extracting the inserted element?
+      if (IE->getOperand(2) == EI.getOperand(1))
+        return replaceInstUsesWith(EI, IE->getOperand(1));
+      // If the inserted and extracted elements are constants, they must not
+      // be the same value, extract from the pre-inserted value instead.
+      if (isa<Constant>(IE->getOperand(2)) && isa<Constant>(EI.getOperand(1))) {
+        Worklist.AddValue(EI.getOperand(0));
+        EI.setOperand(0, IE->getOperand(0));
+        return &EI;
+      }
+    } else if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I)) {
+      // If this is extracting an element from a shufflevector, figure out where
+      // it came from and extract from the appropriate input element instead.
+      if (ConstantInt *Elt = dyn_cast<ConstantInt>(EI.getOperand(1))) {
+        int SrcIdx = SVI->getMaskValue(Elt->getZExtValue());
+        Value *Src;
+        unsigned LHSWidth =
+          SVI->getOperand(0)->getType()->getVectorNumElements();
+
+        if (SrcIdx < 0)
+          return replaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+        if (SrcIdx < (int)LHSWidth)
+          Src = SVI->getOperand(0);
+        else {
+          SrcIdx -= LHSWidth;
+          Src = SVI->getOperand(1);
+        }
+        Type *Int32Ty = Type::getInt32Ty(EI.getContext());
+        return ExtractElementInst::Create(Src,
+                                          ConstantInt::get(Int32Ty,
+                                                           SrcIdx, false));
+      }
+    } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+      // Canonicalize extractelement(cast) -> cast(extractelement).
+      // Bitcasts can change the number of vector elements, and they cost
+      // nothing.
+      if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
+        Value *EE = Builder.CreateExtractElement(CI->getOperand(0),
+                                                 EI.getIndexOperand());
+        Worklist.AddValue(EE);
+        return CastInst::Create(CI->getOpcode(), EE, EI.getType());
+      }
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+      if (SI->hasOneUse()) {
+        // TODO: For a select on vectors, it might be useful to do this if it
+        // has multiple extractelement uses. For vector select, that seems to
+        // fight the vectorizer.
+
+        // If we are extracting an element from a vector select or a select on
+        // vectors, create a select on the scalars extracted from the vector
+        // arguments.
+        Value *TrueVal = SI->getTrueValue();
+        Value *FalseVal = SI->getFalseValue();
+
+        Value *Cond = SI->getCondition();
+        if (Cond->getType()->isVectorTy()) {
+          Cond = Builder.CreateExtractElement(Cond,
+                                              EI.getIndexOperand(),
+                                              Cond->getName() + ".elt");
+        }
+
+        Value *V1Elem
+          = Builder.CreateExtractElement(TrueVal,
+                                         EI.getIndexOperand(),
+                                         TrueVal->getName() + ".elt");
+
+        Value *V2Elem
+          = Builder.CreateExtractElement(FalseVal,
+                                         EI.getIndexOperand(),
+                                         FalseVal->getName() + ".elt");
+        return SelectInst::Create(Cond,
+                                  V1Elem,
+                                  V2Elem,
+                                  SI->getName() + ".elt");
+      }
+    }
+  }
+  return nullptr;
+}
+
+/// If V is a shuffle of values that ONLY returns elements from either LHS or
+/// RHS, return the shuffle mask and true. Otherwise, return false.
+static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
+                                         SmallVectorImpl<Constant*> &Mask) {
+  assert(LHS->getType() == RHS->getType() &&
+         "Invalid CollectSingleShuffleElements");
+  unsigned NumElts = V->getType()->getVectorNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
+    return true;
+  }
+
+  if (V == LHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i));
+    return true;
+  }
+
+  if (V == RHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()),
+                                      i+NumElts));
+    return true;
+  }
+
+  if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+
+    if (!isa<ConstantInt>(IdxOp))
+      return false;
+    unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+
+    if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
+      // We can handle this if the vector we are inserting into is
+      // transitively ok.
+      if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+        // If so, update the mask to reflect the inserted undef.
+        Mask[InsertedIdx] = UndefValue::get(Type::getInt32Ty(V->getContext()));
+        return true;
+      }
+    } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
+      if (isa<ConstantInt>(EI->getOperand(1))) {
+        unsigned ExtractedIdx =
+        cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+        unsigned NumLHSElts = LHS->getType()->getVectorNumElements();
+
+        // This must be extracting from either LHS or RHS.
+        if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
+          // We can handle this if the vector we are inserting into is
+          // transitively ok.
+          if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+            // If so, update the mask to reflect the inserted value.
+            if (EI->getOperand(0) == LHS) {
+              Mask[InsertedIdx % NumElts] =
+              ConstantInt::get(Type::getInt32Ty(V->getContext()),
+                               ExtractedIdx);
+            } else {
+              assert(EI->getOperand(0) == RHS);
+              Mask[InsertedIdx % NumElts] =
+              ConstantInt::get(Type::getInt32Ty(V->getContext()),
+                               ExtractedIdx + NumLHSElts);
+            }
+            return true;
+          }
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+/// If we have insertion into a vector that is wider than the vector that we
+/// are extracting from, try to widen the source vector to allow a single
+/// shufflevector to replace one or more insert/extract pairs.
+static void replaceExtractElements(InsertElementInst *InsElt,
+                                   ExtractElementInst *ExtElt,
+                                   InstCombiner &IC) {
+  VectorType *InsVecType = InsElt->getType();
+  VectorType *ExtVecType = ExtElt->getVectorOperandType();
+  unsigned NumInsElts = InsVecType->getVectorNumElements();
+  unsigned NumExtElts = ExtVecType->getVectorNumElements();
+
+  // The inserted-to vector must be wider than the extracted-from vector.
+  if (InsVecType->getElementType() != ExtVecType->getElementType() ||
+      NumExtElts >= NumInsElts)
+    return;
+
+  // Create a shuffle mask to widen the extended-from vector using undefined
+  // values. The mask selects all of the values of the original vector followed
+  // by as many undefined values as needed to create a vector of the same length
+  // as the inserted-to vector.
+  SmallVector<Constant *, 16> ExtendMask;
+  IntegerType *IntType = Type::getInt32Ty(InsElt->getContext());
+  for (unsigned i = 0; i < NumExtElts; ++i)
+    ExtendMask.push_back(ConstantInt::get(IntType, i));
+  for (unsigned i = NumExtElts; i < NumInsElts; ++i)
+    ExtendMask.push_back(UndefValue::get(IntType));
+
+  Value *ExtVecOp = ExtElt->getVectorOperand();
+  auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
+  BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
+                                   ? ExtVecOpInst->getParent()
+                                   : ExtElt->getParent();
+
+  // TODO: This restriction matches the basic block check below when creating
+  // new extractelement instructions. If that limitation is removed, this one
+  // could also be removed. But for now, we just bail out to ensure that we
+  // will replace the extractelement instruction that is feeding our
+  // insertelement instruction. This allows the insertelement to then be
+  // replaced by a shufflevector. If the insertelement is not replaced, we can
+  // induce infinite looping because there's an optimization for extractelement
+  // that will delete our widening shuffle. This would trigger another attempt
+  // here to create that shuffle, and we spin forever.
+  if (InsertionBlock != InsElt->getParent())
+    return;
+
+  // TODO: This restriction matches the check in visitInsertElementInst() and
+  // prevents an infinite loop caused by not turning the extract/insert pair
+  // into a shuffle. We really should not need either check, but we're lacking
+  // folds for shufflevectors because we're afraid to generate shuffle masks
+  // that the backend can't handle.
+  if (InsElt->hasOneUse() && isa<InsertElementInst>(InsElt->user_back()))
+    return;
+
+  auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType),
+                                        ConstantVector::get(ExtendMask));
+
+  // Insert the new shuffle after the vector operand of the extract is defined
+  // (as long as it's not a PHI) or at the start of the basic block of the
+  // extract, so any subsequent extracts in the same basic block can use it.
+  // TODO: Insert before the earliest ExtractElementInst that is replaced.
+  if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
+    WideVec->insertAfter(ExtVecOpInst);
+  else
+    IC.InsertNewInstWith(WideVec, *ExtElt->getParent()->getFirstInsertionPt());
+
+  // Replace extracts from the original narrow vector with extracts from the new
+  // wide vector.
+  for (User *U : ExtVecOp->users()) {
+    ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U);
+    if (!OldExt || OldExt->getParent() != WideVec->getParent())
+      continue;
+    auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));
+    NewExt->insertAfter(OldExt);
+    IC.replaceInstUsesWith(*OldExt, NewExt);
+  }
+}
+
+/// We are building a shuffle to create V, which is a sequence of insertelement,
+/// extractelement pairs. If PermittedRHS is set, then we must either use it or
+/// not rely on the second vector source. Return a std::pair containing the
+/// left and right vectors of the proposed shuffle (or 0), and set the Mask
+/// parameter as required.
+///
+/// Note: we intentionally don't try to fold earlier shuffles since they have
+/// often been chosen carefully to be efficiently implementable on the target.
+typedef std::pair<Value *, Value *> ShuffleOps;
+
+static ShuffleOps collectShuffleElements(Value *V,
+                                         SmallVectorImpl<Constant *> &Mask,
+                                         Value *PermittedRHS,
+                                         InstCombiner &IC) {
+  assert(V->getType()->isVectorTy() && "Invalid shuffle!");
+  unsigned NumElts = V->getType()->getVectorNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
+    return std::make_pair(
+        PermittedRHS ? UndefValue::get(PermittedRHS->getType()) : V, nullptr);
+  }
+
+  if (isa<ConstantAggregateZero>(V)) {
+    Mask.assign(NumElts, ConstantInt::get(Type::getInt32Ty(V->getContext()),0));
+    return std::make_pair(V, nullptr);
+  }
+
+  if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+
+    if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+      if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) {
+        unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+        unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+
+        // Either the extracted from or inserted into vector must be RHSVec,
+        // otherwise we'd end up with a shuffle of three inputs.
+        if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {
+          Value *RHS = EI->getOperand(0);
+          ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC);
+          assert(LR.second == nullptr || LR.second == RHS);
+
+          if (LR.first->getType() != RHS->getType()) {
+            // Although we are giving up for now, see if we can create extracts
+            // that match the inserts for another round of combining.
+            replaceExtractElements(IEI, EI, IC);
+
+            // We tried our best, but we can't find anything compatible with RHS
+            // further up the chain. Return a trivial shuffle.
+            for (unsigned i = 0; i < NumElts; ++i)
+              Mask[i] = ConstantInt::get(Type::getInt32Ty(V->getContext()), i);
+            return std::make_pair(V, nullptr);
+          }
+
+          unsigned NumLHSElts = RHS->getType()->getVectorNumElements();
+          Mask[InsertedIdx % NumElts] =
+            ConstantInt::get(Type::getInt32Ty(V->getContext()),
+                             NumLHSElts+ExtractedIdx);
+          return std::make_pair(LR.first, RHS);
+        }
+
+        if (VecOp == PermittedRHS) {
+          // We've gone as far as we can: anything on the other side of the
+          // extractelement will already have been converted into a shuffle.
+          unsigned NumLHSElts =
+              EI->getOperand(0)->getType()->getVectorNumElements();
+          for (unsigned i = 0; i != NumElts; ++i)
+            Mask.push_back(ConstantInt::get(
+                Type::getInt32Ty(V->getContext()),
+                i == InsertedIdx ? ExtractedIdx : NumLHSElts + i));
+          return std::make_pair(EI->getOperand(0), PermittedRHS);
+        }
+
+        // If this insertelement is a chain that comes from exactly these two
+        // vectors, return the vector and the effective shuffle.
+        if (EI->getOperand(0)->getType() == PermittedRHS->getType() &&
+            collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS,
+                                         Mask))
+          return std::make_pair(EI->getOperand(0), PermittedRHS);
+      }
+    }
+  }
+
+  // Otherwise, we can't do anything fancy. Return an identity vector.
+  for (unsigned i = 0; i != NumElts; ++i)
+    Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i));
+  return std::make_pair(V, nullptr);
+}
+
+/// Try to find redundant insertvalue instructions, like the following ones:
+///  %0 = insertvalue { i8, i32 } undef, i8 %x, 0
+///  %1 = insertvalue { i8, i32 } %0,    i8 %y, 0
+/// Here the second instruction inserts values at the same indices, as the
+/// first one, making the first one redundant.
+/// It should be transformed to:
+///  %0 = insertvalue { i8, i32 } undef, i8 %y, 0
+Instruction *InstCombiner::visitInsertValueInst(InsertValueInst &I) {
+  bool IsRedundant = false;
+  ArrayRef<unsigned int> FirstIndices = I.getIndices();
+
+  // If there is a chain of insertvalue instructions (each of them except the
+  // last one has only one use and it's another insertvalue insn from this
+  // chain), check if any of the 'children' uses the same indices as the first
+  // instruction. In this case, the first one is redundant.
+  Value *V = &I;
+  unsigned Depth = 0;
+  while (V->hasOneUse() && Depth < 10) {
+    User *U = V->user_back();
+    auto UserInsInst = dyn_cast<InsertValueInst>(U);
+    if (!UserInsInst || U->getOperand(0) != V)
+      break;
+    if (UserInsInst->getIndices() == FirstIndices) {
+      IsRedundant = true;
+      break;
+    }
+    V = UserInsInst;
+    Depth++;
+  }
+
+  if (IsRedundant)
+    return replaceInstUsesWith(I, I.getOperand(0));
+  return nullptr;
+}
+
+static bool isShuffleEquivalentToSelect(ShuffleVectorInst &Shuf) {
+  int MaskSize = Shuf.getMask()->getType()->getVectorNumElements();
+  int VecSize = Shuf.getOperand(0)->getType()->getVectorNumElements();
+
+  // A vector select does not change the size of the operands.
+  if (MaskSize != VecSize)
+    return false;
+
+  // Each mask element must be undefined or choose a vector element from one of
+  // the source operands without crossing vector lanes.
+  for (int i = 0; i != MaskSize; ++i) {
+    int Elt = Shuf.getMaskValue(i);
+    if (Elt != -1 && Elt != i && Elt != i + VecSize)
+      return false;
+  }
+
+  return true;
+}
+
+// Turn a chain of inserts that splats a value into a canonical insert + shuffle
+// splat. That is:
+// insertelt(insertelt(insertelt(insertelt X, %k, 0), %k, 1), %k, 2) ... ->
+// shufflevector(insertelt(X, %k, 0), undef, zero)
+static Instruction *foldInsSequenceIntoBroadcast(InsertElementInst &InsElt) {
+  // We are interested in the last insert in a chain. So, if this insert
+  // has a single user, and that user is an insert, bail.
+  if (InsElt.hasOneUse() && isa<InsertElementInst>(InsElt.user_back()))
+    return nullptr;
+
+  VectorType *VT = cast<VectorType>(InsElt.getType());
+  int NumElements = VT->getNumElements();
+
+  // Do not try to do this for a one-element vector, since that's a nop,
+  // and will cause an inf-loop.
+  if (NumElements == 1)
+    return nullptr;
+
+  Value *SplatVal = InsElt.getOperand(1);
+  InsertElementInst *CurrIE = &InsElt;  
+  SmallVector<bool, 16> ElementPresent(NumElements, false);
+
+  // Walk the chain backwards, keeping track of which indices we inserted into,
+  // until we hit something that isn't an insert of the splatted value.
+  while (CurrIE) {
+    ConstantInt *Idx = dyn_cast<ConstantInt>(CurrIE->getOperand(2));
+    if (!Idx || CurrIE->getOperand(1) != SplatVal)
+      return nullptr;
+
+    // Check none of the intermediate steps have any additional uses.
+    if ((CurrIE != &InsElt) && !CurrIE->hasOneUse())
+      return nullptr;
+
+    ElementPresent[Idx->getZExtValue()] = true;
+    CurrIE = dyn_cast<InsertElementInst>(CurrIE->getOperand(0));
+  }
+
+  // Make sure we've seen an insert into every element.
+  if (llvm::any_of(ElementPresent, [](bool Present) { return !Present; }))
+    return nullptr;
+
+  // All right, create the insert + shuffle.
+  Instruction *InsertFirst = InsertElementInst::Create(
+      UndefValue::get(VT), SplatVal,
+      ConstantInt::get(Type::getInt32Ty(InsElt.getContext()), 0), "", &InsElt);
+
+  Constant *ZeroMask = ConstantAggregateZero::get(
+      VectorType::get(Type::getInt32Ty(InsElt.getContext()), NumElements));
+
+  return new ShuffleVectorInst(InsertFirst, UndefValue::get(VT), ZeroMask);
+}
+
+/// If we have an insertelement instruction feeding into another insertelement
+/// and the 2nd is inserting a constant into the vector, canonicalize that
+/// constant insertion before the insertion of a variable:
+///
+/// insertelement (insertelement X, Y, IdxC1), ScalarC, IdxC2 -->
+/// insertelement (insertelement X, ScalarC, IdxC2), Y, IdxC1
+///
+/// This has the potential of eliminating the 2nd insertelement instruction
+/// via constant folding of the scalar constant into a vector constant.
+static Instruction *hoistInsEltConst(InsertElementInst &InsElt2,
+                                     InstCombiner::BuilderTy &Builder) {
+  auto *InsElt1 = dyn_cast<InsertElementInst>(InsElt2.getOperand(0));
+  if (!InsElt1 || !InsElt1->hasOneUse())
+    return nullptr;
+
+  Value *X, *Y;
+  Constant *ScalarC;
+  ConstantInt *IdxC1, *IdxC2;
+  if (match(InsElt1->getOperand(0), m_Value(X)) &&
+      match(InsElt1->getOperand(1), m_Value(Y)) && !isa<Constant>(Y) &&
+      match(InsElt1->getOperand(2), m_ConstantInt(IdxC1)) &&
+      match(InsElt2.getOperand(1), m_Constant(ScalarC)) &&
+      match(InsElt2.getOperand(2), m_ConstantInt(IdxC2)) && IdxC1 != IdxC2) {
+    Value *NewInsElt1 = Builder.CreateInsertElement(X, ScalarC, IdxC2);
+    return InsertElementInst::Create(NewInsElt1, Y, IdxC1);
+  }
+
+  return nullptr;
+}
+
+/// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex
+/// --> shufflevector X, CVec', Mask'
+static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
+  auto *Inst = dyn_cast<Instruction>(InsElt.getOperand(0));
+  // Bail out if the parent has more than one use. In that case, we'd be
+  // replacing the insertelt with a shuffle, and that's not a clear win.
+  if (!Inst || !Inst->hasOneUse())
+    return nullptr;
+  if (auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0))) {
+    // The shuffle must have a constant vector operand. The insertelt must have
+    // a constant scalar being inserted at a constant position in the vector.
+    Constant *ShufConstVec, *InsEltScalar;
+    uint64_t InsEltIndex;
+    if (!match(Shuf->getOperand(1), m_Constant(ShufConstVec)) ||
+        !match(InsElt.getOperand(1), m_Constant(InsEltScalar)) ||
+        !match(InsElt.getOperand(2), m_ConstantInt(InsEltIndex)))
+      return nullptr;
+
+    // Adding an element to an arbitrary shuffle could be expensive, but a
+    // shuffle that selects elements from vectors without crossing lanes is
+    // assumed cheap.
+    // If we're just adding a constant into that shuffle, it will still be
+    // cheap.
+    if (!isShuffleEquivalentToSelect(*Shuf))
+      return nullptr;
+
+    // From the above 'select' check, we know that the mask has the same number
+    // of elements as the vector input operands. We also know that each constant
+    // input element is used in its lane and can not be used more than once by
+    // the shuffle. Therefore, replace the constant in the shuffle's constant
+    // vector with the insertelt constant. Replace the constant in the shuffle's
+    // mask vector with the insertelt index plus the length of the vector
+    // (because the constant vector operand of a shuffle is always the 2nd
+    // operand).
+    Constant *Mask = Shuf->getMask();
+    unsigned NumElts = Mask->getType()->getVectorNumElements();
+    SmallVector<Constant *, 16> NewShufElts(NumElts);
+    SmallVector<Constant *, 16> NewMaskElts(NumElts);
+    for (unsigned I = 0; I != NumElts; ++I) {
+      if (I == InsEltIndex) {
+        NewShufElts[I] = InsEltScalar;
+        Type *Int32Ty = Type::getInt32Ty(Shuf->getContext());
+        NewMaskElts[I] = ConstantInt::get(Int32Ty, InsEltIndex + NumElts);
+      } else {
+        // Copy over the existing values.
+        NewShufElts[I] = ShufConstVec->getAggregateElement(I);
+        NewMaskElts[I] = Mask->getAggregateElement(I);
+      }
+    }
+
+    // Create new operands for a shuffle that includes the constant of the
+    // original insertelt. The old shuffle will be dead now.
+    return new ShuffleVectorInst(Shuf->getOperand(0),
+                                 ConstantVector::get(NewShufElts),
+                                 ConstantVector::get(NewMaskElts));
+  } else if (auto *IEI = dyn_cast<InsertElementInst>(Inst)) {
+    // Transform sequences of insertelements ops with constant data/indexes into
+    // a single shuffle op.
+    unsigned NumElts = InsElt.getType()->getNumElements();
+
+    uint64_t InsertIdx[2];
+    Constant *Val[2];
+    if (!match(InsElt.getOperand(2), m_ConstantInt(InsertIdx[0])) ||
+        !match(InsElt.getOperand(1), m_Constant(Val[0])) ||
+        !match(IEI->getOperand(2), m_ConstantInt(InsertIdx[1])) ||
+        !match(IEI->getOperand(1), m_Constant(Val[1])))
+      return nullptr;
+    SmallVector<Constant *, 16> Values(NumElts);
+    SmallVector<Constant *, 16> Mask(NumElts);
+    auto ValI = std::begin(Val);
+    // Generate new constant vector and mask.
+    // We have 2 values/masks from the insertelements instructions. Insert them
+    // into new value/mask vectors.
+    for (uint64_t I : InsertIdx) {
+      if (!Values[I]) {
+        assert(!Mask[I]);
+        Values[I] = *ValI;
+        Mask[I] = ConstantInt::get(Type::getInt32Ty(InsElt.getContext()),
+                                   NumElts + I);
+      }
+      ++ValI;
+    }
+    // Remaining values are filled with 'undef' values.
+    for (unsigned I = 0; I < NumElts; ++I) {
+      if (!Values[I]) {
+        assert(!Mask[I]);
+        Values[I] = UndefValue::get(InsElt.getType()->getElementType());
+        Mask[I] = ConstantInt::get(Type::getInt32Ty(InsElt.getContext()), I);
+      }
+    }
+    // Create new operands for a shuffle that includes the constant of the
+    // original insertelt.
+    return new ShuffleVectorInst(IEI->getOperand(0),
+                                 ConstantVector::get(Values),
+                                 ConstantVector::get(Mask));
+  }
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
+  Value *VecOp    = IE.getOperand(0);
+  Value *ScalarOp = IE.getOperand(1);
+  Value *IdxOp    = IE.getOperand(2);
+
+  // Inserting an undef or into an undefined place, remove this.
+  if (isa<UndefValue>(ScalarOp) || isa<UndefValue>(IdxOp))
+    replaceInstUsesWith(IE, VecOp);
+
+  // If the inserted element was extracted from some other vector, and if the
+  // indexes are constant, try to turn this into a shufflevector operation.
+  if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+    if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) {
+      unsigned NumInsertVectorElts = IE.getType()->getNumElements();
+      unsigned NumExtractVectorElts =
+          EI->getOperand(0)->getType()->getVectorNumElements();
+      unsigned ExtractedIdx =
+        cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+      unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+
+      if (ExtractedIdx >= NumExtractVectorElts) // Out of range extract.
+        return replaceInstUsesWith(IE, VecOp);
+
+      if (InsertedIdx >= NumInsertVectorElts)  // Out of range insert.
+        return replaceInstUsesWith(IE, UndefValue::get(IE.getType()));
+
+      // If we are extracting a value from a vector, then inserting it right
+      // back into the same place, just use the input vector.
+      if (EI->getOperand(0) == VecOp && ExtractedIdx == InsertedIdx)
+        return replaceInstUsesWith(IE, VecOp);
+
+      // If this insertelement isn't used by some other insertelement, turn it
+      // (and any insertelements it points to), into one big shuffle.
+      if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) {
+        SmallVector<Constant*, 16> Mask;
+        ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
+
+        // The proposed shuffle may be trivial, in which case we shouldn't
+        // perform the combine.
+        if (LR.first != &IE && LR.second != &IE) {
+          // We now have a shuffle of LHS, RHS, Mask.
+          if (LR.second == nullptr)
+            LR.second = UndefValue::get(LR.first->getType());
+          return new ShuffleVectorInst(LR.first, LR.second,
+                                       ConstantVector::get(Mask));
+        }
+      }
+    }
+  }
+
+  unsigned VWidth = VecOp->getType()->getVectorNumElements();
+  APInt UndefElts(VWidth, 0);
+  APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+  if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) {
+    if (V != &IE)
+      return replaceInstUsesWith(IE, V);
+    return &IE;
+  }
+
+  if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE))
+    return Shuf;
+
+  if (Instruction *NewInsElt = hoistInsEltConst(IE, Builder))
+    return NewInsElt;
+
+  // Turn a sequence of inserts that broadcasts a scalar into a single
+  // insert + shufflevector.
+  if (Instruction *Broadcast = foldInsSequenceIntoBroadcast(IE))
+    return Broadcast;
+
+  return nullptr;
+}
+
+/// Return true if we can evaluate the specified expression tree if the vector
+/// elements were shuffled in a different order.
+static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask,
+                                unsigned Depth = 5) {
+  // We can always reorder the elements of a constant.
+  if (isa<Constant>(V))
+    return true;
+
+  // We won't reorder vector arguments. No IPO here.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // Two users may expect different orders of the elements. Don't try it.
+  if (!I->hasOneUse())
+    return false;
+
+  if (Depth == 0) return false;
+
+  switch (I->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::GetElementPtr: {
+      for (Value *Operand : I->operands()) {
+        if (!CanEvaluateShuffled(Operand, Mask, Depth-1))
+          return false;
+      }
+      return true;
+    }
+    case Instruction::InsertElement: {
+      ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
+      if (!CI) return false;
+      int ElementNumber = CI->getLimitedValue();
+
+      // Verify that 'CI' does not occur twice in Mask. A single 'insertelement'
+      // can't put an element into multiple indices.
+      bool SeenOnce = false;
+      for (int i = 0, e = Mask.size(); i != e; ++i) {
+        if (Mask[i] == ElementNumber) {
+          if (SeenOnce)
+            return false;
+          SeenOnce = true;
+        }
+      }
+      return CanEvaluateShuffled(I->getOperand(0), Mask, Depth-1);
+    }
+  }
+  return false;
+}
+
+/// Rebuild a new instruction just like 'I' but with the new operands given.
+/// In the event of type mismatch, the type of the operands is correct.
+static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) {
+  // We don't want to use the IRBuilder here because we want the replacement
+  // instructions to appear next to 'I', not the builder's insertion point.
+  switch (I->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      BinaryOperator *BO = cast<BinaryOperator>(I);
+      assert(NewOps.size() == 2 && "binary operator with #ops != 2");
+      BinaryOperator *New =
+          BinaryOperator::Create(cast<BinaryOperator>(I)->getOpcode(),
+                                 NewOps[0], NewOps[1], "", BO);
+      if (isa<OverflowingBinaryOperator>(BO)) {
+        New->setHasNoUnsignedWrap(BO->hasNoUnsignedWrap());
+        New->setHasNoSignedWrap(BO->hasNoSignedWrap());
+      }
+      if (isa<PossiblyExactOperator>(BO)) {
+        New->setIsExact(BO->isExact());
+      }
+      if (isa<FPMathOperator>(BO))
+        New->copyFastMathFlags(I);
+      return New;
+    }
+    case Instruction::ICmp:
+      assert(NewOps.size() == 2 && "icmp with #ops != 2");
+      return new ICmpInst(I, cast<ICmpInst>(I)->getPredicate(),
+                          NewOps[0], NewOps[1]);
+    case Instruction::FCmp:
+      assert(NewOps.size() == 2 && "fcmp with #ops != 2");
+      return new FCmpInst(I, cast<FCmpInst>(I)->getPredicate(),
+                          NewOps[0], NewOps[1]);
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt: {
+      // It's possible that the mask has a different number of elements from
+      // the original cast. We recompute the destination type to match the mask.
+      Type *DestTy =
+          VectorType::get(I->getType()->getScalarType(),
+                          NewOps[0]->getType()->getVectorNumElements());
+      assert(NewOps.size() == 1 && "cast with #ops != 1");
+      return CastInst::Create(cast<CastInst>(I)->getOpcode(), NewOps[0], DestTy,
+                              "", I);
+    }
+    case Instruction::GetElementPtr: {
+      Value *Ptr = NewOps[0];
+      ArrayRef<Value*> Idx = NewOps.slice(1);
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          cast<GetElementPtrInst>(I)->getSourceElementType(), Ptr, Idx, "", I);
+      GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds());
+      return GEP;
+    }
+  }
+  llvm_unreachable("failed to rebuild vector instructions");
+}
+
+Value *
+InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
+  // Mask.size() does not need to be equal to the number of vector elements.
+
+  assert(V->getType()->isVectorTy() && "can't reorder non-vector elements");
+  if (isa<UndefValue>(V)) {
+    return UndefValue::get(VectorType::get(V->getType()->getScalarType(),
+                                           Mask.size()));
+  }
+  if (isa<ConstantAggregateZero>(V)) {
+    return ConstantAggregateZero::get(
+               VectorType::get(V->getType()->getScalarType(),
+                               Mask.size()));
+  }
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    SmallVector<Constant *, 16> MaskValues;
+    for (int i = 0, e = Mask.size(); i != e; ++i) {
+      if (Mask[i] == -1)
+        MaskValues.push_back(UndefValue::get(Builder.getInt32Ty()));
+      else
+        MaskValues.push_back(Builder.getInt32(Mask[i]));
+    }
+    return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()),
+                                          ConstantVector::get(MaskValues));
+  }
+
+  Instruction *I = cast<Instruction>(V);
+  switch (I->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::Select:
+    case Instruction::GetElementPtr: {
+      SmallVector<Value*, 8> NewOps;
+      bool NeedsRebuild = (Mask.size() != I->getType()->getVectorNumElements());
+      for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
+        Value *V = EvaluateInDifferentElementOrder(I->getOperand(i), Mask);
+        NewOps.push_back(V);
+        NeedsRebuild |= (V != I->getOperand(i));
+      }
+      if (NeedsRebuild) {
+        return buildNew(I, NewOps);
+      }
+      return I;
+    }
+    case Instruction::InsertElement: {
+      int Element = cast<ConstantInt>(I->getOperand(2))->getLimitedValue();
+
+      // The insertelement was inserting at Element. Figure out which element
+      // that becomes after shuffling. The answer is guaranteed to be unique
+      // by CanEvaluateShuffled.
+      bool Found = false;
+      int Index = 0;
+      for (int e = Mask.size(); Index != e; ++Index) {
+        if (Mask[Index] == Element) {
+          Found = true;
+          break;
+        }
+      }
+
+      // If element is not in Mask, no need to handle the operand 1 (element to
+      // be inserted). Just evaluate values in operand 0 according to Mask.
+      if (!Found)
+        return EvaluateInDifferentElementOrder(I->getOperand(0), Mask);
+
+      Value *V = EvaluateInDifferentElementOrder(I->getOperand(0), Mask);
+      return InsertElementInst::Create(V, I->getOperand(1),
+                                       Builder.getInt32(Index), "", I);
+    }
+  }
+  llvm_unreachable("failed to reorder elements of vector instruction!");
+}
+
+static void recognizeIdentityMask(const SmallVectorImpl<int> &Mask,
+                                  bool &isLHSID, bool &isRHSID) {
+  isLHSID = isRHSID = true;
+
+  for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+    if (Mask[i] < 0) continue;  // Ignore undef values.
+    // Is this an identity shuffle of the LHS value?
+    isLHSID &= (Mask[i] == (int)i);
+
+    // Is this an identity shuffle of the RHS value?
+    isRHSID &= (Mask[i]-e == i);
+  }
+}
+
+// Returns true if the shuffle is extracting a contiguous range of values from
+// LHS, for example:
+//                 +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+//   Input:        |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP|
+//   Shuffles to:  |EE|FF|GG|HH|
+//                 +--+--+--+--+
+static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
+                                       SmallVector<int, 16> &Mask) {
+  unsigned LHSElems = SVI.getOperand(0)->getType()->getVectorNumElements();
+  unsigned MaskElems = Mask.size();
+  unsigned BegIdx = Mask.front();
+  unsigned EndIdx = Mask.back();
+  if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1)
+    return false;
+  for (unsigned I = 0; I != MaskElems; ++I)
+    if (static_cast<unsigned>(Mask[I]) != BegIdx + I)
+      return false;
+  return true;
+}
+
+Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  Value *LHS = SVI.getOperand(0);
+  Value *RHS = SVI.getOperand(1);
+  SmallVector<int, 16> Mask = SVI.getShuffleMask();
+  Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
+
+  if (auto *V = SimplifyShuffleVectorInst(
+          LHS, RHS, SVI.getMask(), SVI.getType(), SQ.getWithInstruction(&SVI)))
+    return replaceInstUsesWith(SVI, V);
+
+  bool MadeChange = false;
+  unsigned VWidth = SVI.getType()->getVectorNumElements();
+
+  APInt UndefElts(VWidth, 0);
+  APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+  if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
+    if (V != &SVI)
+      return replaceInstUsesWith(SVI, V);
+    LHS = SVI.getOperand(0);
+    RHS = SVI.getOperand(1);
+    MadeChange = true;
+  }
+
+  unsigned LHSWidth = LHS->getType()->getVectorNumElements();
+
+  // Canonicalize shuffle(x    ,x,mask) -> shuffle(x, undef,mask')
+  // Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask').
+  if (LHS == RHS || isa<UndefValue>(LHS)) {
+    if (isa<UndefValue>(LHS) && LHS == RHS) {
+      // shuffle(undef,undef,mask) -> undef.
+      Value *Result = (VWidth == LHSWidth)
+                      ? LHS : UndefValue::get(SVI.getType());
+      return replaceInstUsesWith(SVI, Result);
+    }
+
+    // Remap any references to RHS to use LHS.
+    SmallVector<Constant*, 16> Elts;
+    for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) {
+      if (Mask[i] < 0) {
+        Elts.push_back(UndefValue::get(Int32Ty));
+        continue;
+      }
+
+      if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) ||
+          (Mask[i] <  (int)e && isa<UndefValue>(LHS))) {
+        Mask[i] = -1;     // Turn into undef.
+        Elts.push_back(UndefValue::get(Int32Ty));
+      } else {
+        Mask[i] = Mask[i] % e;  // Force to LHS.
+        Elts.push_back(ConstantInt::get(Int32Ty, Mask[i]));
+      }
+    }
+    SVI.setOperand(0, SVI.getOperand(1));
+    SVI.setOperand(1, UndefValue::get(RHS->getType()));
+    SVI.setOperand(2, ConstantVector::get(Elts));
+    LHS = SVI.getOperand(0);
+    RHS = SVI.getOperand(1);
+    MadeChange = true;
+  }
+
+  if (VWidth == LHSWidth) {
+    // Analyze the shuffle, are the LHS or RHS and identity shuffles?
+    bool isLHSID, isRHSID;
+    recognizeIdentityMask(Mask, isLHSID, isRHSID);
+
+    // Eliminate identity shuffles.
+    if (isLHSID) return replaceInstUsesWith(SVI, LHS);
+    if (isRHSID) return replaceInstUsesWith(SVI, RHS);
+  }
+
+  if (isa<UndefValue>(RHS) && CanEvaluateShuffled(LHS, Mask)) {
+    Value *V = EvaluateInDifferentElementOrder(LHS, Mask);
+    return replaceInstUsesWith(SVI, V);
+  }
+
+  // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to
+  // a non-vector type. We can instead bitcast the original vector followed by
+  // an extract of the desired element:
+  //
+  //   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef,
+  //                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  //   %1 = bitcast <4 x i8> %sroa to i32
+  // Becomes:
+  //   %bc = bitcast <16 x i8> %in to <4 x i32>
+  //   %ext = extractelement <4 x i32> %bc, i32 0
+  //
+  // If the shuffle is extracting a contiguous range of values from the input
+  // vector then each use which is a bitcast of the extracted size can be
+  // replaced. This will work if the vector types are compatible, and the begin
+  // index is aligned to a value in the casted vector type. If the begin index
+  // isn't aligned then we can shuffle the original vector (keeping the same
+  // vector type) before extracting.
+  //
+  // This code will bail out if the target type is fundamentally incompatible
+  // with vectors of the source type.
+  //
+  // Example of <16 x i8>, target type i32:
+  // Index range [4,8):         v-----------v Will work.
+  //                +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+  //     <16 x i8>: |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
+  //     <4 x i32>: |           |           |           |           |
+  //                +-----------+-----------+-----------+-----------+
+  // Index range [6,10):              ^-----------^ Needs an extra shuffle.
+  // Target type i40:           ^--------------^ Won't work, bail.
+  if (isShuffleExtractingFromLHS(SVI, Mask)) {
+    Value *V = LHS;
+    unsigned MaskElems = Mask.size();
+    VectorType *SrcTy = cast<VectorType>(V->getType());
+    unsigned VecBitWidth = SrcTy->getBitWidth();
+    unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
+    assert(SrcElemBitWidth && "vector elements must have a bitwidth");
+    unsigned SrcNumElems = SrcTy->getNumElements();
+    SmallVector<BitCastInst *, 8> BCs;
+    DenseMap<Type *, Value *> NewBCs;
+    for (User *U : SVI.users())
+      if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
+        if (!BC->use_empty())
+          // Only visit bitcasts that weren't previously handled.
+          BCs.push_back(BC);
+    for (BitCastInst *BC : BCs) {
+      unsigned BegIdx = Mask.front();
+      Type *TgtTy = BC->getDestTy();
+      unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy);
+      if (!TgtElemBitWidth)
+        continue;
+      unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth;
+      bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth;
+      bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth);
+      if (!VecBitWidthsEqual)
+        continue;
+      if (!VectorType::isValidElementType(TgtTy))
+        continue;
+      VectorType *CastSrcTy = VectorType::get(TgtTy, TgtNumElems);
+      if (!BegIsAligned) {
+        // Shuffle the input so [0,NumElements) contains the output, and
+        // [NumElems,SrcNumElems) is undef.
+        SmallVector<Constant *, 16> ShuffleMask(SrcNumElems,
+                                                UndefValue::get(Int32Ty));
+        for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)
+          ShuffleMask[I] = ConstantInt::get(Int32Ty, Idx);
+        V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                        ConstantVector::get(ShuffleMask),
+                                        SVI.getName() + ".extract");
+        BegIdx = 0;
+      }
+      unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth;
+      assert(SrcElemsPerTgtElem);
+      BegIdx /= SrcElemsPerTgtElem;
+      bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end();
+      auto *NewBC =
+          BCAlreadyExists
+              ? NewBCs[CastSrcTy]
+              : Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
+      if (!BCAlreadyExists)
+        NewBCs[CastSrcTy] = NewBC;
+      auto *Ext = Builder.CreateExtractElement(
+          NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract");
+      // The shufflevector isn't being replaced: the bitcast that used it
+      // is. InstCombine will visit the newly-created instructions.
+      replaceInstUsesWith(*BC, Ext);
+      MadeChange = true;
+    }
+  }
+
+  // If the LHS is a shufflevector itself, see if we can combine it with this
+  // one without producing an unusual shuffle.
+  // Cases that might be simplified:
+  // 1.
+  // x1=shuffle(v1,v2,mask1)
+  //  x=shuffle(x1,undef,mask)
+  //        ==>
+  //  x=shuffle(v1,undef,newMask)
+  // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : -1
+  // 2.
+  // x1=shuffle(v1,undef,mask1)
+  //  x=shuffle(x1,x2,mask)
+  // where v1.size() == mask1.size()
+  //        ==>
+  //  x=shuffle(v1,x2,newMask)
+  // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : mask[i]
+  // 3.
+  // x2=shuffle(v2,undef,mask2)
+  //  x=shuffle(x1,x2,mask)
+  // where v2.size() == mask2.size()
+  //        ==>
+  //  x=shuffle(x1,v2,newMask)
+  // newMask[i] = (mask[i] < x1.size())
+  //              ? mask[i] : mask2[mask[i]-x1.size()]+x1.size()
+  // 4.
+  // x1=shuffle(v1,undef,mask1)
+  // x2=shuffle(v2,undef,mask2)
+  //  x=shuffle(x1,x2,mask)
+  // where v1.size() == v2.size()
+  //        ==>
+  //  x=shuffle(v1,v2,newMask)
+  // newMask[i] = (mask[i] < x1.size())
+  //              ? mask1[mask[i]] : mask2[mask[i]-x1.size()]+v1.size()
+  //
+  // Here we are really conservative:
+  // we are absolutely afraid of producing a shuffle mask not in the input
+  // program, because the code gen may not be smart enough to turn a merged
+  // shuffle into two specific shuffles: it may produce worse code.  As such,
+  // we only merge two shuffles if the result is either a splat or one of the
+  // input shuffle masks.  In this case, merging the shuffles just removes
+  // one instruction, which we know is safe.  This is good for things like
+  // turning: (splat(splat)) -> splat, or
+  // merge(V[0..n], V[n+1..2n]) -> V[0..2n]
+  ShuffleVectorInst* LHSShuffle = dyn_cast<ShuffleVectorInst>(LHS);
+  ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS);
+  if (LHSShuffle)
+    if (!isa<UndefValue>(LHSShuffle->getOperand(1)) && !isa<UndefValue>(RHS))
+      LHSShuffle = nullptr;
+  if (RHSShuffle)
+    if (!isa<UndefValue>(RHSShuffle->getOperand(1)))
+      RHSShuffle = nullptr;
+  if (!LHSShuffle && !RHSShuffle)
+    return MadeChange ? &SVI : nullptr;
+
+  Value* LHSOp0 = nullptr;
+  Value* LHSOp1 = nullptr;
+  Value* RHSOp0 = nullptr;
+  unsigned LHSOp0Width = 0;
+  unsigned RHSOp0Width = 0;
+  if (LHSShuffle) {
+    LHSOp0 = LHSShuffle->getOperand(0);
+    LHSOp1 = LHSShuffle->getOperand(1);
+    LHSOp0Width = LHSOp0->getType()->getVectorNumElements();
+  }
+  if (RHSShuffle) {
+    RHSOp0 = RHSShuffle->getOperand(0);
+    RHSOp0Width = RHSOp0->getType()->getVectorNumElements();
+  }
+  Value* newLHS = LHS;
+  Value* newRHS = RHS;
+  if (LHSShuffle) {
+    // case 1
+    if (isa<UndefValue>(RHS)) {
+      newLHS = LHSOp0;
+      newRHS = LHSOp1;
+    }
+    // case 2 or 4
+    else if (LHSOp0Width == LHSWidth) {
+      newLHS = LHSOp0;
+    }
+  }
+  // case 3 or 4
+  if (RHSShuffle && RHSOp0Width == LHSWidth) {
+    newRHS = RHSOp0;
+  }
+  // case 4
+  if (LHSOp0 == RHSOp0) {
+    newLHS = LHSOp0;
+    newRHS = nullptr;
+  }
+
+  if (newLHS == LHS && newRHS == RHS)
+    return MadeChange ? &SVI : nullptr;
+
+  SmallVector<int, 16> LHSMask;
+  SmallVector<int, 16> RHSMask;
+  if (newLHS != LHS)
+    LHSMask = LHSShuffle->getShuffleMask();
+  if (RHSShuffle && newRHS != RHS)
+    RHSMask = RHSShuffle->getShuffleMask();
+
+  unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth;
+  SmallVector<int, 16> newMask;
+  bool isSplat = true;
+  int SplatElt = -1;
+  // Create a new mask for the new ShuffleVectorInst so that the new
+  // ShuffleVectorInst is equivalent to the original one.
+  for (unsigned i = 0; i < VWidth; ++i) {
+    int eltMask;
+    if (Mask[i] < 0) {
+      // This element is an undef value.
+      eltMask = -1;
+    } else if (Mask[i] < (int)LHSWidth) {
+      // This element is from left hand side vector operand.
+      //
+      // If LHS is going to be replaced (case 1, 2, or 4), calculate the
+      // new mask value for the element.
+      if (newLHS != LHS) {
+        eltMask = LHSMask[Mask[i]];
+        // If the value selected is an undef value, explicitly specify it
+        // with a -1 mask value.
+        if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1))
+          eltMask = -1;
+      } else
+        eltMask = Mask[i];
+    } else {
+      // This element is from right hand side vector operand
+      //
+      // If the value selected is an undef value, explicitly specify it
+      // with a -1 mask value. (case 1)
+      if (isa<UndefValue>(RHS))
+        eltMask = -1;
+      // If RHS is going to be replaced (case 3 or 4), calculate the
+      // new mask value for the element.
+      else if (newRHS != RHS) {
+        eltMask = RHSMask[Mask[i]-LHSWidth];
+        // If the value selected is an undef value, explicitly specify it
+        // with a -1 mask value.
+        if (eltMask >= (int)RHSOp0Width) {
+          assert(isa<UndefValue>(RHSShuffle->getOperand(1))
+                 && "should have been check above");
+          eltMask = -1;
+        }
+      } else
+        eltMask = Mask[i]-LHSWidth;
+
+      // If LHS's width is changed, shift the mask value accordingly.
+      // If newRHS == NULL, i.e. LHSOp0 == RHSOp0, we want to remap any
+      // references from RHSOp0 to LHSOp0, so we don't need to shift the mask.
+      // If newRHS == newLHS, we want to remap any references from newRHS to
+      // newLHS so that we can properly identify splats that may occur due to
+      // obfuscation across the two vectors.
+      if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS)
+        eltMask += newLHSWidth;
+    }
+
+    // Check if this could still be a splat.
+    if (eltMask >= 0) {
+      if (SplatElt >= 0 && SplatElt != eltMask)
+        isSplat = false;
+      SplatElt = eltMask;
+    }
+
+    newMask.push_back(eltMask);
+  }
+
+  // If the result mask is equal to one of the original shuffle masks,
+  // or is a splat, do the replacement.
+  if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
+    SmallVector<Constant*, 16> Elts;
+    for (unsigned i = 0, e = newMask.size(); i != e; ++i) {
+      if (newMask[i] < 0) {
+        Elts.push_back(UndefValue::get(Int32Ty));
+      } else {
+        Elts.push_back(ConstantInt::get(Int32Ty, newMask[i]));
+      }
+    }
+    if (!newRHS)
+      newRHS = UndefValue::get(newLHS->getType());
+    return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts));
+  }
+
+  // If the result mask is an identity, replace uses of this instruction with
+  // corresponding argument.
+  bool isLHSID, isRHSID;
+  recognizeIdentityMask(newMask, isLHSID, isRHSID);
+  if (isLHSID && VWidth == LHSOp0Width) return replaceInstUsesWith(SVI, newLHS);
+  if (isRHSID && VWidth == RHSOp0Width) return replaceInstUsesWith(SVI, newRHS);
+
+  return MadeChange ? &SVI : nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
new file mode 100644
index 000000000000..90e232399155
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -0,0 +1,3248 @@
+//===- InstructionCombining.cpp - Combine multiple instructions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// InstructionCombining - Combine instructions to form fewer, simple
+// instructions.  This pass does not modify the CFG.  This pass is where
+// algebraic simplification happens.
+//
+// This pass combines things like:
+//    %Y = add i32 %X, 1
+//    %Z = add i32 %Y, 1
+// into:
+//    %Z = add i32 %X, 2
+//
+// This is a simple worklist driven algorithm.
+//
+// This pass guarantees that the following canonicalizations are performed on
+// the program:
+//    1. If a binary operator has a constant operand, it is moved to the RHS
+//    2. Bitwise operators with constant operands are always grouped so that
+//       shifts are performed first, then or's, then and's, then xor's.
+//    3. Compare instructions are converted from <,>,<=,>= to ==,!= if possible
+//    4. All cmp instructions on boolean values are replaced with logical ops
+//    5. add X, X is represented as (X*2) => (X << 1)
+//    6. Multiplies with a power-of-two constant argument are transformed into
+//       shifts.
+//   ... etc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <climits>
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+STATISTIC(NumCombined , "Number of insts combined");
+STATISTIC(NumConstProp, "Number of constant folds");
+STATISTIC(NumDeadInst , "Number of dead inst eliminated");
+STATISTIC(NumSunkInst , "Number of instructions sunk");
+STATISTIC(NumExpand,    "Number of expansions");
+STATISTIC(NumFactor   , "Number of factorizations");
+STATISTIC(NumReassoc  , "Number of reassociations");
+
+static cl::opt<bool>
+EnableExpensiveCombines("expensive-combines",
+                        cl::desc("Enable expensive instruction combines"));
+
+static cl::opt<unsigned>
+MaxArraySize("instcombine-maxarray-size", cl::init(1024),
+             cl::desc("Maximum array size considered when doing a combine"));
+
+Value *InstCombiner::EmitGEPOffset(User *GEP) {
+  return llvm::EmitGEPOffset(&Builder, DL, GEP);
+}
+
+/// Return true if it is desirable to convert an integer computation from a
+/// given bit width to a new bit width.
+/// We don't want to convert from a legal to an illegal type or from a smaller
+/// to a larger illegal type. A width of '1' is always treated as a legal type
+/// because i1 is a fundamental type in IR, and there are many specialized
+/// optimizations for i1 types.
+bool InstCombiner::shouldChangeType(unsigned FromWidth,
+                                    unsigned ToWidth) const {
+  bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth);
+  bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth);
+
+  // If this is a legal integer from type, and the result would be an illegal
+  // type, don't do the transformation.
+  if (FromLegal && !ToLegal)
+    return false;
+
+  // Otherwise, if both are illegal, do not increase the size of the result. We
+  // do allow things like i160 -> i64, but not i64 -> i160.
+  if (!FromLegal && !ToLegal && ToWidth > FromWidth)
+    return false;
+
+  return true;
+}
+
+/// Return true if it is desirable to convert a computation from 'From' to 'To'.
+/// We don't want to convert from a legal to an illegal type or from a smaller
+/// to a larger illegal type. i1 is always treated as a legal type because it is
+/// a fundamental type in IR, and there are many specialized optimizations for
+/// i1 types.
+bool InstCombiner::shouldChangeType(Type *From, Type *To) const {
+  assert(From->isIntegerTy() && To->isIntegerTy());
+
+  unsigned FromWidth = From->getPrimitiveSizeInBits();
+  unsigned ToWidth = To->getPrimitiveSizeInBits();
+  return shouldChangeType(FromWidth, ToWidth);
+}
+
+// Return true, if No Signed Wrap should be maintained for I.
+// The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C",
+// where both B and C should be ConstantInts, results in a constant that does
+// not overflow. This function only handles the Add and Sub opcodes. For
+// all other opcodes, the function conservatively returns false.
+static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
+  OverflowingBinaryOperator *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+  if (!OBO || !OBO->hasNoSignedWrap())
+    return false;
+
+  // We reason about Add and Sub Only.
+  Instruction::BinaryOps Opcode = I.getOpcode();
+  if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
+    return false;
+
+  const APInt *BVal, *CVal;
+  if (!match(B, m_APInt(BVal)) || !match(C, m_APInt(CVal)))
+    return false;
+
+  bool Overflow = false;
+  if (Opcode == Instruction::Add)
+    (void)BVal->sadd_ov(*CVal, Overflow);
+  else
+    (void)BVal->ssub_ov(*CVal, Overflow);
+
+  return !Overflow;
+}
+
+/// Conservatively clears subclassOptionalData after a reassociation or
+/// commutation. We preserve fast-math flags when applicable as they can be
+/// preserved.
+static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
+  FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I);
+  if (!FPMO) {
+    I.clearSubclassOptionalData();
+    return;
+  }
+
+  FastMathFlags FMF = I.getFastMathFlags();
+  I.clearSubclassOptionalData();
+  I.setFastMathFlags(FMF);
+}
+
+/// Combine constant operands of associative operations either before or after a
+/// cast to eliminate one of the associative operations:
+/// (op (cast (op X, C2)), C1) --> (cast (op X, op (C1, C2)))
+/// (op (cast (op X, C2)), C1) --> (op (cast X), op (C1, C2))
+static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1) {
+  auto *Cast = dyn_cast<CastInst>(BinOp1->getOperand(0));
+  if (!Cast || !Cast->hasOneUse())
+    return false;
+
+  // TODO: Enhance logic for other casts and remove this check.
+  auto CastOpcode = Cast->getOpcode();
+  if (CastOpcode != Instruction::ZExt)
+    return false;
+
+  // TODO: Enhance logic for other BinOps and remove this check.
+  if (!BinOp1->isBitwiseLogicOp())
+    return false;
+
+  auto AssocOpcode = BinOp1->getOpcode();
+  auto *BinOp2 = dyn_cast<BinaryOperator>(Cast->getOperand(0));
+  if (!BinOp2 || !BinOp2->hasOneUse() || BinOp2->getOpcode() != AssocOpcode)
+    return false;
+
+  Constant *C1, *C2;
+  if (!match(BinOp1->getOperand(1), m_Constant(C1)) ||
+      !match(BinOp2->getOperand(1), m_Constant(C2)))
+    return false;
+
+  // TODO: This assumes a zext cast.
+  // Eg, if it was a trunc, we'd cast C1 to the source type because casting C2
+  // to the destination type might lose bits.
+
+  // Fold the constants together in the destination type:
+  // (op (cast (op X, C2)), C1) --> (op (cast X), FoldedC)
+  Type *DestTy = C1->getType();
+  Constant *CastC2 = ConstantExpr::getCast(CastOpcode, C2, DestTy);
+  Constant *FoldedC = ConstantExpr::get(AssocOpcode, C1, CastC2);
+  Cast->setOperand(0, BinOp2->getOperand(0));
+  BinOp1->setOperand(1, FoldedC);
+  return true;
+}
+
+/// This performs a few simplifications for operators that are associative or
+/// commutative:
+///
+///  Commutative operators:
+///
+///  1. Order operands such that they are listed from right (least complex) to
+///     left (most complex).  This puts constants before unary operators before
+///     binary operators.
+///
+///  Associative operators:
+///
+///  2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
+///  3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
+///
+///  Associative and commutative operators:
+///
+///  4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
+///  5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
+///  6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
+///     if C1 and C2 are constants.
+bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
+  Instruction::BinaryOps Opcode = I.getOpcode();
+  bool Changed = false;
+
+  do {
+    // Order operands such that they are listed from right (least complex) to
+    // left (most complex).  This puts constants before unary operators before
+    // binary operators.
+    if (I.isCommutative() && getComplexity(I.getOperand(0)) <
+        getComplexity(I.getOperand(1)))
+      Changed = !I.swapOperands();
+
+    BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0));
+    BinaryOperator *Op1 = dyn_cast<BinaryOperator>(I.getOperand(1));
+
+    if (I.isAssociative()) {
+      // Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
+      if (Op0 && Op0->getOpcode() == Opcode) {
+        Value *A = Op0->getOperand(0);
+        Value *B = Op0->getOperand(1);
+        Value *C = I.getOperand(1);
+
+        // Does "B op C" simplify?
+        if (Value *V = SimplifyBinOp(Opcode, B, C, SQ.getWithInstruction(&I))) {
+          // It simplifies to V.  Form "A op V".
+          I.setOperand(0, A);
+          I.setOperand(1, V);
+          // Conservatively clear the optional flags, since they may not be
+          // preserved by the reassociation.
+          if (MaintainNoSignedWrap(I, B, C) &&
+              (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) {
+            // Note: this is only valid because SimplifyBinOp doesn't look at
+            // the operands to Op0.
+            I.clearSubclassOptionalData();
+            I.setHasNoSignedWrap(true);
+          } else {
+            ClearSubclassDataAfterReassociation(I);
+          }
+
+          Changed = true;
+          ++NumReassoc;
+          continue;
+        }
+      }
+
+      // Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
+      if (Op1 && Op1->getOpcode() == Opcode) {
+        Value *A = I.getOperand(0);
+        Value *B = Op1->getOperand(0);
+        Value *C = Op1->getOperand(1);
+
+        // Does "A op B" simplify?
+        if (Value *V = SimplifyBinOp(Opcode, A, B, SQ.getWithInstruction(&I))) {
+          // It simplifies to V.  Form "V op C".
+          I.setOperand(0, V);
+          I.setOperand(1, C);
+          // Conservatively clear the optional flags, since they may not be
+          // preserved by the reassociation.
+          ClearSubclassDataAfterReassociation(I);
+          Changed = true;
+          ++NumReassoc;
+          continue;
+        }
+      }
+    }
+
+    if (I.isAssociative() && I.isCommutative()) {
+      if (simplifyAssocCastAssoc(&I)) {
+        Changed = true;
+        ++NumReassoc;
+        continue;
+      }
+
+      // Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
+      if (Op0 && Op0->getOpcode() == Opcode) {
+        Value *A = Op0->getOperand(0);
+        Value *B = Op0->getOperand(1);
+        Value *C = I.getOperand(1);
+
+        // Does "C op A" simplify?
+        if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
+          // It simplifies to V.  Form "V op B".
+          I.setOperand(0, V);
+          I.setOperand(1, B);
+          // Conservatively clear the optional flags, since they may not be
+          // preserved by the reassociation.
+          ClearSubclassDataAfterReassociation(I);
+          Changed = true;
+          ++NumReassoc;
+          continue;
+        }
+      }
+
+      // Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
+      if (Op1 && Op1->getOpcode() == Opcode) {
+        Value *A = I.getOperand(0);
+        Value *B = Op1->getOperand(0);
+        Value *C = Op1->getOperand(1);
+
+        // Does "C op A" simplify?
+        if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
+          // It simplifies to V.  Form "B op V".
+          I.setOperand(0, B);
+          I.setOperand(1, V);
+          // Conservatively clear the optional flags, since they may not be
+          // preserved by the reassociation.
+          ClearSubclassDataAfterReassociation(I);
+          Changed = true;
+          ++NumReassoc;
+          continue;
+        }
+      }
+
+      // Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
+      // if C1 and C2 are constants.
+      if (Op0 && Op1 &&
+          Op0->getOpcode() == Opcode && Op1->getOpcode() == Opcode &&
+          isa<Constant>(Op0->getOperand(1)) &&
+          isa<Constant>(Op1->getOperand(1)) &&
+          Op0->hasOneUse() && Op1->hasOneUse()) {
+        Value *A = Op0->getOperand(0);
+        Constant *C1 = cast<Constant>(Op0->getOperand(1));
+        Value *B = Op1->getOperand(0);
+        Constant *C2 = cast<Constant>(Op1->getOperand(1));
+
+        Constant *Folded = ConstantExpr::get(Opcode, C1, C2);
+        BinaryOperator *New = BinaryOperator::Create(Opcode, A, B);
+        if (isa<FPMathOperator>(New)) {
+          FastMathFlags Flags = I.getFastMathFlags();
+          Flags &= Op0->getFastMathFlags();
+          Flags &= Op1->getFastMathFlags();
+          New->setFastMathFlags(Flags);
+        }
+        InsertNewInstWith(New, I);
+        New->takeName(Op1);
+        I.setOperand(0, New);
+        I.setOperand(1, Folded);
+        // Conservatively clear the optional flags, since they may not be
+        // preserved by the reassociation.
+        ClearSubclassDataAfterReassociation(I);
+
+        Changed = true;
+        continue;
+      }
+    }
+
+    // No further simplifications.
+    return Changed;
+  } while (1);
+}
+
+/// Return whether "X LOp (Y ROp Z)" is always equal to
+/// "(X LOp Y) ROp (X LOp Z)".
+static bool LeftDistributesOverRight(Instruction::BinaryOps LOp,
+                                     Instruction::BinaryOps ROp) {
+  switch (LOp) {
+  default:
+    return false;
+
+  case Instruction::And:
+    // And distributes over Or and Xor.
+    switch (ROp) {
+    default:
+      return false;
+    case Instruction::Or:
+    case Instruction::Xor:
+      return true;
+    }
+
+  case Instruction::Mul:
+    // Multiplication distributes over addition and subtraction.
+    switch (ROp) {
+    default:
+      return false;
+    case Instruction::Add:
+    case Instruction::Sub:
+      return true;
+    }
+
+  case Instruction::Or:
+    // Or distributes over And.
+    switch (ROp) {
+    default:
+      return false;
+    case Instruction::And:
+      return true;
+    }
+  }
+}
+
+/// Return whether "(X LOp Y) ROp Z" is always equal to
+/// "(X ROp Z) LOp (Y ROp Z)".
+static bool RightDistributesOverLeft(Instruction::BinaryOps LOp,
+                                     Instruction::BinaryOps ROp) {
+  if (Instruction::isCommutative(ROp))
+    return LeftDistributesOverRight(ROp, LOp);
+
+  switch (LOp) {
+  default:
+    return false;
+  // (X >> Z) & (Y >> Z)  -> (X&Y) >> Z  for all shifts.
+  // (X >> Z) | (Y >> Z)  -> (X|Y) >> Z  for all shifts.
+  // (X >> Z) ^ (Y >> Z)  -> (X^Y) >> Z  for all shifts.
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    switch (ROp) {
+    default:
+      return false;
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+      return true;
+    }
+  }
+  // TODO: It would be nice to handle division, aka "(X + Y)/Z = X/Z + Y/Z",
+  // but this requires knowing that the addition does not overflow and other
+  // such subtleties.
+  return false;
+}
+
+/// This function returns identity value for given opcode, which can be used to
+/// factor patterns like (X * 2) + X ==> (X * 2) + (X * 1) ==> X * (2 + 1).
+static Value *getIdentityValue(Instruction::BinaryOps Opcode, Value *V) {
+  if (isa<Constant>(V))
+    return nullptr;
+
+  return ConstantExpr::getBinOpIdentity(Opcode, V->getType());
+}
+
+/// This function factors binary ops which can be combined using distributive
+/// laws. This function tries to transform 'Op' based TopLevelOpcode to enable
+/// factorization e.g for ADD(SHL(X , 2), MUL(X, 5)), When this function called
+/// with TopLevelOpcode == Instruction::Add and Op = SHL(X, 2), transforms
+/// SHL(X, 2) to MUL(X, 4) i.e. returns Instruction::Mul with LHS set to 'X' and
+/// RHS to 4.
+static Instruction::BinaryOps
+getBinOpsForFactorization(Instruction::BinaryOps TopLevelOpcode,
+                          BinaryOperator *Op, Value *&LHS, Value *&RHS) {
+  assert(Op && "Expected a binary operator");
+
+  LHS = Op->getOperand(0);
+  RHS = Op->getOperand(1);
+
+  switch (TopLevelOpcode) {
+  default:
+    return Op->getOpcode();
+
+  case Instruction::Add:
+  case Instruction::Sub:
+    if (Op->getOpcode() == Instruction::Shl) {
+      if (Constant *CST = dyn_cast<Constant>(Op->getOperand(1))) {
+        // The multiplier is really 1 << CST.
+        RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), CST);
+        return Instruction::Mul;
+      }
+    }
+    return Op->getOpcode();
+  }
+
+  // TODO: We can add other conversions e.g. shr => div etc.
+}
+
+/// This tries to simplify binary operations by factorizing out common terms
+/// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
+Value *InstCombiner::tryFactorization(BinaryOperator &I,
+                                      Instruction::BinaryOps InnerOpcode,
+                                      Value *A, Value *B, Value *C, Value *D) {
+  assert(A && B && C && D && "All values must be provided");
+
+  Value *V = nullptr;
+  Value *SimplifiedInst = nullptr;
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
+
+  // Does "X op' Y" always equal "Y op' X"?
+  bool InnerCommutative = Instruction::isCommutative(InnerOpcode);
+
+  // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"?
+  if (LeftDistributesOverRight(InnerOpcode, TopLevelOpcode))
+    // Does the instruction have the form "(A op' B) op (A op' D)" or, in the
+    // commutative case, "(A op' B) op (C op' A)"?
+    if (A == C || (InnerCommutative && A == D)) {
+      if (A != C)
+        std::swap(C, D);
+      // Consider forming "A op' (B op D)".
+      // If "B op D" simplifies then it can be formed with no cost.
+      V = SimplifyBinOp(TopLevelOpcode, B, D, SQ.getWithInstruction(&I));
+      // If "B op D" doesn't simplify then only go on if both of the existing
+      // operations "A op' B" and "C op' D" will be zapped as no longer used.
+      if (!V && LHS->hasOneUse() && RHS->hasOneUse())
+        V = Builder.CreateBinOp(TopLevelOpcode, B, D, RHS->getName());
+      if (V) {
+        SimplifiedInst = Builder.CreateBinOp(InnerOpcode, A, V);
+      }
+    }
+
+  // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"?
+  if (!SimplifiedInst && RightDistributesOverLeft(TopLevelOpcode, InnerOpcode))
+    // Does the instruction have the form "(A op' B) op (C op' B)" or, in the
+    // commutative case, "(A op' B) op (B op' D)"?
+    if (B == D || (InnerCommutative && B == C)) {
+      if (B != D)
+        std::swap(C, D);
+      // Consider forming "(A op C) op' B".
+      // If "A op C" simplifies then it can be formed with no cost.
+      V = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
+
+      // If "A op C" doesn't simplify then only go on if both of the existing
+      // operations "A op' B" and "C op' D" will be zapped as no longer used.
+      if (!V && LHS->hasOneUse() && RHS->hasOneUse())
+        V = Builder.CreateBinOp(TopLevelOpcode, A, C, LHS->getName());
+      if (V) {
+        SimplifiedInst = Builder.CreateBinOp(InnerOpcode, V, B);
+      }
+    }
+
+  if (SimplifiedInst) {
+    ++NumFactor;
+    SimplifiedInst->takeName(&I);
+
+    // Check if we can add NSW flag to SimplifiedInst. If so, set NSW flag.
+    // TODO: Check for NUW.
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SimplifiedInst)) {
+      if (isa<OverflowingBinaryOperator>(SimplifiedInst)) {
+        bool HasNSW = false;
+        if (isa<OverflowingBinaryOperator>(&I))
+          HasNSW = I.hasNoSignedWrap();
+
+        if (auto *LOBO = dyn_cast<OverflowingBinaryOperator>(LHS))
+          HasNSW &= LOBO->hasNoSignedWrap();
+
+        if (auto *ROBO = dyn_cast<OverflowingBinaryOperator>(RHS))
+          HasNSW &= ROBO->hasNoSignedWrap();
+
+        // We can propagate 'nsw' if we know that
+        //  %Y = mul nsw i16 %X, C
+        //  %Z = add nsw i16 %Y, %X
+        // =>
+        //  %Z = mul nsw i16 %X, C+1
+        //
+        // iff C+1 isn't INT_MIN
+        const APInt *CInt;
+        if (TopLevelOpcode == Instruction::Add &&
+            InnerOpcode == Instruction::Mul)
+          if (match(V, m_APInt(CInt)) && !CInt->isMinSignedValue())
+            BO->setHasNoSignedWrap(HasNSW);
+      }
+    }
+  }
+  return SimplifiedInst;
+}
+
+/// This tries to simplify binary operations which some other binary operation
+/// distributes over either by factorizing out common terms
+/// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in
+/// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win).
+/// Returns the simplified value, or null if it didn't simplify.
+Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
+  BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
+  Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
+
+  {
+    // Factorization.
+    Value *A, *B, *C, *D;
+    Instruction::BinaryOps LHSOpcode, RHSOpcode;
+    if (Op0)
+      LHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op0, A, B);
+    if (Op1)
+      RHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op1, C, D);
+
+    // The instruction has the form "(A op' B) op (C op' D)".  Try to factorize
+    // a common term.
+    if (Op0 && Op1 && LHSOpcode == RHSOpcode)
+      if (Value *V = tryFactorization(I, LHSOpcode, A, B, C, D))
+        return V;
+
+    // The instruction has the form "(A op' B) op (C)".  Try to factorize common
+    // term.
+    if (Op0)
+      if (Value *Ident = getIdentityValue(LHSOpcode, RHS))
+        if (Value *V =
+                tryFactorization(I, LHSOpcode, A, B, RHS, Ident))
+          return V;
+
+    // The instruction has the form "(B) op (C op' D)".  Try to factorize common
+    // term.
+    if (Op1)
+      if (Value *Ident = getIdentityValue(RHSOpcode, LHS))
+        if (Value *V =
+                tryFactorization(I, RHSOpcode, LHS, Ident, C, D))
+          return V;
+  }
+
+  // Expansion.
+  if (Op0 && RightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
+    // The instruction has the form "(A op' B) op C".  See if expanding it out
+    // to "(A op C) op' (B op C)" results in simplifications.
+    Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
+    Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op'
+
+    // Do "A op C" and "B op C" both simplify?
+    if (Value *L =
+            SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I)))
+      if (Value *R =
+              SimplifyBinOp(TopLevelOpcode, B, C, SQ.getWithInstruction(&I))) {
+        // They do! Return "L op' R".
+        ++NumExpand;
+        C = Builder.CreateBinOp(InnerOpcode, L, R);
+        C->takeName(&I);
+        return C;
+      }
+  }
+
+  if (Op1 && LeftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) {
+    // The instruction has the form "A op (B op' C)".  See if expanding it out
+    // to "(A op B) op' (A op C)" results in simplifications.
+    Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
+    Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op'
+
+    // Do "A op B" and "A op C" both simplify?
+    if (Value *L =
+            SimplifyBinOp(TopLevelOpcode, A, B, SQ.getWithInstruction(&I)))
+      if (Value *R =
+              SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I))) {
+        // They do! Return "L op' R".
+        ++NumExpand;
+        A = Builder.CreateBinOp(InnerOpcode, L, R);
+        A->takeName(&I);
+        return A;
+      }
+  }
+
+  // (op (select (a, c, b)), (select (a, d, b))) -> (select (a, (op c, d), 0))
+  // (op (select (a, b, c)), (select (a, b, d))) -> (select (a, 0, (op c, d)))
+  if (auto *SI0 = dyn_cast<SelectInst>(LHS)) {
+    if (auto *SI1 = dyn_cast<SelectInst>(RHS)) {
+      if (SI0->getCondition() == SI1->getCondition()) {
+        Value *SI = nullptr;
+        if (Value *V =
+                SimplifyBinOp(TopLevelOpcode, SI0->getFalseValue(),
+                              SI1->getFalseValue(), SQ.getWithInstruction(&I)))
+          SI = Builder.CreateSelect(SI0->getCondition(),
+                                    Builder.CreateBinOp(TopLevelOpcode,
+                                                        SI0->getTrueValue(),
+                                                        SI1->getTrueValue()),
+                                    V);
+        if (Value *V =
+                SimplifyBinOp(TopLevelOpcode, SI0->getTrueValue(),
+                              SI1->getTrueValue(), SQ.getWithInstruction(&I)))
+          SI = Builder.CreateSelect(
+              SI0->getCondition(), V,
+              Builder.CreateBinOp(TopLevelOpcode, SI0->getFalseValue(),
+                                  SI1->getFalseValue()));
+        if (SI) {
+          SI->takeName(&I);
+          return SI;
+        }
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+/// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a
+/// constant zero (which is the 'negate' form).
+Value *InstCombiner::dyn_castNegVal(Value *V) const {
+  if (BinaryOperator::isNeg(V))
+    return BinaryOperator::getNegArgument(V);
+
+  // Constants can be considered to be negated values if they can be folded.
+  if (ConstantInt *C = dyn_cast<ConstantInt>(V))
+    return ConstantExpr::getNeg(C);
+
+  if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V))
+    if (C->getType()->getElementType()->isIntegerTy())
+      return ConstantExpr::getNeg(C);
+
+  if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) {
+    for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
+      Constant *Elt = CV->getAggregateElement(i);
+      if (!Elt)
+        return nullptr;
+
+      if (isa<UndefValue>(Elt))
+        continue;
+
+      if (!isa<ConstantInt>(Elt))
+        return nullptr;
+    }
+    return ConstantExpr::getNeg(CV);
+  }
+
+  return nullptr;
+}
+
+/// Given a 'fsub' instruction, return the RHS of the instruction if the LHS is
+/// a constant negative zero (which is the 'negate' form).
+Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const {
+  if (BinaryOperator::isFNeg(V, IgnoreZeroSign))
+    return BinaryOperator::getFNegArgument(V);
+
+  // Constants can be considered to be negated values if they can be folded.
+  if (ConstantFP *C = dyn_cast<ConstantFP>(V))
+    return ConstantExpr::getFNeg(C);
+
+  if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V))
+    if (C->getType()->getElementType()->isFloatingPointTy())
+      return ConstantExpr::getFNeg(C);
+
+  return nullptr;
+}
+
+static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
+                                             InstCombiner::BuilderTy &Builder) {
+  if (auto *Cast = dyn_cast<CastInst>(&I))
+    return Builder.CreateCast(Cast->getOpcode(), SO, I.getType());
+
+  assert(I.isBinaryOp() && "Unexpected opcode for select folding");
+
+  // Figure out if the constant is the left or the right argument.
+  bool ConstIsRHS = isa<Constant>(I.getOperand(1));
+  Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS));
+
+  if (auto *SOC = dyn_cast<Constant>(SO)) {
+    if (ConstIsRHS)
+      return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand);
+    return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC);
+  }
+
+  Value *Op0 = SO, *Op1 = ConstOperand;
+  if (!ConstIsRHS)
+    std::swap(Op0, Op1);
+
+  auto *BO = cast<BinaryOperator>(&I);
+  Value *RI = Builder.CreateBinOp(BO->getOpcode(), Op0, Op1,
+                                  SO->getName() + ".op");
+  auto *FPInst = dyn_cast<Instruction>(RI);
+  if (FPInst && isa<FPMathOperator>(FPInst))
+    FPInst->copyFastMathFlags(BO);
+  return RI;
+}
+
+Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
+  // Don't modify shared select instructions.
+  if (!SI->hasOneUse())
+    return nullptr;
+
+  Value *TV = SI->getTrueValue();
+  Value *FV = SI->getFalseValue();
+  if (!(isa<Constant>(TV) || isa<Constant>(FV)))
+    return nullptr;
+
+  // Bool selects with constant operands can be folded to logical ops.
+  if (SI->getType()->isIntOrIntVectorTy(1))
+    return nullptr;
+
+  // If it's a bitcast involving vectors, make sure it has the same number of
+  // elements on both sides.
+  if (auto *BC = dyn_cast<BitCastInst>(&Op)) {
+    VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy());
+    VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy());
+
+    // Verify that either both or neither are vectors.
+    if ((SrcTy == nullptr) != (DestTy == nullptr))
+      return nullptr;
+
+    // If vectors, verify that they have the same number of elements.
+    if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements())
+      return nullptr;
+  }
+
+  // Test if a CmpInst instruction is used exclusively by a select as
+  // part of a minimum or maximum operation. If so, refrain from doing
+  // any other folding. This helps out other analyses which understand
+  // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
+  // and CodeGen. And in this case, at least one of the comparison
+  // operands has at least one user besides the compare (the select),
+  // which would often largely negate the benefit of folding anyway.
+  if (auto *CI = dyn_cast<CmpInst>(SI->getCondition())) {
+    if (CI->hasOneUse()) {
+      Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+      if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) ||
+          (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1))
+        return nullptr;
+    }
+  }
+
+  Value *NewTV = foldOperationIntoSelectOperand(Op, TV, Builder);
+  Value *NewFV = foldOperationIntoSelectOperand(Op, FV, Builder);
+  return SelectInst::Create(SI->getCondition(), NewTV, NewFV, "", nullptr, SI);
+}
+
+static Value *foldOperationIntoPhiValue(BinaryOperator *I, Value *InV,
+                                        InstCombiner::BuilderTy &Builder) {
+  bool ConstIsRHS = isa<Constant>(I->getOperand(1));
+  Constant *C = cast<Constant>(I->getOperand(ConstIsRHS));
+
+  if (auto *InC = dyn_cast<Constant>(InV)) {
+    if (ConstIsRHS)
+      return ConstantExpr::get(I->getOpcode(), InC, C);
+    return ConstantExpr::get(I->getOpcode(), C, InC);
+  }
+
+  Value *Op0 = InV, *Op1 = C;
+  if (!ConstIsRHS)
+    std::swap(Op0, Op1);
+
+  Value *RI = Builder.CreateBinOp(I->getOpcode(), Op0, Op1, "phitmp");
+  auto *FPInst = dyn_cast<Instruction>(RI);
+  if (FPInst && isa<FPMathOperator>(FPInst))
+    FPInst->copyFastMathFlags(I);
+  return RI;
+}
+
+Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
+  unsigned NumPHIValues = PN->getNumIncomingValues();
+  if (NumPHIValues == 0)
+    return nullptr;
+
+  // We normally only transform phis with a single use.  However, if a PHI has
+  // multiple uses and they are all the same operation, we can fold *all* of the
+  // uses into the PHI.
+  if (!PN->hasOneUse()) {
+    // Walk the use list for the instruction, comparing them to I.
+    for (User *U : PN->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      if (UI != &I && !I.isIdenticalTo(UI))
+        return nullptr;
+    }
+    // Otherwise, we can replace *all* users with the new PHI we form.
+  }
+
+  // Check to see if all of the operands of the PHI are simple constants
+  // (constantint/constantfp/undef).  If there is one non-constant value,
+  // remember the BB it is in.  If there is more than one or if *it* is a PHI,
+  // bail out.  We don't do arbitrary constant expressions here because moving
+  // their computation can be expensive without a cost model.
+  BasicBlock *NonConstBB = nullptr;
+  for (unsigned i = 0; i != NumPHIValues; ++i) {
+    Value *InVal = PN->getIncomingValue(i);
+    if (isa<Constant>(InVal) && !isa<ConstantExpr>(InVal))
+      continue;
+
+    if (isa<PHINode>(InVal)) return nullptr;  // Itself a phi.
+    if (NonConstBB) return nullptr;  // More than one non-const value.
+
+    NonConstBB = PN->getIncomingBlock(i);
+
+    // If the InVal is an invoke at the end of the pred block, then we can't
+    // insert a computation after it without breaking the edge.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(InVal))
+      if (II->getParent() == NonConstBB)
+        return nullptr;
+
+    // If the incoming non-constant value is in I's block, we will remove one
+    // instruction, but insert another equivalent one, leading to infinite
+    // instcombine.
+    if (isPotentiallyReachable(I.getParent(), NonConstBB, &DT, LI))
+      return nullptr;
+  }
+
+  // If there is exactly one non-constant value, we can insert a copy of the
+  // operation in that block.  However, if this is a critical edge, we would be
+  // inserting the computation on some other paths (e.g. inside a loop).  Only
+  // do this if the pred block is unconditionally branching into the phi block.
+  if (NonConstBB != nullptr) {
+    BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator());
+    if (!BI || !BI->isUnconditional()) return nullptr;
+  }
+
+  // Okay, we can do the transformation: create the new PHI node.
+  PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues());
+  InsertNewInstBefore(NewPN, *PN);
+  NewPN->takeName(PN);
+
+  // If we are going to have to insert a new computation, do so right before the
+  // predecessor's terminator.
+  if (NonConstBB)
+    Builder.SetInsertPoint(NonConstBB->getTerminator());
+
+  // Next, add all of the operands to the PHI.
+  if (SelectInst *SI = dyn_cast<SelectInst>(&I)) {
+    // We only currently try to fold the condition of a select when it is a phi,
+    // not the true/false values.
+    Value *TrueV = SI->getTrueValue();
+    Value *FalseV = SI->getFalseValue();
+    BasicBlock *PhiTransBB = PN->getParent();
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      BasicBlock *ThisBB = PN->getIncomingBlock(i);
+      Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB);
+      Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB);
+      Value *InV = nullptr;
+      // Beware of ConstantExpr:  it may eventually evaluate to getNullValue,
+      // even if currently isNullValue gives false.
+      Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i));
+      // For vector constants, we cannot use isNullValue to fold into
+      // FalseVInPred versus TrueVInPred. When we have individual nonzero
+      // elements in the vector, we will incorrectly fold InC to
+      // `TrueVInPred`.
+      if (InC && !isa<ConstantExpr>(InC) && isa<ConstantInt>(InC))
+        InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
+      else {
+        // Generate the select in the same block as PN's current incoming block.
+        // Note: ThisBB need not be the NonConstBB because vector constants
+        // which are constants by definition are handled here.
+        // FIXME: This can lead to an increase in IR generation because we might
+        // generate selects for vector constant phi operand, that could not be
+        // folded to TrueVInPred or FalseVInPred as done for ConstantInt. For
+        // non-vector phis, this transformation was always profitable because
+        // the select would be generated exactly once in the NonConstBB.
+        Builder.SetInsertPoint(ThisBB->getTerminator());
+        InV = Builder.CreateSelect(PN->getIncomingValue(i), TrueVInPred,
+                                   FalseVInPred, "phitmp");
+      }
+      NewPN->addIncoming(InV, ThisBB);
+    }
+  } else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) {
+    Constant *C = cast<Constant>(I.getOperand(1));
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV = nullptr;
+      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+        InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C);
+      else if (isa<ICmpInst>(CI))
+        InV = Builder.CreateICmp(CI->getPredicate(), PN->getIncomingValue(i),
+                                 C, "phitmp");
+      else
+        InV = Builder.CreateFCmp(CI->getPredicate(), PN->getIncomingValue(i),
+                                 C, "phitmp");
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
+  } else if (auto *BO = dyn_cast<BinaryOperator>(&I)) {
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV = foldOperationIntoPhiValue(BO, PN->getIncomingValue(i),
+                                             Builder);
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
+  } else {
+    CastInst *CI = cast<CastInst>(&I);
+    Type *RetTy = CI->getType();
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV;
+      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+        InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
+      else
+        InV = Builder.CreateCast(CI->getOpcode(), PN->getIncomingValue(i),
+                                 I.getType(), "phitmp");
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
+  }
+
+  for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) {
+    Instruction *User = cast<Instruction>(*UI++);
+    if (User == &I) continue;
+    replaceInstUsesWith(*User, NewPN);
+    eraseInstFromFunction(*User);
+  }
+  return replaceInstUsesWith(I, NewPN);
+}
+
+Instruction *InstCombiner::foldOpWithConstantIntoOperand(BinaryOperator &I) {
+  assert(isa<Constant>(I.getOperand(1)) && "Unexpected operand type");
+
+  if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
+    if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
+      return NewSel;
+  } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) {
+    if (Instruction *NewPhi = foldOpIntoPhi(I, PN))
+      return NewPhi;
+  }
+  return nullptr;
+}
+
+/// Given a pointer type and a constant offset, determine whether or not there
+/// is a sequence of GEP indices into the pointed type that will land us at the
+/// specified offset. If so, fill them into NewIndices and return the resultant
+/// element type, otherwise return null.
+Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
+                                        SmallVectorImpl<Value *> &NewIndices) {
+  Type *Ty = PtrTy->getElementType();
+  if (!Ty->isSized())
+    return nullptr;
+
+  // Start with the index over the outer type.  Note that the type size
+  // might be zero (even if the offset isn't zero) if the indexed type
+  // is something like [0 x {int, int}]
+  Type *IntPtrTy = DL.getIntPtrType(PtrTy);
+  int64_t FirstIdx = 0;
+  if (int64_t TySize = DL.getTypeAllocSize(Ty)) {
+    FirstIdx = Offset/TySize;
+    Offset -= FirstIdx*TySize;
+
+    // Handle hosts where % returns negative instead of values [0..TySize).
+    if (Offset < 0) {
+      --FirstIdx;
+      Offset += TySize;
+      assert(Offset >= 0);
+    }
+    assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
+  }
+
+  NewIndices.push_back(ConstantInt::get(IntPtrTy, FirstIdx));
+
+  // Index into the types.  If we fail, set OrigBase to null.
+  while (Offset) {
+    // Indexing into tail padding between struct/array elements.
+    if (uint64_t(Offset * 8) >= DL.getTypeSizeInBits(Ty))
+      return nullptr;
+
+    if (StructType *STy = dyn_cast<StructType>(Ty)) {
+      const StructLayout *SL = DL.getStructLayout(STy);
+      assert(Offset < (int64_t)SL->getSizeInBytes() &&
+             "Offset must stay within the indexed type");
+
+      unsigned Elt = SL->getElementContainingOffset(Offset);
+      NewIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()),
+                                            Elt));
+
+      Offset -= SL->getElementOffset(Elt);
+      Ty = STy->getElementType(Elt);
+    } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+      uint64_t EltSize = DL.getTypeAllocSize(AT->getElementType());
+      assert(EltSize && "Cannot index into a zero-sized array");
+      NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize));
+      Offset %= EltSize;
+      Ty = AT->getElementType();
+    } else {
+      // Otherwise, we can't index into the middle of this atomic type, bail.
+      return nullptr;
+    }
+  }
+
+  return Ty;
+}
+
+static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) {
+  // If this GEP has only 0 indices, it is the same pointer as
+  // Src. If Src is not a trivial GEP too, don't combine
+  // the indices.
+  if (GEP.hasAllZeroIndices() && !Src.hasAllZeroIndices() &&
+      !Src.hasOneUse())
+    return false;
+  return true;
+}
+
+/// Return a value X such that Val = X * Scale, or null if none.
+/// If the multiplication is known not to overflow, then NoSignedWrap is set.
+Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
+  assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!");
+  assert(cast<IntegerType>(Val->getType())->getBitWidth() ==
+         Scale.getBitWidth() && "Scale not compatible with value!");
+
+  // If Val is zero or Scale is one then Val = Val * Scale.
+  if (match(Val, m_Zero()) || Scale == 1) {
+    NoSignedWrap = true;
+    return Val;
+  }
+
+  // If Scale is zero then it does not divide Val.
+  if (Scale.isMinValue())
+    return nullptr;
+
+  // Look through chains of multiplications, searching for a constant that is
+  // divisible by Scale.  For example, descaling X*(Y*(Z*4)) by a factor of 4
+  // will find the constant factor 4 and produce X*(Y*Z).  Descaling X*(Y*8) by
+  // a factor of 4 will produce X*(Y*2).  The principle of operation is to bore
+  // down from Val:
+  //
+  //     Val = M1 * X          ||   Analysis starts here and works down
+  //      M1 = M2 * Y          ||   Doesn't descend into terms with more
+  //      M2 =  Z * 4          \/   than one use
+  //
+  // Then to modify a term at the bottom:
+  //
+  //     Val = M1 * X
+  //      M1 =  Z * Y          ||   Replaced M2 with Z
+  //
+  // Then to work back up correcting nsw flags.
+
+  // Op - the term we are currently analyzing.  Starts at Val then drills down.
+  // Replaced with its descaled value before exiting from the drill down loop.
+  Value *Op = Val;
+
+  // Parent - initially null, but after drilling down notes where Op came from.
+  // In the example above, Parent is (Val, 0) when Op is M1, because M1 is the
+  // 0'th operand of Val.
+  std::pair<Instruction*, unsigned> Parent;
+
+  // Set if the transform requires a descaling at deeper levels that doesn't
+  // overflow.
+  bool RequireNoSignedWrap = false;
+
+  // Log base 2 of the scale. Negative if not a power of 2.
+  int32_t logScale = Scale.exactLogBase2();
+
+  for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down
+
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+      // If Op is a constant divisible by Scale then descale to the quotient.
+      APInt Quotient(Scale), Remainder(Scale); // Init ensures right bitwidth.
+      APInt::sdivrem(CI->getValue(), Scale, Quotient, Remainder);
+      if (!Remainder.isMinValue())
+        // Not divisible by Scale.
+        return nullptr;
+      // Replace with the quotient in the parent.
+      Op = ConstantInt::get(CI->getType(), Quotient);
+      NoSignedWrap = true;
+      break;
+    }
+
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op)) {
+
+      if (BO->getOpcode() == Instruction::Mul) {
+        // Multiplication.
+        NoSignedWrap = BO->hasNoSignedWrap();
+        if (RequireNoSignedWrap && !NoSignedWrap)
+          return nullptr;
+
+        // There are three cases for multiplication: multiplication by exactly
+        // the scale, multiplication by a constant different to the scale, and
+        // multiplication by something else.
+        Value *LHS = BO->getOperand(0);
+        Value *RHS = BO->getOperand(1);
+
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+          // Multiplication by a constant.
+          if (CI->getValue() == Scale) {
+            // Multiplication by exactly the scale, replace the multiplication
+            // by its left-hand side in the parent.
+            Op = LHS;
+            break;
+          }
+
+          // Otherwise drill down into the constant.
+          if (!Op->hasOneUse())
+            return nullptr;
+
+          Parent = std::make_pair(BO, 1);
+          continue;
+        }
+
+        // Multiplication by something else. Drill down into the left-hand side
+        // since that's where the reassociate pass puts the good stuff.
+        if (!Op->hasOneUse())
+          return nullptr;
+
+        Parent = std::make_pair(BO, 0);
+        continue;
+      }
+
+      if (logScale > 0 && BO->getOpcode() == Instruction::Shl &&
+          isa<ConstantInt>(BO->getOperand(1))) {
+        // Multiplication by a power of 2.
+        NoSignedWrap = BO->hasNoSignedWrap();
+        if (RequireNoSignedWrap && !NoSignedWrap)
+          return nullptr;
+
+        Value *LHS = BO->getOperand(0);
+        int32_t Amt = cast<ConstantInt>(BO->getOperand(1))->
+          getLimitedValue(Scale.getBitWidth());
+        // Op = LHS << Amt.
+
+        if (Amt == logScale) {
+          // Multiplication by exactly the scale, replace the multiplication
+          // by its left-hand side in the parent.
+          Op = LHS;
+          break;
+        }
+        if (Amt < logScale || !Op->hasOneUse())
+          return nullptr;
+
+        // Multiplication by more than the scale.  Reduce the multiplying amount
+        // by the scale in the parent.
+        Parent = std::make_pair(BO, 1);
+        Op = ConstantInt::get(BO->getType(), Amt - logScale);
+        break;
+      }
+    }
+
+    if (!Op->hasOneUse())
+      return nullptr;
+
+    if (CastInst *Cast = dyn_cast<CastInst>(Op)) {
+      if (Cast->getOpcode() == Instruction::SExt) {
+        // Op is sign-extended from a smaller type, descale in the smaller type.
+        unsigned SmallSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
+        APInt SmallScale = Scale.trunc(SmallSize);
+        // Suppose Op = sext X, and we descale X as Y * SmallScale.  We want to
+        // descale Op as (sext Y) * Scale.  In order to have
+        //   sext (Y * SmallScale) = (sext Y) * Scale
+        // some conditions need to hold however: SmallScale must sign-extend to
+        // Scale and the multiplication Y * SmallScale should not overflow.
+        if (SmallScale.sext(Scale.getBitWidth()) != Scale)
+          // SmallScale does not sign-extend to Scale.
+          return nullptr;
+        assert(SmallScale.exactLogBase2() == logScale);
+        // Require that Y * SmallScale must not overflow.
+        RequireNoSignedWrap = true;
+
+        // Drill down through the cast.
+        Parent = std::make_pair(Cast, 0);
+        Scale = SmallScale;
+        continue;
+      }
+
+      if (Cast->getOpcode() == Instruction::Trunc) {
+        // Op is truncated from a larger type, descale in the larger type.
+        // Suppose Op = trunc X, and we descale X as Y * sext Scale.  Then
+        //   trunc (Y * sext Scale) = (trunc Y) * Scale
+        // always holds.  However (trunc Y) * Scale may overflow even if
+        // trunc (Y * sext Scale) does not, so nsw flags need to be cleared
+        // from this point up in the expression (see later).
+        if (RequireNoSignedWrap)
+          return nullptr;
+
+        // Drill down through the cast.
+        unsigned LargeSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
+        Parent = std::make_pair(Cast, 0);
+        Scale = Scale.sext(LargeSize);
+        if (logScale + 1 == (int32_t)Cast->getType()->getPrimitiveSizeInBits())
+          logScale = -1;
+        assert(Scale.exactLogBase2() == logScale);
+        continue;
+      }
+    }
+
+    // Unsupported expression, bail out.
+    return nullptr;
+  }
+
+  // If Op is zero then Val = Op * Scale.
+  if (match(Op, m_Zero())) {
+    NoSignedWrap = true;
+    return Op;
+  }
+
+  // We know that we can successfully descale, so from here on we can safely
+  // modify the IR.  Op holds the descaled version of the deepest term in the
+  // expression.  NoSignedWrap is 'true' if multiplying Op by Scale is known
+  // not to overflow.
+
+  if (!Parent.first)
+    // The expression only had one term.
+    return Op;
+
+  // Rewrite the parent using the descaled version of its operand.
+  assert(Parent.first->hasOneUse() && "Drilled down when more than one use!");
+  assert(Op != Parent.first->getOperand(Parent.second) &&
+         "Descaling was a no-op?");
+  Parent.first->setOperand(Parent.second, Op);
+  Worklist.Add(Parent.first);
+
+  // Now work back up the expression correcting nsw flags.  The logic is based
+  // on the following observation: if X * Y is known not to overflow as a signed
+  // multiplication, and Y is replaced by a value Z with smaller absolute value,
+  // then X * Z will not overflow as a signed multiplication either.  As we work
+  // our way up, having NoSignedWrap 'true' means that the descaled value at the
+  // current level has strictly smaller absolute value than the original.
+  Instruction *Ancestor = Parent.first;
+  do {
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Ancestor)) {
+      // If the multiplication wasn't nsw then we can't say anything about the
+      // value of the descaled multiplication, and we have to clear nsw flags
+      // from this point on up.
+      bool OpNoSignedWrap = BO->hasNoSignedWrap();
+      NoSignedWrap &= OpNoSignedWrap;
+      if (NoSignedWrap != OpNoSignedWrap) {
+        BO->setHasNoSignedWrap(NoSignedWrap);
+        Worklist.Add(Ancestor);
+      }
+    } else if (Ancestor->getOpcode() == Instruction::Trunc) {
+      // The fact that the descaled input to the trunc has smaller absolute
+      // value than the original input doesn't tell us anything useful about
+      // the absolute values of the truncations.
+      NoSignedWrap = false;
+    }
+    assert((Ancestor->getOpcode() != Instruction::SExt || NoSignedWrap) &&
+           "Failed to keep proper track of nsw flags while drilling down?");
+
+    if (Ancestor == Val)
+      // Got to the top, all done!
+      return Val;
+
+    // Move up one level in the expression.
+    assert(Ancestor->hasOneUse() && "Drilled down when more than one use!");
+    Ancestor = Ancestor->user_back();
+  } while (1);
+}
+
+/// \brief Creates node of binary operation with the same attributes as the
+/// specified one but with other operands.
+static Value *CreateBinOpAsGiven(BinaryOperator &Inst, Value *LHS, Value *RHS,
+                                 InstCombiner::BuilderTy &B) {
+  Value *BO = B.CreateBinOp(Inst.getOpcode(), LHS, RHS);
+  // If LHS and RHS are constant, BO won't be a binary operator.
+  if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BO))
+    NewBO->copyIRFlags(&Inst);
+  return BO;
+}
+
+/// \brief Makes transformation of binary operation specific for vector types.
+/// \param Inst Binary operator to transform.
+/// \return Pointer to node that must replace the original binary operator, or
+///         null pointer if no transformation was made.
+Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
+  if (!Inst.getType()->isVectorTy()) return nullptr;
+
+  // It may not be safe to reorder shuffles and things like div, urem, etc.
+  // because we may trap when executing those ops on unknown vector elements.
+  // See PR20059.
+  if (!isSafeToSpeculativelyExecute(&Inst))
+    return nullptr;
+
+  unsigned VWidth = cast<VectorType>(Inst.getType())->getNumElements();
+  Value *LHS = Inst.getOperand(0), *RHS = Inst.getOperand(1);
+  assert(cast<VectorType>(LHS->getType())->getNumElements() == VWidth);
+  assert(cast<VectorType>(RHS->getType())->getNumElements() == VWidth);
+
+  // If both arguments of the binary operation are shuffles that use the same
+  // mask and shuffle within a single vector, move the shuffle after the binop:
+  //   Op(shuffle(v1, m), shuffle(v2, m)) -> shuffle(Op(v1, v2), m)
+  auto *LShuf = dyn_cast<ShuffleVectorInst>(LHS);
+  auto *RShuf = dyn_cast<ShuffleVectorInst>(RHS);
+  if (LShuf && RShuf && LShuf->getMask() == RShuf->getMask() &&
+      isa<UndefValue>(LShuf->getOperand(1)) &&
+      isa<UndefValue>(RShuf->getOperand(1)) &&
+      LShuf->getOperand(0)->getType() == RShuf->getOperand(0)->getType()) {
+    Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0),
+                                      RShuf->getOperand(0), Builder);
+    return Builder.CreateShuffleVector(
+        NewBO, UndefValue::get(NewBO->getType()), LShuf->getMask());
+  }
+
+  // If one argument is a shuffle within one vector, the other is a constant,
+  // try moving the shuffle after the binary operation.
+  ShuffleVectorInst *Shuffle = nullptr;
+  Constant *C1 = nullptr;
+  if (isa<ShuffleVectorInst>(LHS)) Shuffle = cast<ShuffleVectorInst>(LHS);
+  if (isa<ShuffleVectorInst>(RHS)) Shuffle = cast<ShuffleVectorInst>(RHS);
+  if (isa<Constant>(LHS)) C1 = cast<Constant>(LHS);
+  if (isa<Constant>(RHS)) C1 = cast<Constant>(RHS);
+  if (Shuffle && C1 &&
+      (isa<ConstantVector>(C1) || isa<ConstantDataVector>(C1)) &&
+      isa<UndefValue>(Shuffle->getOperand(1)) &&
+      Shuffle->getType() == Shuffle->getOperand(0)->getType()) {
+    SmallVector<int, 16> ShMask = Shuffle->getShuffleMask();
+    // Find constant C2 that has property:
+    //   shuffle(C2, ShMask) = C1
+    // If such constant does not exist (example: ShMask=<0,0> and C1=<1,2>)
+    // reorder is not possible.
+    SmallVector<Constant*, 16> C2M(VWidth,
+                               UndefValue::get(C1->getType()->getScalarType()));
+    bool MayChange = true;
+    for (unsigned I = 0; I < VWidth; ++I) {
+      if (ShMask[I] >= 0) {
+        assert(ShMask[I] < (int)VWidth);
+        if (!isa<UndefValue>(C2M[ShMask[I]])) {
+          MayChange = false;
+          break;
+        }
+        C2M[ShMask[I]] = C1->getAggregateElement(I);
+      }
+    }
+    if (MayChange) {
+      Constant *C2 = ConstantVector::get(C2M);
+      Value *NewLHS = isa<Constant>(LHS) ? C2 : Shuffle->getOperand(0);
+      Value *NewRHS = isa<Constant>(LHS) ? Shuffle->getOperand(0) : C2;
+      Value *NewBO = CreateBinOpAsGiven(Inst, NewLHS, NewRHS, Builder);
+      return Builder.CreateShuffleVector(NewBO,
+          UndefValue::get(Inst.getType()), Shuffle->getMask());
+    }
+  }
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+  SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
+
+  if (Value *V = SimplifyGEPInst(GEP.getSourceElementType(), Ops,
+                                 SQ.getWithInstruction(&GEP)))
+    return replaceInstUsesWith(GEP, V);
+
+  Value *PtrOp = GEP.getOperand(0);
+
+  // Eliminate unneeded casts for indices, and replace indices which displace
+  // by multiples of a zero size type with zero.
+  bool MadeChange = false;
+  Type *IntPtrTy =
+    DL.getIntPtrType(GEP.getPointerOperandType()->getScalarType());
+
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E;
+       ++I, ++GTI) {
+    // Skip indices into struct types.
+    if (GTI.isStruct())
+      continue;
+
+    // Index type should have the same width as IntPtr
+    Type *IndexTy = (*I)->getType();
+    Type *NewIndexType = IndexTy->isVectorTy() ?
+      VectorType::get(IntPtrTy, IndexTy->getVectorNumElements()) : IntPtrTy;
+
+    // If the element type has zero size then any index over it is equivalent
+    // to an index of zero, so replace it with zero if it is not zero already.
+    Type *EltTy = GTI.getIndexedType();
+    if (EltTy->isSized() && DL.getTypeAllocSize(EltTy) == 0)
+      if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) {
+        *I = Constant::getNullValue(NewIndexType);
+        MadeChange = true;
+      }
+
+    if (IndexTy != NewIndexType) {
+      // If we are using a wider index than needed for this platform, shrink
+      // it to what we need.  If narrower, sign-extend it to what we need.
+      // This explicit cast can make subsequent optimizations more obvious.
+      *I = Builder.CreateIntCast(*I, NewIndexType, true);
+      MadeChange = true;
+    }
+  }
+  if (MadeChange)
+    return &GEP;
+
+  // Check to see if the inputs to the PHI node are getelementptr instructions.
+  if (PHINode *PN = dyn_cast<PHINode>(PtrOp)) {
+    GetElementPtrInst *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
+    if (!Op1)
+      return nullptr;
+
+    // Don't fold a GEP into itself through a PHI node. This can only happen
+    // through the back-edge of a loop. Folding a GEP into itself means that
+    // the value of the previous iteration needs to be stored in the meantime,
+    // thus requiring an additional register variable to be live, but not
+    // actually achieving anything (the GEP still needs to be executed once per
+    // loop iteration).
+    if (Op1 == &GEP)
+      return nullptr;
+
+    int DI = -1;
+
+    for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
+      GetElementPtrInst *Op2 = dyn_cast<GetElementPtrInst>(*I);
+      if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands())
+        return nullptr;
+
+      // As for Op1 above, don't try to fold a GEP into itself.
+      if (Op2 == &GEP)
+        return nullptr;
+
+      // Keep track of the type as we walk the GEP.
+      Type *CurTy = nullptr;
+
+      for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) {
+        if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType())
+          return nullptr;
+
+        if (Op1->getOperand(J) != Op2->getOperand(J)) {
+          if (DI == -1) {
+            // We have not seen any differences yet in the GEPs feeding the
+            // PHI yet, so we record this one if it is allowed to be a
+            // variable.
+
+            // The first two arguments can vary for any GEP, the rest have to be
+            // static for struct slots
+            if (J > 1 && CurTy->isStructTy())
+              return nullptr;
+
+            DI = J;
+          } else {
+            // The GEP is different by more than one input. While this could be
+            // extended to support GEPs that vary by more than one variable it
+            // doesn't make sense since it greatly increases the complexity and
+            // would result in an R+R+R addressing mode which no backend
+            // directly supports and would need to be broken into several
+            // simpler instructions anyway.
+            return nullptr;
+          }
+        }
+
+        // Sink down a layer of the type for the next iteration.
+        if (J > 0) {
+          if (J == 1) {
+            CurTy = Op1->getSourceElementType();
+          } else if (CompositeType *CT = dyn_cast<CompositeType>(CurTy)) {
+            CurTy = CT->getTypeAtIndex(Op1->getOperand(J));
+          } else {
+            CurTy = nullptr;
+          }
+        }
+      }
+    }
+
+    // If not all GEPs are identical we'll have to create a new PHI node.
+    // Check that the old PHI node has only one use so that it will get
+    // removed.
+    if (DI != -1 && !PN->hasOneUse())
+      return nullptr;
+
+    GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone());
+    if (DI == -1) {
+      // All the GEPs feeding the PHI are identical. Clone one down into our
+      // BB so that it can be merged with the current GEP.
+      GEP.getParent()->getInstList().insert(
+          GEP.getParent()->getFirstInsertionPt(), NewGEP);
+    } else {
+      // All the GEPs feeding the PHI differ at a single offset. Clone a GEP
+      // into the current block so it can be merged, and create a new PHI to
+      // set that index.
+      PHINode *NewPN;
+      {
+        IRBuilderBase::InsertPointGuard Guard(Builder);
+        Builder.SetInsertPoint(PN);
+        NewPN = Builder.CreatePHI(Op1->getOperand(DI)->getType(),
+                                  PN->getNumOperands());
+      }
+
+      for (auto &I : PN->operands())
+        NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI),
+                           PN->getIncomingBlock(I));
+
+      NewGEP->setOperand(DI, NewPN);
+      GEP.getParent()->getInstList().insert(
+          GEP.getParent()->getFirstInsertionPt(), NewGEP);
+      NewGEP->setOperand(DI, NewPN);
+    }
+
+    GEP.setOperand(0, NewGEP);
+    PtrOp = NewGEP;
+  }
+
+  // Combine Indices - If the source pointer to this getelementptr instruction
+  // is a getelementptr instruction, combine the indices of the two
+  // getelementptr instructions into a single instruction.
+  //
+  if (GEPOperator *Src = dyn_cast<GEPOperator>(PtrOp)) {
+    if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
+      return nullptr;
+
+    // Note that if our source is a gep chain itself then we wait for that
+    // chain to be resolved before we perform this transformation.  This
+    // avoids us creating a TON of code in some cases.
+    if (GEPOperator *SrcGEP =
+          dyn_cast<GEPOperator>(Src->getOperand(0)))
+      if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
+        return nullptr;   // Wait until our source is folded to completion.
+
+    SmallVector<Value*, 8> Indices;
+
+    // Find out whether the last index in the source GEP is a sequential idx.
+    bool EndsWithSequential = false;
+    for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
+         I != E; ++I)
+      EndsWithSequential = I.isSequential();
+
+    // Can we combine the two pointer arithmetics offsets?
+    if (EndsWithSequential) {
+      // Replace: gep (gep %P, long B), long A, ...
+      // With:    T = long A+B; gep %P, T, ...
+      //
+      Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
+      Value *GO1 = GEP.getOperand(1);
+
+      // If they aren't the same type, then the input hasn't been processed
+      // by the loop above yet (which canonicalizes sequential index types to
+      // intptr_t).  Just avoid transforming this until the input has been
+      // normalized.
+      if (SO1->getType() != GO1->getType())
+        return nullptr;
+
+      Value *Sum =
+          SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
+      // Only do the combine when we are sure the cost after the
+      // merge is never more than that before the merge.
+      if (Sum == nullptr)
+        return nullptr;
+
+      // Update the GEP in place if possible.
+      if (Src->getNumOperands() == 2) {
+        GEP.setOperand(0, Src->getOperand(0));
+        GEP.setOperand(1, Sum);
+        return &GEP;
+      }
+      Indices.append(Src->op_begin()+1, Src->op_end()-1);
+      Indices.push_back(Sum);
+      Indices.append(GEP.op_begin()+2, GEP.op_end());
+    } else if (isa<Constant>(*GEP.idx_begin()) &&
+               cast<Constant>(*GEP.idx_begin())->isNullValue() &&
+               Src->getNumOperands() != 1) {
+      // Otherwise we can do the fold if the first index of the GEP is a zero
+      Indices.append(Src->op_begin()+1, Src->op_end());
+      Indices.append(GEP.idx_begin()+1, GEP.idx_end());
+    }
+
+    if (!Indices.empty())
+      return GEP.isInBounds() && Src->isInBounds()
+                 ? GetElementPtrInst::CreateInBounds(
+                       Src->getSourceElementType(), Src->getOperand(0), Indices,
+                       GEP.getName())
+                 : GetElementPtrInst::Create(Src->getSourceElementType(),
+                                             Src->getOperand(0), Indices,
+                                             GEP.getName());
+  }
+
+  if (GEP.getNumIndices() == 1) {
+    unsigned AS = GEP.getPointerAddressSpace();
+    if (GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
+        DL.getPointerSizeInBits(AS)) {
+      Type *Ty = GEP.getSourceElementType();
+      uint64_t TyAllocSize = DL.getTypeAllocSize(Ty);
+
+      bool Matched = false;
+      uint64_t C;
+      Value *V = nullptr;
+      if (TyAllocSize == 1) {
+        V = GEP.getOperand(1);
+        Matched = true;
+      } else if (match(GEP.getOperand(1),
+                       m_AShr(m_Value(V), m_ConstantInt(C)))) {
+        if (TyAllocSize == 1ULL << C)
+          Matched = true;
+      } else if (match(GEP.getOperand(1),
+                       m_SDiv(m_Value(V), m_ConstantInt(C)))) {
+        if (TyAllocSize == C)
+          Matched = true;
+      }
+
+      if (Matched) {
+        // Canonicalize (gep i8* X, -(ptrtoint Y))
+        // to (inttoptr (sub (ptrtoint X), (ptrtoint Y)))
+        // The GEP pattern is emitted by the SCEV expander for certain kinds of
+        // pointer arithmetic.
+        if (match(V, m_Neg(m_PtrToInt(m_Value())))) {
+          Operator *Index = cast<Operator>(V);
+          Value *PtrToInt = Builder.CreatePtrToInt(PtrOp, Index->getType());
+          Value *NewSub = Builder.CreateSub(PtrToInt, Index->getOperand(1));
+          return CastInst::Create(Instruction::IntToPtr, NewSub, GEP.getType());
+        }
+        // Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X))
+        // to (bitcast Y)
+        Value *Y;
+        if (match(V, m_Sub(m_PtrToInt(m_Value(Y)),
+                           m_PtrToInt(m_Specific(GEP.getOperand(0)))))) {
+          return CastInst::CreatePointerBitCastOrAddrSpaceCast(Y,
+                                                               GEP.getType());
+        }
+      }
+    }
+  }
+
+  // We do not handle pointer-vector geps here.
+  if (GEP.getType()->isVectorTy())
+    return nullptr;
+
+  // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
+  Value *StrippedPtr = PtrOp->stripPointerCasts();
+  PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType());
+
+  if (StrippedPtr != PtrOp) {
+    bool HasZeroPointerIndex = false;
+    if (ConstantInt *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
+      HasZeroPointerIndex = C->isZero();
+
+    // Transform: GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ...
+    // into     : GEP [10 x i8]* X, i32 0, ...
+    //
+    // Likewise, transform: GEP (bitcast i8* X to [0 x i8]*), i32 0, ...
+    //           into     : GEP i8* X, ...
+    //
+    // This occurs when the program declares an array extern like "int X[];"
+    if (HasZeroPointerIndex) {
+      if (ArrayType *CATy =
+          dyn_cast<ArrayType>(GEP.getSourceElementType())) {
+        // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
+        if (CATy->getElementType() == StrippedPtrTy->getElementType()) {
+          // -> GEP i8* X, ...
+          SmallVector<Value*, 8> Idx(GEP.idx_begin()+1, GEP.idx_end());
+          GetElementPtrInst *Res = GetElementPtrInst::Create(
+              StrippedPtrTy->getElementType(), StrippedPtr, Idx, GEP.getName());
+          Res->setIsInBounds(GEP.isInBounds());
+          if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace())
+            return Res;
+          // Insert Res, and create an addrspacecast.
+          // e.g.,
+          // GEP (addrspacecast i8 addrspace(1)* X to [0 x i8]*), i32 0, ...
+          // ->
+          // %0 = GEP i8 addrspace(1)* X, ...
+          // addrspacecast i8 addrspace(1)* %0 to i8*
+          return new AddrSpaceCastInst(Builder.Insert(Res), GEP.getType());
+        }
+
+        if (ArrayType *XATy =
+              dyn_cast<ArrayType>(StrippedPtrTy->getElementType())){
+          // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
+          if (CATy->getElementType() == XATy->getElementType()) {
+            // -> GEP [10 x i8]* X, i32 0, ...
+            // At this point, we know that the cast source type is a pointer
+            // to an array of the same type as the destination pointer
+            // array.  Because the array type is never stepped over (there
+            // is a leading zero) we can fold the cast into this GEP.
+            if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) {
+              GEP.setOperand(0, StrippedPtr);
+              GEP.setSourceElementType(XATy);
+              return &GEP;
+            }
+            // Cannot replace the base pointer directly because StrippedPtr's
+            // address space is different. Instead, create a new GEP followed by
+            // an addrspacecast.
+            // e.g.,
+            // GEP (addrspacecast [10 x i8] addrspace(1)* X to [0 x i8]*),
+            //   i32 0, ...
+            // ->
+            // %0 = GEP [10 x i8] addrspace(1)* X, ...
+            // addrspacecast i8 addrspace(1)* %0 to i8*
+            SmallVector<Value*, 8> Idx(GEP.idx_begin(), GEP.idx_end());
+            Value *NewGEP = GEP.isInBounds()
+                                ? Builder.CreateInBoundsGEP(
+                                      nullptr, StrippedPtr, Idx, GEP.getName())
+                                : Builder.CreateGEP(nullptr, StrippedPtr, Idx,
+                                                    GEP.getName());
+            return new AddrSpaceCastInst(NewGEP, GEP.getType());
+          }
+        }
+      }
+    } else if (GEP.getNumOperands() == 2) {
+      // Transform things like:
+      // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V
+      // into:  %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast
+      Type *SrcElTy = StrippedPtrTy->getElementType();
+      Type *ResElTy = GEP.getSourceElementType();
+      if (SrcElTy->isArrayTy() &&
+          DL.getTypeAllocSize(SrcElTy->getArrayElementType()) ==
+              DL.getTypeAllocSize(ResElTy)) {
+        Type *IdxType = DL.getIntPtrType(GEP.getType());
+        Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) };
+        Value *NewGEP =
+            GEP.isInBounds()
+                ? Builder.CreateInBoundsGEP(nullptr, StrippedPtr, Idx,
+                                            GEP.getName())
+                : Builder.CreateGEP(nullptr, StrippedPtr, Idx, GEP.getName());
+
+        // V and GEP are both pointer types --> BitCast
+        return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
+                                                             GEP.getType());
+      }
+
+      // Transform things like:
+      // %V = mul i64 %N, 4
+      // %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V
+      // into:  %t1 = getelementptr i32* %arr, i32 %N; bitcast
+      if (ResElTy->isSized() && SrcElTy->isSized()) {
+        // Check that changing the type amounts to dividing the index by a scale
+        // factor.
+        uint64_t ResSize = DL.getTypeAllocSize(ResElTy);
+        uint64_t SrcSize = DL.getTypeAllocSize(SrcElTy);
+        if (ResSize && SrcSize % ResSize == 0) {
+          Value *Idx = GEP.getOperand(1);
+          unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
+          uint64_t Scale = SrcSize / ResSize;
+
+          // Earlier transforms ensure that the index has type IntPtrType, which
+          // considerably simplifies the logic by eliminating implicit casts.
+          assert(Idx->getType() == DL.getIntPtrType(GEP.getType()) &&
+                 "Index not cast to pointer width?");
+
+          bool NSW;
+          if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
+            // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
+            // If the multiplication NewIdx * Scale may overflow then the new
+            // GEP may not be "inbounds".
+            Value *NewGEP =
+                GEP.isInBounds() && NSW
+                    ? Builder.CreateInBoundsGEP(nullptr, StrippedPtr, NewIdx,
+                                                GEP.getName())
+                    : Builder.CreateGEP(nullptr, StrippedPtr, NewIdx,
+                                        GEP.getName());
+
+            // The NewGEP must be pointer typed, so must the old one -> BitCast
+            return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
+                                                                 GEP.getType());
+          }
+        }
+      }
+
+      // Similarly, transform things like:
+      // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
+      //   (where tmp = 8*tmp2) into:
+      // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
+      if (ResElTy->isSized() && SrcElTy->isSized() && SrcElTy->isArrayTy()) {
+        // Check that changing to the array element type amounts to dividing the
+        // index by a scale factor.
+        uint64_t ResSize = DL.getTypeAllocSize(ResElTy);
+        uint64_t ArrayEltSize =
+            DL.getTypeAllocSize(SrcElTy->getArrayElementType());
+        if (ResSize && ArrayEltSize % ResSize == 0) {
+          Value *Idx = GEP.getOperand(1);
+          unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
+          uint64_t Scale = ArrayEltSize / ResSize;
+
+          // Earlier transforms ensure that the index has type IntPtrType, which
+          // considerably simplifies the logic by eliminating implicit casts.
+          assert(Idx->getType() == DL.getIntPtrType(GEP.getType()) &&
+                 "Index not cast to pointer width?");
+
+          bool NSW;
+          if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
+            // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
+            // If the multiplication NewIdx * Scale may overflow then the new
+            // GEP may not be "inbounds".
+            Value *Off[2] = {
+                Constant::getNullValue(DL.getIntPtrType(GEP.getType())),
+                NewIdx};
+
+            Value *NewGEP = GEP.isInBounds() && NSW
+                                ? Builder.CreateInBoundsGEP(
+                                      SrcElTy, StrippedPtr, Off, GEP.getName())
+                                : Builder.CreateGEP(SrcElTy, StrippedPtr, Off,
+                                                    GEP.getName());
+            // The NewGEP must be pointer typed, so must the old one -> BitCast
+            return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
+                                                                 GEP.getType());
+          }
+        }
+      }
+    }
+  }
+
+  // addrspacecast between types is canonicalized as a bitcast, then an
+  // addrspacecast. To take advantage of the below bitcast + struct GEP, look
+  // through the addrspacecast.
+  if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(PtrOp)) {
+    //   X = bitcast A addrspace(1)* to B addrspace(1)*
+    //   Y = addrspacecast A addrspace(1)* to B addrspace(2)*
+    //   Z = gep Y, <...constant indices...>
+    // Into an addrspacecasted GEP of the struct.
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(ASC->getOperand(0)))
+      PtrOp = BC;
+  }
+
+  /// See if we can simplify:
+  ///   X = bitcast A* to B*
+  ///   Y = gep X, <...constant indices...>
+  /// into a gep of the original struct.  This is important for SROA and alias
+  /// analysis of unions.  If "A" is also a bitcast, wait for A/X to be merged.
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) {
+    Value *Operand = BCI->getOperand(0);
+    PointerType *OpType = cast<PointerType>(Operand->getType());
+    unsigned OffsetBits = DL.getPointerTypeSizeInBits(GEP.getType());
+    APInt Offset(OffsetBits, 0);
+    if (!isa<BitCastInst>(Operand) &&
+        GEP.accumulateConstantOffset(DL, Offset)) {
+
+      // If this GEP instruction doesn't move the pointer, just replace the GEP
+      // with a bitcast of the real input to the dest type.
+      if (!Offset) {
+        // If the bitcast is of an allocation, and the allocation will be
+        // converted to match the type of the cast, don't touch this.
+        if (isa<AllocaInst>(Operand) || isAllocationFn(Operand, &TLI)) {
+          // See if the bitcast simplifies, if so, don't nuke this GEP yet.
+          if (Instruction *I = visitBitCast(*BCI)) {
+            if (I != BCI) {
+              I->takeName(BCI);
+              BCI->getParent()->getInstList().insert(BCI->getIterator(), I);
+              replaceInstUsesWith(*BCI, I);
+            }
+            return &GEP;
+          }
+        }
+
+        if (Operand->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
+          return new AddrSpaceCastInst(Operand, GEP.getType());
+        return new BitCastInst(Operand, GEP.getType());
+      }
+
+      // Otherwise, if the offset is non-zero, we need to find out if there is a
+      // field at Offset in 'A's type.  If so, we can pull the cast through the
+      // GEP.
+      SmallVector<Value*, 8> NewIndices;
+      if (FindElementAtOffset(OpType, Offset.getSExtValue(), NewIndices)) {
+        Value *NGEP =
+            GEP.isInBounds()
+                ? Builder.CreateInBoundsGEP(nullptr, Operand, NewIndices)
+                : Builder.CreateGEP(nullptr, Operand, NewIndices);
+
+        if (NGEP->getType() == GEP.getType())
+          return replaceInstUsesWith(GEP, NGEP);
+        NGEP->takeName(&GEP);
+
+        if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
+          return new AddrSpaceCastInst(NGEP, GEP.getType());
+        return new BitCastInst(NGEP, GEP.getType());
+      }
+    }
+  }
+
+  if (!GEP.isInBounds()) {
+    unsigned PtrWidth =
+        DL.getPointerSizeInBits(PtrOp->getType()->getPointerAddressSpace());
+    APInt BasePtrOffset(PtrWidth, 0);
+    Value *UnderlyingPtrOp =
+            PtrOp->stripAndAccumulateInBoundsConstantOffsets(DL,
+                                                             BasePtrOffset);
+    if (auto *AI = dyn_cast<AllocaInst>(UnderlyingPtrOp)) {
+      if (GEP.accumulateConstantOffset(DL, BasePtrOffset) &&
+          BasePtrOffset.isNonNegative()) {
+        APInt AllocSize(PtrWidth, DL.getTypeAllocSize(AI->getAllocatedType()));
+        if (BasePtrOffset.ule(AllocSize)) {
+          return GetElementPtrInst::CreateInBounds(
+              PtrOp, makeArrayRef(Ops).slice(1), GEP.getName());
+        }
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI,
+                                         Instruction *AI) {
+  if (isa<ConstantPointerNull>(V))
+    return true;
+  if (auto *LI = dyn_cast<LoadInst>(V))
+    return isa<GlobalVariable>(LI->getPointerOperand());
+  // Two distinct allocations will never be equal.
+  // We rely on LookThroughBitCast in isAllocLikeFn being false, since looking
+  // through bitcasts of V can cause
+  // the result statement below to be true, even when AI and V (ex:
+  // i8* ->i32* ->i8* of AI) are the same allocations.
+  return isAllocLikeFn(V, TLI) && V != AI;
+}
+
+static bool isAllocSiteRemovable(Instruction *AI,
+                                 SmallVectorImpl<WeakTrackingVH> &Users,
+                                 const TargetLibraryInfo *TLI) {
+  SmallVector<Instruction*, 4> Worklist;
+  Worklist.push_back(AI);
+
+  do {
+    Instruction *PI = Worklist.pop_back_val();
+    for (User *U : PI->users()) {
+      Instruction *I = cast<Instruction>(U);
+      switch (I->getOpcode()) {
+      default:
+        // Give up the moment we see something we can't handle.
+        return false;
+
+      case Instruction::AddrSpaceCast:
+      case Instruction::BitCast:
+      case Instruction::GetElementPtr:
+        Users.emplace_back(I);
+        Worklist.push_back(I);
+        continue;
+
+      case Instruction::ICmp: {
+        ICmpInst *ICI = cast<ICmpInst>(I);
+        // We can fold eq/ne comparisons with null to false/true, respectively.
+        // We also fold comparisons in some conditions provided the alloc has
+        // not escaped (see isNeverEqualToUnescapedAlloc).
+        if (!ICI->isEquality())
+          return false;
+        unsigned OtherIndex = (ICI->getOperand(0) == PI) ? 1 : 0;
+        if (!isNeverEqualToUnescapedAlloc(ICI->getOperand(OtherIndex), TLI, AI))
+          return false;
+        Users.emplace_back(I);
+        continue;
+      }
+
+      case Instruction::Call:
+        // Ignore no-op and store intrinsics.
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+          switch (II->getIntrinsicID()) {
+          default:
+            return false;
+
+          case Intrinsic::memmove:
+          case Intrinsic::memcpy:
+          case Intrinsic::memset: {
+            MemIntrinsic *MI = cast<MemIntrinsic>(II);
+            if (MI->isVolatile() || MI->getRawDest() != PI)
+              return false;
+            LLVM_FALLTHROUGH;
+          }
+          case Intrinsic::dbg_declare:
+          case Intrinsic::dbg_value:
+          case Intrinsic::invariant_start:
+          case Intrinsic::invariant_end:
+          case Intrinsic::lifetime_start:
+          case Intrinsic::lifetime_end:
+          case Intrinsic::objectsize:
+            Users.emplace_back(I);
+            continue;
+          }
+        }
+
+        if (isFreeCall(I, TLI)) {
+          Users.emplace_back(I);
+          continue;
+        }
+        return false;
+
+      case Instruction::Store: {
+        StoreInst *SI = cast<StoreInst>(I);
+        if (SI->isVolatile() || SI->getPointerOperand() != PI)
+          return false;
+        Users.emplace_back(I);
+        continue;
+      }
+      }
+      llvm_unreachable("missing a return?");
+    }
+  } while (!Worklist.empty());
+  return true;
+}
+
+Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
+  // If we have a malloc call which is only used in any amount of comparisons
+  // to null and free calls, delete the calls and replace the comparisons with
+  // true or false as appropriate.
+  SmallVector<WeakTrackingVH, 64> Users;
+  if (isAllocSiteRemovable(&MI, Users, &TLI)) {
+    for (unsigned i = 0, e = Users.size(); i != e; ++i) {
+      // Lowering all @llvm.objectsize calls first because they may
+      // use a bitcast/GEP of the alloca we are removing.
+      if (!Users[i])
+       continue;
+
+      Instruction *I = cast<Instruction>(&*Users[i]);
+
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::objectsize) {
+          ConstantInt *Result = lowerObjectSizeCall(II, DL, &TLI,
+                                                    /*MustSucceed=*/true);
+          replaceInstUsesWith(*I, Result);
+          eraseInstFromFunction(*I);
+          Users[i] = nullptr; // Skip examining in the next loop.
+        }
+      }
+    }
+    for (unsigned i = 0, e = Users.size(); i != e; ++i) {
+      if (!Users[i])
+        continue;
+
+      Instruction *I = cast<Instruction>(&*Users[i]);
+
+      if (ICmpInst *C = dyn_cast<ICmpInst>(I)) {
+        replaceInstUsesWith(*C,
+                            ConstantInt::get(Type::getInt1Ty(C->getContext()),
+                                             C->isFalseWhenEqual()));
+      } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I) ||
+                 isa<AddrSpaceCastInst>(I)) {
+        replaceInstUsesWith(*I, UndefValue::get(I->getType()));
+      }
+      eraseInstFromFunction(*I);
+    }
+
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) {
+      // Replace invoke with a NOP intrinsic to maintain the original CFG
+      Module *M = II->getModule();
+      Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);
+      InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),
+                         None, "", II->getParent());
+    }
+    return eraseInstFromFunction(MI);
+  }
+  return nullptr;
+}
+
+/// \brief Move the call to free before a NULL test.
+///
+/// Check if this free is accessed after its argument has been test
+/// against NULL (property 0).
+/// If yes, it is legal to move this call in its predecessor block.
+///
+/// The move is performed only if the block containing the call to free
+/// will be removed, i.e.:
+/// 1. it has only one predecessor P, and P has two successors
+/// 2. it contains the call and an unconditional branch
+/// 3. its successor is the same as its predecessor's successor
+///
+/// The profitability is out-of concern here and this function should
+/// be called only if the caller knows this transformation would be
+/// profitable (e.g., for code size).
+static Instruction *
+tryToMoveFreeBeforeNullTest(CallInst &FI) {
+  Value *Op = FI.getArgOperand(0);
+  BasicBlock *FreeInstrBB = FI.getParent();
+  BasicBlock *PredBB = FreeInstrBB->getSinglePredecessor();
+
+  // Validate part of constraint #1: Only one predecessor
+  // FIXME: We can extend the number of predecessor, but in that case, we
+  //        would duplicate the call to free in each predecessor and it may
+  //        not be profitable even for code size.
+  if (!PredBB)
+    return nullptr;
+
+  // Validate constraint #2: Does this block contains only the call to
+  //                         free and an unconditional branch?
+  // FIXME: We could check if we can speculate everything in the
+  //        predecessor block
+  if (FreeInstrBB->size() != 2)
+    return nullptr;
+  BasicBlock *SuccBB;
+  if (!match(FreeInstrBB->getTerminator(), m_UnconditionalBr(SuccBB)))
+    return nullptr;
+
+  // Validate the rest of constraint #1 by matching on the pred branch.
+  TerminatorInst *TI = PredBB->getTerminator();
+  BasicBlock *TrueBB, *FalseBB;
+  ICmpInst::Predicate Pred;
+  if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Op), m_Zero()), TrueBB, FalseBB)))
+    return nullptr;
+  if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE)
+    return nullptr;
+
+  // Validate constraint #3: Ensure the null case just falls through.
+  if (SuccBB != (Pred == ICmpInst::ICMP_EQ ? TrueBB : FalseBB))
+    return nullptr;
+  assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) &&
+         "Broken CFG: missing edge from predecessor to successor");
+
+  FI.moveBefore(TI);
+  return &FI;
+}
+
+
+Instruction *InstCombiner::visitFree(CallInst &FI) {
+  Value *Op = FI.getArgOperand(0);
+
+  // free undef -> unreachable.
+  if (isa<UndefValue>(Op)) {
+    // Insert a new store to null because we cannot modify the CFG here.
+    Builder.CreateStore(ConstantInt::getTrue(FI.getContext()),
+                        UndefValue::get(Type::getInt1PtrTy(FI.getContext())));
+    return eraseInstFromFunction(FI);
+  }
+
+  // If we have 'free null' delete the instruction.  This can happen in stl code
+  // when lots of inlining happens.
+  if (isa<ConstantPointerNull>(Op))
+    return eraseInstFromFunction(FI);
+
+  // If we optimize for code size, try to move the call to free before the null
+  // test so that simplify cfg can remove the empty block and dead code
+  // elimination the branch. I.e., helps to turn something like:
+  // if (foo) free(foo);
+  // into
+  // free(foo);
+  if (MinimizeSize)
+    if (Instruction *I = tryToMoveFreeBeforeNullTest(FI))
+      return I;
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
+  if (RI.getNumOperands() == 0) // ret void
+    return nullptr;
+
+  Value *ResultOp = RI.getOperand(0);
+  Type *VTy = ResultOp->getType();
+  if (!VTy->isIntegerTy())
+    return nullptr;
+
+  // There might be assume intrinsics dominating this return that completely
+  // determine the value. If so, constant fold it.
+  KnownBits Known = computeKnownBits(ResultOp, 0, &RI);
+  if (Known.isConstant())
+    RI.setOperand(0, Constant::getIntegerValue(VTy, Known.getConstant()));
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
+  // Change br (not X), label True, label False to: br X, label False, True
+  Value *X = nullptr;
+  BasicBlock *TrueDest;
+  BasicBlock *FalseDest;
+  if (match(&BI, m_Br(m_Not(m_Value(X)), TrueDest, FalseDest)) &&
+      !isa<Constant>(X)) {
+    // Swap Destinations and condition...
+    BI.setCondition(X);
+    BI.swapSuccessors();
+    return &BI;
+  }
+
+  // If the condition is irrelevant, remove the use so that other
+  // transforms on the condition become more effective.
+  if (BI.isConditional() &&
+      BI.getSuccessor(0) == BI.getSuccessor(1) &&
+      !isa<UndefValue>(BI.getCondition())) {
+    BI.setCondition(UndefValue::get(BI.getCondition()->getType()));
+    return &BI;
+  }
+
+  // Canonicalize, for example, icmp_ne -> icmp_eq or fcmp_one -> fcmp_oeq.
+  CmpInst::Predicate Pred;
+  if (match(&BI, m_Br(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), TrueDest,
+                      FalseDest)) &&
+      !isCanonicalPredicate(Pred)) {
+    // Swap destinations and condition.
+    CmpInst *Cond = cast<CmpInst>(BI.getCondition());
+    Cond->setPredicate(CmpInst::getInversePredicate(Pred));
+    BI.swapSuccessors();
+    Worklist.Add(Cond);
+    return &BI;
+  }
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
+  Value *Cond = SI.getCondition();
+  Value *Op0;
+  ConstantInt *AddRHS;
+  if (match(Cond, m_Add(m_Value(Op0), m_ConstantInt(AddRHS)))) {
+    // Change 'switch (X+4) case 1:' into 'switch (X) case -3'.
+    for (auto Case : SI.cases()) {
+      Constant *NewCase = ConstantExpr::getSub(Case.getCaseValue(), AddRHS);
+      assert(isa<ConstantInt>(NewCase) &&
+             "Result of expression should be constant");
+      Case.setValue(cast<ConstantInt>(NewCase));
+    }
+    SI.setCondition(Op0);
+    return &SI;
+  }
+
+  KnownBits Known = computeKnownBits(Cond, 0, &SI);
+  unsigned LeadingKnownZeros = Known.countMinLeadingZeros();
+  unsigned LeadingKnownOnes = Known.countMinLeadingOnes();
+
+  // Compute the number of leading bits we can ignore.
+  // TODO: A better way to determine this would use ComputeNumSignBits().
+  for (auto &C : SI.cases()) {
+    LeadingKnownZeros = std::min(
+        LeadingKnownZeros, C.getCaseValue()->getValue().countLeadingZeros());
+    LeadingKnownOnes = std::min(
+        LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes());
+  }
+
+  unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);
+
+  // Shrink the condition operand if the new type is smaller than the old type.
+  // This may produce a non-standard type for the switch, but that's ok because
+  // the backend should extend back to a legal type for the target.
+  if (NewWidth > 0 && NewWidth < Known.getBitWidth()) {
+    IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
+    Builder.SetInsertPoint(&SI);
+    Value *NewCond = Builder.CreateTrunc(Cond, Ty, "trunc");
+    SI.setCondition(NewCond);
+
+    for (auto Case : SI.cases()) {
+      APInt TruncatedCase = Case.getCaseValue()->getValue().trunc(NewWidth);
+      Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
+    }
+    return &SI;
+  }
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
+  Value *Agg = EV.getAggregateOperand();
+
+  if (!EV.hasIndices())
+    return replaceInstUsesWith(EV, Agg);
+
+  if (Value *V = SimplifyExtractValueInst(Agg, EV.getIndices(),
+                                          SQ.getWithInstruction(&EV)))
+    return replaceInstUsesWith(EV, V);
+
+  if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
+    // We're extracting from an insertvalue instruction, compare the indices
+    const unsigned *exti, *exte, *insi, *inse;
+    for (exti = EV.idx_begin(), insi = IV->idx_begin(),
+         exte = EV.idx_end(), inse = IV->idx_end();
+         exti != exte && insi != inse;
+         ++exti, ++insi) {
+      if (*insi != *exti)
+        // The insert and extract both reference distinctly different elements.
+        // This means the extract is not influenced by the insert, and we can
+        // replace the aggregate operand of the extract with the aggregate
+        // operand of the insert. i.e., replace
+        // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
+        // %E = extractvalue { i32, { i32 } } %I, 0
+        // with
+        // %E = extractvalue { i32, { i32 } } %A, 0
+        return ExtractValueInst::Create(IV->getAggregateOperand(),
+                                        EV.getIndices());
+    }
+    if (exti == exte && insi == inse)
+      // Both iterators are at the end: Index lists are identical. Replace
+      // %B = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
+      // %C = extractvalue { i32, { i32 } } %B, 1, 0
+      // with "i32 42"
+      return replaceInstUsesWith(EV, IV->getInsertedValueOperand());
+    if (exti == exte) {
+      // The extract list is a prefix of the insert list. i.e. replace
+      // %I = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
+      // %E = extractvalue { i32, { i32 } } %I, 1
+      // with
+      // %X = extractvalue { i32, { i32 } } %A, 1
+      // %E = insertvalue { i32 } %X, i32 42, 0
+      // by switching the order of the insert and extract (though the
+      // insertvalue should be left in, since it may have other uses).
+      Value *NewEV = Builder.CreateExtractValue(IV->getAggregateOperand(),
+                                                EV.getIndices());
+      return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(),
+                                     makeArrayRef(insi, inse));
+    }
+    if (insi == inse)
+      // The insert list is a prefix of the extract list
+      // We can simply remove the common indices from the extract and make it
+      // operate on the inserted value instead of the insertvalue result.
+      // i.e., replace
+      // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
+      // %E = extractvalue { i32, { i32 } } %I, 1, 0
+      // with
+      // %E extractvalue { i32 } { i32 42 }, 0
+      return ExtractValueInst::Create(IV->getInsertedValueOperand(),
+                                      makeArrayRef(exti, exte));
+  }
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Agg)) {
+    // We're extracting from an intrinsic, see if we're the only user, which
+    // allows us to simplify multiple result intrinsics to simpler things that
+    // just get one value.
+    if (II->hasOneUse()) {
+      // Check if we're grabbing the overflow bit or the result of a 'with
+      // overflow' intrinsic.  If it's the latter we can remove the intrinsic
+      // and replace it with a traditional binary instruction.
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::uadd_with_overflow:
+      case Intrinsic::sadd_with_overflow:
+        if (*EV.idx_begin() == 0) {  // Normal result.
+          Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
+          replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+          eraseInstFromFunction(*II);
+          return BinaryOperator::CreateAdd(LHS, RHS);
+        }
+
+        // If the normal result of the add is dead, and the RHS is a constant,
+        // we can transform this into a range comparison.
+        // overflow = uadd a, -4  -->  overflow = icmp ugt a, 3
+        if (II->getIntrinsicID() == Intrinsic::uadd_with_overflow)
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(II->getArgOperand(1)))
+            return new ICmpInst(ICmpInst::ICMP_UGT, II->getArgOperand(0),
+                                ConstantExpr::getNot(CI));
+        break;
+      case Intrinsic::usub_with_overflow:
+      case Intrinsic::ssub_with_overflow:
+        if (*EV.idx_begin() == 0) {  // Normal result.
+          Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
+          replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+          eraseInstFromFunction(*II);
+          return BinaryOperator::CreateSub(LHS, RHS);
+        }
+        break;
+      case Intrinsic::umul_with_overflow:
+      case Intrinsic::smul_with_overflow:
+        if (*EV.idx_begin() == 0) {  // Normal result.
+          Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
+          replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+          eraseInstFromFunction(*II);
+          return BinaryOperator::CreateMul(LHS, RHS);
+        }
+        break;
+      default:
+        break;
+      }
+    }
+  }
+  if (LoadInst *L = dyn_cast<LoadInst>(Agg))
+    // If the (non-volatile) load only has one use, we can rewrite this to a
+    // load from a GEP. This reduces the size of the load. If a load is used
+    // only by extractvalue instructions then this either must have been
+    // optimized before, or it is a struct with padding, in which case we
+    // don't want to do the transformation as it loses padding knowledge.
+    if (L->isSimple() && L->hasOneUse()) {
+      // extractvalue has integer indices, getelementptr has Value*s. Convert.
+      SmallVector<Value*, 4> Indices;
+      // Prefix an i32 0 since we need the first element.
+      Indices.push_back(Builder.getInt32(0));
+      for (ExtractValueInst::idx_iterator I = EV.idx_begin(), E = EV.idx_end();
+            I != E; ++I)
+        Indices.push_back(Builder.getInt32(*I));
+
+      // We need to insert these at the location of the old load, not at that of
+      // the extractvalue.
+      Builder.SetInsertPoint(L);
+      Value *GEP = Builder.CreateInBoundsGEP(L->getType(),
+                                             L->getPointerOperand(), Indices);
+      Instruction *NL = Builder.CreateLoad(GEP);
+      // Whatever aliasing information we had for the orignal load must also
+      // hold for the smaller load, so propagate the annotations.
+      AAMDNodes Nodes;
+      L->getAAMetadata(Nodes);
+      NL->setAAMetadata(Nodes);
+      // Returning the load directly will cause the main loop to insert it in
+      // the wrong spot, so use replaceInstUsesWith().
+      return replaceInstUsesWith(EV, NL);
+    }
+  // We could simplify extracts from other values. Note that nested extracts may
+  // already be simplified implicitly by the above: extract (extract (insert) )
+  // will be translated into extract ( insert ( extract ) ) first and then just
+  // the value inserted, if appropriate. Similarly for extracts from single-use
+  // loads: extract (extract (load)) will be translated to extract (load (gep))
+  // and if again single-use then via load (gep (gep)) to load (gep).
+  // However, double extracts from e.g. function arguments or return values
+  // aren't handled yet.
+  return nullptr;
+}
+
+/// Return 'true' if the given typeinfo will match anything.
+static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {
+  switch (Personality) {
+  case EHPersonality::GNU_C:
+  case EHPersonality::GNU_C_SjLj:
+  case EHPersonality::Rust:
+    // The GCC C EH and Rust personality only exists to support cleanups, so
+    // it's not clear what the semantics of catch clauses are.
+    return false;
+  case EHPersonality::Unknown:
+    return false;
+  case EHPersonality::GNU_Ada:
+    // While __gnat_all_others_value will match any Ada exception, it doesn't
+    // match foreign exceptions (or didn't, before gcc-4.7).
+    return false;
+  case EHPersonality::GNU_CXX:
+  case EHPersonality::GNU_CXX_SjLj:
+  case EHPersonality::GNU_ObjC:
+  case EHPersonality::MSVC_X86SEH:
+  case EHPersonality::MSVC_Win64SEH:
+  case EHPersonality::MSVC_CXX:
+  case EHPersonality::CoreCLR:
+    return TypeInfo->isNullValue();
+  }
+  llvm_unreachable("invalid enum");
+}
+
+static bool shorter_filter(const Value *LHS, const Value *RHS) {
+  return
+    cast<ArrayType>(LHS->getType())->getNumElements()
+  <
+    cast<ArrayType>(RHS->getType())->getNumElements();
+}
+
+Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
+  // The logic here should be correct for any real-world personality function.
+  // However if that turns out not to be true, the offending logic can always
+  // be conditioned on the personality function, like the catch-all logic is.
+  EHPersonality Personality =
+      classifyEHPersonality(LI.getParent()->getParent()->getPersonalityFn());
+
+  // Simplify the list of clauses, eg by removing repeated catch clauses
+  // (these are often created by inlining).
+  bool MakeNewInstruction = false; // If true, recreate using the following:
+  SmallVector<Constant *, 16> NewClauses; // - Clauses for the new instruction;
+  bool CleanupFlag = LI.isCleanup();   // - The new instruction is a cleanup.
+
+  SmallPtrSet<Value *, 16> AlreadyCaught; // Typeinfos known caught already.
+  for (unsigned i = 0, e = LI.getNumClauses(); i != e; ++i) {
+    bool isLastClause = i + 1 == e;
+    if (LI.isCatch(i)) {
+      // A catch clause.
+      Constant *CatchClause = LI.getClause(i);
+      Constant *TypeInfo = CatchClause->stripPointerCasts();
+
+      // If we already saw this clause, there is no point in having a second
+      // copy of it.
+      if (AlreadyCaught.insert(TypeInfo).second) {
+        // This catch clause was not already seen.
+        NewClauses.push_back(CatchClause);
+      } else {
+        // Repeated catch clause - drop the redundant copy.
+        MakeNewInstruction = true;
+      }
+
+      // If this is a catch-all then there is no point in keeping any following
+      // clauses or marking the landingpad as having a cleanup.
+      if (isCatchAll(Personality, TypeInfo)) {
+        if (!isLastClause)
+          MakeNewInstruction = true;
+        CleanupFlag = false;
+        break;
+      }
+    } else {
+      // A filter clause.  If any of the filter elements were already caught
+      // then they can be dropped from the filter.  It is tempting to try to
+      // exploit the filter further by saying that any typeinfo that does not
+      // occur in the filter can't be caught later (and thus can be dropped).
+      // However this would be wrong, since typeinfos can match without being
+      // equal (for example if one represents a C++ class, and the other some
+      // class derived from it).
+      assert(LI.isFilter(i) && "Unsupported landingpad clause!");
+      Constant *FilterClause = LI.getClause(i);
+      ArrayType *FilterType = cast<ArrayType>(FilterClause->getType());
+      unsigned NumTypeInfos = FilterType->getNumElements();
+
+      // An empty filter catches everything, so there is no point in keeping any
+      // following clauses or marking the landingpad as having a cleanup.  By
+      // dealing with this case here the following code is made a bit simpler.
+      if (!NumTypeInfos) {
+        NewClauses.push_back(FilterClause);
+        if (!isLastClause)
+          MakeNewInstruction = true;
+        CleanupFlag = false;
+        break;
+      }
+
+      bool MakeNewFilter = false; // If true, make a new filter.
+      SmallVector<Constant *, 16> NewFilterElts; // New elements.
+      if (isa<ConstantAggregateZero>(FilterClause)) {
+        // Not an empty filter - it contains at least one null typeinfo.
+        assert(NumTypeInfos > 0 && "Should have handled empty filter already!");
+        Constant *TypeInfo =
+          Constant::getNullValue(FilterType->getElementType());
+        // If this typeinfo is a catch-all then the filter can never match.
+        if (isCatchAll(Personality, TypeInfo)) {
+          // Throw the filter away.
+          MakeNewInstruction = true;
+          continue;
+        }
+
+        // There is no point in having multiple copies of this typeinfo, so
+        // discard all but the first copy if there is more than one.
+        NewFilterElts.push_back(TypeInfo);
+        if (NumTypeInfos > 1)
+          MakeNewFilter = true;
+      } else {
+        ConstantArray *Filter = cast<ConstantArray>(FilterClause);
+        SmallPtrSet<Value *, 16> SeenInFilter; // For uniquing the elements.
+        NewFilterElts.reserve(NumTypeInfos);
+
+        // Remove any filter elements that were already caught or that already
+        // occurred in the filter.  While there, see if any of the elements are
+        // catch-alls.  If so, the filter can be discarded.
+        bool SawCatchAll = false;
+        for (unsigned j = 0; j != NumTypeInfos; ++j) {
+          Constant *Elt = Filter->getOperand(j);
+          Constant *TypeInfo = Elt->stripPointerCasts();
+          if (isCatchAll(Personality, TypeInfo)) {
+            // This element is a catch-all.  Bail out, noting this fact.
+            SawCatchAll = true;
+            break;
+          }
+
+          // Even if we've seen a type in a catch clause, we don't want to
+          // remove it from the filter.  An unexpected type handler may be
+          // set up for a call site which throws an exception of the same
+          // type caught.  In order for the exception thrown by the unexpected
+          // handler to propagate correctly, the filter must be correctly
+          // described for the call site.
+          //
+          // Example:
+          //
+          // void unexpected() { throw 1;}
+          // void foo() throw (int) {
+          //   std::set_unexpected(unexpected);
+          //   try {
+          //     throw 2.0;
+          //   } catch (int i) {}
+          // }
+
+          // There is no point in having multiple copies of the same typeinfo in
+          // a filter, so only add it if we didn't already.
+          if (SeenInFilter.insert(TypeInfo).second)
+            NewFilterElts.push_back(cast<Constant>(Elt));
+        }
+        // A filter containing a catch-all cannot match anything by definition.
+        if (SawCatchAll) {
+          // Throw the filter away.
+          MakeNewInstruction = true;
+          continue;
+        }
+
+        // If we dropped something from the filter, make a new one.
+        if (NewFilterElts.size() < NumTypeInfos)
+          MakeNewFilter = true;
+      }
+      if (MakeNewFilter) {
+        FilterType = ArrayType::get(FilterType->getElementType(),
+                                    NewFilterElts.size());
+        FilterClause = ConstantArray::get(FilterType, NewFilterElts);
+        MakeNewInstruction = true;
+      }
+
+      NewClauses.push_back(FilterClause);
+
+      // If the new filter is empty then it will catch everything so there is
+      // no point in keeping any following clauses or marking the landingpad
+      // as having a cleanup.  The case of the original filter being empty was
+      // already handled above.
+      if (MakeNewFilter && !NewFilterElts.size()) {
+        assert(MakeNewInstruction && "New filter but not a new instruction!");
+        CleanupFlag = false;
+        break;
+      }
+    }
+  }
+
+  // If several filters occur in a row then reorder them so that the shortest
+  // filters come first (those with the smallest number of elements).  This is
+  // advantageous because shorter filters are more likely to match, speeding up
+  // unwinding, but mostly because it increases the effectiveness of the other
+  // filter optimizations below.
+  for (unsigned i = 0, e = NewClauses.size(); i + 1 < e; ) {
+    unsigned j;
+    // Find the maximal 'j' s.t. the range [i, j) consists entirely of filters.
+    for (j = i; j != e; ++j)
+      if (!isa<ArrayType>(NewClauses[j]->getType()))
+        break;
+
+    // Check whether the filters are already sorted by length.  We need to know
+    // if sorting them is actually going to do anything so that we only make a
+    // new landingpad instruction if it does.
+    for (unsigned k = i; k + 1 < j; ++k)
+      if (shorter_filter(NewClauses[k+1], NewClauses[k])) {
+        // Not sorted, so sort the filters now.  Doing an unstable sort would be
+        // correct too but reordering filters pointlessly might confuse users.
+        std::stable_sort(NewClauses.begin() + i, NewClauses.begin() + j,
+                         shorter_filter);
+        MakeNewInstruction = true;
+        break;
+      }
+
+    // Look for the next batch of filters.
+    i = j + 1;
+  }
+
+  // If typeinfos matched if and only if equal, then the elements of a filter L
+  // that occurs later than a filter F could be replaced by the intersection of
+  // the elements of F and L.  In reality two typeinfos can match without being
+  // equal (for example if one represents a C++ class, and the other some class
+  // derived from it) so it would be wrong to perform this transform in general.
+  // However the transform is correct and useful if F is a subset of L.  In that
+  // case L can be replaced by F, and thus removed altogether since repeating a
+  // filter is pointless.  So here we look at all pairs of filters F and L where
+  // L follows F in the list of clauses, and remove L if every element of F is
+  // an element of L.  This can occur when inlining C++ functions with exception
+  // specifications.
+  for (unsigned i = 0; i + 1 < NewClauses.size(); ++i) {
+    // Examine each filter in turn.
+    Value *Filter = NewClauses[i];
+    ArrayType *FTy = dyn_cast<ArrayType>(Filter->getType());
+    if (!FTy)
+      // Not a filter - skip it.
+      continue;
+    unsigned FElts = FTy->getNumElements();
+    // Examine each filter following this one.  Doing this backwards means that
+    // we don't have to worry about filters disappearing under us when removed.
+    for (unsigned j = NewClauses.size() - 1; j != i; --j) {
+      Value *LFilter = NewClauses[j];
+      ArrayType *LTy = dyn_cast<ArrayType>(LFilter->getType());
+      if (!LTy)
+        // Not a filter - skip it.
+        continue;
+      // If Filter is a subset of LFilter, i.e. every element of Filter is also
+      // an element of LFilter, then discard LFilter.
+      SmallVectorImpl<Constant *>::iterator J = NewClauses.begin() + j;
+      // If Filter is empty then it is a subset of LFilter.
+      if (!FElts) {
+        // Discard LFilter.
+        NewClauses.erase(J);
+        MakeNewInstruction = true;
+        // Move on to the next filter.
+        continue;
+      }
+      unsigned LElts = LTy->getNumElements();
+      // If Filter is longer than LFilter then it cannot be a subset of it.
+      if (FElts > LElts)
+        // Move on to the next filter.
+        continue;
+      // At this point we know that LFilter has at least one element.
+      if (isa<ConstantAggregateZero>(LFilter)) { // LFilter only contains zeros.
+        // Filter is a subset of LFilter iff Filter contains only zeros (as we
+        // already know that Filter is not longer than LFilter).
+        if (isa<ConstantAggregateZero>(Filter)) {
+          assert(FElts <= LElts && "Should have handled this case earlier!");
+          // Discard LFilter.
+          NewClauses.erase(J);
+          MakeNewInstruction = true;
+        }
+        // Move on to the next filter.
+        continue;
+      }
+      ConstantArray *LArray = cast<ConstantArray>(LFilter);
+      if (isa<ConstantAggregateZero>(Filter)) { // Filter only contains zeros.
+        // Since Filter is non-empty and contains only zeros, it is a subset of
+        // LFilter iff LFilter contains a zero.
+        assert(FElts > 0 && "Should have eliminated the empty filter earlier!");
+        for (unsigned l = 0; l != LElts; ++l)
+          if (LArray->getOperand(l)->isNullValue()) {
+            // LFilter contains a zero - discard it.
+            NewClauses.erase(J);
+            MakeNewInstruction = true;
+            break;
+          }
+        // Move on to the next filter.
+        continue;
+      }
+      // At this point we know that both filters are ConstantArrays.  Loop over
+      // operands to see whether every element of Filter is also an element of
+      // LFilter.  Since filters tend to be short this is probably faster than
+      // using a method that scales nicely.
+      ConstantArray *FArray = cast<ConstantArray>(Filter);
+      bool AllFound = true;
+      for (unsigned f = 0; f != FElts; ++f) {
+        Value *FTypeInfo = FArray->getOperand(f)->stripPointerCasts();
+        AllFound = false;
+        for (unsigned l = 0; l != LElts; ++l) {
+          Value *LTypeInfo = LArray->getOperand(l)->stripPointerCasts();
+          if (LTypeInfo == FTypeInfo) {
+            AllFound = true;
+            break;
+          }
+        }
+        if (!AllFound)
+          break;
+      }
+      if (AllFound) {
+        // Discard LFilter.
+        NewClauses.erase(J);
+        MakeNewInstruction = true;
+      }
+      // Move on to the next filter.
+    }
+  }
+
+  // If we changed any of the clauses, replace the old landingpad instruction
+  // with a new one.
+  if (MakeNewInstruction) {
+    LandingPadInst *NLI = LandingPadInst::Create(LI.getType(),
+                                                 NewClauses.size());
+    for (unsigned i = 0, e = NewClauses.size(); i != e; ++i)
+      NLI->addClause(NewClauses[i]);
+    // A landing pad with no clauses must have the cleanup flag set.  It is
+    // theoretically possible, though highly unlikely, that we eliminated all
+    // clauses.  If so, force the cleanup flag to true.
+    if (NewClauses.empty())
+      CleanupFlag = true;
+    NLI->setCleanup(CleanupFlag);
+    return NLI;
+  }
+
+  // Even if none of the clauses changed, we may nonetheless have understood
+  // that the cleanup flag is pointless.  Clear it if so.
+  if (LI.isCleanup() != CleanupFlag) {
+    assert(!CleanupFlag && "Adding a cleanup, not removing one?!");
+    LI.setCleanup(CleanupFlag);
+    return &LI;
+  }
+
+  return nullptr;
+}
+
+/// Try to move the specified instruction from its current block into the
+/// beginning of DestBlock, which can only happen if it's safe to move the
+/// instruction past all of the instructions between it and the end of its
+/// block.
+static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
+  assert(I->hasOneUse() && "Invariants didn't hold!");
+
+  // Cannot move control-flow-involving, volatile loads, vaarg, etc.
+  if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() ||
+      isa<TerminatorInst>(I))
+    return false;
+
+  // Do not sink alloca instructions out of the entry block.
+  if (isa<AllocaInst>(I) && I->getParent() ==
+        &DestBlock->getParent()->getEntryBlock())
+    return false;
+
+  // Do not sink into catchswitch blocks.
+  if (isa<CatchSwitchInst>(DestBlock->getTerminator()))
+    return false;
+
+  // Do not sink convergent call instructions.
+  if (auto *CI = dyn_cast<CallInst>(I)) {
+    if (CI->isConvergent())
+      return false;
+  }
+  // We can only sink load instructions if there is nothing between the load and
+  // the end of block that could change the value.
+  if (I->mayReadFromMemory()) {
+    for (BasicBlock::iterator Scan = I->getIterator(),
+                              E = I->getParent()->end();
+         Scan != E; ++Scan)
+      if (Scan->mayWriteToMemory())
+        return false;
+  }
+
+  BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
+  I->moveBefore(&*InsertPos);
+  ++NumSunkInst;
+  return true;
+}
+
+bool InstCombiner::run() {
+  while (!Worklist.isEmpty()) {
+    Instruction *I = Worklist.RemoveOne();
+    if (I == nullptr) continue;  // skip null values.
+
+    // Check to see if we can DCE the instruction.
+    if (isInstructionTriviallyDead(I, &TLI)) {
+      DEBUG(dbgs() << "IC: DCE: " << *I << '\n');
+      eraseInstFromFunction(*I);
+      ++NumDeadInst;
+      MadeIRChange = true;
+      continue;
+    }
+
+    // Instruction isn't dead, see if we can constant propagate it.
+    if (!I->use_empty() &&
+        (I->getNumOperands() == 0 || isa<Constant>(I->getOperand(0)))) {
+      if (Constant *C = ConstantFoldInstruction(I, DL, &TLI)) {
+        DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
+
+        // Add operands to the worklist.
+        replaceInstUsesWith(*I, C);
+        ++NumConstProp;
+        if (isInstructionTriviallyDead(I, &TLI))
+          eraseInstFromFunction(*I);
+        MadeIRChange = true;
+        continue;
+      }
+    }
+
+    // In general, it is possible for computeKnownBits to determine all bits in
+    // a value even when the operands are not all constants.
+    Type *Ty = I->getType();
+    if (ExpensiveCombines && !I->use_empty() && Ty->isIntOrIntVectorTy()) {
+      KnownBits Known = computeKnownBits(I, /*Depth*/0, I);
+      if (Known.isConstant()) {
+        Constant *C = ConstantInt::get(Ty, Known.getConstant());
+        DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C <<
+                        " from: " << *I << '\n');
+
+        // Add operands to the worklist.
+        replaceInstUsesWith(*I, C);
+        ++NumConstProp;
+        if (isInstructionTriviallyDead(I, &TLI))
+          eraseInstFromFunction(*I);
+        MadeIRChange = true;
+        continue;
+      }
+    }
+
+    // See if we can trivially sink this instruction to a successor basic block.
+    if (I->hasOneUse()) {
+      BasicBlock *BB = I->getParent();
+      Instruction *UserInst = cast<Instruction>(*I->user_begin());
+      BasicBlock *UserParent;
+
+      // Get the block the use occurs in.
+      if (PHINode *PN = dyn_cast<PHINode>(UserInst))
+        UserParent = PN->getIncomingBlock(*I->use_begin());
+      else
+        UserParent = UserInst->getParent();
+
+      if (UserParent != BB) {
+        bool UserIsSuccessor = false;
+        // See if the user is one of our successors.
+        for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI)
+          if (*SI == UserParent) {
+            UserIsSuccessor = true;
+            break;
+          }
+
+        // If the user is one of our immediate successors, and if that successor
+        // only has us as a predecessors (we'd have to split the critical edge
+        // otherwise), we can keep going.
+        if (UserIsSuccessor && UserParent->getUniquePredecessor()) {
+          // Okay, the CFG is simple enough, try to sink this instruction.
+          if (TryToSinkInstruction(I, UserParent)) {
+            DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
+            MadeIRChange = true;
+            // We'll add uses of the sunk instruction below, but since sinking
+            // can expose opportunities for it's *operands* add them to the
+            // worklist
+            for (Use &U : I->operands())
+              if (Instruction *OpI = dyn_cast<Instruction>(U.get()))
+                Worklist.Add(OpI);
+          }
+        }
+      }
+    }
+
+    // Now that we have an instruction, try combining it to simplify it.
+    Builder.SetInsertPoint(I);
+    Builder.SetCurrentDebugLocation(I->getDebugLoc());
+
+#ifndef NDEBUG
+    std::string OrigI;
+#endif
+    DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
+    DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
+
+    if (Instruction *Result = visit(*I)) {
+      ++NumCombined;
+      // Should we replace the old instruction with a new one?
+      if (Result != I) {
+        DEBUG(dbgs() << "IC: Old = " << *I << '\n'
+                     << "    New = " << *Result << '\n');
+
+        if (I->getDebugLoc())
+          Result->setDebugLoc(I->getDebugLoc());
+        // Everything uses the new instruction now.
+        I->replaceAllUsesWith(Result);
+
+        // Move the name to the new instruction first.
+        Result->takeName(I);
+
+        // Push the new instruction and any users onto the worklist.
+        Worklist.AddUsersToWorkList(*Result);
+        Worklist.Add(Result);
+
+        // Insert the new instruction into the basic block...
+        BasicBlock *InstParent = I->getParent();
+        BasicBlock::iterator InsertPos = I->getIterator();
+
+        // If we replace a PHI with something that isn't a PHI, fix up the
+        // insertion point.
+        if (!isa<PHINode>(Result) && isa<PHINode>(InsertPos))
+          InsertPos = InstParent->getFirstInsertionPt();
+
+        InstParent->getInstList().insert(InsertPos, Result);
+
+        eraseInstFromFunction(*I);
+      } else {
+        DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
+                     << "    New = " << *I << '\n');
+
+        // If the instruction was modified, it's possible that it is now dead.
+        // if so, remove it.
+        if (isInstructionTriviallyDead(I, &TLI)) {
+          eraseInstFromFunction(*I);
+        } else {
+          Worklist.AddUsersToWorkList(*I);
+          Worklist.Add(I);
+        }
+      }
+      MadeIRChange = true;
+    }
+  }
+
+  Worklist.Zap();
+  return MadeIRChange;
+}
+
+/// Walk the function in depth-first order, adding all reachable code to the
+/// worklist.
+///
+/// This has a couple of tricks to make the code faster and more powerful.  In
+/// particular, we constant fold and DCE instructions as we go, to avoid adding
+/// them to the worklist (this significantly speeds up instcombine on code where
+/// many instructions are dead or constant).  Additionally, if we find a branch
+/// whose condition is a known constant, we only visit the reachable successors.
+///
+static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
+                                       SmallPtrSetImpl<BasicBlock *> &Visited,
+                                       InstCombineWorklist &ICWorklist,
+                                       const TargetLibraryInfo *TLI) {
+  bool MadeIRChange = false;
+  SmallVector<BasicBlock*, 256> Worklist;
+  Worklist.push_back(BB);
+
+  SmallVector<Instruction*, 128> InstrsForInstCombineWorklist;
+  DenseMap<Constant *, Constant *> FoldedConstants;
+
+  do {
+    BB = Worklist.pop_back_val();
+
+    // We have now visited this block!  If we've already been here, ignore it.
+    if (!Visited.insert(BB).second)
+      continue;
+
+    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
+      Instruction *Inst = &*BBI++;
+
+      // DCE instruction if trivially dead.
+      if (isInstructionTriviallyDead(Inst, TLI)) {
+        ++NumDeadInst;
+        DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
+        Inst->eraseFromParent();
+        MadeIRChange = true;
+        continue;
+      }
+
+      // ConstantProp instruction if trivially constant.
+      if (!Inst->use_empty() &&
+          (Inst->getNumOperands() == 0 || isa<Constant>(Inst->getOperand(0))))
+        if (Constant *C = ConstantFoldInstruction(Inst, DL, TLI)) {
+          DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: "
+                       << *Inst << '\n');
+          Inst->replaceAllUsesWith(C);
+          ++NumConstProp;
+          if (isInstructionTriviallyDead(Inst, TLI))
+            Inst->eraseFromParent();
+          MadeIRChange = true;
+          continue;
+        }
+
+      // See if we can constant fold its operands.
+      for (Use &U : Inst->operands()) {
+        if (!isa<ConstantVector>(U) && !isa<ConstantExpr>(U))
+          continue;
+
+        auto *C = cast<Constant>(U);
+        Constant *&FoldRes = FoldedConstants[C];
+        if (!FoldRes)
+          FoldRes = ConstantFoldConstant(C, DL, TLI);
+        if (!FoldRes)
+          FoldRes = C;
+
+        if (FoldRes != C) {
+          DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
+                       << "\n    Old = " << *C
+                       << "\n    New = " << *FoldRes << '\n');
+          U = FoldRes;
+          MadeIRChange = true;
+        }
+      }
+
+      // Skip processing debug intrinsics in InstCombine. Processing these call instructions
+      // consumes non-trivial amount of time and provides no value for the optimization.
+      if (!isa<DbgInfoIntrinsic>(Inst))
+        InstrsForInstCombineWorklist.push_back(Inst);
+    }
+
+    // Recursively visit successors.  If this is a branch or switch on a
+    // constant, only visit the reachable successor.
+    TerminatorInst *TI = BB->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) {
+        bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue();
+        BasicBlock *ReachableBB = BI->getSuccessor(!CondVal);
+        Worklist.push_back(ReachableBB);
+        continue;
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
+        Worklist.push_back(SI->findCaseValue(Cond)->getCaseSuccessor());
+        continue;
+      }
+    }
+
+    for (BasicBlock *SuccBB : TI->successors())
+      Worklist.push_back(SuccBB);
+  } while (!Worklist.empty());
+
+  // Once we've found all of the instructions to add to instcombine's worklist,
+  // add them in reverse order.  This way instcombine will visit from the top
+  // of the function down.  This jives well with the way that it adds all uses
+  // of instructions to the worklist after doing a transformation, thus avoiding
+  // some N^2 behavior in pathological cases.
+  ICWorklist.AddInitialGroup(InstrsForInstCombineWorklist);
+
+  return MadeIRChange;
+}
+
+/// \brief Populate the IC worklist from a function, and prune any dead basic
+/// blocks discovered in the process.
+///
+/// This also does basic constant propagation and other forward fixing to make
+/// the combiner itself run much faster.
+static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
+                                          TargetLibraryInfo *TLI,
+                                          InstCombineWorklist &ICWorklist) {
+  bool MadeIRChange = false;
+
+  // Do a depth-first traversal of the function, populate the worklist with
+  // the reachable instructions.  Ignore blocks that are not reachable.  Keep
+  // track of which blocks we visit.
+  SmallPtrSet<BasicBlock *, 32> Visited;
+  MadeIRChange |=
+      AddReachableCodeToWorklist(&F.front(), DL, Visited, ICWorklist, TLI);
+
+  // Do a quick scan over the function.  If we find any blocks that are
+  // unreachable, remove any instructions inside of them.  This prevents
+  // the instcombine code from having to deal with some bad special cases.
+  for (BasicBlock &BB : F) {
+    if (Visited.count(&BB))
+      continue;
+
+    unsigned NumDeadInstInBB = removeAllNonTerminatorAndEHPadInstructions(&BB);
+    MadeIRChange |= NumDeadInstInBB > 0;
+    NumDeadInst += NumDeadInstInBB;
+  }
+
+  return MadeIRChange;
+}
+
+static bool
+combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
+                                AliasAnalysis *AA, AssumptionCache &AC,
+                                TargetLibraryInfo &TLI, DominatorTree &DT,
+                                bool ExpensiveCombines = true,
+                                LoopInfo *LI = nullptr) {
+  auto &DL = F.getParent()->getDataLayout();
+  ExpensiveCombines |= EnableExpensiveCombines;
+
+  /// Builder - This is an IRBuilder that automatically inserts new
+  /// instructions into the worklist when they are created.
+  IRBuilder<TargetFolder, IRBuilderCallbackInserter> Builder(
+      F.getContext(), TargetFolder(DL),
+      IRBuilderCallbackInserter([&Worklist, &AC](Instruction *I) {
+        Worklist.Add(I);
+
+        using namespace llvm::PatternMatch;
+        if (match(I, m_Intrinsic<Intrinsic::assume>()))
+          AC.registerAssumption(cast<CallInst>(I));
+      }));
+
+  // Lower dbg.declare intrinsics otherwise their value may be clobbered
+  // by instcombiner.
+  bool MadeIRChange = LowerDbgDeclare(F);
+
+  // Iterate while there is work to do.
+  int Iteration = 0;
+  for (;;) {
+    ++Iteration;
+    DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
+                 << F.getName() << "\n");
+
+    MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
+
+    InstCombiner IC(Worklist, Builder, F.optForMinSize(), ExpensiveCombines,
+                    AA, AC, TLI, DT, DL, LI);
+    IC.MaxArraySizeForCombine = MaxArraySize;
+
+    if (!IC.run())
+      break;
+  }
+
+  return MadeIRChange || Iteration > 1;
+}
+
+PreservedAnalyses InstCombinePass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
+  auto *LI = AM.getCachedResult<LoopAnalysis>(F);
+
+  // FIXME: The AliasAnalysis is not yet supported in the new pass manager
+  if (!combineInstructionsOverFunction(F, Worklist, nullptr, AC, TLI, DT,
+                                       ExpensiveCombines, LI))
+    // No changes, all analyses are preserved.
+    return PreservedAnalyses::all();
+
+  // Mark all the analyses that instcombine updates as preserved.
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<AAManager>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<BasicAAWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+}
+
+bool InstructionCombiningPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  // Required analyses.
+  auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  // Optional analyses.
+  auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+  auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+
+  return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT,
+                                         ExpensiveCombines, LI);
+}
+
+char InstructionCombiningPass::ID = 0;
+INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine",
+                      "Combine redundant instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",
+                    "Combine redundant instructions", false, false)
+
+// Initialization Routines
+void llvm::initializeInstCombine(PassRegistry &Registry) {
+  initializeInstructionCombiningPassPass(Registry);
+}
+
+void LLVMInitializeInstCombine(LLVMPassRegistryRef R) {
+  initializeInstructionCombiningPassPass(*unwrap(R));
+}
+
+FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines) {
+  return new InstructionCombiningPass(ExpensiveCombines);
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
new file mode 100644
index 000000000000..184940b7ea58
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -0,0 +1,2973 @@
+//===-- AddressSanitizer.cpp - memory error detector ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+// Details of the algorithm:
+//  http://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <system_error>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asan"
+
+static const uint64_t kDefaultShadowScale = 3;
+static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
+static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
+static const uint64_t kDynamicShadowSentinel = ~(uint64_t)0;
+static const uint64_t kIOSShadowOffset32 = 1ULL << 30;
+static const uint64_t kIOSSimShadowOffset32 = 1ULL << 30;
+static const uint64_t kIOSSimShadowOffset64 = kDefaultShadowOffset64;
+static const uint64_t kSmallX86_64ShadowOffset = 0x7FFF8000;  // < 2G.
+static const uint64_t kLinuxKasan_ShadowOffset64 = 0xdffffc0000000000;
+static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
+static const uint64_t kSystemZ_ShadowOffset64 = 1ULL << 52;
+static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000;
+static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
+static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
+static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
+static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40;
+static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
+// The shadow memory space is dynamically allocated.
+static const uint64_t kWindowsShadowOffset64 = kDynamicShadowSentinel;
+
+static const size_t kMinStackMallocSize = 1 << 6;   // 64B
+static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
+static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
+static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
+
+static const char *const kAsanModuleCtorName = "asan.module_ctor";
+static const char *const kAsanModuleDtorName = "asan.module_dtor";
+static const uint64_t kAsanCtorAndDtorPriority = 1;
+static const char *const kAsanReportErrorTemplate = "__asan_report_";
+static const char *const kAsanRegisterGlobalsName = "__asan_register_globals";
+static const char *const kAsanUnregisterGlobalsName =
+    "__asan_unregister_globals";
+static const char *const kAsanRegisterImageGlobalsName =
+  "__asan_register_image_globals";
+static const char *const kAsanUnregisterImageGlobalsName =
+  "__asan_unregister_image_globals";
+static const char *const kAsanRegisterElfGlobalsName =
+  "__asan_register_elf_globals";
+static const char *const kAsanUnregisterElfGlobalsName =
+  "__asan_unregister_elf_globals";
+static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
+static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
+static const char *const kAsanInitName = "__asan_init";
+static const char *const kAsanVersionCheckName =
+    "__asan_version_mismatch_check_v8";
+static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp";
+static const char *const kAsanPtrSub = "__sanitizer_ptr_sub";
+static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
+static const int kMaxAsanStackMallocSizeClass = 10;
+static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_";
+static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_";
+static const char *const kAsanGenPrefix = "__asan_gen_";
+static const char *const kODRGenPrefix = "__odr_asan_gen_";
+static const char *const kSanCovGenPrefix = "__sancov_gen_";
+static const char *const kAsanSetShadowPrefix = "__asan_set_shadow_";
+static const char *const kAsanPoisonStackMemoryName =
+    "__asan_poison_stack_memory";
+static const char *const kAsanUnpoisonStackMemoryName =
+    "__asan_unpoison_stack_memory";
+
+// ASan version script has __asan_* wildcard. Triple underscore prevents a
+// linker (gold) warning about attempting to export a local symbol.
+static const char *const kAsanGlobalsRegisteredFlagName =
+    "___asan_globals_registered";
+
+static const char *const kAsanOptionDetectUseAfterReturn =
+    "__asan_option_detect_stack_use_after_return";
+
+static const char *const kAsanShadowMemoryDynamicAddress =
+    "__asan_shadow_memory_dynamic_address";
+
+static const char *const kAsanAllocaPoison = "__asan_alloca_poison";
+static const char *const kAsanAllocasUnpoison = "__asan_allocas_unpoison";
+
+// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+static const size_t kNumberOfAccessSizes = 5;
+
+static const unsigned kAllocaRzSize = 32;
+
+// Command-line flags.
+static cl::opt<bool> ClEnableKasan(
+    "asan-kernel", cl::desc("Enable KernelAddressSanitizer instrumentation"),
+    cl::Hidden, cl::init(false));
+static cl::opt<bool> ClRecover(
+    "asan-recover",
+    cl::desc("Enable recovery mode (continue-after-error)."),
+    cl::Hidden, cl::init(false));
+
+// This flag may need to be replaced with -f[no-]asan-reads.
+static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
+                                       cl::desc("instrument read instructions"),
+                                       cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInstrumentWrites(
+    "asan-instrument-writes", cl::desc("instrument write instructions"),
+    cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInstrumentAtomics(
+    "asan-instrument-atomics",
+    cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
+    cl::init(true));
+static cl::opt<bool> ClAlwaysSlowPath(
+    "asan-always-slow-path",
+    cl::desc("use instrumentation with slow path for all accesses"), cl::Hidden,
+    cl::init(false));
+static cl::opt<bool> ClForceDynamicShadow(
+    "asan-force-dynamic-shadow",
+    cl::desc("Load shadow address into a local variable for each function"),
+    cl::Hidden, cl::init(false));
+
+// This flag limits the number of instructions to be instrumented
+// in any given BB. Normally, this should be set to unlimited (INT_MAX),
+// but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary
+// set it to 10000.
+static cl::opt<int> ClMaxInsnsToInstrumentPerBB(
+    "asan-max-ins-per-bb", cl::init(10000),
+    cl::desc("maximal number of instructions to instrument in any given BB"),
+    cl::Hidden);
+// This flag may need to be replaced with -f[no]asan-stack.
+static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"),
+                             cl::Hidden, cl::init(true));
+static cl::opt<uint32_t> ClMaxInlinePoisoningSize(
+    "asan-max-inline-poisoning-size",
+    cl::desc(
+        "Inline shadow poisoning for blocks up to the given size in bytes."),
+    cl::Hidden, cl::init(64));
+static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
+                                      cl::desc("Check stack-use-after-return"),
+                                      cl::Hidden, cl::init(true));
+static cl::opt<bool> ClUseAfterScope("asan-use-after-scope",
+                                     cl::desc("Check stack-use-after-scope"),
+                                     cl::Hidden, cl::init(false));
+// This flag may need to be replaced with -f[no]asan-globals.
+static cl::opt<bool> ClGlobals("asan-globals",
+                               cl::desc("Handle global objects"), cl::Hidden,
+                               cl::init(true));
+static cl::opt<bool> ClInitializers("asan-initialization-order",
+                                    cl::desc("Handle C++ initializer order"),
+                                    cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInvalidPointerPairs(
+    "asan-detect-invalid-pointer-pair",
+    cl::desc("Instrument <, <=, >, >=, - with pointer operands"), cl::Hidden,
+    cl::init(false));
+static cl::opt<unsigned> ClRealignStack(
+    "asan-realign-stack",
+    cl::desc("Realign stack to the value of this flag (power of two)"),
+    cl::Hidden, cl::init(32));
+static cl::opt<int> ClInstrumentationWithCallsThreshold(
+    "asan-instrumentation-with-call-threshold",
+    cl::desc(
+        "If the function being instrumented contains more than "
+        "this number of memory accesses, use callbacks instead of "
+        "inline checks (-1 means never use callbacks)."),
+    cl::Hidden, cl::init(7000));
+static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
+    "asan-memory-access-callback-prefix",
+    cl::desc("Prefix for memory access callbacks"), cl::Hidden,
+    cl::init("__asan_"));
+static cl::opt<bool>
+    ClInstrumentDynamicAllocas("asan-instrument-dynamic-allocas",
+                               cl::desc("instrument dynamic allocas"),
+                               cl::Hidden, cl::init(true));
+static cl::opt<bool> ClSkipPromotableAllocas(
+    "asan-skip-promotable-allocas",
+    cl::desc("Do not instrument promotable allocas"), cl::Hidden,
+    cl::init(true));
+
+// These flags allow to change the shadow mapping.
+// The shadow mapping looks like
+//    Shadow = (Mem >> scale) + offset
+static cl::opt<int> ClMappingScale("asan-mapping-scale",
+                                   cl::desc("scale of asan shadow mapping"),
+                                   cl::Hidden, cl::init(0));
+static cl::opt<unsigned long long> ClMappingOffset(
+    "asan-mapping-offset",
+    cl::desc("offset of asan shadow mapping [EXPERIMENTAL]"), cl::Hidden,
+    cl::init(0));
+
+// Optimization flags. Not user visible, used mostly for testing
+// and benchmarking the tool.
+static cl::opt<bool> ClOpt("asan-opt", cl::desc("Optimize instrumentation"),
+                           cl::Hidden, cl::init(true));
+static cl::opt<bool> ClOptSameTemp(
+    "asan-opt-same-temp", cl::desc("Instrument the same temp just once"),
+    cl::Hidden, cl::init(true));
+static cl::opt<bool> ClOptGlobals("asan-opt-globals",
+                                  cl::desc("Don't instrument scalar globals"),
+                                  cl::Hidden, cl::init(true));
+static cl::opt<bool> ClOptStack(
+    "asan-opt-stack", cl::desc("Don't instrument scalar stack variables"),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDynamicAllocaStack(
+    "asan-stack-dynamic-alloca",
+    cl::desc("Use dynamic alloca to represent stack variables"), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<uint32_t> ClForceExperiment(
+    "asan-force-experiment",
+    cl::desc("Force optimization experiment (for testing)"), cl::Hidden,
+    cl::init(0));
+
+static cl::opt<bool>
+    ClUsePrivateAliasForGlobals("asan-use-private-alias",
+                                cl::desc("Use private aliases for global"
+                                         " variables"),
+                                cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClUseGlobalsGC("asan-globals-live-support",
+                   cl::desc("Use linker features to support dead "
+                            "code stripping of globals"),
+                   cl::Hidden, cl::init(true));
+
+// This is on by default even though there is a bug in gold:
+// https://sourceware.org/bugzilla/show_bug.cgi?id=19002
+static cl::opt<bool>
+    ClWithComdat("asan-with-comdat",
+                 cl::desc("Place ASan constructors in comdat sections"),
+                 cl::Hidden, cl::init(true));
+
+// Debug flags.
+static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
+                            cl::init(0));
+static cl::opt<int> ClDebugStack("asan-debug-stack", cl::desc("debug stack"),
+                                 cl::Hidden, cl::init(0));
+static cl::opt<std::string> ClDebugFunc("asan-debug-func", cl::Hidden,
+                                        cl::desc("Debug func"));
+static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
+                               cl::Hidden, cl::init(-1));
+static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug max inst"),
+                               cl::Hidden, cl::init(-1));
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOptimizedAccessesToGlobalVar,
+          "Number of optimized accesses to global vars");
+STATISTIC(NumOptimizedAccessesToStackVar,
+          "Number of optimized accesses to stack vars");
+
+namespace {
+/// Frontend-provided metadata for source location.
+struct LocationMetadata {
+  StringRef Filename;
+  int LineNo;
+  int ColumnNo;
+
+  LocationMetadata() : Filename(), LineNo(0), ColumnNo(0) {}
+
+  bool empty() const { return Filename.empty(); }
+
+  void parse(MDNode *MDN) {
+    assert(MDN->getNumOperands() == 3);
+    MDString *DIFilename = cast<MDString>(MDN->getOperand(0));
+    Filename = DIFilename->getString();
+    LineNo =
+        mdconst::extract<ConstantInt>(MDN->getOperand(1))->getLimitedValue();
+    ColumnNo =
+        mdconst::extract<ConstantInt>(MDN->getOperand(2))->getLimitedValue();
+  }
+};
+
+/// Frontend-provided metadata for global variables.
+class GlobalsMetadata {
+ public:
+  struct Entry {
+    Entry() : SourceLoc(), Name(), IsDynInit(false), IsBlacklisted(false) {}
+    LocationMetadata SourceLoc;
+    StringRef Name;
+    bool IsDynInit;
+    bool IsBlacklisted;
+  };
+
+  GlobalsMetadata() : inited_(false) {}
+
+  void reset() {
+    inited_ = false;
+    Entries.clear();
+  }
+
+  void init(Module &M) {
+    assert(!inited_);
+    inited_ = true;
+    NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals");
+    if (!Globals) return;
+    for (auto MDN : Globals->operands()) {
+      // Metadata node contains the global and the fields of "Entry".
+      assert(MDN->getNumOperands() == 5);
+      auto *GV = mdconst::extract_or_null<GlobalVariable>(MDN->getOperand(0));
+      // The optimizer may optimize away a global entirely.
+      if (!GV) continue;
+      // We can already have an entry for GV if it was merged with another
+      // global.
+      Entry &E = Entries[GV];
+      if (auto *Loc = cast_or_null<MDNode>(MDN->getOperand(1)))
+        E.SourceLoc.parse(Loc);
+      if (auto *Name = cast_or_null<MDString>(MDN->getOperand(2)))
+        E.Name = Name->getString();
+      ConstantInt *IsDynInit =
+          mdconst::extract<ConstantInt>(MDN->getOperand(3));
+      E.IsDynInit |= IsDynInit->isOne();
+      ConstantInt *IsBlacklisted =
+          mdconst::extract<ConstantInt>(MDN->getOperand(4));
+      E.IsBlacklisted |= IsBlacklisted->isOne();
+    }
+  }
+
+  /// Returns metadata entry for a given global.
+  Entry get(GlobalVariable *G) const {
+    auto Pos = Entries.find(G);
+    return (Pos != Entries.end()) ? Pos->second : Entry();
+  }
+
+ private:
+  bool inited_;
+  DenseMap<GlobalVariable *, Entry> Entries;
+};
+
+/// This struct defines the shadow mapping using the rule:
+///   shadow = (mem >> Scale) ADD-or-OR Offset.
+struct ShadowMapping {
+  int Scale;
+  uint64_t Offset;
+  bool OrShadowOffset;
+};
+
+static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
+                                      bool IsKasan) {
+  bool IsAndroid = TargetTriple.isAndroid();
+  bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS();
+  bool IsFreeBSD = TargetTriple.isOSFreeBSD();
+  bool IsPS4CPU = TargetTriple.isPS4CPU();
+  bool IsLinux = TargetTriple.isOSLinux();
+  bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64 ||
+                 TargetTriple.getArch() == llvm::Triple::ppc64le;
+  bool IsSystemZ = TargetTriple.getArch() == llvm::Triple::systemz;
+  bool IsX86 = TargetTriple.getArch() == llvm::Triple::x86;
+  bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
+  bool IsMIPS32 = TargetTriple.getArch() == llvm::Triple::mips ||
+                  TargetTriple.getArch() == llvm::Triple::mipsel;
+  bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 ||
+                  TargetTriple.getArch() == llvm::Triple::mips64el;
+  bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64;
+  bool IsWindows = TargetTriple.isOSWindows();
+  bool IsFuchsia = TargetTriple.isOSFuchsia();
+
+  ShadowMapping Mapping;
+
+  if (LongSize == 32) {
+    // Android is always PIE, which means that the beginning of the address
+    // space is always available.
+    if (IsAndroid)
+      Mapping.Offset = 0;
+    else if (IsMIPS32)
+      Mapping.Offset = kMIPS32_ShadowOffset32;
+    else if (IsFreeBSD)
+      Mapping.Offset = kFreeBSD_ShadowOffset32;
+    else if (IsIOS)
+      // If we're targeting iOS and x86, the binary is built for iOS simulator.
+      Mapping.Offset = IsX86 ? kIOSSimShadowOffset32 : kIOSShadowOffset32;
+    else if (IsWindows)
+      Mapping.Offset = kWindowsShadowOffset32;
+    else
+      Mapping.Offset = kDefaultShadowOffset32;
+  } else {  // LongSize == 64
+    // Fuchsia is always PIE, which means that the beginning of the address
+    // space is always available.
+    if (IsFuchsia)
+      Mapping.Offset = 0;
+    else if (IsPPC64)
+      Mapping.Offset = kPPC64_ShadowOffset64;
+    else if (IsSystemZ)
+      Mapping.Offset = kSystemZ_ShadowOffset64;
+    else if (IsFreeBSD)
+      Mapping.Offset = kFreeBSD_ShadowOffset64;
+    else if (IsPS4CPU)
+      Mapping.Offset = kPS4CPU_ShadowOffset64;
+    else if (IsLinux && IsX86_64) {
+      if (IsKasan)
+        Mapping.Offset = kLinuxKasan_ShadowOffset64;
+      else
+        Mapping.Offset = kSmallX86_64ShadowOffset;
+    } else if (IsWindows && IsX86_64) {
+      Mapping.Offset = kWindowsShadowOffset64;
+    } else if (IsMIPS64)
+      Mapping.Offset = kMIPS64_ShadowOffset64;
+    else if (IsIOS)
+      // If we're targeting iOS and x86, the binary is built for iOS simulator.
+      // We are using dynamic shadow offset on the 64-bit devices.
+      Mapping.Offset =
+        IsX86_64 ? kIOSSimShadowOffset64 : kDynamicShadowSentinel;
+    else if (IsAArch64)
+      Mapping.Offset = kAArch64_ShadowOffset64;
+    else
+      Mapping.Offset = kDefaultShadowOffset64;
+  }
+
+  if (ClForceDynamicShadow) {
+    Mapping.Offset = kDynamicShadowSentinel;
+  }
+
+  Mapping.Scale = kDefaultShadowScale;
+  if (ClMappingScale.getNumOccurrences() > 0) {
+    Mapping.Scale = ClMappingScale;
+  }
+
+  if (ClMappingOffset.getNumOccurrences() > 0) {
+    Mapping.Offset = ClMappingOffset;
+  }
+
+  // OR-ing shadow offset if more efficient (at least on x86) if the offset
+  // is a power of two, but on ppc64 we have to use add since the shadow
+  // offset is not necessary 1/8-th of the address space.  On SystemZ,
+  // we could OR the constant in a single instruction, but it's more
+  // efficient to load it once and use indexed addressing.
+  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ && !IsPS4CPU &&
+                           !(Mapping.Offset & (Mapping.Offset - 1)) &&
+                           Mapping.Offset != kDynamicShadowSentinel;
+
+  return Mapping;
+}
+
+static size_t RedzoneSizeForScale(int MappingScale) {
+  // Redzone used for stack and globals is at least 32 bytes.
+  // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
+  return std::max(32U, 1U << MappingScale);
+}
+
+/// AddressSanitizer: instrument the code in module to find memory bugs.
+struct AddressSanitizer : public FunctionPass {
+  explicit AddressSanitizer(bool CompileKernel = false, bool Recover = false,
+                            bool UseAfterScope = false)
+      : FunctionPass(ID), CompileKernel(CompileKernel || ClEnableKasan),
+        Recover(Recover || ClRecover),
+        UseAfterScope(UseAfterScope || ClUseAfterScope),
+        LocalDynamicShadow(nullptr) {
+    initializeAddressSanitizerPass(*PassRegistry::getPassRegistry());
+  }
+  StringRef getPassName() const override {
+    return "AddressSanitizerFunctionPass";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+  uint64_t getAllocaSizeInBytes(const AllocaInst &AI) const {
+    uint64_t ArraySize = 1;
+    if (AI.isArrayAllocation()) {
+      const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
+      assert(CI && "non-constant array size");
+      ArraySize = CI->getZExtValue();
+    }
+    Type *Ty = AI.getAllocatedType();
+    uint64_t SizeInBytes =
+        AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
+    return SizeInBytes * ArraySize;
+  }
+  /// Check if we want (and can) handle this alloca.
+  bool isInterestingAlloca(const AllocaInst &AI);
+
+  /// If it is an interesting memory access, return the PointerOperand
+  /// and set IsWrite/Alignment. Otherwise return nullptr.
+  /// MaybeMask is an output parameter for the mask Value, if we're looking at a
+  /// masked load/store.
+  Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite,
+                                   uint64_t *TypeSize, unsigned *Alignment,
+                                   Value **MaybeMask = nullptr);
+  void instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, Instruction *I,
+                     bool UseCalls, const DataLayout &DL);
+  void instrumentPointerComparisonOrSubtraction(Instruction *I);
+  void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
+                         Value *Addr, uint32_t TypeSize, bool IsWrite,
+                         Value *SizeArgument, bool UseCalls, uint32_t Exp);
+  void instrumentUnusualSizeOrAlignment(Instruction *I,
+                                        Instruction *InsertBefore, Value *Addr,
+                                        uint32_t TypeSize, bool IsWrite,
+                                        Value *SizeArgument, bool UseCalls,
+                                        uint32_t Exp);
+  Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+                           Value *ShadowValue, uint32_t TypeSize);
+  Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr,
+                                 bool IsWrite, size_t AccessSizeIndex,
+                                 Value *SizeArgument, uint32_t Exp);
+  void instrumentMemIntrinsic(MemIntrinsic *MI);
+  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool runOnFunction(Function &F) override;
+  bool maybeInsertAsanInitAtFunctionEntry(Function &F);
+  void maybeInsertDynamicShadowAtFunctionEntry(Function &F);
+  void markEscapedLocalAllocas(Function &F);
+  bool doInitialization(Module &M) override;
+  bool doFinalization(Module &M) override;
+  static char ID;  // Pass identification, replacement for typeid
+
+  DominatorTree &getDominatorTree() const { return *DT; }
+
+ private:
+  void initializeCallbacks(Module &M);
+
+  bool LooksLikeCodeInBug11395(Instruction *I);
+  bool GlobalIsLinkerInitialized(GlobalVariable *G);
+  bool isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, Value *Addr,
+                    uint64_t TypeSize) const;
+
+  /// Helper to cleanup per-function state.
+  struct FunctionStateRAII {
+    AddressSanitizer *Pass;
+    FunctionStateRAII(AddressSanitizer *Pass) : Pass(Pass) {
+      assert(Pass->ProcessedAllocas.empty() &&
+             "last pass forgot to clear cache");
+      assert(!Pass->LocalDynamicShadow);
+    }
+    ~FunctionStateRAII() {
+      Pass->LocalDynamicShadow = nullptr;
+      Pass->ProcessedAllocas.clear();
+    }
+  };
+
+  LLVMContext *C;
+  Triple TargetTriple;
+  int LongSize;
+  bool CompileKernel;
+  bool Recover;
+  bool UseAfterScope;
+  Type *IntptrTy;
+  ShadowMapping Mapping;
+  DominatorTree *DT;
+  Function *AsanHandleNoReturnFunc;
+  Function *AsanPtrCmpFunction, *AsanPtrSubFunction;
+  // This array is indexed by AccessIsWrite, Experiment and log2(AccessSize).
+  Function *AsanErrorCallback[2][2][kNumberOfAccessSizes];
+  Function *AsanMemoryAccessCallback[2][2][kNumberOfAccessSizes];
+  // This array is indexed by AccessIsWrite and Experiment.
+  Function *AsanErrorCallbackSized[2][2];
+  Function *AsanMemoryAccessCallbackSized[2][2];
+  Function *AsanMemmove, *AsanMemcpy, *AsanMemset;
+  InlineAsm *EmptyAsm;
+  Value *LocalDynamicShadow;
+  GlobalsMetadata GlobalsMD;
+  DenseMap<const AllocaInst *, bool> ProcessedAllocas;
+
+  friend struct FunctionStackPoisoner;
+};
+
+class AddressSanitizerModule : public ModulePass {
+public:
+  explicit AddressSanitizerModule(bool CompileKernel = false,
+                                  bool Recover = false,
+                                  bool UseGlobalsGC = true)
+      : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan),
+        Recover(Recover || ClRecover),
+        UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+        // Not a typo: ClWithComdat is almost completely pointless without
+        // ClUseGlobalsGC (because then it only works on modules without
+        // globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
+        // and both suffer from gold PR19002 for which UseGlobalsGC constructor
+        // argument is designed as workaround. Therefore, disable both
+        // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
+        // do globals-gc.
+        UseCtorComdat(UseGlobalsGC && ClWithComdat) {}
+  bool runOnModule(Module &M) override;
+  static char ID; // Pass identification, replacement for typeid
+  StringRef getPassName() const override { return "AddressSanitizerModule"; }
+
+private:
+  void initializeCallbacks(Module &M);
+
+  bool InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat);
+  void InstrumentGlobalsCOFF(IRBuilder<> &IRB, Module &M,
+                             ArrayRef<GlobalVariable *> ExtendedGlobals,
+                             ArrayRef<Constant *> MetadataInitializers);
+  void InstrumentGlobalsELF(IRBuilder<> &IRB, Module &M,
+                            ArrayRef<GlobalVariable *> ExtendedGlobals,
+                            ArrayRef<Constant *> MetadataInitializers,
+                            const std::string &UniqueModuleId);
+  void InstrumentGlobalsMachO(IRBuilder<> &IRB, Module &M,
+                              ArrayRef<GlobalVariable *> ExtendedGlobals,
+                              ArrayRef<Constant *> MetadataInitializers);
+  void
+  InstrumentGlobalsWithMetadataArray(IRBuilder<> &IRB, Module &M,
+                                     ArrayRef<GlobalVariable *> ExtendedGlobals,
+                                     ArrayRef<Constant *> MetadataInitializers);
+
+  GlobalVariable *CreateMetadataGlobal(Module &M, Constant *Initializer,
+                                       StringRef OriginalName);
+  void SetComdatForGlobalMetadata(GlobalVariable *G, GlobalVariable *Metadata,
+                                  StringRef InternalSuffix);
+  IRBuilder<> CreateAsanModuleDtor(Module &M);
+
+  bool ShouldInstrumentGlobal(GlobalVariable *G);
+  bool ShouldUseMachOGlobalsSection() const;
+  StringRef getGlobalMetadataSection() const;
+  void poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName);
+  void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
+  size_t MinRedzoneSizeForGlobal() const {
+    return RedzoneSizeForScale(Mapping.Scale);
+  }
+
+  GlobalsMetadata GlobalsMD;
+  bool CompileKernel;
+  bool Recover;
+  bool UseGlobalsGC;
+  bool UseCtorComdat;
+  Type *IntptrTy;
+  LLVMContext *C;
+  Triple TargetTriple;
+  ShadowMapping Mapping;
+  Function *AsanPoisonGlobals;
+  Function *AsanUnpoisonGlobals;
+  Function *AsanRegisterGlobals;
+  Function *AsanUnregisterGlobals;
+  Function *AsanRegisterImageGlobals;
+  Function *AsanUnregisterImageGlobals;
+  Function *AsanRegisterElfGlobals;
+  Function *AsanUnregisterElfGlobals;
+
+  Function *AsanCtorFunction = nullptr;
+  Function *AsanDtorFunction = nullptr;
+};
+
+// Stack poisoning does not play well with exception handling.
+// When an exception is thrown, we essentially bypass the code
+// that unpoisones the stack. This is why the run-time library has
+// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire
+// stack in the interceptor. This however does not work inside the
+// actual function which catches the exception. Most likely because the
+// compiler hoists the load of the shadow value somewhere too high.
+// This causes asan to report a non-existing bug on 453.povray.
+// It sounds like an LLVM bug.
+struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
+  Function &F;
+  AddressSanitizer &ASan;
+  DIBuilder DIB;
+  LLVMContext *C;
+  Type *IntptrTy;
+  Type *IntptrPtrTy;
+  ShadowMapping Mapping;
+
+  SmallVector<AllocaInst *, 16> AllocaVec;
+  SmallVector<AllocaInst *, 16> StaticAllocasToMoveUp;
+  SmallVector<Instruction *, 8> RetVec;
+  unsigned StackAlignment;
+
+  Function *AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1],
+      *AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1];
+  Function *AsanSetShadowFunc[0x100] = {};
+  Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc;
+  Function *AsanAllocaPoisonFunc, *AsanAllocasUnpoisonFunc;
+
+  // Stores a place and arguments of poisoning/unpoisoning call for alloca.
+  struct AllocaPoisonCall {
+    IntrinsicInst *InsBefore;
+    AllocaInst *AI;
+    uint64_t Size;
+    bool DoPoison;
+  };
+  SmallVector<AllocaPoisonCall, 8> DynamicAllocaPoisonCallVec;
+  SmallVector<AllocaPoisonCall, 8> StaticAllocaPoisonCallVec;
+
+  SmallVector<AllocaInst *, 1> DynamicAllocaVec;
+  SmallVector<IntrinsicInst *, 1> StackRestoreVec;
+  AllocaInst *DynamicAllocaLayout = nullptr;
+  IntrinsicInst *LocalEscapeCall = nullptr;
+
+  // Maps Value to an AllocaInst from which the Value is originated.
+  typedef DenseMap<Value *, AllocaInst *> AllocaForValueMapTy;
+  AllocaForValueMapTy AllocaForValue;
+
+  bool HasNonEmptyInlineAsm = false;
+  bool HasReturnsTwiceCall = false;
+  std::unique_ptr<CallInst> EmptyInlineAsm;
+
+  FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
+      : F(F),
+        ASan(ASan),
+        DIB(*F.getParent(), /*AllowUnresolved*/ false),
+        C(ASan.C),
+        IntptrTy(ASan.IntptrTy),
+        IntptrPtrTy(PointerType::get(IntptrTy, 0)),
+        Mapping(ASan.Mapping),
+        StackAlignment(1 << Mapping.Scale),
+        EmptyInlineAsm(CallInst::Create(ASan.EmptyAsm)) {}
+
+  bool runOnFunction() {
+    if (!ClStack) return false;
+    // Collect alloca, ret, lifetime instructions etc.
+    for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB);
+
+    if (AllocaVec.empty() && DynamicAllocaVec.empty()) return false;
+
+    initializeCallbacks(*F.getParent());
+
+    processDynamicAllocas();
+    processStaticAllocas();
+
+    if (ClDebugStack) {
+      DEBUG(dbgs() << F);
+    }
+    return true;
+  }
+
+  // Finds all Alloca instructions and puts
+  // poisoned red zones around all of them.
+  // Then unpoison everything back before the function returns.
+  void processStaticAllocas();
+  void processDynamicAllocas();
+
+  void createDynamicAllocasInitStorage();
+
+  // ----------------------- Visitors.
+  /// \brief Collect all Ret instructions.
+  void visitReturnInst(ReturnInst &RI) { RetVec.push_back(&RI); }
+
+  /// \brief Collect all Resume instructions.
+  void visitResumeInst(ResumeInst &RI) { RetVec.push_back(&RI); }
+
+  /// \brief Collect all CatchReturnInst instructions.
+  void visitCleanupReturnInst(CleanupReturnInst &CRI) { RetVec.push_back(&CRI); }
+
+  void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore,
+                                        Value *SavedStack) {
+    IRBuilder<> IRB(InstBefore);
+    Value *DynamicAreaPtr = IRB.CreatePtrToInt(SavedStack, IntptrTy);
+    // When we insert _asan_allocas_unpoison before @llvm.stackrestore, we
+    // need to adjust extracted SP to compute the address of the most recent
+    // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for
+    // this purpose.
+    if (!isa<ReturnInst>(InstBefore)) {
+      Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration(
+          InstBefore->getModule(), Intrinsic::get_dynamic_area_offset,
+          {IntptrTy});
+
+      Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {});
+
+      DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy),
+                                     DynamicAreaOffset);
+    }
+
+    IRB.CreateCall(AsanAllocasUnpoisonFunc,
+                   {IRB.CreateLoad(DynamicAllocaLayout), DynamicAreaPtr});
+  }
+
+  // Unpoison dynamic allocas redzones.
+  void unpoisonDynamicAllocas() {
+    for (auto &Ret : RetVec)
+      unpoisonDynamicAllocasBeforeInst(Ret, DynamicAllocaLayout);
+
+    for (auto &StackRestoreInst : StackRestoreVec)
+      unpoisonDynamicAllocasBeforeInst(StackRestoreInst,
+                                       StackRestoreInst->getOperand(0));
+  }
+
+  // Deploy and poison redzones around dynamic alloca call. To do this, we
+  // should replace this call with another one with changed parameters and
+  // replace all its uses with new address, so
+  //   addr = alloca type, old_size, align
+  // is replaced by
+  //   new_size = (old_size + additional_size) * sizeof(type)
+  //   tmp = alloca i8, new_size, max(align, 32)
+  //   addr = tmp + 32 (first 32 bytes are for the left redzone).
+  // Additional_size is added to make new memory allocation contain not only
+  // requested memory, but also left, partial and right redzones.
+  void handleDynamicAllocaCall(AllocaInst *AI);
+
+  /// \brief Collect Alloca instructions we want (and can) handle.
+  void visitAllocaInst(AllocaInst &AI) {
+    if (!ASan.isInterestingAlloca(AI)) {
+      if (AI.isStaticAlloca()) {
+        // Skip over allocas that are present *before* the first instrumented
+        // alloca, we don't want to move those around.
+        if (AllocaVec.empty())
+          return;
+
+        StaticAllocasToMoveUp.push_back(&AI);
+      }
+      return;
+    }
+
+    StackAlignment = std::max(StackAlignment, AI.getAlignment());
+    if (!AI.isStaticAlloca())
+      DynamicAllocaVec.push_back(&AI);
+    else
+      AllocaVec.push_back(&AI);
+  }
+
+  /// \brief Collect lifetime intrinsic calls to check for use-after-scope
+  /// errors.
+  void visitIntrinsicInst(IntrinsicInst &II) {
+    Intrinsic::ID ID = II.getIntrinsicID();
+    if (ID == Intrinsic::stackrestore) StackRestoreVec.push_back(&II);
+    if (ID == Intrinsic::localescape) LocalEscapeCall = &II;
+    if (!ASan.UseAfterScope)
+      return;
+    if (ID != Intrinsic::lifetime_start && ID != Intrinsic::lifetime_end)
+      return;
+    // Found lifetime intrinsic, add ASan instrumentation if necessary.
+    ConstantInt *Size = dyn_cast<ConstantInt>(II.getArgOperand(0));
+    // If size argument is undefined, don't do anything.
+    if (Size->isMinusOne()) return;
+    // Check that size doesn't saturate uint64_t and can
+    // be stored in IntptrTy.
+    const uint64_t SizeValue = Size->getValue().getLimitedValue();
+    if (SizeValue == ~0ULL ||
+        !ConstantInt::isValueValidForType(IntptrTy, SizeValue))
+      return;
+    // Find alloca instruction that corresponds to llvm.lifetime argument.
+    AllocaInst *AI = findAllocaForValue(II.getArgOperand(1));
+    if (!AI || !ASan.isInterestingAlloca(*AI))
+      return;
+    bool DoPoison = (ID == Intrinsic::lifetime_end);
+    AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
+    if (AI->isStaticAlloca())
+      StaticAllocaPoisonCallVec.push_back(APC);
+    else if (ClInstrumentDynamicAllocas)
+      DynamicAllocaPoisonCallVec.push_back(APC);
+  }
+
+  void visitCallSite(CallSite CS) {
+    Instruction *I = CS.getInstruction();
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      HasNonEmptyInlineAsm |=
+          CI->isInlineAsm() && !CI->isIdenticalTo(EmptyInlineAsm.get());
+      HasReturnsTwiceCall |= CI->canReturnTwice();
+    }
+  }
+
+  // ---------------------- Helpers.
+  void initializeCallbacks(Module &M);
+
+  bool doesDominateAllExits(const Instruction *I) const {
+    for (auto Ret : RetVec) {
+      if (!ASan.getDominatorTree().dominates(I, Ret)) return false;
+    }
+    return true;
+  }
+
+  /// Finds alloca where the value comes from.
+  AllocaInst *findAllocaForValue(Value *V);
+
+  // Copies bytes from ShadowBytes into shadow memory for indexes where
+  // ShadowMask is not zero. If ShadowMask[i] is zero, we assume that
+  // ShadowBytes[i] is constantly zero and doesn't need to be overwritten.
+  void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes,
+                    IRBuilder<> &IRB, Value *ShadowBase);
+  void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes,
+                    size_t Begin, size_t End, IRBuilder<> &IRB,
+                    Value *ShadowBase);
+  void copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
+                          ArrayRef<uint8_t> ShadowBytes, size_t Begin,
+                          size_t End, IRBuilder<> &IRB, Value *ShadowBase);
+
+  void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison);
+
+  Value *createAllocaForLayout(IRBuilder<> &IRB, const ASanStackFrameLayout &L,
+                               bool Dynamic);
+  PHINode *createPHI(IRBuilder<> &IRB, Value *Cond, Value *ValueIfTrue,
+                     Instruction *ThenTerm, Value *ValueIfFalse);
+};
+
+} // anonymous namespace
+
+char AddressSanitizer::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    AddressSanitizer, "asan",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
+    false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    AddressSanitizer, "asan",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
+    false)
+FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel,
+                                                       bool Recover,
+                                                       bool UseAfterScope) {
+  assert(!CompileKernel || Recover);
+  return new AddressSanitizer(CompileKernel, Recover, UseAfterScope);
+}
+
+char AddressSanitizerModule::ID = 0;
+INITIALIZE_PASS(
+    AddressSanitizerModule, "asan-module",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
+    "ModulePass",
+    false, false)
+ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel,
+                                                   bool Recover,
+                                                   bool UseGlobalsGC) {
+  assert(!CompileKernel || Recover);
+  return new AddressSanitizerModule(CompileKernel, Recover, UseGlobalsGC);
+}
+
+static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
+  size_t Res = countTrailingZeros(TypeSize / 8);
+  assert(Res < kNumberOfAccessSizes);
+  return Res;
+}
+
+// \brief Create a constant for Str so that we can pass it to the run-time lib.
+static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
+                                                    bool AllowMerging) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  // We use private linkage for module-local strings. If they can be merged
+  // with another one, we set the unnamed_addr attribute.
+  GlobalVariable *GV =
+      new GlobalVariable(M, StrConst->getType(), true,
+                         GlobalValue::PrivateLinkage, StrConst, kAsanGenPrefix);
+  if (AllowMerging) GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
+  return GV;
+}
+
+/// \brief Create a global describing a source location.
+static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
+                                                       LocationMetadata MD) {
+  Constant *LocData[] = {
+      createPrivateGlobalForString(M, MD.Filename, true),
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.LineNo),
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.ColumnNo),
+  };
+  auto LocStruct = ConstantStruct::getAnon(LocData);
+  auto GV = new GlobalVariable(M, LocStruct->getType(), true,
+                               GlobalValue::PrivateLinkage, LocStruct,
+                               kAsanGenPrefix);
+  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  return GV;
+}
+
+/// \brief Check if \p G has been created by a trusted compiler pass.
+static bool GlobalWasGeneratedByCompiler(GlobalVariable *G) {
+  // Do not instrument asan globals.
+  if (G->getName().startswith(kAsanGenPrefix) ||
+      G->getName().startswith(kSanCovGenPrefix) ||
+      G->getName().startswith(kODRGenPrefix))
+    return true;
+
+  // Do not instrument gcov counter arrays.
+  if (G->getName() == "__llvm_gcov_ctr")
+    return true;
+
+  return false;
+}
+
+Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  // Shadow >> scale
+  Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
+  if (Mapping.Offset == 0) return Shadow;
+  // (Shadow >> scale) | offset
+  Value *ShadowBase;
+  if (LocalDynamicShadow)
+    ShadowBase = LocalDynamicShadow;
+  else
+    ShadowBase = ConstantInt::get(IntptrTy, Mapping.Offset);
+  if (Mapping.OrShadowOffset)
+    return IRB.CreateOr(Shadow, ShadowBase);
+  else
+    return IRB.CreateAdd(Shadow, ShadowBase);
+}
+
+// Instrument memset/memmove/memcpy
+void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> IRB(MI);
+  if (isa<MemTransferInst>(MI)) {
+    IRB.CreateCall(
+        isa<MemMoveInst>(MI) ? AsanMemmove : AsanMemcpy,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  } else if (isa<MemSetInst>(MI)) {
+    IRB.CreateCall(
+        AsanMemset,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  }
+  MI->eraseFromParent();
+}
+
+/// Check if we want (and can) handle this alloca.
+bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
+  auto PreviouslySeenAllocaInfo = ProcessedAllocas.find(&AI);
+
+  if (PreviouslySeenAllocaInfo != ProcessedAllocas.end())
+    return PreviouslySeenAllocaInfo->getSecond();
+
+  bool IsInteresting =
+      (AI.getAllocatedType()->isSized() &&
+       // alloca() may be called with 0 size, ignore it.
+       ((!AI.isStaticAlloca()) || getAllocaSizeInBytes(AI) > 0) &&
+       // We are only interested in allocas not promotable to registers.
+       // Promotable allocas are common under -O0.
+       (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) &&
+       // inalloca allocas are not treated as static, and we don't want
+       // dynamic alloca instrumentation for them as well.
+       !AI.isUsedWithInAlloca() &&
+       // swifterror allocas are register promoted by ISel
+       !AI.isSwiftError());
+
+  ProcessedAllocas[&AI] = IsInteresting;
+  return IsInteresting;
+}
+
+Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I,
+                                                   bool *IsWrite,
+                                                   uint64_t *TypeSize,
+                                                   unsigned *Alignment,
+                                                   Value **MaybeMask) {
+  // Skip memory accesses inserted by another instrumentation.
+  if (I->getMetadata("nosanitize")) return nullptr;
+
+  // Do not instrument the load fetching the dynamic shadow address.
+  if (LocalDynamicShadow == I)
+    return nullptr;
+
+  Value *PtrOperand = nullptr;
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!ClInstrumentReads) return nullptr;
+    *IsWrite = false;
+    *TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
+    *Alignment = LI->getAlignment();
+    PtrOperand = LI->getPointerOperand();
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!ClInstrumentWrites) return nullptr;
+    *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
+    *Alignment = SI->getAlignment();
+    PtrOperand = SI->getPointerOperand();
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    if (!ClInstrumentAtomics) return nullptr;
+    *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
+    *Alignment = 0;
+    PtrOperand = RMW->getPointerOperand();
+  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!ClInstrumentAtomics) return nullptr;
+    *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
+    *Alignment = 0;
+    PtrOperand = XCHG->getPointerOperand();
+  } else if (auto CI = dyn_cast<CallInst>(I)) {
+    auto *F = dyn_cast<Function>(CI->getCalledValue());
+    if (F && (F->getName().startswith("llvm.masked.load.") ||
+              F->getName().startswith("llvm.masked.store."))) {
+      unsigned OpOffset = 0;
+      if (F->getName().startswith("llvm.masked.store.")) {
+        if (!ClInstrumentWrites)
+          return nullptr;
+        // Masked store has an initial operand for the value.
+        OpOffset = 1;
+        *IsWrite = true;
+      } else {
+        if (!ClInstrumentReads)
+          return nullptr;
+        *IsWrite = false;
+      }
+
+      auto BasePtr = CI->getOperand(0 + OpOffset);
+      auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
+      *TypeSize = DL.getTypeStoreSizeInBits(Ty);
+      if (auto AlignmentConstant =
+              dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
+        *Alignment = (unsigned)AlignmentConstant->getZExtValue();
+      else
+        *Alignment = 1; // No alignment guarantees. We probably got Undef
+      if (MaybeMask)
+        *MaybeMask = CI->getOperand(2 + OpOffset);
+      PtrOperand = BasePtr;
+    }
+  }
+
+  if (PtrOperand) {
+    // Do not instrument acesses from different address spaces; we cannot deal
+    // with them.
+    Type *PtrTy = cast<PointerType>(PtrOperand->getType()->getScalarType());
+    if (PtrTy->getPointerAddressSpace() != 0)
+      return nullptr;
+
+    // Ignore swifterror addresses.
+    // swifterror memory addresses are mem2reg promoted by instruction
+    // selection. As such they cannot have regular uses like an instrumentation
+    // function and it makes no sense to track them as memory.
+    if (PtrOperand->isSwiftError())
+      return nullptr;
+  }
+
+  // Treat memory accesses to promotable allocas as non-interesting since they
+  // will not cause memory violations. This greatly speeds up the instrumented
+  // executable at -O0.
+  if (ClSkipPromotableAllocas)
+    if (auto AI = dyn_cast_or_null<AllocaInst>(PtrOperand))
+      return isInterestingAlloca(*AI) ? AI : nullptr;
+
+  return PtrOperand;
+}
+
+static bool isPointerOperand(Value *V) {
+  return V->getType()->isPointerTy() || isa<PtrToIntInst>(V);
+}
+
+// This is a rough heuristic; it may cause both false positives and
+// false negatives. The proper implementation requires cooperation with
+// the frontend.
+static bool isInterestingPointerComparisonOrSubtraction(Instruction *I) {
+  if (ICmpInst *Cmp = dyn_cast<ICmpInst>(I)) {
+    if (!Cmp->isRelational()) return false;
+  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+    if (BO->getOpcode() != Instruction::Sub) return false;
+  } else {
+    return false;
+  }
+  return isPointerOperand(I->getOperand(0)) &&
+         isPointerOperand(I->getOperand(1));
+}
+
+bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) {
+  // If a global variable does not have dynamic initialization we don't
+  // have to instrument it.  However, if a global does not have initializer
+  // at all, we assume it has dynamic initializer (in other TU).
+  return G->hasInitializer() && !GlobalsMD.get(G).IsDynInit;
+}
+
+void AddressSanitizer::instrumentPointerComparisonOrSubtraction(
+    Instruction *I) {
+  IRBuilder<> IRB(I);
+  Function *F = isa<ICmpInst>(I) ? AsanPtrCmpFunction : AsanPtrSubFunction;
+  Value *Param[2] = {I->getOperand(0), I->getOperand(1)};
+  for (Value *&i : Param) {
+    if (i->getType()->isPointerTy())
+      i = IRB.CreatePointerCast(i, IntptrTy);
+  }
+  IRB.CreateCall(F, Param);
+}
+
+static void doInstrumentAddress(AddressSanitizer *Pass, Instruction *I,
+                                Instruction *InsertBefore, Value *Addr,
+                                unsigned Alignment, unsigned Granularity,
+                                uint32_t TypeSize, bool IsWrite,
+                                Value *SizeArgument, bool UseCalls,
+                                uint32_t Exp) {
+  // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check
+  // if the data is properly aligned.
+  if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 ||
+       TypeSize == 128) &&
+      (Alignment >= Granularity || Alignment == 0 || Alignment >= TypeSize / 8))
+    return Pass->instrumentAddress(I, InsertBefore, Addr, TypeSize, IsWrite,
+                                   nullptr, UseCalls, Exp);
+  Pass->instrumentUnusualSizeOrAlignment(I, InsertBefore, Addr, TypeSize,
+                                         IsWrite, nullptr, UseCalls, Exp);
+}
+
+static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
+                                        const DataLayout &DL, Type *IntptrTy,
+                                        Value *Mask, Instruction *I,
+                                        Value *Addr, unsigned Alignment,
+                                        unsigned Granularity, uint32_t TypeSize,
+                                        bool IsWrite, Value *SizeArgument,
+                                        bool UseCalls, uint32_t Exp) {
+  auto *VTy = cast<PointerType>(Addr->getType())->getElementType();
+  uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
+  unsigned Num = VTy->getVectorNumElements();
+  auto Zero = ConstantInt::get(IntptrTy, 0);
+  for (unsigned Idx = 0; Idx < Num; ++Idx) {
+    Value *InstrumentedAddress = nullptr;
+    Instruction *InsertBefore = I;
+    if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
+      // dyn_cast as we might get UndefValue
+      if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
+        if (Masked->isZero())
+          // Mask is constant false, so no instrumentation needed.
+          continue;
+        // If we have a true or undef value, fall through to doInstrumentAddress
+        // with InsertBefore == I
+      }
+    } else {
+      IRBuilder<> IRB(I);
+      Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
+      TerminatorInst *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
+      InsertBefore = ThenTerm;
+    }
+
+    IRBuilder<> IRB(InsertBefore);
+    InstrumentedAddress =
+        IRB.CreateGEP(Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
+    doInstrumentAddress(Pass, I, InsertBefore, InstrumentedAddress, Alignment,
+                        Granularity, ElemTypeSize, IsWrite, SizeArgument,
+                        UseCalls, Exp);
+  }
+}
+
+void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
+                                     Instruction *I, bool UseCalls,
+                                     const DataLayout &DL) {
+  bool IsWrite = false;
+  unsigned Alignment = 0;
+  uint64_t TypeSize = 0;
+  Value *MaybeMask = nullptr;
+  Value *Addr =
+      isInterestingMemoryAccess(I, &IsWrite, &TypeSize, &Alignment, &MaybeMask);
+  assert(Addr);
+
+  // Optimization experiments.
+  // The experiments can be used to evaluate potential optimizations that remove
+  // instrumentation (assess false negatives). Instead of completely removing
+  // some instrumentation, you set Exp to a non-zero value (mask of optimization
+  // experiments that want to remove instrumentation of this instruction).
+  // If Exp is non-zero, this pass will emit special calls into runtime
+  // (e.g. __asan_report_exp_load1 instead of __asan_report_load1). These calls
+  // make runtime terminate the program in a special way (with a different
+  // exit status). Then you run the new compiler on a buggy corpus, collect
+  // the special terminations (ideally, you don't see them at all -- no false
+  // negatives) and make the decision on the optimization.
+  uint32_t Exp = ClForceExperiment;
+
+  if (ClOpt && ClOptGlobals) {
+    // If initialization order checking is disabled, a simple access to a
+    // dynamically initialized global is always valid.
+    GlobalVariable *G = dyn_cast<GlobalVariable>(GetUnderlyingObject(Addr, DL));
+    if (G && (!ClInitializers || GlobalIsLinkerInitialized(G)) &&
+        isSafeAccess(ObjSizeVis, Addr, TypeSize)) {
+      NumOptimizedAccessesToGlobalVar++;
+      return;
+    }
+  }
+
+  if (ClOpt && ClOptStack) {
+    // A direct inbounds access to a stack variable is always valid.
+    if (isa<AllocaInst>(GetUnderlyingObject(Addr, DL)) &&
+        isSafeAccess(ObjSizeVis, Addr, TypeSize)) {
+      NumOptimizedAccessesToStackVar++;
+      return;
+    }
+  }
+
+  if (IsWrite)
+    NumInstrumentedWrites++;
+  else
+    NumInstrumentedReads++;
+
+  unsigned Granularity = 1 << Mapping.Scale;
+  if (MaybeMask) {
+    instrumentMaskedLoadOrStore(this, DL, IntptrTy, MaybeMask, I, Addr,
+                                Alignment, Granularity, TypeSize, IsWrite,
+                                nullptr, UseCalls, Exp);
+  } else {
+    doInstrumentAddress(this, I, I, Addr, Alignment, Granularity, TypeSize,
+                        IsWrite, nullptr, UseCalls, Exp);
+  }
+}
+
+Instruction *AddressSanitizer::generateCrashCode(Instruction *InsertBefore,
+                                                 Value *Addr, bool IsWrite,
+                                                 size_t AccessSizeIndex,
+                                                 Value *SizeArgument,
+                                                 uint32_t Exp) {
+  IRBuilder<> IRB(InsertBefore);
+  Value *ExpVal = Exp == 0 ? nullptr : ConstantInt::get(IRB.getInt32Ty(), Exp);
+  CallInst *Call = nullptr;
+  if (SizeArgument) {
+    if (Exp == 0)
+      Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][0],
+                            {Addr, SizeArgument});
+    else
+      Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][1],
+                            {Addr, SizeArgument, ExpVal});
+  } else {
+    if (Exp == 0)
+      Call =
+          IRB.CreateCall(AsanErrorCallback[IsWrite][0][AccessSizeIndex], Addr);
+    else
+      Call = IRB.CreateCall(AsanErrorCallback[IsWrite][1][AccessSizeIndex],
+                            {Addr, ExpVal});
+  }
+
+  // We don't do Call->setDoesNotReturn() because the BB already has
+  // UnreachableInst at the end.
+  // This EmptyAsm is required to avoid callback merge.
+  IRB.CreateCall(EmptyAsm, {});
+  return Call;
+}
+
+Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+                                           Value *ShadowValue,
+                                           uint32_t TypeSize) {
+  size_t Granularity = static_cast<size_t>(1) << Mapping.Scale;
+  // Addr & (Granularity - 1)
+  Value *LastAccessedByte =
+      IRB.CreateAnd(AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
+  // (Addr & (Granularity - 1)) + size - 1
+  if (TypeSize / 8 > 1)
+    LastAccessedByte = IRB.CreateAdd(
+        LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
+  // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
+  LastAccessedByte =
+      IRB.CreateIntCast(LastAccessedByte, ShadowValue->getType(), false);
+  // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
+  return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
+}
+
+void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
+                                         Instruction *InsertBefore, Value *Addr,
+                                         uint32_t TypeSize, bool IsWrite,
+                                         Value *SizeArgument, bool UseCalls,
+                                         uint32_t Exp) {
+  IRBuilder<> IRB(InsertBefore);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
+
+  if (UseCalls) {
+    if (Exp == 0)
+      IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][0][AccessSizeIndex],
+                     AddrLong);
+    else
+      IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][1][AccessSizeIndex],
+                     {AddrLong, ConstantInt::get(IRB.getInt32Ty(), Exp)});
+    return;
+  }
+
+  Type *ShadowTy =
+      IntegerType::get(*C, std::max(8U, TypeSize >> Mapping.Scale));
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Value *ShadowPtr = memToShadow(AddrLong, IRB);
+  Value *CmpVal = Constant::getNullValue(ShadowTy);
+  Value *ShadowValue =
+      IRB.CreateLoad(IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+
+  Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
+  size_t Granularity = 1ULL << Mapping.Scale;
+  TerminatorInst *CrashTerm = nullptr;
+
+  if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
+    // We use branch weights for the slow path check, to indicate that the slow
+    // path is rarely taken. This seems to be the case for SPEC benchmarks.
+    TerminatorInst *CheckTerm = SplitBlockAndInsertIfThen(
+        Cmp, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000));
+    assert(cast<BranchInst>(CheckTerm)->isUnconditional());
+    BasicBlock *NextBB = CheckTerm->getSuccessor(0);
+    IRB.SetInsertPoint(CheckTerm);
+    Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize);
+    if (Recover) {
+      CrashTerm = SplitBlockAndInsertIfThen(Cmp2, CheckTerm, false);
+    } else {
+      BasicBlock *CrashBlock =
+        BasicBlock::Create(*C, "", NextBB->getParent(), NextBB);
+      CrashTerm = new UnreachableInst(*C, CrashBlock);
+      BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
+      ReplaceInstWithInst(CheckTerm, NewTerm);
+    }
+  } else {
+    CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, !Recover);
+  }
+
+  Instruction *Crash = generateCrashCode(CrashTerm, AddrLong, IsWrite,
+                                         AccessSizeIndex, SizeArgument, Exp);
+  Crash->setDebugLoc(OrigIns->getDebugLoc());
+}
+
+// Instrument unusual size or unusual alignment.
+// We can not do it with a single check, so we do 1-byte check for the first
+// and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
+// to report the actual access size.
+void AddressSanitizer::instrumentUnusualSizeOrAlignment(
+    Instruction *I, Instruction *InsertBefore, Value *Addr, uint32_t TypeSize,
+    bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp) {
+  IRBuilder<> IRB(InsertBefore);
+  Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  if (UseCalls) {
+    if (Exp == 0)
+      IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][0],
+                     {AddrLong, Size});
+    else
+      IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][1],
+                     {AddrLong, Size, ConstantInt::get(IRB.getInt32Ty(), Exp)});
+  } else {
+    Value *LastByte = IRB.CreateIntToPtr(
+        IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
+        Addr->getType());
+    instrumentAddress(I, InsertBefore, Addr, 8, IsWrite, Size, false, Exp);
+    instrumentAddress(I, InsertBefore, LastByte, 8, IsWrite, Size, false, Exp);
+  }
+}
+
+void AddressSanitizerModule::poisonOneInitializer(Function &GlobalInit,
+                                                  GlobalValue *ModuleName) {
+  // Set up the arguments to our poison/unpoison functions.
+  IRBuilder<> IRB(&GlobalInit.front(),
+                  GlobalInit.front().getFirstInsertionPt());
+
+  // Add a call to poison all external globals before the given function starts.
+  Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy);
+  IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr);
+
+  // Add calls to unpoison all globals before each return instruction.
+  for (auto &BB : GlobalInit.getBasicBlockList())
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+      CallInst::Create(AsanUnpoisonGlobals, "", RI);
+}
+
+void AddressSanitizerModule::createInitializerPoisonCalls(
+    Module &M, GlobalValue *ModuleName) {
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (!GV)
+    return;
+
+  ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!CA)
+    return;
+
+  for (Use &OP : CA->operands()) {
+    if (isa<ConstantAggregateZero>(OP)) continue;
+    ConstantStruct *CS = cast<ConstantStruct>(OP);
+
+    // Must have a function or null ptr.
+    if (Function *F = dyn_cast<Function>(CS->getOperand(1))) {
+      if (F->getName() == kAsanModuleCtorName) continue;
+      ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
+      // Don't instrument CTORs that will run before asan.module_ctor.
+      if (Priority->getLimitedValue() <= kAsanCtorAndDtorPriority) continue;
+      poisonOneInitializer(*F, ModuleName);
+    }
+  }
+}
+
+bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
+  Type *Ty = G->getValueType();
+  DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
+
+  if (GlobalsMD.get(G).IsBlacklisted) return false;
+  if (!Ty->isSized()) return false;
+  if (!G->hasInitializer()) return false;
+  if (GlobalWasGeneratedByCompiler(G)) return false; // Our own globals.
+  // Touch only those globals that will not be defined in other modules.
+  // Don't handle ODR linkage types and COMDATs since other modules may be built
+  // without ASan.
+  if (G->getLinkage() != GlobalVariable::ExternalLinkage &&
+      G->getLinkage() != GlobalVariable::PrivateLinkage &&
+      G->getLinkage() != GlobalVariable::InternalLinkage)
+    return false;
+  if (G->hasComdat()) return false;
+  // Two problems with thread-locals:
+  //   - The address of the main thread's copy can't be computed at link-time.
+  //   - Need to poison all copies, not just the main thread's one.
+  if (G->isThreadLocal()) return false;
+  // For now, just ignore this Global if the alignment is large.
+  if (G->getAlignment() > MinRedzoneSizeForGlobal()) return false;
+
+  if (G->hasSection()) {
+    StringRef Section = G->getSection();
+
+    // Globals from llvm.metadata aren't emitted, do not instrument them.
+    if (Section == "llvm.metadata") return false;
+    // Do not instrument globals from special LLVM sections.
+    if (Section.find("__llvm") != StringRef::npos || Section.find("__LLVM") != StringRef::npos) return false;
+
+    // Do not instrument function pointers to initialization and termination
+    // routines: dynamic linker will not properly handle redzones.
+    if (Section.startswith(".preinit_array") ||
+        Section.startswith(".init_array") ||
+        Section.startswith(".fini_array")) {
+      return false;
+    }
+
+    // Callbacks put into the CRT initializer/terminator sections
+    // should not be instrumented.
+    // See https://code.google.com/p/address-sanitizer/issues/detail?id=305
+    // and http://msdn.microsoft.com/en-US/en-en/library/bb918180(v=vs.120).aspx
+    if (Section.startswith(".CRT")) {
+      DEBUG(dbgs() << "Ignoring a global initializer callback: " << *G << "\n");
+      return false;
+    }
+
+    if (TargetTriple.isOSBinFormatMachO()) {
+      StringRef ParsedSegment, ParsedSection;
+      unsigned TAA = 0, StubSize = 0;
+      bool TAAParsed;
+      std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier(
+          Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize);
+      assert(ErrorCode.empty() && "Invalid section specifier.");
+
+      // Ignore the globals from the __OBJC section. The ObjC runtime assumes
+      // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
+      // them.
+      if (ParsedSegment == "__OBJC" ||
+          (ParsedSegment == "__DATA" && ParsedSection.startswith("__objc_"))) {
+        DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n");
+        return false;
+      }
+      // See http://code.google.com/p/address-sanitizer/issues/detail?id=32
+      // Constant CFString instances are compiled in the following way:
+      //  -- the string buffer is emitted into
+      //     __TEXT,__cstring,cstring_literals
+      //  -- the constant NSConstantString structure referencing that buffer
+      //     is placed into __DATA,__cfstring
+      // Therefore there's no point in placing redzones into __DATA,__cfstring.
+      // Moreover, it causes the linker to crash on OS X 10.7
+      if (ParsedSegment == "__DATA" && ParsedSection == "__cfstring") {
+        DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n");
+        return false;
+      }
+      // The linker merges the contents of cstring_literals and removes the
+      // trailing zeroes.
+      if (ParsedSegment == "__TEXT" && (TAA & MachO::S_CSTRING_LITERALS)) {
+        DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// On Mach-O platforms, we emit global metadata in a separate section of the
+// binary in order to allow the linker to properly dead strip. This is only
+// supported on recent versions of ld64.
+bool AddressSanitizerModule::ShouldUseMachOGlobalsSection() const {
+  if (!TargetTriple.isOSBinFormatMachO())
+    return false;
+
+  if (TargetTriple.isMacOSX() && !TargetTriple.isMacOSXVersionLT(10, 11))
+    return true;
+  if (TargetTriple.isiOS() /* or tvOS */ && !TargetTriple.isOSVersionLT(9))
+    return true;
+  if (TargetTriple.isWatchOS() && !TargetTriple.isOSVersionLT(2))
+    return true;
+
+  return false;
+}
+
+StringRef AddressSanitizerModule::getGlobalMetadataSection() const {
+  switch (TargetTriple.getObjectFormat()) {
+  case Triple::COFF:  return ".ASAN$GL";
+  case Triple::ELF:   return "asan_globals";
+  case Triple::MachO: return "__DATA,__asan_globals,regular";
+  default: break;
+  }
+  llvm_unreachable("unsupported object format");
+}
+
+void AddressSanitizerModule::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+
+  // Declare our poisoning and unpoisoning functions.
+  AsanPoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy));
+  AsanPoisonGlobals->setLinkage(Function::ExternalLinkage);
+  AsanUnpoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      kAsanUnpoisonGlobalsName, IRB.getVoidTy()));
+  AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage);
+
+  // Declare functions that register/unregister globals.
+  AsanRegisterGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy));
+  AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
+  AsanUnregisterGlobals = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kAsanUnregisterGlobalsName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy));
+  AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
+
+  // Declare the functions that find globals in a shared object and then invoke
+  // the (un)register function on them.
+  AsanRegisterImageGlobals =
+      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+          kAsanRegisterImageGlobalsName, IRB.getVoidTy(), IntptrTy));
+  AsanRegisterImageGlobals->setLinkage(Function::ExternalLinkage);
+
+  AsanUnregisterImageGlobals =
+      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+          kAsanUnregisterImageGlobalsName, IRB.getVoidTy(), IntptrTy));
+  AsanUnregisterImageGlobals->setLinkage(Function::ExternalLinkage);
+
+  AsanRegisterElfGlobals = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kAsanRegisterElfGlobalsName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy, IntptrTy));
+  AsanRegisterElfGlobals->setLinkage(Function::ExternalLinkage);
+
+  AsanUnregisterElfGlobals = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kAsanUnregisterElfGlobalsName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy, IntptrTy));
+  AsanUnregisterElfGlobals->setLinkage(Function::ExternalLinkage);
+}
+
+// Put the metadata and the instrumented global in the same group. This ensures
+// that the metadata is discarded if the instrumented global is discarded.
+void AddressSanitizerModule::SetComdatForGlobalMetadata(
+    GlobalVariable *G, GlobalVariable *Metadata, StringRef InternalSuffix) {
+  Module &M = *G->getParent();
+  Comdat *C = G->getComdat();
+  if (!C) {
+    if (!G->hasName()) {
+      // If G is unnamed, it must be internal. Give it an artificial name
+      // so we can put it in a comdat.
+      assert(G->hasLocalLinkage());
+      G->setName(Twine(kAsanGenPrefix) + "_anon_global");
+    }
+
+    if (!InternalSuffix.empty() && G->hasLocalLinkage()) {
+      std::string Name = G->getName();
+      Name += InternalSuffix;
+      C = M.getOrInsertComdat(Name);
+    } else {
+      C = M.getOrInsertComdat(G->getName());
+    }
+
+    // Make this IMAGE_COMDAT_SELECT_NODUPLICATES on COFF.
+    if (TargetTriple.isOSBinFormatCOFF())
+      C->setSelectionKind(Comdat::NoDuplicates);
+    G->setComdat(C);
+  }
+
+  assert(G->hasComdat());
+  Metadata->setComdat(G->getComdat());
+}
+
+// Create a separate metadata global and put it in the appropriate ASan
+// global registration section.
+GlobalVariable *
+AddressSanitizerModule::CreateMetadataGlobal(Module &M, Constant *Initializer,
+                                             StringRef OriginalName) {
+  auto Linkage = TargetTriple.isOSBinFormatMachO()
+                     ? GlobalVariable::InternalLinkage
+                     : GlobalVariable::PrivateLinkage;
+  GlobalVariable *Metadata = new GlobalVariable(
+      M, Initializer->getType(), false, Linkage, Initializer,
+      Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName));
+  Metadata->setSection(getGlobalMetadataSection());
+  return Metadata;
+}
+
+IRBuilder<> AddressSanitizerModule::CreateAsanModuleDtor(Module &M) {
+  AsanDtorFunction =
+      Function::Create(FunctionType::get(Type::getVoidTy(*C), false),
+                       GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
+  BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
+
+  return IRBuilder<>(ReturnInst::Create(*C, AsanDtorBB));
+}
+
+void AddressSanitizerModule::InstrumentGlobalsCOFF(
+    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+    ArrayRef<Constant *> MetadataInitializers) {
+  assert(ExtendedGlobals.size() == MetadataInitializers.size());
+  auto &DL = M.getDataLayout();
+
+  for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+    Constant *Initializer = MetadataInitializers[i];
+    GlobalVariable *G = ExtendedGlobals[i];
+    GlobalVariable *Metadata =
+        CreateMetadataGlobal(M, Initializer, G->getName());
+
+    // The MSVC linker always inserts padding when linking incrementally. We
+    // cope with that by aligning each struct to its size, which must be a power
+    // of two.
+    unsigned SizeOfGlobalStruct = DL.getTypeAllocSize(Initializer->getType());
+    assert(isPowerOf2_32(SizeOfGlobalStruct) &&
+           "global metadata will not be padded appropriately");
+    Metadata->setAlignment(SizeOfGlobalStruct);
+
+    SetComdatForGlobalMetadata(G, Metadata, "");
+  }
+}
+
+void AddressSanitizerModule::InstrumentGlobalsELF(
+    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+    ArrayRef<Constant *> MetadataInitializers,
+    const std::string &UniqueModuleId) {
+  assert(ExtendedGlobals.size() == MetadataInitializers.size());
+
+  SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size());
+  for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+    GlobalVariable *G = ExtendedGlobals[i];
+    GlobalVariable *Metadata =
+        CreateMetadataGlobal(M, MetadataInitializers[i], G->getName());
+    MDNode *MD = MDNode::get(M.getContext(), ValueAsMetadata::get(G));
+    Metadata->setMetadata(LLVMContext::MD_associated, MD);
+    MetadataGlobals[i] = Metadata;
+
+    SetComdatForGlobalMetadata(G, Metadata, UniqueModuleId);
+  }
+
+  // Update llvm.compiler.used, adding the new metadata globals. This is
+  // needed so that during LTO these variables stay alive.
+  if (!MetadataGlobals.empty())
+    appendToCompilerUsed(M, MetadataGlobals);
+
+  // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
+  // to look up the loaded image that contains it. Second, we can store in it
+  // whether registration has already occurred, to prevent duplicate
+  // registration.
+  //
+  // Common linkage ensures that there is only one global per shared library.
+  GlobalVariable *RegisteredFlag = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::CommonLinkage,
+      ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
+  RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
+
+  // Create start and stop symbols.
+  GlobalVariable *StartELFMetadata = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
+      "__start_" + getGlobalMetadataSection());
+  StartELFMetadata->setVisibility(GlobalVariable::HiddenVisibility);
+  GlobalVariable *StopELFMetadata = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
+      "__stop_" + getGlobalMetadataSection());
+  StopELFMetadata->setVisibility(GlobalVariable::HiddenVisibility);
+
+  // Create a call to register the globals with the runtime.
+  IRB.CreateCall(AsanRegisterElfGlobals,
+                 {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
+                  IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
+                  IRB.CreatePointerCast(StopELFMetadata, IntptrTy)});
+
+  // We also need to unregister globals at the end, e.g., when a shared library
+  // gets closed.
+  IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
+  IRB_Dtor.CreateCall(AsanUnregisterElfGlobals,
+                      {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
+                       IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
+                       IRB.CreatePointerCast(StopELFMetadata, IntptrTy)});
+}
+
+void AddressSanitizerModule::InstrumentGlobalsMachO(
+    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+    ArrayRef<Constant *> MetadataInitializers) {
+  assert(ExtendedGlobals.size() == MetadataInitializers.size());
+
+  // On recent Mach-O platforms, use a structure which binds the liveness of
+  // the global variable to the metadata struct. Keep the list of "Liveness" GV
+  // created to be added to llvm.compiler.used
+  StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy);
+  SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size());
+
+  for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+    Constant *Initializer = MetadataInitializers[i];
+    GlobalVariable *G = ExtendedGlobals[i];
+    GlobalVariable *Metadata =
+        CreateMetadataGlobal(M, Initializer, G->getName());
+
+    // On recent Mach-O platforms, we emit the global metadata in a way that
+    // allows the linker to properly strip dead globals.
+    auto LivenessBinder =
+        ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u),
+                            ConstantExpr::getPointerCast(Metadata, IntptrTy));
+    GlobalVariable *Liveness = new GlobalVariable(
+        M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder,
+        Twine("__asan_binder_") + G->getName());
+    Liveness->setSection("__DATA,__asan_liveness,regular,live_support");
+    LivenessGlobals[i] = Liveness;
+  }
+
+  // Update llvm.compiler.used, adding the new liveness globals. This is
+  // needed so that during LTO these variables stay alive. The alternative
+  // would be to have the linker handling the LTO symbols, but libLTO
+  // current API does not expose access to the section for each symbol.
+  if (!LivenessGlobals.empty())
+    appendToCompilerUsed(M, LivenessGlobals);
+
+  // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
+  // to look up the loaded image that contains it. Second, we can store in it
+  // whether registration has already occurred, to prevent duplicate
+  // registration.
+  //
+  // common linkage ensures that there is only one global per shared library.
+  GlobalVariable *RegisteredFlag = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::CommonLinkage,
+      ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
+  RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
+
+  IRB.CreateCall(AsanRegisterImageGlobals,
+                 {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
+
+  // We also need to unregister globals at the end, e.g., when a shared library
+  // gets closed.
+  IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
+  IRB_Dtor.CreateCall(AsanUnregisterImageGlobals,
+                      {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
+}
+
+void AddressSanitizerModule::InstrumentGlobalsWithMetadataArray(
+    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+    ArrayRef<Constant *> MetadataInitializers) {
+  assert(ExtendedGlobals.size() == MetadataInitializers.size());
+  unsigned N = ExtendedGlobals.size();
+  assert(N > 0);
+
+  // On platforms that don't have a custom metadata section, we emit an array
+  // of global metadata structures.
+  ArrayType *ArrayOfGlobalStructTy =
+      ArrayType::get(MetadataInitializers[0]->getType(), N);
+  auto AllGlobals = new GlobalVariable(
+      M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage,
+      ConstantArray::get(ArrayOfGlobalStructTy, MetadataInitializers), "");
+
+  IRB.CreateCall(AsanRegisterGlobals,
+                 {IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                  ConstantInt::get(IntptrTy, N)});
+
+  // We also need to unregister globals at the end, e.g., when a shared library
+  // gets closed.
+  IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
+  IRB_Dtor.CreateCall(AsanUnregisterGlobals,
+                      {IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                       ConstantInt::get(IntptrTy, N)});
+}
+
+// This function replaces all global variables with new variables that have
+// trailing redzones. It also creates a function that poisons
+// redzones and inserts this function into llvm.global_ctors.
+// Sets *CtorComdat to true if the global registration code emitted into the
+// asan constructor is comdat-compatible.
+bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat) {
+  *CtorComdat = false;
+  GlobalsMD.init(M);
+
+  SmallVector<GlobalVariable *, 16> GlobalsToChange;
+
+  for (auto &G : M.globals()) {
+    if (ShouldInstrumentGlobal(&G)) GlobalsToChange.push_back(&G);
+  }
+
+  size_t n = GlobalsToChange.size();
+  if (n == 0) {
+    *CtorComdat = true;
+    return false;
+  }
+
+  auto &DL = M.getDataLayout();
+
+  // A global is described by a structure
+  //   size_t beg;
+  //   size_t size;
+  //   size_t size_with_redzone;
+  //   const char *name;
+  //   const char *module_name;
+  //   size_t has_dynamic_init;
+  //   void *source_location;
+  //   size_t odr_indicator;
+  // We initialize an array of such structures and pass it to a run-time call.
+  StructType *GlobalStructTy =
+      StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
+                      IntptrTy, IntptrTy, IntptrTy);
+  SmallVector<GlobalVariable *, 16> NewGlobals(n);
+  SmallVector<Constant *, 16> Initializers(n);
+
+  bool HasDynamicallyInitializedGlobals = false;
+
+  // We shouldn't merge same module names, as this string serves as unique
+  // module ID in runtime.
+  GlobalVariable *ModuleName = createPrivateGlobalForString(
+      M, M.getModuleIdentifier(), /*AllowMerging*/ false);
+
+  for (size_t i = 0; i < n; i++) {
+    static const uint64_t kMaxGlobalRedzone = 1 << 18;
+    GlobalVariable *G = GlobalsToChange[i];
+
+    auto MD = GlobalsMD.get(G);
+    StringRef NameForGlobal = G->getName();
+    // Create string holding the global name (use global name from metadata
+    // if it's available, otherwise just write the name of global variable).
+    GlobalVariable *Name = createPrivateGlobalForString(
+        M, MD.Name.empty() ? NameForGlobal : MD.Name,
+        /*AllowMerging*/ true);
+
+    Type *Ty = G->getValueType();
+    uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
+    uint64_t MinRZ = MinRedzoneSizeForGlobal();
+    // MinRZ <= RZ <= kMaxGlobalRedzone
+    // and trying to make RZ to be ~ 1/4 of SizeInBytes.
+    uint64_t RZ = std::max(
+        MinRZ, std::min(kMaxGlobalRedzone, (SizeInBytes / MinRZ / 4) * MinRZ));
+    uint64_t RightRedzoneSize = RZ;
+    // Round up to MinRZ
+    if (SizeInBytes % MinRZ) RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ);
+    assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
+    Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
+
+    StructType *NewTy = StructType::get(Ty, RightRedZoneTy);
+    Constant *NewInitializer = ConstantStruct::get(
+        NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy));
+
+    // Create a new global variable with enough space for a redzone.
+    GlobalValue::LinkageTypes Linkage = G->getLinkage();
+    if (G->isConstant() && Linkage == GlobalValue::PrivateLinkage)
+      Linkage = GlobalValue::InternalLinkage;
+    GlobalVariable *NewGlobal =
+        new GlobalVariable(M, NewTy, G->isConstant(), Linkage, NewInitializer,
+                           "", G, G->getThreadLocalMode());
+    NewGlobal->copyAttributesFrom(G);
+    NewGlobal->setAlignment(MinRZ);
+
+    // Move null-terminated C strings to "__asan_cstring" section on Darwin.
+    if (TargetTriple.isOSBinFormatMachO() && !G->hasSection() &&
+        G->isConstant()) {
+      auto Seq = dyn_cast<ConstantDataSequential>(G->getInitializer());
+      if (Seq && Seq->isCString())
+        NewGlobal->setSection("__TEXT,__asan_cstring,regular");
+    }
+
+    // Transfer the debug info.  The payload starts at offset zero so we can
+    // copy the debug info over as is.
+    SmallVector<DIGlobalVariableExpression *, 1> GVs;
+    G->getDebugInfo(GVs);
+    for (auto *GV : GVs)
+      NewGlobal->addDebugInfo(GV);
+
+    Value *Indices2[2];
+    Indices2[0] = IRB.getInt32(0);
+    Indices2[1] = IRB.getInt32(0);
+
+    G->replaceAllUsesWith(
+        ConstantExpr::getGetElementPtr(NewTy, NewGlobal, Indices2, true));
+    NewGlobal->takeName(G);
+    G->eraseFromParent();
+    NewGlobals[i] = NewGlobal;
+
+    Constant *SourceLoc;
+    if (!MD.SourceLoc.empty()) {
+      auto SourceLocGlobal = createPrivateGlobalForSourceLoc(M, MD.SourceLoc);
+      SourceLoc = ConstantExpr::getPointerCast(SourceLocGlobal, IntptrTy);
+    } else {
+      SourceLoc = ConstantInt::get(IntptrTy, 0);
+    }
+
+    Constant *ODRIndicator = ConstantExpr::getNullValue(IRB.getInt8PtrTy());
+    GlobalValue *InstrumentedGlobal = NewGlobal;
+
+    bool CanUsePrivateAliases =
+        TargetTriple.isOSBinFormatELF() || TargetTriple.isOSBinFormatMachO() ||
+        TargetTriple.isOSBinFormatWasm();
+    if (CanUsePrivateAliases && ClUsePrivateAliasForGlobals) {
+      // Create local alias for NewGlobal to avoid crash on ODR between
+      // instrumented and non-instrumented libraries.
+      auto *GA = GlobalAlias::create(GlobalValue::InternalLinkage,
+                                     NameForGlobal + M.getName(), NewGlobal);
+
+      // With local aliases, we need to provide another externally visible
+      // symbol __odr_asan_XXX to detect ODR violation.
+      auto *ODRIndicatorSym =
+          new GlobalVariable(M, IRB.getInt8Ty(), false, Linkage,
+                             Constant::getNullValue(IRB.getInt8Ty()),
+                             kODRGenPrefix + NameForGlobal, nullptr,
+                             NewGlobal->getThreadLocalMode());
+
+      // Set meaningful attributes for indicator symbol.
+      ODRIndicatorSym->setVisibility(NewGlobal->getVisibility());
+      ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass());
+      ODRIndicatorSym->setAlignment(1);
+      ODRIndicator = ODRIndicatorSym;
+      InstrumentedGlobal = GA;
+    }
+
+    Constant *Initializer = ConstantStruct::get(
+        GlobalStructTy,
+        ConstantExpr::getPointerCast(InstrumentedGlobal, IntptrTy),
+        ConstantInt::get(IntptrTy, SizeInBytes),
+        ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
+        ConstantExpr::getPointerCast(Name, IntptrTy),
+        ConstantExpr::getPointerCast(ModuleName, IntptrTy),
+        ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc,
+        ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));
+
+    if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
+
+    DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
+
+    Initializers[i] = Initializer;
+  }
+
+  std::string ELFUniqueModuleId =
+      (UseGlobalsGC && TargetTriple.isOSBinFormatELF()) ? getUniqueModuleId(&M)
+                                                        : "";
+
+  if (!ELFUniqueModuleId.empty()) {
+    InstrumentGlobalsELF(IRB, M, NewGlobals, Initializers, ELFUniqueModuleId);
+    *CtorComdat = true;
+  } else if (UseGlobalsGC && TargetTriple.isOSBinFormatCOFF()) {
+    InstrumentGlobalsCOFF(IRB, M, NewGlobals, Initializers);
+  } else if (UseGlobalsGC && ShouldUseMachOGlobalsSection()) {
+    InstrumentGlobalsMachO(IRB, M, NewGlobals, Initializers);
+  } else {
+    InstrumentGlobalsWithMetadataArray(IRB, M, NewGlobals, Initializers);
+  }
+
+  // Create calls for poisoning before initializers run and unpoisoning after.
+  if (HasDynamicallyInitializedGlobals)
+    createInitializerPoisonCalls(M, ModuleName);
+
+  DEBUG(dbgs() << M);
+  return true;
+}
+
+bool AddressSanitizerModule::runOnModule(Module &M) {
+  C = &(M.getContext());
+  int LongSize = M.getDataLayout().getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+  TargetTriple = Triple(M.getTargetTriple());
+  Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
+  initializeCallbacks(M);
+
+  if (CompileKernel)
+    return false;
+
+  // Create a module constructor. A destructor is created lazily because not all
+  // platforms, and not all modules need it.
+  std::tie(AsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, kAsanModuleCtorName, kAsanInitName, /*InitArgTypes=*/{},
+      /*InitArgs=*/{}, kAsanVersionCheckName);
+
+  bool CtorComdat = true;
+  bool Changed = false;
+  // TODO(glider): temporarily disabled globals instrumentation for KASan.
+  if (ClGlobals) {
+    IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator());
+    Changed |= InstrumentGlobals(IRB, M, &CtorComdat);
+  }
+
+  // Put the constructor and destructor in comdat if both
+  // (1) global instrumentation is not TU-specific
+  // (2) target is ELF.
+  if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
+    AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));
+    appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority,
+                        AsanCtorFunction);
+    if (AsanDtorFunction) {
+      AsanDtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleDtorName));
+      appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority,
+                          AsanDtorFunction);
+    }
+  } else {
+    appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
+    if (AsanDtorFunction)
+      appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority);
+  }
+
+  return Changed;
+}
+
+void AddressSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  // Create __asan_report* callbacks.
+  // IsWrite, TypeSize and Exp are encoded in the function name.
+  for (int Exp = 0; Exp < 2; Exp++) {
+    for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+      const std::string TypeStr = AccessIsWrite ? "store" : "load";
+      const std::string ExpStr = Exp ? "exp_" : "";
+      const std::string SuffixStr = CompileKernel ? "N" : "_n";
+      const std::string EndingStr = Recover ? "_noabort" : "";
+
+      SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
+      SmallVector<Type *, 2> Args1{1, IntptrTy};
+      if (Exp) {
+        Type *ExpType = Type::getInt32Ty(*C);
+        Args2.push_back(ExpType);
+        Args1.push_back(ExpType);
+      }
+	    AsanErrorCallbackSized[AccessIsWrite][Exp] =
+	        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	            kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr +
+	                EndingStr,
+	            FunctionType::get(IRB.getVoidTy(), Args2, false)));
+
+	    AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] =
+	        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	            ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N" + EndingStr,
+	            FunctionType::get(IRB.getVoidTy(), Args2, false)));
+
+	    for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+	         AccessSizeIndex++) {
+	      const std::string Suffix = TypeStr + itostr(1ULL << AccessSizeIndex);
+	      AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+	          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	              kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,
+	              FunctionType::get(IRB.getVoidTy(), Args1, false)));
+
+	      AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+	          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	              ClMemoryAccessCallbackPrefix + ExpStr + Suffix + EndingStr,
+	              FunctionType::get(IRB.getVoidTy(), Args1, false)));
+	    }
+	  }
+  }
+
+  const std::string MemIntrinCallbackPrefix =
+      CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
+  AsanMemmove = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      MemIntrinCallbackPrefix + "memmove", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy));
+  AsanMemcpy = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      MemIntrinCallbackPrefix + "memcpy", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy));
+  AsanMemset = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      MemIntrinCallbackPrefix + "memset", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy));
+
+  AsanHandleNoReturnFunc = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy()));
+
+  AsanPtrCmpFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy));
+  AsanPtrSubFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy));
+  // We insert an empty inline asm after __asan_report* to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+}
+
+// virtual
+bool AddressSanitizer::doInitialization(Module &M) {
+  // Initialize the private fields. No one has accessed them before.
+  GlobalsMD.init(M);
+
+  C = &(M.getContext());
+  LongSize = M.getDataLayout().getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+  TargetTriple = Triple(M.getTargetTriple());
+
+  Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
+  return true;
+}
+
+bool AddressSanitizer::doFinalization(Module &M) {
+  GlobalsMD.reset();
+  return false;
+}
+
+bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
+  // For each NSObject descendant having a +load method, this method is invoked
+  // by the ObjC runtime before any of the static constructors is called.
+  // Therefore we need to instrument such methods with a call to __asan_init
+  // at the beginning in order to initialize our runtime before any access to
+  // the shadow memory.
+  // We cannot just ignore these methods, because they may call other
+  // instrumented functions.
+  if (F.getName().find(" load]") != std::string::npos) {
+    Function *AsanInitFunction =
+        declareSanitizerInitFunction(*F.getParent(), kAsanInitName, {});
+    IRBuilder<> IRB(&F.front(), F.front().begin());
+    IRB.CreateCall(AsanInitFunction, {});
+    return true;
+  }
+  return false;
+}
+
+void AddressSanitizer::maybeInsertDynamicShadowAtFunctionEntry(Function &F) {
+  // Generate code only when dynamic addressing is needed.
+  if (Mapping.Offset != kDynamicShadowSentinel)
+    return;
+
+  IRBuilder<> IRB(&F.front().front());
+  Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal(
+      kAsanShadowMemoryDynamicAddress, IntptrTy);
+  LocalDynamicShadow = IRB.CreateLoad(GlobalDynamicAddress);
+}
+
+void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
+  // Find the one possible call to llvm.localescape and pre-mark allocas passed
+  // to it as uninteresting. This assumes we haven't started processing allocas
+  // yet. This check is done up front because iterating the use list in
+  // isInterestingAlloca would be algorithmically slower.
+  assert(ProcessedAllocas.empty() && "must process localescape before allocas");
+
+  // Try to get the declaration of llvm.localescape. If it's not in the module,
+  // we can exit early.
+  if (!F.getParent()->getFunction("llvm.localescape")) return;
+
+  // Look for a call to llvm.localescape call in the entry block. It can't be in
+  // any other block.
+  for (Instruction &I : F.getEntryBlock()) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+    if (II && II->getIntrinsicID() == Intrinsic::localescape) {
+      // We found a call. Mark all the allocas passed in as uninteresting.
+      for (Value *Arg : II->arg_operands()) {
+        AllocaInst *AI = dyn_cast<AllocaInst>(Arg->stripPointerCasts());
+        assert(AI && AI->isStaticAlloca() &&
+               "non-static alloca arg to localescape");
+        ProcessedAllocas[AI] = false;
+      }
+      break;
+    }
+  }
+}
+
+bool AddressSanitizer::runOnFunction(Function &F) {
+  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
+  if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
+  if (F.getName().startswith("__asan_")) return false;
+
+  bool FunctionModified = false;
+
+  // If needed, insert __asan_init before checking for SanitizeAddress attr.
+  // This function needs to be called even if the function body is not
+  // instrumented.  
+  if (maybeInsertAsanInitAtFunctionEntry(F))
+    FunctionModified = true;
+  
+  // Leave if the function doesn't need instrumentation.
+  if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return FunctionModified;
+
+  DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
+
+  initializeCallbacks(*F.getParent());
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  FunctionStateRAII CleanupObj(this);
+
+  maybeInsertDynamicShadowAtFunctionEntry(F);
+
+  // We can't instrument allocas used with llvm.localescape. Only static allocas
+  // can be passed to that intrinsic.
+  markEscapedLocalAllocas(F);
+
+  // We want to instrument every address only once per basic block (unless there
+  // are calls between uses).
+  SmallSet<Value *, 16> TempsToInstrument;
+  SmallVector<Instruction *, 16> ToInstrument;
+  SmallVector<Instruction *, 8> NoReturnCalls;
+  SmallVector<BasicBlock *, 16> AllBlocks;
+  SmallVector<Instruction *, 16> PointerComparisonsOrSubtracts;
+  int NumAllocas = 0;
+  bool IsWrite;
+  unsigned Alignment;
+  uint64_t TypeSize;
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  // Fill the set of memory operations to instrument.
+  for (auto &BB : F) {
+    AllBlocks.push_back(&BB);
+    TempsToInstrument.clear();
+    int NumInsnsPerBB = 0;
+    for (auto &Inst : BB) {
+      if (LooksLikeCodeInBug11395(&Inst)) return false;
+      Value *MaybeMask = nullptr;
+      if (Value *Addr = isInterestingMemoryAccess(&Inst, &IsWrite, &TypeSize,
+                                                  &Alignment, &MaybeMask)) {
+        if (ClOpt && ClOptSameTemp) {
+          // If we have a mask, skip instrumentation if we've already
+          // instrumented the full object. But don't add to TempsToInstrument
+          // because we might get another load/store with a different mask.
+          if (MaybeMask) {
+            if (TempsToInstrument.count(Addr))
+              continue; // We've seen this (whole) temp in the current BB.
+          } else {
+            if (!TempsToInstrument.insert(Addr).second)
+              continue; // We've seen this temp in the current BB.
+          }
+        }
+      } else if (ClInvalidPointerPairs &&
+                 isInterestingPointerComparisonOrSubtraction(&Inst)) {
+        PointerComparisonsOrSubtracts.push_back(&Inst);
+        continue;
+      } else if (isa<MemIntrinsic>(Inst)) {
+        // ok, take it.
+      } else {
+        if (isa<AllocaInst>(Inst)) NumAllocas++;
+        CallSite CS(&Inst);
+        if (CS) {
+          // A call inside BB.
+          TempsToInstrument.clear();
+          if (CS.doesNotReturn()) NoReturnCalls.push_back(CS.getInstruction());
+        }
+        if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+          maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
+        continue;
+      }
+      ToInstrument.push_back(&Inst);
+      NumInsnsPerBB++;
+      if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) break;
+    }
+  }
+
+  bool UseCalls =
+      CompileKernel ||
+      (ClInstrumentationWithCallsThreshold >= 0 &&
+       ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold);
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  ObjectSizeOpts ObjSizeOpts;
+  ObjSizeOpts.RoundToAlign = true;
+  ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(), ObjSizeOpts);
+
+  // Instrument.
+  int NumInstrumented = 0;
+  for (auto Inst : ToInstrument) {
+    if (ClDebugMin < 0 || ClDebugMax < 0 ||
+        (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
+      if (isInterestingMemoryAccess(Inst, &IsWrite, &TypeSize, &Alignment))
+        instrumentMop(ObjSizeVis, Inst, UseCalls,
+                      F.getParent()->getDataLayout());
+      else
+        instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+    }
+    NumInstrumented++;
+  }
+
+  FunctionStackPoisoner FSP(F, *this);
+  bool ChangedStack = FSP.runOnFunction();
+
+  // We must unpoison the stack before every NoReturn call (throw, _exit, etc).
+  // See e.g. http://code.google.com/p/address-sanitizer/issues/detail?id=37
+  for (auto CI : NoReturnCalls) {
+    IRBuilder<> IRB(CI);
+    IRB.CreateCall(AsanHandleNoReturnFunc, {});
+  }
+
+  for (auto Inst : PointerComparisonsOrSubtracts) {
+    instrumentPointerComparisonOrSubtraction(Inst);
+    NumInstrumented++;
+  }
+
+  if (NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty())
+    FunctionModified = true;
+
+  DEBUG(dbgs() << "ASAN done instrumenting: " << FunctionModified << " "
+               << F << "\n");
+
+  return FunctionModified;
+}
+
+// Workaround for bug 11395: we don't want to instrument stack in functions
+// with large assembly blobs (32-bit only), otherwise reg alloc may crash.
+// FIXME: remove once the bug 11395 is fixed.
+bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
+  if (LongSize != 32) return false;
+  CallInst *CI = dyn_cast<CallInst>(I);
+  if (!CI || !CI->isInlineAsm()) return false;
+  if (CI->getNumArgOperands() <= 5) return false;
+  // We have inline assembly with quite a few arguments.
+  return true;
+}
+
+void FunctionStackPoisoner::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) {
+    std::string Suffix = itostr(i);
+    AsanStackMallocFunc[i] = checkSanitizerInterfaceFunction(
+        M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy,
+                              IntptrTy));
+    AsanStackFreeFunc[i] = checkSanitizerInterfaceFunction(
+        M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix,
+                              IRB.getVoidTy(), IntptrTy, IntptrTy));
+  }
+  if (ASan.UseAfterScope) {
+    AsanPoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
+        M.getOrInsertFunction(kAsanPoisonStackMemoryName, IRB.getVoidTy(),
+                              IntptrTy, IntptrTy));
+    AsanUnpoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
+        M.getOrInsertFunction(kAsanUnpoisonStackMemoryName, IRB.getVoidTy(),
+                              IntptrTy, IntptrTy));
+  }
+
+  for (size_t Val : {0x00, 0xf1, 0xf2, 0xf3, 0xf5, 0xf8}) {
+    std::ostringstream Name;
+    Name << kAsanSetShadowPrefix;
+    Name << std::setw(2) << std::setfill('0') << std::hex << Val;
+    AsanSetShadowFunc[Val] =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy));
+  }
+
+  AsanAllocaPoisonFunc = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy));
+  AsanAllocasUnpoisonFunc =
+      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+          kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy));
+}
+
+void FunctionStackPoisoner::copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
+                                               ArrayRef<uint8_t> ShadowBytes,
+                                               size_t Begin, size_t End,
+                                               IRBuilder<> &IRB,
+                                               Value *ShadowBase) {
+  if (Begin >= End)
+    return;
+
+  const size_t LargestStoreSizeInBytes =
+      std::min<size_t>(sizeof(uint64_t), ASan.LongSize / 8);
+
+  const bool IsLittleEndian = F.getParent()->getDataLayout().isLittleEndian();
+
+  // Poison given range in shadow using larges store size with out leading and
+  // trailing zeros in ShadowMask. Zeros never change, so they need neither
+  // poisoning nor up-poisoning. Still we don't mind if some of them get into a
+  // middle of a store.
+  for (size_t i = Begin; i < End;) {
+    if (!ShadowMask[i]) {
+      assert(!ShadowBytes[i]);
+      ++i;
+      continue;
+    }
+
+    size_t StoreSizeInBytes = LargestStoreSizeInBytes;
+    // Fit store size into the range.
+    while (StoreSizeInBytes > End - i)
+      StoreSizeInBytes /= 2;
+
+    // Minimize store size by trimming trailing zeros.
+    for (size_t j = StoreSizeInBytes - 1; j && !ShadowMask[i + j]; --j) {
+      while (j <= StoreSizeInBytes / 2)
+        StoreSizeInBytes /= 2;
+    }
+
+    uint64_t Val = 0;
+    for (size_t j = 0; j < StoreSizeInBytes; j++) {
+      if (IsLittleEndian)
+        Val |= (uint64_t)ShadowBytes[i + j] << (8 * j);
+      else
+        Val = (Val << 8) | ShadowBytes[i + j];
+    }
+
+    Value *Ptr = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
+    Value *Poison = IRB.getIntN(StoreSizeInBytes * 8, Val);
+    IRB.CreateAlignedStore(
+        Poison, IRB.CreateIntToPtr(Ptr, Poison->getType()->getPointerTo()), 1);
+
+    i += StoreSizeInBytes;
+  }
+}
+
+void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask,
+                                         ArrayRef<uint8_t> ShadowBytes,
+                                         IRBuilder<> &IRB, Value *ShadowBase) {
+  copyToShadow(ShadowMask, ShadowBytes, 0, ShadowMask.size(), IRB, ShadowBase);
+}
+
+void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask,
+                                         ArrayRef<uint8_t> ShadowBytes,
+                                         size_t Begin, size_t End,
+                                         IRBuilder<> &IRB, Value *ShadowBase) {
+  assert(ShadowMask.size() == ShadowBytes.size());
+  size_t Done = Begin;
+  for (size_t i = Begin, j = Begin + 1; i < End; i = j++) {
+    if (!ShadowMask[i]) {
+      assert(!ShadowBytes[i]);
+      continue;
+    }
+    uint8_t Val = ShadowBytes[i];
+    if (!AsanSetShadowFunc[Val])
+      continue;
+
+    // Skip same values.
+    for (; j < End && ShadowMask[j] && Val == ShadowBytes[j]; ++j) {
+    }
+
+    if (j - i >= ClMaxInlinePoisoningSize) {
+      copyToShadowInline(ShadowMask, ShadowBytes, Done, i, IRB, ShadowBase);
+      IRB.CreateCall(AsanSetShadowFunc[Val],
+                     {IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i)),
+                      ConstantInt::get(IntptrTy, j - i)});
+      Done = j;
+    }
+  }
+
+  copyToShadowInline(ShadowMask, ShadowBytes, Done, End, IRB, ShadowBase);
+}
+
+// Fake stack allocator (asan_fake_stack.h) has 11 size classes
+// for every power of 2 from kMinStackMallocSize to kMaxAsanStackMallocSizeClass
+static int StackMallocSizeClass(uint64_t LocalStackSize) {
+  assert(LocalStackSize <= kMaxStackMallocSize);
+  uint64_t MaxSize = kMinStackMallocSize;
+  for (int i = 0;; i++, MaxSize *= 2)
+    if (LocalStackSize <= MaxSize) return i;
+  llvm_unreachable("impossible LocalStackSize");
+}
+
+PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond,
+                                          Value *ValueIfTrue,
+                                          Instruction *ThenTerm,
+                                          Value *ValueIfFalse) {
+  PHINode *PHI = IRB.CreatePHI(IntptrTy, 2);
+  BasicBlock *CondBlock = cast<Instruction>(Cond)->getParent();
+  PHI->addIncoming(ValueIfFalse, CondBlock);
+  BasicBlock *ThenBlock = ThenTerm->getParent();
+  PHI->addIncoming(ValueIfTrue, ThenBlock);
+  return PHI;
+}
+
+Value *FunctionStackPoisoner::createAllocaForLayout(
+    IRBuilder<> &IRB, const ASanStackFrameLayout &L, bool Dynamic) {
+  AllocaInst *Alloca;
+  if (Dynamic) {
+    Alloca = IRB.CreateAlloca(IRB.getInt8Ty(),
+                              ConstantInt::get(IRB.getInt64Ty(), L.FrameSize),
+                              "MyAlloca");
+  } else {
+    Alloca = IRB.CreateAlloca(ArrayType::get(IRB.getInt8Ty(), L.FrameSize),
+                              nullptr, "MyAlloca");
+    assert(Alloca->isStaticAlloca());
+  }
+  assert((ClRealignStack & (ClRealignStack - 1)) == 0);
+  size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
+  Alloca->setAlignment(FrameAlignment);
+  return IRB.CreatePointerCast(Alloca, IntptrTy);
+}
+
+void FunctionStackPoisoner::createDynamicAllocasInitStorage() {
+  BasicBlock &FirstBB = *F.begin();
+  IRBuilder<> IRB(dyn_cast<Instruction>(FirstBB.begin()));
+  DynamicAllocaLayout = IRB.CreateAlloca(IntptrTy, nullptr);
+  IRB.CreateStore(Constant::getNullValue(IntptrTy), DynamicAllocaLayout);
+  DynamicAllocaLayout->setAlignment(32);
+}
+
+void FunctionStackPoisoner::processDynamicAllocas() {
+  if (!ClInstrumentDynamicAllocas || DynamicAllocaVec.empty()) {
+    assert(DynamicAllocaPoisonCallVec.empty());
+    return;
+  }
+
+  // Insert poison calls for lifetime intrinsics for dynamic allocas.
+  for (const auto &APC : DynamicAllocaPoisonCallVec) {
+    assert(APC.InsBefore);
+    assert(APC.AI);
+    assert(ASan.isInterestingAlloca(*APC.AI));
+    assert(!APC.AI->isStaticAlloca());
+
+    IRBuilder<> IRB(APC.InsBefore);
+    poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison);
+    // Dynamic allocas will be unpoisoned unconditionally below in
+    // unpoisonDynamicAllocas.
+    // Flag that we need unpoison static allocas.
+  }
+
+  // Handle dynamic allocas.
+  createDynamicAllocasInitStorage();
+  for (auto &AI : DynamicAllocaVec)
+    handleDynamicAllocaCall(AI);
+  unpoisonDynamicAllocas();
+}
+
+void FunctionStackPoisoner::processStaticAllocas() {
+  if (AllocaVec.empty()) {
+    assert(StaticAllocaPoisonCallVec.empty());
+    return;
+  }
+
+  int StackMallocIdx = -1;
+  DebugLoc EntryDebugLocation;
+  if (auto SP = F.getSubprogram())
+    EntryDebugLocation = DebugLoc::get(SP->getScopeLine(), 0, SP);
+
+  Instruction *InsBefore = AllocaVec[0];
+  IRBuilder<> IRB(InsBefore);
+  IRB.SetCurrentDebugLocation(EntryDebugLocation);
+
+  // Make sure non-instrumented allocas stay in the entry block. Otherwise,
+  // debug info is broken, because only entry-block allocas are treated as
+  // regular stack slots.
+  auto InsBeforeB = InsBefore->getParent();
+  assert(InsBeforeB == &F.getEntryBlock());
+  for (auto *AI : StaticAllocasToMoveUp)
+    if (AI->getParent() == InsBeforeB)
+      AI->moveBefore(InsBefore);
+
+  // If we have a call to llvm.localescape, keep it in the entry block.
+  if (LocalEscapeCall) LocalEscapeCall->moveBefore(InsBefore);
+
+  SmallVector<ASanStackVariableDescription, 16> SVD;
+  SVD.reserve(AllocaVec.size());
+  for (AllocaInst *AI : AllocaVec) {
+    ASanStackVariableDescription D = {AI->getName().data(),
+                                      ASan.getAllocaSizeInBytes(*AI),
+                                      0,
+                                      AI->getAlignment(),
+                                      AI,
+                                      0,
+                                      0};
+    SVD.push_back(D);
+  }
+
+  // Minimal header size (left redzone) is 4 pointers,
+  // i.e. 32 bytes on 64-bit platforms and 16 bytes in 32-bit platforms.
+  size_t MinHeaderSize = ASan.LongSize / 2;
+  const ASanStackFrameLayout &L =
+      ComputeASanStackFrameLayout(SVD, 1ULL << Mapping.Scale, MinHeaderSize);
+
+  // Build AllocaToSVDMap for ASanStackVariableDescription lookup.
+  DenseMap<const AllocaInst *, ASanStackVariableDescription *> AllocaToSVDMap;
+  for (auto &Desc : SVD)
+    AllocaToSVDMap[Desc.AI] = &Desc;
+
+  // Update SVD with information from lifetime intrinsics.
+  for (const auto &APC : StaticAllocaPoisonCallVec) {
+    assert(APC.InsBefore);
+    assert(APC.AI);
+    assert(ASan.isInterestingAlloca(*APC.AI));
+    assert(APC.AI->isStaticAlloca());
+
+    ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI];
+    Desc.LifetimeSize = Desc.Size;
+    if (const DILocation *FnLoc = EntryDebugLocation.get()) {
+      if (const DILocation *LifetimeLoc = APC.InsBefore->getDebugLoc().get()) {
+        if (LifetimeLoc->getFile() == FnLoc->getFile())
+          if (unsigned Line = LifetimeLoc->getLine())
+            Desc.Line = std::min(Desc.Line ? Desc.Line : Line, Line);
+      }
+    }
+  }
+
+  auto DescriptionString = ComputeASanStackFrameDescription(SVD);
+  DEBUG(dbgs() << DescriptionString << " --- " << L.FrameSize << "\n");
+  uint64_t LocalStackSize = L.FrameSize;
+  bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel &&
+                       LocalStackSize <= kMaxStackMallocSize;
+  bool DoDynamicAlloca = ClDynamicAllocaStack;
+  // Don't do dynamic alloca or stack malloc if:
+  // 1) There is inline asm: too often it makes assumptions on which registers
+  //    are available.
+  // 2) There is a returns_twice call (typically setjmp), which is
+  //    optimization-hostile, and doesn't play well with introduced indirect
+  //    register-relative calculation of local variable addresses.
+  DoDynamicAlloca &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall;
+  DoStackMalloc &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall;
+
+  Value *StaticAlloca =
+      DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false);
+
+  Value *FakeStack;
+  Value *LocalStackBase;
+
+  if (DoStackMalloc) {
+    // void *FakeStack = __asan_option_detect_stack_use_after_return
+    //     ? __asan_stack_malloc_N(LocalStackSize)
+    //     : nullptr;
+    // void *LocalStackBase = (FakeStack) ? FakeStack : alloca(LocalStackSize);
+    Constant *OptionDetectUseAfterReturn = F.getParent()->getOrInsertGlobal(
+        kAsanOptionDetectUseAfterReturn, IRB.getInt32Ty());
+    Value *UseAfterReturnIsEnabled =
+        IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUseAfterReturn),
+                         Constant::getNullValue(IRB.getInt32Ty()));
+    Instruction *Term =
+        SplitBlockAndInsertIfThen(UseAfterReturnIsEnabled, InsBefore, false);
+    IRBuilder<> IRBIf(Term);
+    IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
+    StackMallocIdx = StackMallocSizeClass(LocalStackSize);
+    assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass);
+    Value *FakeStackValue =
+        IRBIf.CreateCall(AsanStackMallocFunc[StackMallocIdx],
+                         ConstantInt::get(IntptrTy, LocalStackSize));
+    IRB.SetInsertPoint(InsBefore);
+    IRB.SetCurrentDebugLocation(EntryDebugLocation);
+    FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term,
+                          ConstantInt::get(IntptrTy, 0));
+
+    Value *NoFakeStack =
+        IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy));
+    Term = SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false);
+    IRBIf.SetInsertPoint(Term);
+    IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
+    Value *AllocaValue =
+        DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca;
+    IRB.SetInsertPoint(InsBefore);
+    IRB.SetCurrentDebugLocation(EntryDebugLocation);
+    LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack);
+  } else {
+    // void *FakeStack = nullptr;
+    // void *LocalStackBase = alloca(LocalStackSize);
+    FakeStack = ConstantInt::get(IntptrTy, 0);
+    LocalStackBase =
+        DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca;
+  }
+
+  // Replace Alloca instructions with base+offset.
+  for (const auto &Desc : SVD) {
+    AllocaInst *AI = Desc.AI;
+    Value *NewAllocaPtr = IRB.CreateIntToPtr(
+        IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
+        AI->getType());
+    replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB, DIExpression::NoDeref);
+    AI->replaceAllUsesWith(NewAllocaPtr);
+  }
+
+  // The left-most redzone has enough space for at least 4 pointers.
+  // Write the Magic value to redzone[0].
+  Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
+  IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
+                  BasePlus0);
+  // Write the frame description constant to redzone[1].
+  Value *BasePlus1 = IRB.CreateIntToPtr(
+      IRB.CreateAdd(LocalStackBase,
+                    ConstantInt::get(IntptrTy, ASan.LongSize / 8)),
+      IntptrPtrTy);
+  GlobalVariable *StackDescriptionGlobal =
+      createPrivateGlobalForString(*F.getParent(), DescriptionString,
+                                   /*AllowMerging*/ true);
+  Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
+  IRB.CreateStore(Description, BasePlus1);
+  // Write the PC to redzone[2].
+  Value *BasePlus2 = IRB.CreateIntToPtr(
+      IRB.CreateAdd(LocalStackBase,
+                    ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8)),
+      IntptrPtrTy);
+  IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
+
+  const auto &ShadowAfterScope = GetShadowBytesAfterScope(SVD, L);
+
+  // Poison the stack red zones at the entry.
+  Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
+  // As mask we must use most poisoned case: red zones and after scope.
+  // As bytes we can use either the same or just red zones only.
+  copyToShadow(ShadowAfterScope, ShadowAfterScope, IRB, ShadowBase);
+
+  if (!StaticAllocaPoisonCallVec.empty()) {
+    const auto &ShadowInScope = GetShadowBytes(SVD, L);
+
+    // Poison static allocas near lifetime intrinsics.
+    for (const auto &APC : StaticAllocaPoisonCallVec) {
+      const ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI];
+      assert(Desc.Offset % L.Granularity == 0);
+      size_t Begin = Desc.Offset / L.Granularity;
+      size_t End = Begin + (APC.Size + L.Granularity - 1) / L.Granularity;
+
+      IRBuilder<> IRB(APC.InsBefore);
+      copyToShadow(ShadowAfterScope,
+                   APC.DoPoison ? ShadowAfterScope : ShadowInScope, Begin, End,
+                   IRB, ShadowBase);
+    }
+  }
+
+  SmallVector<uint8_t, 64> ShadowClean(ShadowAfterScope.size(), 0);
+  SmallVector<uint8_t, 64> ShadowAfterReturn;
+
+  // (Un)poison the stack before all ret instructions.
+  for (auto Ret : RetVec) {
+    IRBuilder<> IRBRet(Ret);
+    // Mark the current frame as retired.
+    IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
+                       BasePlus0);
+    if (DoStackMalloc) {
+      assert(StackMallocIdx >= 0);
+      // if FakeStack != 0  // LocalStackBase == FakeStack
+      //     // In use-after-return mode, poison the whole stack frame.
+      //     if StackMallocIdx <= 4
+      //         // For small sizes inline the whole thing:
+      //         memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize);
+      //         **SavedFlagPtr(FakeStack) = 0
+      //     else
+      //         __asan_stack_free_N(FakeStack, LocalStackSize)
+      // else
+      //     <This is not a fake stack; unpoison the redzones>
+      Value *Cmp =
+          IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy));
+      TerminatorInst *ThenTerm, *ElseTerm;
+      SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm);
+
+      IRBuilder<> IRBPoison(ThenTerm);
+      if (StackMallocIdx <= 4) {
+        int ClassSize = kMinStackMallocSize << StackMallocIdx;
+        ShadowAfterReturn.resize(ClassSize / L.Granularity,
+                                 kAsanStackUseAfterReturnMagic);
+        copyToShadow(ShadowAfterReturn, ShadowAfterReturn, IRBPoison,
+                     ShadowBase);
+        Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
+            FakeStack,
+            ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
+        Value *SavedFlagPtr = IRBPoison.CreateLoad(
+            IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy));
+        IRBPoison.CreateStore(
+            Constant::getNullValue(IRBPoison.getInt8Ty()),
+            IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy()));
+      } else {
+        // For larger frames call __asan_stack_free_*.
+        IRBPoison.CreateCall(
+            AsanStackFreeFunc[StackMallocIdx],
+            {FakeStack, ConstantInt::get(IntptrTy, LocalStackSize)});
+      }
+
+      IRBuilder<> IRBElse(ElseTerm);
+      copyToShadow(ShadowAfterScope, ShadowClean, IRBElse, ShadowBase);
+    } else {
+      copyToShadow(ShadowAfterScope, ShadowClean, IRBRet, ShadowBase);
+    }
+  }
+
+  // We are done. Remove the old unused alloca instructions.
+  for (auto AI : AllocaVec) AI->eraseFromParent();
+}
+
+void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
+                                         IRBuilder<> &IRB, bool DoPoison) {
+  // For now just insert the call to ASan runtime.
+  Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy);
+  Value *SizeArg = ConstantInt::get(IntptrTy, Size);
+  IRB.CreateCall(
+      DoPoison ? AsanPoisonStackMemoryFunc : AsanUnpoisonStackMemoryFunc,
+      {AddrArg, SizeArg});
+}
+
+// Handling llvm.lifetime intrinsics for a given %alloca:
+// (1) collect all llvm.lifetime.xxx(%size, %value) describing the alloca.
+// (2) if %size is constant, poison memory for llvm.lifetime.end (to detect
+//     invalid accesses) and unpoison it for llvm.lifetime.start (the memory
+//     could be poisoned by previous llvm.lifetime.end instruction, as the
+//     variable may go in and out of scope several times, e.g. in loops).
+// (3) if we poisoned at least one %alloca in a function,
+//     unpoison the whole stack frame at function exit.
+
+AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(V))
+    // We're interested only in allocas we can handle.
+    return ASan.isInterestingAlloca(*AI) ? AI : nullptr;
+  // See if we've already calculated (or started to calculate) alloca for a
+  // given value.
+  AllocaForValueMapTy::iterator I = AllocaForValue.find(V);
+  if (I != AllocaForValue.end()) return I->second;
+  // Store 0 while we're calculating alloca for value V to avoid
+  // infinite recursion if the value references itself.
+  AllocaForValue[V] = nullptr;
+  AllocaInst *Res = nullptr;
+  if (CastInst *CI = dyn_cast<CastInst>(V))
+    Res = findAllocaForValue(CI->getOperand(0));
+  else if (PHINode *PN = dyn_cast<PHINode>(V)) {
+    for (Value *IncValue : PN->incoming_values()) {
+      // Allow self-referencing phi-nodes.
+      if (IncValue == PN) continue;
+      AllocaInst *IncValueAI = findAllocaForValue(IncValue);
+      // AI for incoming values should exist and should all be equal.
+      if (IncValueAI == nullptr || (Res != nullptr && IncValueAI != Res))
+        return nullptr;
+      Res = IncValueAI;
+    }
+  } else if (GetElementPtrInst *EP = dyn_cast<GetElementPtrInst>(V)) {
+    Res = findAllocaForValue(EP->getPointerOperand());
+  } else {
+    DEBUG(dbgs() << "Alloca search canceled on unknown instruction: " << *V << "\n");
+  }
+  if (Res) AllocaForValue[V] = Res;
+  return Res;
+}
+
+void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
+  IRBuilder<> IRB(AI);
+
+  const unsigned Align = std::max(kAllocaRzSize, AI->getAlignment());
+  const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1;
+
+  Value *Zero = Constant::getNullValue(IntptrTy);
+  Value *AllocaRzSize = ConstantInt::get(IntptrTy, kAllocaRzSize);
+  Value *AllocaRzMask = ConstantInt::get(IntptrTy, AllocaRedzoneMask);
+
+  // Since we need to extend alloca with additional memory to locate
+  // redzones, and OldSize is number of allocated blocks with
+  // ElementSize size, get allocated memory size in bytes by
+  // OldSize * ElementSize.
+  const unsigned ElementSize =
+      F.getParent()->getDataLayout().getTypeAllocSize(AI->getAllocatedType());
+  Value *OldSize =
+      IRB.CreateMul(IRB.CreateIntCast(AI->getArraySize(), IntptrTy, false),
+                    ConstantInt::get(IntptrTy, ElementSize));
+
+  // PartialSize = OldSize % 32
+  Value *PartialSize = IRB.CreateAnd(OldSize, AllocaRzMask);
+
+  // Misalign = kAllocaRzSize - PartialSize;
+  Value *Misalign = IRB.CreateSub(AllocaRzSize, PartialSize);
+
+  // PartialPadding = Misalign != kAllocaRzSize ? Misalign : 0;
+  Value *Cond = IRB.CreateICmpNE(Misalign, AllocaRzSize);
+  Value *PartialPadding = IRB.CreateSelect(Cond, Misalign, Zero);
+
+  // AdditionalChunkSize = Align + PartialPadding + kAllocaRzSize
+  // Align is added to locate left redzone, PartialPadding for possible
+  // partial redzone and kAllocaRzSize for right redzone respectively.
+  Value *AdditionalChunkSize = IRB.CreateAdd(
+      ConstantInt::get(IntptrTy, Align + kAllocaRzSize), PartialPadding);
+
+  Value *NewSize = IRB.CreateAdd(OldSize, AdditionalChunkSize);
+
+  // Insert new alloca with new NewSize and Align params.
+  AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize);
+  NewAlloca->setAlignment(Align);
+
+  // NewAddress = Address + Align
+  Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),
+                                    ConstantInt::get(IntptrTy, Align));
+
+  // Insert __asan_alloca_poison call for new created alloca.
+  IRB.CreateCall(AsanAllocaPoisonFunc, {NewAddress, OldSize});
+
+  // Store the last alloca's address to DynamicAllocaLayout. We'll need this
+  // for unpoisoning stuff.
+  IRB.CreateStore(IRB.CreatePtrToInt(NewAlloca, IntptrTy), DynamicAllocaLayout);
+
+  Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType());
+
+  // Replace all uses of AddessReturnedByAlloca with NewAddressPtr.
+  AI->replaceAllUsesWith(NewAddressPtr);
+
+  // We are done. Erase old alloca from parent.
+  AI->eraseFromParent();
+}
+
+// isSafeAccess returns true if Addr is always inbounds with respect to its
+// base object. For example, it is a field access or an array access with
+// constant inbounds index.
+bool AddressSanitizer::isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis,
+                                    Value *Addr, uint64_t TypeSize) const {
+  SizeOffsetType SizeOffset = ObjSizeVis.compute(Addr);
+  if (!ObjSizeVis.bothKnown(SizeOffset)) return false;
+  uint64_t Size = SizeOffset.first.getZExtValue();
+  int64_t Offset = SizeOffset.second.getSExtValue();
+  // Three checks are required to ensure safety:
+  // . Offset >= 0  (since the offset is given from the base ptr)
+  // . Size >= Offset  (unsigned)
+  // . Size - Offset >= NeededSize  (unsigned)
+  return Offset >= 0 && Size >= uint64_t(Offset) &&
+         Size - uint64_t(Offset) >= TypeSize / 8;
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
new file mode 100644
index 000000000000..a193efe902cf
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -0,0 +1,212 @@
+//===- BoundsChecking.cpp - Instrumentation for run-time bounds checking --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass that instruments the code to perform run-time
+// bounds checking on loads, stores, and other memory intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bounds-checking"
+
+static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap",
+                                  cl::desc("Use one trap block per function"));
+
+STATISTIC(ChecksAdded, "Bounds checks added");
+STATISTIC(ChecksSkipped, "Bounds checks skipped");
+STATISTIC(ChecksUnable, "Bounds checks unable to add");
+
+typedef IRBuilder<TargetFolder> BuilderTy;
+
+namespace {
+  struct BoundsChecking : public FunctionPass {
+    static char ID;
+
+    BoundsChecking() : FunctionPass(ID) {
+      initializeBoundsCheckingPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+    }
+
+  private:
+    const TargetLibraryInfo *TLI;
+    ObjectSizeOffsetEvaluator *ObjSizeEval;
+    BuilderTy *Builder;
+    Instruction *Inst;
+    BasicBlock *TrapBB;
+
+    BasicBlock *getTrapBB();
+    void emitBranchToTrap(Value *Cmp = nullptr);
+    bool instrument(Value *Ptr, Value *Val, const DataLayout &DL);
+ };
+}
+
+char BoundsChecking::ID = 0;
+INITIALIZE_PASS(BoundsChecking, "bounds-checking", "Run-time bounds checking",
+                false, false)
+
+
+/// getTrapBB - create a basic block that traps. All overflowing conditions
+/// branch to this block. There's only one trap block per function.
+BasicBlock *BoundsChecking::getTrapBB() {
+  if (TrapBB && SingleTrapBB)
+    return TrapBB;
+
+  Function *Fn = Inst->getParent()->getParent();
+  IRBuilder<>::InsertPointGuard Guard(*Builder);
+  TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
+  Builder->SetInsertPoint(TrapBB);
+
+  llvm::Value *F = Intrinsic::getDeclaration(Fn->getParent(), Intrinsic::trap);
+  CallInst *TrapCall = Builder->CreateCall(F, {});
+  TrapCall->setDoesNotReturn();
+  TrapCall->setDoesNotThrow();
+  TrapCall->setDebugLoc(Inst->getDebugLoc());
+  Builder->CreateUnreachable();
+
+  return TrapBB;
+}
+
+
+/// emitBranchToTrap - emit a branch instruction to a trap block.
+/// If Cmp is non-null, perform a jump only if its value evaluates to true.
+void BoundsChecking::emitBranchToTrap(Value *Cmp) {
+  // check if the comparison is always false
+  ConstantInt *C = dyn_cast_or_null<ConstantInt>(Cmp);
+  if (C) {
+    ++ChecksSkipped;
+    if (!C->getZExtValue())
+      return;
+    else
+      Cmp = nullptr; // unconditional branch
+  }
+  ++ChecksAdded;
+
+  BasicBlock::iterator Inst = Builder->GetInsertPoint();
+  BasicBlock *OldBB = Inst->getParent();
+  BasicBlock *Cont = OldBB->splitBasicBlock(Inst);
+  OldBB->getTerminator()->eraseFromParent();
+
+  if (Cmp)
+    BranchInst::Create(getTrapBB(), Cont, Cmp, OldBB);
+  else
+    BranchInst::Create(getTrapBB(), OldBB);
+}
+
+
+/// instrument - adds run-time bounds checks to memory accessing instructions.
+/// Ptr is the pointer that will be read/written, and InstVal is either the
+/// result from the load or the value being stored. It is used to determine the
+/// size of memory block that is touched.
+/// Returns true if any change was made to the IR, false otherwise.
+bool BoundsChecking::instrument(Value *Ptr, Value *InstVal,
+                                const DataLayout &DL) {
+  uint64_t NeededSize = DL.getTypeStoreSize(InstVal->getType());
+  DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
+              << " bytes\n");
+
+  SizeOffsetEvalType SizeOffset = ObjSizeEval->compute(Ptr);
+
+  if (!ObjSizeEval->bothKnown(SizeOffset)) {
+    ++ChecksUnable;
+    return false;
+  }
+
+  Value *Size   = SizeOffset.first;
+  Value *Offset = SizeOffset.second;
+  ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
+
+  Type *IntTy = DL.getIntPtrType(Ptr->getType());
+  Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
+
+  // three checks are required to ensure safety:
+  // . Offset >= 0  (since the offset is given from the base ptr)
+  // . Size >= Offset  (unsigned)
+  // . Size - Offset >= NeededSize  (unsigned)
+  //
+  // optimization: if Size >= 0 (signed), skip 1st check
+  // FIXME: add NSW/NUW here?  -- we dont care if the subtraction overflows
+  Value *ObjSize = Builder->CreateSub(Size, Offset);
+  Value *Cmp2 = Builder->CreateICmpULT(Size, Offset);
+  Value *Cmp3 = Builder->CreateICmpULT(ObjSize, NeededSizeVal);
+  Value *Or = Builder->CreateOr(Cmp2, Cmp3);
+  if (!SizeCI || SizeCI->getValue().slt(0)) {
+    Value *Cmp1 = Builder->CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0));
+    Or = Builder->CreateOr(Cmp1, Or);
+  }
+  emitBranchToTrap(Or);
+
+  return true;
+}
+
+bool BoundsChecking::runOnFunction(Function &F) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  TrapBB = nullptr;
+  BuilderTy TheBuilder(F.getContext(), TargetFolder(DL));
+  Builder = &TheBuilder;
+  ObjectSizeOffsetEvaluator TheObjSizeEval(DL, TLI, F.getContext(),
+                                           /*RoundToAlign=*/true);
+  ObjSizeEval = &TheObjSizeEval;
+
+  // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
+  // touching instructions
+  std::vector<Instruction*> WorkList;
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &*i;
+    if (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<AtomicCmpXchgInst>(I) ||
+        isa<AtomicRMWInst>(I))
+        WorkList.push_back(I);
+  }
+
+  bool MadeChange = false;
+  for (Instruction *i : WorkList) {
+    Inst = i;
+
+    Builder->SetInsertPoint(Inst);
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+      MadeChange |= instrument(LI->getPointerOperand(), LI, DL);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      MadeChange |=
+          instrument(SI->getPointerOperand(), SI->getValueOperand(), DL);
+    } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+      MadeChange |=
+          instrument(AI->getPointerOperand(), AI->getCompareOperand(), DL);
+    } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) {
+      MadeChange |=
+          instrument(AI->getPointerOperand(), AI->getValOperand(), DL);
+    } else {
+      llvm_unreachable("unknown Instruction type");
+    }
+  }
+  return MadeChange;
+}
+
+FunctionPass *llvm::createBoundsCheckingPass() {
+  return new BoundsChecking();
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h
new file mode 100644
index 000000000000..16e2e6b4e730
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h
@@ -0,0 +1,230 @@
+//===-- CFGMST.h - Minimum Spanning Tree for CFG ----------------*- C++ -*-===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Union-find algorithm to compute Minimum Spanning Tree
+// for a given CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
+#define LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "cfgmst"
+
+namespace llvm {
+
+/// \brief An union-find based Minimum Spanning Tree for CFG
+///
+/// Implements a Union-find algorithm to compute Minimum Spanning Tree
+/// for a given CFG.
+template <class Edge, class BBInfo> class CFGMST {
+public:
+  Function &F;
+
+  // Store all the edges in CFG. It may contain some stale edges
+  // when Removed is set.
+  std::vector<std::unique_ptr<Edge>> AllEdges;
+
+  // This map records the auxiliary information for each BB.
+  DenseMap<const BasicBlock *, std::unique_ptr<BBInfo>> BBInfos;
+
+  // Find the root group of the G and compress the path from G to the root.
+  BBInfo *findAndCompressGroup(BBInfo *G) {
+    if (G->Group != G)
+      G->Group = findAndCompressGroup(static_cast<BBInfo *>(G->Group));
+    return static_cast<BBInfo *>(G->Group);
+  }
+
+  // Union BB1 and BB2 into the same group and return true.
+  // Returns false if BB1 and BB2 are already in the same group.
+  bool unionGroups(const BasicBlock *BB1, const BasicBlock *BB2) {
+    BBInfo *BB1G = findAndCompressGroup(&getBBInfo(BB1));
+    BBInfo *BB2G = findAndCompressGroup(&getBBInfo(BB2));
+
+    if (BB1G == BB2G)
+      return false;
+
+    // Make the smaller rank tree a direct child or the root of high rank tree.
+    if (BB1G->Rank < BB2G->Rank)
+      BB1G->Group = BB2G;
+    else {
+      BB2G->Group = BB1G;
+      // If the ranks are the same, increment root of one tree by one.
+      if (BB1G->Rank == BB2G->Rank)
+        BB1G->Rank++;
+    }
+    return true;
+  }
+
+  // Give BB, return the auxiliary information.
+  BBInfo &getBBInfo(const BasicBlock *BB) const {
+    auto It = BBInfos.find(BB);
+    assert(It->second.get() != nullptr);
+    return *It->second.get();
+  }
+
+  // Give BB, return the auxiliary information if it's available.
+  BBInfo *findBBInfo(const BasicBlock *BB) const {
+    auto It = BBInfos.find(BB);
+    if (It == BBInfos.end())
+      return nullptr;
+    return It->second.get();
+  }
+
+  // Traverse the CFG using a stack. Find all the edges and assign the weight.
+  // Edges with large weight will be put into MST first so they are less likely
+  // to be instrumented.
+  void buildEdges() {
+    DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n");
+
+    const BasicBlock *BB = &(F.getEntryBlock());
+    uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2);
+    // Add a fake edge to the entry.
+    addEdge(nullptr, BB, EntryWeight);
+
+    // Special handling for single BB functions.
+    if (succ_empty(BB)) {
+      addEdge(BB, nullptr, EntryWeight);
+      return;
+    }
+
+    static const uint32_t CriticalEdgeMultiplier = 1000;
+
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      TerminatorInst *TI = BB->getTerminator();
+      uint64_t BBWeight =
+          (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2);
+      uint64_t Weight = 2;
+      if (int successors = TI->getNumSuccessors()) {
+        for (int i = 0; i != successors; ++i) {
+          BasicBlock *TargetBB = TI->getSuccessor(i);
+          bool Critical = isCriticalEdge(TI, i);
+          uint64_t scaleFactor = BBWeight;
+          if (Critical) {
+            if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier)
+              scaleFactor *= CriticalEdgeMultiplier;
+            else
+              scaleFactor = UINT64_MAX;
+          }
+          if (BPI != nullptr)
+            Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor);
+          addEdge(&*BB, TargetBB, Weight).IsCritical = Critical;
+          DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to "
+                       << TargetBB->getName() << "  w=" << Weight << "\n");
+        }
+      } else {
+        addEdge(&*BB, nullptr, BBWeight);
+        DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to exit"
+                     << " w = " << BBWeight << "\n");
+      }
+    }
+  }
+
+  // Sort CFG edges based on its weight.
+  void sortEdgesByWeight() {
+    std::stable_sort(AllEdges.begin(), AllEdges.end(),
+                     [](const std::unique_ptr<Edge> &Edge1,
+                        const std::unique_ptr<Edge> &Edge2) {
+                       return Edge1->Weight > Edge2->Weight;
+                     });
+  }
+
+  // Traverse all the edges and compute the Minimum Weight Spanning Tree
+  // using union-find algorithm.
+  void computeMinimumSpanningTree() {
+    // First, put all the critical edge with landing-pad as the Dest to MST.
+    // This works around the insufficient support of critical edges split
+    // when destination BB is a landing pad.
+    for (auto &Ei : AllEdges) {
+      if (Ei->Removed)
+        continue;
+      if (Ei->IsCritical) {
+        if (Ei->DestBB && Ei->DestBB->isLandingPad()) {
+          if (unionGroups(Ei->SrcBB, Ei->DestBB))
+            Ei->InMST = true;
+        }
+      }
+    }
+
+    for (auto &Ei : AllEdges) {
+      if (Ei->Removed)
+        continue;
+      if (unionGroups(Ei->SrcBB, Ei->DestBB))
+        Ei->InMST = true;
+    }
+  }
+
+  // Dump the Debug information about the instrumentation.
+  void dumpEdges(raw_ostream &OS, const Twine &Message) const {
+    if (!Message.str().empty())
+      OS << Message << "\n";
+    OS << "  Number of Basic Blocks: " << BBInfos.size() << "\n";
+    for (auto &BI : BBInfos) {
+      const BasicBlock *BB = BI.first;
+      OS << "  BB: " << (BB == nullptr ? "FakeNode" : BB->getName()) << "  "
+         << BI.second->infoString() << "\n";
+    }
+
+    OS << "  Number of Edges: " << AllEdges.size()
+       << " (*: Instrument, C: CriticalEdge, -: Removed)\n";
+    uint32_t Count = 0;
+    for (auto &EI : AllEdges)
+      OS << "  Edge " << Count++ << ": " << getBBInfo(EI->SrcBB).Index << "-->"
+         << getBBInfo(EI->DestBB).Index << EI->infoString() << "\n";
+  }
+
+  // Add an edge to AllEdges with weight W.
+  Edge &addEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W) {
+    uint32_t Index = BBInfos.size();
+    auto Iter = BBInfos.end();
+    bool Inserted;
+    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr));
+    if (Inserted) {
+      // Newly inserted, update the real info.
+      Iter->second = std::move(llvm::make_unique<BBInfo>(Index));
+      Index++;
+    }
+    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr));
+    if (Inserted)
+      // Newly inserted, update the real info.
+      Iter->second = std::move(llvm::make_unique<BBInfo>(Index));
+    AllEdges.emplace_back(new Edge(Src, Dest, W));
+    return *AllEdges.back();
+  }
+
+  BranchProbabilityInfo *BPI;
+  BlockFrequencyInfo *BFI;
+
+public:
+  CFGMST(Function &Func, BranchProbabilityInfo *BPI_ = nullptr,
+         BlockFrequencyInfo *BFI_ = nullptr)
+      : F(Func), BPI(BPI_), BFI(BFI_) {
+    buildEdges();
+    sortEdgesByWeight();
+    computeMinimumSpanningTree();
+  }
+};
+
+} // end namespace llvm
+
+#undef DEBUG_TYPE // "cfgmst"
+
+#endif // LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
new file mode 100644
index 000000000000..a33490f6e4ac
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -0,0 +1,1628 @@
+//===-- DataFlowSanitizer.cpp - dynamic data flow analysis ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file is a part of DataFlowSanitizer, a generalised dynamic data flow
+/// analysis.
+///
+/// Unlike other Sanitizer tools, this tool is not designed to detect a specific
+/// class of bugs on its own.  Instead, it provides a generic dynamic data flow
+/// analysis framework to be used by clients to help detect application-specific
+/// issues within their own code.
+///
+/// The analysis is based on automatic propagation of data flow labels (also
+/// known as taint labels) through a program as it performs computation.  Each
+/// byte of application memory is backed by two bytes of shadow memory which
+/// hold the label.  On Linux/x86_64, memory is laid out as follows:
+///
+/// +--------------------+ 0x800000000000 (top of memory)
+/// | application memory |
+/// +--------------------+ 0x700000008000 (kAppAddr)
+/// |                    |
+/// |       unused       |
+/// |                    |
+/// +--------------------+ 0x200200000000 (kUnusedAddr)
+/// |    union table     |
+/// +--------------------+ 0x200000000000 (kUnionTableAddr)
+/// |   shadow memory    |
+/// +--------------------+ 0x000000010000 (kShadowAddr)
+/// | reserved by kernel |
+/// +--------------------+ 0x000000000000
+///
+/// To derive a shadow memory address from an application memory address,
+/// bits 44-46 are cleared to bring the address into the range
+/// [0x000000008000,0x100000000000).  Then the address is shifted left by 1 to
+/// account for the double byte representation of shadow labels and move the
+/// address into the shadow memory range.  See the function
+/// DataFlowSanitizer::getShadowAddress below.
+///
+/// For more information, please refer to the design document:
+/// http://clang.llvm.org/docs/DataFlowSanitizerDesign.html
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/SpecialCaseList.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <iterator>
+#include <set>
+#include <utility>
+
+using namespace llvm;
+
+// External symbol to be used when generating the shadow address for
+// architectures with multiple VMAs. Instead of using a constant integer
+// the runtime will set the external mask based on the VMA range.
+static const char *const kDFSanExternShadowPtrMask = "__dfsan_shadow_ptr_mask";
+
+// The -dfsan-preserve-alignment flag controls whether this pass assumes that
+// alignment requirements provided by the input IR are correct.  For example,
+// if the input IR contains a load with alignment 8, this flag will cause
+// the shadow load to have alignment 16.  This flag is disabled by default as
+// we have unfortunately encountered too much code (including Clang itself;
+// see PR14291) which performs misaligned access.
+static cl::opt<bool> ClPreserveAlignment(
+    "dfsan-preserve-alignment",
+    cl::desc("respect alignment requirements provided by input IR"), cl::Hidden,
+    cl::init(false));
+
+// The ABI list files control how shadow parameters are passed. The pass treats
+// every function labelled "uninstrumented" in the ABI list file as conforming
+// to the "native" (i.e. unsanitized) ABI.  Unless the ABI list contains
+// additional annotations for those functions, a call to one of those functions
+// will produce a warning message, as the labelling behaviour of the function is
+// unknown.  The other supported annotations are "functional" and "discard",
+// which are described below under DataFlowSanitizer::WrapperKind.
+static cl::list<std::string> ClABIListFiles(
+    "dfsan-abilist",
+    cl::desc("File listing native ABI functions and how the pass treats them"),
+    cl::Hidden);
+
+// Controls whether the pass uses IA_Args or IA_TLS as the ABI for instrumented
+// functions (see DataFlowSanitizer::InstrumentedABI below).
+static cl::opt<bool> ClArgsABI(
+    "dfsan-args-abi",
+    cl::desc("Use the argument ABI rather than the TLS ABI"),
+    cl::Hidden);
+
+// Controls whether the pass includes or ignores the labels of pointers in load
+// instructions.
+static cl::opt<bool> ClCombinePointerLabelsOnLoad(
+    "dfsan-combine-pointer-labels-on-load",
+    cl::desc("Combine the label of the pointer with the label of the data when "
+             "loading from memory."),
+    cl::Hidden, cl::init(true));
+
+// Controls whether the pass includes or ignores the labels of pointers in
+// stores instructions.
+static cl::opt<bool> ClCombinePointerLabelsOnStore(
+    "dfsan-combine-pointer-labels-on-store",
+    cl::desc("Combine the label of the pointer with the label of the data when "
+             "storing in memory."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDebugNonzeroLabels(
+    "dfsan-debug-nonzero-labels",
+    cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, "
+             "load or return with a nonzero label"),
+    cl::Hidden);
+
+
+namespace {
+
+StringRef GetGlobalTypeString(const GlobalValue &G) {
+  // Types of GlobalVariables are always pointer types.
+  Type *GType = G.getValueType();
+  // For now we support blacklisting struct types only.
+  if (StructType *SGType = dyn_cast<StructType>(GType)) {
+    if (!SGType->isLiteral())
+      return SGType->getName();
+  }
+  return "<unknown type>";
+}
+
+class DFSanABIList {
+  std::unique_ptr<SpecialCaseList> SCL;
+
+ public:
+  DFSanABIList() {}
+
+  void set(std::unique_ptr<SpecialCaseList> List) { SCL = std::move(List); }
+
+  /// Returns whether either this function or its source file are listed in the
+  /// given category.
+  bool isIn(const Function &F, StringRef Category) const {
+    return isIn(*F.getParent(), Category) ||
+           SCL->inSection("fun", F.getName(), Category);
+  }
+
+  /// Returns whether this global alias is listed in the given category.
+  ///
+  /// If GA aliases a function, the alias's name is matched as a function name
+  /// would be.  Similarly, aliases of globals are matched like globals.
+  bool isIn(const GlobalAlias &GA, StringRef Category) const {
+    if (isIn(*GA.getParent(), Category))
+      return true;
+
+    if (isa<FunctionType>(GA.getValueType()))
+      return SCL->inSection("fun", GA.getName(), Category);
+
+    return SCL->inSection("global", GA.getName(), Category) ||
+           SCL->inSection("type", GetGlobalTypeString(GA), Category);
+  }
+
+  /// Returns whether this module is listed in the given category.
+  bool isIn(const Module &M, StringRef Category) const {
+    return SCL->inSection("src", M.getModuleIdentifier(), Category);
+  }
+};
+
+class DataFlowSanitizer : public ModulePass {
+  friend struct DFSanFunction;
+  friend class DFSanVisitor;
+
+  enum {
+    ShadowWidth = 16
+  };
+
+  /// Which ABI should be used for instrumented functions?
+  enum InstrumentedABI {
+    /// Argument and return value labels are passed through additional
+    /// arguments and by modifying the return type.
+    IA_Args,
+
+    /// Argument and return value labels are passed through TLS variables
+    /// __dfsan_arg_tls and __dfsan_retval_tls.
+    IA_TLS
+  };
+
+  /// How should calls to uninstrumented functions be handled?
+  enum WrapperKind {
+    /// This function is present in an uninstrumented form but we don't know
+    /// how it should be handled.  Print a warning and call the function anyway.
+    /// Don't label the return value.
+    WK_Warning,
+
+    /// This function does not write to (user-accessible) memory, and its return
+    /// value is unlabelled.
+    WK_Discard,
+
+    /// This function does not write to (user-accessible) memory, and the label
+    /// of its return value is the union of the label of its arguments.
+    WK_Functional,
+
+    /// Instead of calling the function, a custom wrapper __dfsw_F is called,
+    /// where F is the name of the function.  This function may wrap the
+    /// original function or provide its own implementation.  This is similar to
+    /// the IA_Args ABI, except that IA_Args uses a struct return type to
+    /// pass the return value shadow in a register, while WK_Custom uses an
+    /// extra pointer argument to return the shadow.  This allows the wrapped
+    /// form of the function type to be expressed in C.
+    WK_Custom
+  };
+
+  Module *Mod;
+  LLVMContext *Ctx;
+  IntegerType *ShadowTy;
+  PointerType *ShadowPtrTy;
+  IntegerType *IntptrTy;
+  ConstantInt *ZeroShadow;
+  ConstantInt *ShadowPtrMask;
+  ConstantInt *ShadowPtrMul;
+  Constant *ArgTLS;
+  Constant *RetvalTLS;
+  void *(*GetArgTLSPtr)();
+  void *(*GetRetvalTLSPtr)();
+  Constant *GetArgTLS;
+  Constant *GetRetvalTLS;
+  Constant *ExternalShadowMask;
+  FunctionType *DFSanUnionFnTy;
+  FunctionType *DFSanUnionLoadFnTy;
+  FunctionType *DFSanUnimplementedFnTy;
+  FunctionType *DFSanSetLabelFnTy;
+  FunctionType *DFSanNonzeroLabelFnTy;
+  FunctionType *DFSanVarargWrapperFnTy;
+  Constant *DFSanUnionFn;
+  Constant *DFSanCheckedUnionFn;
+  Constant *DFSanUnionLoadFn;
+  Constant *DFSanUnimplementedFn;
+  Constant *DFSanSetLabelFn;
+  Constant *DFSanNonzeroLabelFn;
+  Constant *DFSanVarargWrapperFn;
+  MDNode *ColdCallWeights;
+  DFSanABIList ABIList;
+  DenseMap<Value *, Function *> UnwrappedFnMap;
+  AttrBuilder ReadOnlyNoneAttrs;
+  bool DFSanRuntimeShadowMask;
+
+  Value *getShadowAddress(Value *Addr, Instruction *Pos);
+  bool isInstrumented(const Function *F);
+  bool isInstrumented(const GlobalAlias *GA);
+  FunctionType *getArgsFunctionType(FunctionType *T);
+  FunctionType *getTrampolineFunctionType(FunctionType *T);
+  FunctionType *getCustomFunctionType(FunctionType *T);
+  InstrumentedABI getInstrumentedABI();
+  WrapperKind getWrapperKind(Function *F);
+  void addGlobalNamePrefix(GlobalValue *GV);
+  Function *buildWrapperFunction(Function *F, StringRef NewFName,
+                                 GlobalValue::LinkageTypes NewFLink,
+                                 FunctionType *NewFT);
+  Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName);
+
+ public:
+  DataFlowSanitizer(
+      const std::vector<std::string> &ABIListFiles = std::vector<std::string>(),
+      void *(*getArgTLS)() = nullptr, void *(*getRetValTLS)() = nullptr);
+  static char ID;
+  bool doInitialization(Module &M) override;
+  bool runOnModule(Module &M) override;
+};
+
+struct DFSanFunction {
+  DataFlowSanitizer &DFS;
+  Function *F;
+  DominatorTree DT;
+  DataFlowSanitizer::InstrumentedABI IA;
+  bool IsNativeABI;
+  Value *ArgTLSPtr;
+  Value *RetvalTLSPtr;
+  AllocaInst *LabelReturnAlloca;
+  DenseMap<Value *, Value *> ValShadowMap;
+  DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap;
+  std::vector<std::pair<PHINode *, PHINode *> > PHIFixups;
+  DenseSet<Instruction *> SkipInsts;
+  std::vector<Value *> NonZeroChecks;
+  bool AvoidNewBlocks;
+
+  struct CachedCombinedShadow {
+    BasicBlock *Block;
+    Value *Shadow;
+  };
+  DenseMap<std::pair<Value *, Value *>, CachedCombinedShadow>
+      CachedCombinedShadows;
+  DenseMap<Value *, std::set<Value *>> ShadowElements;
+
+  DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI)
+      : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()),
+        IsNativeABI(IsNativeABI), ArgTLSPtr(nullptr), RetvalTLSPtr(nullptr),
+        LabelReturnAlloca(nullptr) {
+    DT.recalculate(*F);
+    // FIXME: Need to track down the register allocator issue which causes poor
+    // performance in pathological cases with large numbers of basic blocks.
+    AvoidNewBlocks = F->size() > 1000;
+  }
+  Value *getArgTLSPtr();
+  Value *getArgTLS(unsigned Index, Instruction *Pos);
+  Value *getRetvalTLS();
+  Value *getShadow(Value *V);
+  void setShadow(Instruction *I, Value *Shadow);
+  Value *combineShadows(Value *V1, Value *V2, Instruction *Pos);
+  Value *combineOperandShadows(Instruction *Inst);
+  Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align,
+                    Instruction *Pos);
+  void storeShadow(Value *Addr, uint64_t Size, uint64_t Align, Value *Shadow,
+                   Instruction *Pos);
+};
+
+class DFSanVisitor : public InstVisitor<DFSanVisitor> {
+ public:
+  DFSanFunction &DFSF;
+  DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {}
+
+  const DataLayout &getDataLayout() const {
+    return DFSF.F->getParent()->getDataLayout();
+  }
+
+  void visitOperandShadowInst(Instruction &I);
+
+  void visitBinaryOperator(BinaryOperator &BO);
+  void visitCastInst(CastInst &CI);
+  void visitCmpInst(CmpInst &CI);
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI);
+  void visitLoadInst(LoadInst &LI);
+  void visitStoreInst(StoreInst &SI);
+  void visitReturnInst(ReturnInst &RI);
+  void visitCallSite(CallSite CS);
+  void visitPHINode(PHINode &PN);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+  void visitExtractValueInst(ExtractValueInst &I);
+  void visitInsertValueInst(InsertValueInst &I);
+  void visitAllocaInst(AllocaInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitMemSetInst(MemSetInst &I);
+  void visitMemTransferInst(MemTransferInst &I);
+};
+
+}
+
+char DataFlowSanitizer::ID;
+INITIALIZE_PASS(DataFlowSanitizer, "dfsan",
+                "DataFlowSanitizer: dynamic data flow analysis.", false, false)
+
+ModulePass *
+llvm::createDataFlowSanitizerPass(const std::vector<std::string> &ABIListFiles,
+                                  void *(*getArgTLS)(),
+                                  void *(*getRetValTLS)()) {
+  return new DataFlowSanitizer(ABIListFiles, getArgTLS, getRetValTLS);
+}
+
+DataFlowSanitizer::DataFlowSanitizer(
+    const std::vector<std::string> &ABIListFiles, void *(*getArgTLS)(),
+    void *(*getRetValTLS)())
+    : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS),
+      DFSanRuntimeShadowMask(false) {
+  std::vector<std::string> AllABIListFiles(std::move(ABIListFiles));
+  AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(),
+                         ClABIListFiles.end());
+  ABIList.set(SpecialCaseList::createOrDie(AllABIListFiles));
+}
+
+FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
+  llvm::SmallVector<Type *, 4> ArgTypes(T->param_begin(), T->param_end());
+  ArgTypes.append(T->getNumParams(), ShadowTy);
+  if (T->isVarArg())
+    ArgTypes.push_back(ShadowPtrTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    RetType = StructType::get(RetType, ShadowTy);
+  return FunctionType::get(RetType, ArgTypes, T->isVarArg());
+}
+
+FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
+  assert(!T->isVarArg());
+  llvm::SmallVector<Type *, 4> ArgTypes;
+  ArgTypes.push_back(T->getPointerTo());
+  ArgTypes.append(T->param_begin(), T->param_end());
+  ArgTypes.append(T->getNumParams(), ShadowTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    ArgTypes.push_back(ShadowPtrTy);
+  return FunctionType::get(T->getReturnType(), ArgTypes, false);
+}
+
+FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
+  llvm::SmallVector<Type *, 4> ArgTypes;
+  for (FunctionType::param_iterator i = T->param_begin(), e = T->param_end();
+       i != e; ++i) {
+    FunctionType *FT;
+    if (isa<PointerType>(*i) && (FT = dyn_cast<FunctionType>(cast<PointerType>(
+                                     *i)->getElementType()))) {
+      ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo());
+      ArgTypes.push_back(Type::getInt8PtrTy(*Ctx));
+    } else {
+      ArgTypes.push_back(*i);
+    }
+  }
+  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
+    ArgTypes.push_back(ShadowTy);
+  if (T->isVarArg())
+    ArgTypes.push_back(ShadowPtrTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    ArgTypes.push_back(ShadowPtrTy);
+  return FunctionType::get(T->getReturnType(), ArgTypes, T->isVarArg());
+}
+
+bool DataFlowSanitizer::doInitialization(Module &M) {
+  llvm::Triple TargetTriple(M.getTargetTriple());
+  bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
+  bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 ||
+                  TargetTriple.getArch() == llvm::Triple::mips64el;
+  bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64 ||
+                   TargetTriple.getArch() == llvm::Triple::aarch64_be;
+
+  const DataLayout &DL = M.getDataLayout();
+
+  Mod = &M;
+  Ctx = &M.getContext();
+  ShadowTy = IntegerType::get(*Ctx, ShadowWidth);
+  ShadowPtrTy = PointerType::getUnqual(ShadowTy);
+  IntptrTy = DL.getIntPtrType(*Ctx);
+  ZeroShadow = ConstantInt::getSigned(ShadowTy, 0);
+  ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidth / 8);
+  if (IsX86_64)
+    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
+  else if (IsMIPS64)
+    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL);
+  // AArch64 supports multiple VMAs and the shadow mask is set at runtime.
+  else if (IsAArch64)
+    DFSanRuntimeShadowMask = true;
+  else
+    report_fatal_error("unsupported triple");
+
+  Type *DFSanUnionArgs[2] = { ShadowTy, ShadowTy };
+  DFSanUnionFnTy =
+      FunctionType::get(ShadowTy, DFSanUnionArgs, /*isVarArg=*/ false);
+  Type *DFSanUnionLoadArgs[2] = { ShadowPtrTy, IntptrTy };
+  DFSanUnionLoadFnTy =
+      FunctionType::get(ShadowTy, DFSanUnionLoadArgs, /*isVarArg=*/ false);
+  DFSanUnimplementedFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+  Type *DFSanSetLabelArgs[3] = { ShadowTy, Type::getInt8PtrTy(*Ctx), IntptrTy };
+  DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx),
+                                        DFSanSetLabelArgs, /*isVarArg=*/false);
+  DFSanNonzeroLabelFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), None, /*isVarArg=*/false);
+  DFSanVarargWrapperFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+
+  if (GetArgTLSPtr) {
+    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
+    ArgTLS = nullptr;
+    GetArgTLS = ConstantExpr::getIntToPtr(
+        ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
+        PointerType::getUnqual(
+            FunctionType::get(PointerType::getUnqual(ArgTLSTy), false)));
+  }
+  if (GetRetvalTLSPtr) {
+    RetvalTLS = nullptr;
+    GetRetvalTLS = ConstantExpr::getIntToPtr(
+        ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
+        PointerType::getUnqual(
+            FunctionType::get(PointerType::getUnqual(ShadowTy), false)));
+  }
+
+  ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
+  return true;
+}
+
+bool DataFlowSanitizer::isInstrumented(const Function *F) {
+  return !ABIList.isIn(*F, "uninstrumented");
+}
+
+bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) {
+  return !ABIList.isIn(*GA, "uninstrumented");
+}
+
+DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() {
+  return ClArgsABI ? IA_Args : IA_TLS;
+}
+
+DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
+  if (ABIList.isIn(*F, "functional"))
+    return WK_Functional;
+  if (ABIList.isIn(*F, "discard"))
+    return WK_Discard;
+  if (ABIList.isIn(*F, "custom"))
+    return WK_Custom;
+
+  return WK_Warning;
+}
+
+void DataFlowSanitizer::addGlobalNamePrefix(GlobalValue *GV) {
+  std::string GVName = GV->getName(), Prefix = "dfs$";
+  GV->setName(Prefix + GVName);
+
+  // Try to change the name of the function in module inline asm.  We only do
+  // this for specific asm directives, currently only ".symver", to try to avoid
+  // corrupting asm which happens to contain the symbol name as a substring.
+  // Note that the substitution for .symver assumes that the versioned symbol
+  // also has an instrumented name.
+  std::string Asm = GV->getParent()->getModuleInlineAsm();
+  std::string SearchStr = ".symver " + GVName + ",";
+  size_t Pos = Asm.find(SearchStr);
+  if (Pos != std::string::npos) {
+    Asm.replace(Pos, SearchStr.size(),
+                ".symver " + Prefix + GVName + "," + Prefix);
+    GV->getParent()->setModuleInlineAsm(Asm);
+  }
+}
+
+Function *
+DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
+                                        GlobalValue::LinkageTypes NewFLink,
+                                        FunctionType *NewFT) {
+  FunctionType *FT = F->getFunctionType();
+  Function *NewF = Function::Create(NewFT, NewFLink, NewFName,
+                                    F->getParent());
+  NewF->copyAttributesFrom(F);
+  NewF->removeAttributes(
+      AttributeList::ReturnIndex,
+      AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
+
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
+  if (F->isVarArg()) {
+    NewF->removeAttributes(AttributeList::FunctionIndex,
+                           AttrBuilder().addAttribute("split-stack"));
+    CallInst::Create(DFSanVarargWrapperFn,
+                     IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
+                     BB);
+    new UnreachableInst(*Ctx, BB);
+  } else {
+    std::vector<Value *> Args;
+    unsigned n = FT->getNumParams();
+    for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n)
+      Args.push_back(&*ai);
+    CallInst *CI = CallInst::Create(F, Args, "", BB);
+    if (FT->getReturnType()->isVoidTy())
+      ReturnInst::Create(*Ctx, BB);
+    else
+      ReturnInst::Create(*Ctx, CI, BB);
+  }
+
+  return NewF;
+}
+
+Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
+                                                          StringRef FName) {
+  FunctionType *FTT = getTrampolineFunctionType(FT);
+  Constant *C = Mod->getOrInsertFunction(FName, FTT);
+  Function *F = dyn_cast<Function>(C);
+  if (F && F->isDeclaration()) {
+    F->setLinkage(GlobalValue::LinkOnceODRLinkage);
+    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+    std::vector<Value *> Args;
+    Function::arg_iterator AI = F->arg_begin(); ++AI;
+    for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N)
+      Args.push_back(&*AI);
+    CallInst *CI = CallInst::Create(&*F->arg_begin(), Args, "", BB);
+    ReturnInst *RI;
+    if (FT->getReturnType()->isVoidTy())
+      RI = ReturnInst::Create(*Ctx, BB);
+    else
+      RI = ReturnInst::Create(*Ctx, CI, BB);
+
+    DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true);
+    Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI;
+    for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N)
+      DFSF.ValShadowMap[&*ValAI] = &*ShadowAI;
+    DFSanVisitor(DFSF).visitCallInst(*CI);
+    if (!FT->getReturnType()->isVoidTy())
+      new StoreInst(DFSF.getShadow(RI->getReturnValue()),
+                    &*std::prev(F->arg_end()), RI);
+  }
+
+  return C;
+}
+
+bool DataFlowSanitizer::runOnModule(Module &M) {
+  if (ABIList.isIn(M, "skip"))
+    return false;
+
+  if (!GetArgTLSPtr) {
+    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
+    ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS))
+      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
+  }
+  if (!GetRetvalTLSPtr) {
+    RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy);
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS))
+      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
+  }
+
+  ExternalShadowMask =
+      Mod->getOrInsertGlobal(kDFSanExternShadowPtrMask, IntptrTy);
+
+  DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanUnionFn)) {
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+    F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    F->addParamAttr(0, Attribute::ZExt);
+    F->addParamAttr(1, Attribute::ZExt);
+  }
+  DFSanCheckedUnionFn = Mod->getOrInsertFunction("dfsan_union", DFSanUnionFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanCheckedUnionFn)) {
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+    F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    F->addParamAttr(0, Attribute::ZExt);
+    F->addParamAttr(1, Attribute::ZExt);
+  }
+  DFSanUnionLoadFn =
+      Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanUnionLoadFn)) {
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly);
+    F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+  }
+  DFSanUnimplementedFn =
+      Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
+  DFSanSetLabelFn =
+      Mod->getOrInsertFunction("__dfsan_set_label", DFSanSetLabelFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanSetLabelFn)) {
+    F->addParamAttr(0, Attribute::ZExt);
+  }
+  DFSanNonzeroLabelFn =
+      Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy);
+  DFSanVarargWrapperFn = Mod->getOrInsertFunction("__dfsan_vararg_wrapper",
+                                                  DFSanVarargWrapperFnTy);
+
+  std::vector<Function *> FnsToInstrument;
+  llvm::SmallPtrSet<Function *, 2> FnsWithNativeABI;
+  for (Function &i : M) {
+    if (!i.isIntrinsic() &&
+        &i != DFSanUnionFn &&
+        &i != DFSanCheckedUnionFn &&
+        &i != DFSanUnionLoadFn &&
+        &i != DFSanUnimplementedFn &&
+        &i != DFSanSetLabelFn &&
+        &i != DFSanNonzeroLabelFn &&
+        &i != DFSanVarargWrapperFn)
+      FnsToInstrument.push_back(&i);
+  }
+
+  // Give function aliases prefixes when necessary, and build wrappers where the
+  // instrumentedness is inconsistent.
+  for (Module::alias_iterator i = M.alias_begin(), e = M.alias_end(); i != e;) {
+    GlobalAlias *GA = &*i;
+    ++i;
+    // Don't stop on weak.  We assume people aren't playing games with the
+    // instrumentedness of overridden weak aliases.
+    if (auto F = dyn_cast<Function>(GA->getBaseObject())) {
+      bool GAInst = isInstrumented(GA), FInst = isInstrumented(F);
+      if (GAInst && FInst) {
+        addGlobalNamePrefix(GA);
+      } else if (GAInst != FInst) {
+        // Non-instrumented alias of an instrumented function, or vice versa.
+        // Replace the alias with a native-ABI wrapper of the aliasee.  The pass
+        // below will take care of instrumenting it.
+        Function *NewF =
+            buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType());
+        GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewF, GA->getType()));
+        NewF->takeName(GA);
+        GA->eraseFromParent();
+        FnsToInstrument.push_back(NewF);
+      }
+    }
+  }
+
+  ReadOnlyNoneAttrs.addAttribute(Attribute::ReadOnly)
+      .addAttribute(Attribute::ReadNone);
+
+  // First, change the ABI of every function in the module.  ABI-listed
+  // functions keep their original ABI and get a wrapper function.
+  for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
+                                         e = FnsToInstrument.end();
+       i != e; ++i) {
+    Function &F = **i;
+    FunctionType *FT = F.getFunctionType();
+
+    bool IsZeroArgsVoidRet = (FT->getNumParams() == 0 && !FT->isVarArg() &&
+                              FT->getReturnType()->isVoidTy());
+
+    if (isInstrumented(&F)) {
+      // Instrumented functions get a 'dfs$' prefix.  This allows us to more
+      // easily identify cases of mismatching ABIs.
+      if (getInstrumentedABI() == IA_Args && !IsZeroArgsVoidRet) {
+        FunctionType *NewFT = getArgsFunctionType(FT);
+        Function *NewF = Function::Create(NewFT, F.getLinkage(), "", &M);
+        NewF->copyAttributesFrom(&F);
+        NewF->removeAttributes(
+            AttributeList::ReturnIndex,
+            AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
+        for (Function::arg_iterator FArg = F.arg_begin(),
+                                    NewFArg = NewF->arg_begin(),
+                                    FArgEnd = F.arg_end();
+             FArg != FArgEnd; ++FArg, ++NewFArg) {
+          FArg->replaceAllUsesWith(&*NewFArg);
+        }
+        NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList());
+
+        for (Function::user_iterator UI = F.user_begin(), UE = F.user_end();
+             UI != UE;) {
+          BlockAddress *BA = dyn_cast<BlockAddress>(*UI);
+          ++UI;
+          if (BA) {
+            BA->replaceAllUsesWith(
+                BlockAddress::get(NewF, BA->getBasicBlock()));
+            delete BA;
+          }
+        }
+        F.replaceAllUsesWith(
+            ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)));
+        NewF->takeName(&F);
+        F.eraseFromParent();
+        *i = NewF;
+        addGlobalNamePrefix(NewF);
+      } else {
+        addGlobalNamePrefix(&F);
+      }
+    } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) {
+      // Build a wrapper function for F.  The wrapper simply calls F, and is
+      // added to FnsToInstrument so that any instrumentation according to its
+      // WrapperKind is done in the second pass below.
+      FunctionType *NewFT = getInstrumentedABI() == IA_Args
+                                ? getArgsFunctionType(FT)
+                                : FT;
+      Function *NewF = buildWrapperFunction(
+          &F, std::string("dfsw$") + std::string(F.getName()),
+          GlobalValue::LinkOnceODRLinkage, NewFT);
+      if (getInstrumentedABI() == IA_TLS)
+        NewF->removeAttributes(AttributeList::FunctionIndex, ReadOnlyNoneAttrs);
+
+      Value *WrappedFnCst =
+          ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
+      F.replaceAllUsesWith(WrappedFnCst);
+
+      UnwrappedFnMap[WrappedFnCst] = &F;
+      *i = NewF;
+
+      if (!F.isDeclaration()) {
+        // This function is probably defining an interposition of an
+        // uninstrumented function and hence needs to keep the original ABI.
+        // But any functions it may call need to use the instrumented ABI, so
+        // we instrument it in a mode which preserves the original ABI.
+        FnsWithNativeABI.insert(&F);
+
+        // This code needs to rebuild the iterators, as they may be invalidated
+        // by the push_back, taking care that the new range does not include
+        // any functions added by this code.
+        size_t N = i - FnsToInstrument.begin(),
+               Count = e - FnsToInstrument.begin();
+        FnsToInstrument.push_back(&F);
+        i = FnsToInstrument.begin() + N;
+        e = FnsToInstrument.begin() + Count;
+      }
+               // Hopefully, nobody will try to indirectly call a vararg
+               // function... yet.
+    } else if (FT->isVarArg()) {
+      UnwrappedFnMap[&F] = &F;
+      *i = nullptr;
+    }
+  }
+
+  for (Function *i : FnsToInstrument) {
+    if (!i || i->isDeclaration())
+      continue;
+
+    removeUnreachableBlocks(*i);
+
+    DFSanFunction DFSF(*this, i, FnsWithNativeABI.count(i));
+
+    // DFSanVisitor may create new basic blocks, which confuses df_iterator.
+    // Build a copy of the list before iterating over it.
+    llvm::SmallVector<BasicBlock *, 4> BBList(depth_first(&i->getEntryBlock()));
+
+    for (BasicBlock *i : BBList) {
+      Instruction *Inst = &i->front();
+      while (1) {
+        // DFSanVisitor may split the current basic block, changing the current
+        // instruction's next pointer and moving the next instruction to the
+        // tail block from which we should continue.
+        Instruction *Next = Inst->getNextNode();
+        // DFSanVisitor may delete Inst, so keep track of whether it was a
+        // terminator.
+        bool IsTerminator = isa<TerminatorInst>(Inst);
+        if (!DFSF.SkipInsts.count(Inst))
+          DFSanVisitor(DFSF).visit(Inst);
+        if (IsTerminator)
+          break;
+        Inst = Next;
+      }
+    }
+
+    // We will not necessarily be able to compute the shadow for every phi node
+    // until we have visited every block.  Therefore, the code that handles phi
+    // nodes adds them to the PHIFixups list so that they can be properly
+    // handled here.
+    for (std::vector<std::pair<PHINode *, PHINode *> >::iterator
+             i = DFSF.PHIFixups.begin(),
+             e = DFSF.PHIFixups.end();
+         i != e; ++i) {
+      for (unsigned val = 0, n = i->first->getNumIncomingValues(); val != n;
+           ++val) {
+        i->second->setIncomingValue(
+            val, DFSF.getShadow(i->first->getIncomingValue(val)));
+      }
+    }
+
+    // -dfsan-debug-nonzero-labels will split the CFG in all kinds of crazy
+    // places (i.e. instructions in basic blocks we haven't even begun visiting
+    // yet).  To make our life easier, do this work in a pass after the main
+    // instrumentation.
+    if (ClDebugNonzeroLabels) {
+      for (Value *V : DFSF.NonZeroChecks) {
+        Instruction *Pos;
+        if (Instruction *I = dyn_cast<Instruction>(V))
+          Pos = I->getNextNode();
+        else
+          Pos = &DFSF.F->getEntryBlock().front();
+        while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos))
+          Pos = Pos->getNextNode();
+        IRBuilder<> IRB(Pos);
+        Value *Ne = IRB.CreateICmpNE(V, DFSF.DFS.ZeroShadow);
+        BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+            Ne, Pos, /*Unreachable=*/false, ColdCallWeights));
+        IRBuilder<> ThenIRB(BI);
+        ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn, {});
+      }
+    }
+  }
+
+  return false;
+}
+
+Value *DFSanFunction::getArgTLSPtr() {
+  if (ArgTLSPtr)
+    return ArgTLSPtr;
+  if (DFS.ArgTLS)
+    return ArgTLSPtr = DFS.ArgTLS;
+
+  IRBuilder<> IRB(&F->getEntryBlock().front());
+  return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLS, {});
+}
+
+Value *DFSanFunction::getRetvalTLS() {
+  if (RetvalTLSPtr)
+    return RetvalTLSPtr;
+  if (DFS.RetvalTLS)
+    return RetvalTLSPtr = DFS.RetvalTLS;
+
+  IRBuilder<> IRB(&F->getEntryBlock().front());
+  return RetvalTLSPtr = IRB.CreateCall(DFS.GetRetvalTLS, {});
+}
+
+Value *DFSanFunction::getArgTLS(unsigned Idx, Instruction *Pos) {
+  IRBuilder<> IRB(Pos);
+  return IRB.CreateConstGEP2_64(getArgTLSPtr(), 0, Idx);
+}
+
+Value *DFSanFunction::getShadow(Value *V) {
+  if (!isa<Argument>(V) && !isa<Instruction>(V))
+    return DFS.ZeroShadow;
+  Value *&Shadow = ValShadowMap[V];
+  if (!Shadow) {
+    if (Argument *A = dyn_cast<Argument>(V)) {
+      if (IsNativeABI)
+        return DFS.ZeroShadow;
+      switch (IA) {
+      case DataFlowSanitizer::IA_TLS: {
+        Value *ArgTLSPtr = getArgTLSPtr();
+        Instruction *ArgTLSPos =
+            DFS.ArgTLS ? &*F->getEntryBlock().begin()
+                       : cast<Instruction>(ArgTLSPtr)->getNextNode();
+        IRBuilder<> IRB(ArgTLSPos);
+        Shadow = IRB.CreateLoad(getArgTLS(A->getArgNo(), ArgTLSPos));
+        break;
+      }
+      case DataFlowSanitizer::IA_Args: {
+        unsigned ArgIdx = A->getArgNo() + F->arg_size() / 2;
+        Function::arg_iterator i = F->arg_begin();
+        while (ArgIdx--)
+          ++i;
+        Shadow = &*i;
+        assert(Shadow->getType() == DFS.ShadowTy);
+        break;
+      }
+      }
+      NonZeroChecks.push_back(Shadow);
+    } else {
+      Shadow = DFS.ZeroShadow;
+    }
+  }
+  return Shadow;
+}
+
+void DFSanFunction::setShadow(Instruction *I, Value *Shadow) {
+  assert(!ValShadowMap.count(I));
+  assert(Shadow->getType() == DFS.ShadowTy);
+  ValShadowMap[I] = Shadow;
+}
+
+Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {
+  assert(Addr != RetvalTLS && "Reinstrumenting?");
+  IRBuilder<> IRB(Pos);
+  Value *ShadowPtrMaskValue;
+  if (DFSanRuntimeShadowMask)
+    ShadowPtrMaskValue = IRB.CreateLoad(IntptrTy, ExternalShadowMask);
+  else
+    ShadowPtrMaskValue = ShadowPtrMask;
+  return IRB.CreateIntToPtr(
+      IRB.CreateMul(
+          IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy),
+                        IRB.CreatePtrToInt(ShadowPtrMaskValue, IntptrTy)),
+          ShadowPtrMul),
+      ShadowPtrTy);
+}
+
+// Generates IR to compute the union of the two given shadows, inserting it
+// before Pos.  Returns the computed union Value.
+Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
+  if (V1 == DFS.ZeroShadow)
+    return V2;
+  if (V2 == DFS.ZeroShadow)
+    return V1;
+  if (V1 == V2)
+    return V1;
+
+  auto V1Elems = ShadowElements.find(V1);
+  auto V2Elems = ShadowElements.find(V2);
+  if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) {
+    if (std::includes(V1Elems->second.begin(), V1Elems->second.end(),
+                      V2Elems->second.begin(), V2Elems->second.end())) {
+      return V1;
+    } else if (std::includes(V2Elems->second.begin(), V2Elems->second.end(),
+                             V1Elems->second.begin(), V1Elems->second.end())) {
+      return V2;
+    }
+  } else if (V1Elems != ShadowElements.end()) {
+    if (V1Elems->second.count(V2))
+      return V1;
+  } else if (V2Elems != ShadowElements.end()) {
+    if (V2Elems->second.count(V1))
+      return V2;
+  }
+
+  auto Key = std::make_pair(V1, V2);
+  if (V1 > V2)
+    std::swap(Key.first, Key.second);
+  CachedCombinedShadow &CCS = CachedCombinedShadows[Key];
+  if (CCS.Block && DT.dominates(CCS.Block, Pos->getParent()))
+    return CCS.Shadow;
+
+  IRBuilder<> IRB(Pos);
+  if (AvoidNewBlocks) {
+    CallInst *Call = IRB.CreateCall(DFS.DFSanCheckedUnionFn, {V1, V2});
+    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    Call->addParamAttr(0, Attribute::ZExt);
+    Call->addParamAttr(1, Attribute::ZExt);
+
+    CCS.Block = Pos->getParent();
+    CCS.Shadow = Call;
+  } else {
+    BasicBlock *Head = Pos->getParent();
+    Value *Ne = IRB.CreateICmpNE(V1, V2);
+    BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+        Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT));
+    IRBuilder<> ThenIRB(BI);
+    CallInst *Call = ThenIRB.CreateCall(DFS.DFSanUnionFn, {V1, V2});
+    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    Call->addParamAttr(0, Attribute::ZExt);
+    Call->addParamAttr(1, Attribute::ZExt);
+
+    BasicBlock *Tail = BI->getSuccessor(0);
+    PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());
+    Phi->addIncoming(Call, Call->getParent());
+    Phi->addIncoming(V1, Head);
+
+    CCS.Block = Tail;
+    CCS.Shadow = Phi;
+  }
+
+  std::set<Value *> UnionElems;
+  if (V1Elems != ShadowElements.end()) {
+    UnionElems = V1Elems->second;
+  } else {
+    UnionElems.insert(V1);
+  }
+  if (V2Elems != ShadowElements.end()) {
+    UnionElems.insert(V2Elems->second.begin(), V2Elems->second.end());
+  } else {
+    UnionElems.insert(V2);
+  }
+  ShadowElements[CCS.Shadow] = std::move(UnionElems);
+
+  return CCS.Shadow;
+}
+
+// A convenience function which folds the shadows of each of the operands
+// of the provided instruction Inst, inserting the IR before Inst.  Returns
+// the computed union Value.
+Value *DFSanFunction::combineOperandShadows(Instruction *Inst) {
+  if (Inst->getNumOperands() == 0)
+    return DFS.ZeroShadow;
+
+  Value *Shadow = getShadow(Inst->getOperand(0));
+  for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) {
+    Shadow = combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst);
+  }
+  return Shadow;
+}
+
+void DFSanVisitor::visitOperandShadowInst(Instruction &I) {
+  Value *CombinedShadow = DFSF.combineOperandShadows(&I);
+  DFSF.setShadow(&I, CombinedShadow);
+}
+
+// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where
+// Addr has alignment Align, and take the union of each of those shadows.
+Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
+                                 Instruction *Pos) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+    llvm::DenseMap<AllocaInst *, AllocaInst *>::iterator i =
+        AllocaShadowMap.find(AI);
+    if (i != AllocaShadowMap.end()) {
+      IRBuilder<> IRB(Pos);
+      return IRB.CreateLoad(i->second);
+    }
+  }
+
+  uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
+  SmallVector<Value *, 2> Objs;
+  GetUnderlyingObjects(Addr, Objs, Pos->getModule()->getDataLayout());
+  bool AllConstants = true;
+  for (Value *Obj : Objs) {
+    if (isa<Function>(Obj) || isa<BlockAddress>(Obj))
+      continue;
+    if (isa<GlobalVariable>(Obj) && cast<GlobalVariable>(Obj)->isConstant())
+      continue;
+
+    AllConstants = false;
+    break;
+  }
+  if (AllConstants)
+    return DFS.ZeroShadow;
+
+  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+  switch (Size) {
+  case 0:
+    return DFS.ZeroShadow;
+  case 1: {
+    LoadInst *LI = new LoadInst(ShadowAddr, "", Pos);
+    LI->setAlignment(ShadowAlign);
+    return LI;
+  }
+  case 2: {
+    IRBuilder<> IRB(Pos);
+    Value *ShadowAddr1 = IRB.CreateGEP(DFS.ShadowTy, ShadowAddr,
+                                       ConstantInt::get(DFS.IntptrTy, 1));
+    return combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign),
+                          IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign), Pos);
+  }
+  }
+  if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidth) == 0) {
+    // Fast path for the common case where each byte has identical shadow: load
+    // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any
+    // shadow is non-equal.
+    BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F);
+    IRBuilder<> FallbackIRB(FallbackBB);
+    CallInst *FallbackCall = FallbackIRB.CreateCall(
+        DFS.DFSanUnionLoadFn,
+        {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
+    FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+
+    // Compare each of the shadows stored in the loaded 64 bits to each other,
+    // by computing (WideShadow rotl ShadowWidth) == WideShadow.
+    IRBuilder<> IRB(Pos);
+    Value *WideAddr =
+        IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
+    Value *WideShadow = IRB.CreateAlignedLoad(WideAddr, ShadowAlign);
+    Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.ShadowTy);
+    Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidth);
+    Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidth);
+    Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow);
+    Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);
+
+    BasicBlock *Head = Pos->getParent();
+    BasicBlock *Tail = Head->splitBasicBlock(Pos->getIterator());
+
+    if (DomTreeNode *OldNode = DT.getNode(Head)) {
+      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
+
+      DomTreeNode *NewNode = DT.addNewBlock(Tail, Head);
+      for (auto Child : Children)
+        DT.changeImmediateDominator(Child, NewNode);
+    }
+
+    // In the following code LastBr will refer to the previous basic block's
+    // conditional branch instruction, whose true successor is fixed up to point
+    // to the next block during the loop below or to the tail after the final
+    // iteration.
+    BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq);
+    ReplaceInstWithInst(Head->getTerminator(), LastBr);
+    DT.addNewBlock(FallbackBB, Head);
+
+    for (uint64_t Ofs = 64 / DFS.ShadowWidth; Ofs != Size;
+         Ofs += 64 / DFS.ShadowWidth) {
+      BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F);
+      DT.addNewBlock(NextBB, LastBr->getParent());
+      IRBuilder<> NextIRB(NextBB);
+      WideAddr = NextIRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr,
+                                   ConstantInt::get(DFS.IntptrTy, 1));
+      Value *NextWideShadow = NextIRB.CreateAlignedLoad(WideAddr, ShadowAlign);
+      ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow);
+      LastBr->setSuccessor(0, NextBB);
+      LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB);
+    }
+
+    LastBr->setSuccessor(0, Tail);
+    FallbackIRB.CreateBr(Tail);
+    PHINode *Shadow = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());
+    Shadow->addIncoming(FallbackCall, FallbackBB);
+    Shadow->addIncoming(TruncShadow, LastBr->getParent());
+    return Shadow;
+  }
+
+  IRBuilder<> IRB(Pos);
+  CallInst *FallbackCall = IRB.CreateCall(
+      DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
+  FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+  return FallbackCall;
+}
+
+void DFSanVisitor::visitLoadInst(LoadInst &LI) {
+  auto &DL = LI.getModule()->getDataLayout();
+  uint64_t Size = DL.getTypeStoreSize(LI.getType());
+  if (Size == 0) {
+    DFSF.setShadow(&LI, DFSF.DFS.ZeroShadow);
+    return;
+  }
+
+  uint64_t Align;
+  if (ClPreserveAlignment) {
+    Align = LI.getAlignment();
+    if (Align == 0)
+      Align = DL.getABITypeAlignment(LI.getType());
+  } else {
+    Align = 1;
+  }
+  IRBuilder<> IRB(&LI);
+  Value *Shadow = DFSF.loadShadow(LI.getPointerOperand(), Size, Align, &LI);
+  if (ClCombinePointerLabelsOnLoad) {
+    Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
+    Shadow = DFSF.combineShadows(Shadow, PtrShadow, &LI);
+  }
+  if (Shadow != DFSF.DFS.ZeroShadow)
+    DFSF.NonZeroChecks.push_back(Shadow);
+
+  DFSF.setShadow(&LI, Shadow);
+}
+
+void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align,
+                                Value *Shadow, Instruction *Pos) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+    llvm::DenseMap<AllocaInst *, AllocaInst *>::iterator i =
+        AllocaShadowMap.find(AI);
+    if (i != AllocaShadowMap.end()) {
+      IRBuilder<> IRB(Pos);
+      IRB.CreateStore(Shadow, i->second);
+      return;
+    }
+  }
+
+  uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
+  IRBuilder<> IRB(Pos);
+  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+  if (Shadow == DFS.ZeroShadow) {
+    IntegerType *ShadowTy = IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidth);
+    Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0);
+    Value *ExtShadowAddr =
+        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy));
+    IRB.CreateAlignedStore(ExtZeroShadow, ExtShadowAddr, ShadowAlign);
+    return;
+  }
+
+  const unsigned ShadowVecSize = 128 / DFS.ShadowWidth;
+  uint64_t Offset = 0;
+  if (Size >= ShadowVecSize) {
+    VectorType *ShadowVecTy = VectorType::get(DFS.ShadowTy, ShadowVecSize);
+    Value *ShadowVec = UndefValue::get(ShadowVecTy);
+    for (unsigned i = 0; i != ShadowVecSize; ++i) {
+      ShadowVec = IRB.CreateInsertElement(
+          ShadowVec, Shadow, ConstantInt::get(Type::getInt32Ty(*DFS.Ctx), i));
+    }
+    Value *ShadowVecAddr =
+        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy));
+    do {
+      Value *CurShadowVecAddr =
+          IRB.CreateConstGEP1_32(ShadowVecTy, ShadowVecAddr, Offset);
+      IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign);
+      Size -= ShadowVecSize;
+      ++Offset;
+    } while (Size >= ShadowVecSize);
+    Offset *= ShadowVecSize;
+  }
+  while (Size > 0) {
+    Value *CurShadowAddr =
+        IRB.CreateConstGEP1_32(DFS.ShadowTy, ShadowAddr, Offset);
+    IRB.CreateAlignedStore(Shadow, CurShadowAddr, ShadowAlign);
+    --Size;
+    ++Offset;
+  }
+}
+
+void DFSanVisitor::visitStoreInst(StoreInst &SI) {
+  auto &DL = SI.getModule()->getDataLayout();
+  uint64_t Size = DL.getTypeStoreSize(SI.getValueOperand()->getType());
+  if (Size == 0)
+    return;
+
+  uint64_t Align;
+  if (ClPreserveAlignment) {
+    Align = SI.getAlignment();
+    if (Align == 0)
+      Align = DL.getABITypeAlignment(SI.getValueOperand()->getType());
+  } else {
+    Align = 1;
+  }
+
+  Value* Shadow = DFSF.getShadow(SI.getValueOperand());
+  if (ClCombinePointerLabelsOnStore) {
+    Value *PtrShadow = DFSF.getShadow(SI.getPointerOperand());
+    Shadow = DFSF.combineShadows(Shadow, PtrShadow, &SI);
+  }
+  DFSF.storeShadow(SI.getPointerOperand(), Size, Align, Shadow, &SI);
+}
+
+void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) {
+  visitOperandShadowInst(BO);
+}
+
+void DFSanVisitor::visitCastInst(CastInst &CI) { visitOperandShadowInst(CI); }
+
+void DFSanVisitor::visitCmpInst(CmpInst &CI) { visitOperandShadowInst(CI); }
+
+void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+  visitOperandShadowInst(GEPI);
+}
+
+void DFSanVisitor::visitExtractElementInst(ExtractElementInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitInsertElementInst(InsertElementInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitAllocaInst(AllocaInst &I) {
+  bool AllLoadsStores = true;
+  for (User *U : I.users()) {
+    if (isa<LoadInst>(U))
+      continue;
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getPointerOperand() == &I)
+        continue;
+    }
+
+    AllLoadsStores = false;
+    break;
+  }
+  if (AllLoadsStores) {
+    IRBuilder<> IRB(&I);
+    DFSF.AllocaShadowMap[&I] = IRB.CreateAlloca(DFSF.DFS.ShadowTy);
+  }
+  DFSF.setShadow(&I, DFSF.DFS.ZeroShadow);
+}
+
+void DFSanVisitor::visitSelectInst(SelectInst &I) {
+  Value *CondShadow = DFSF.getShadow(I.getCondition());
+  Value *TrueShadow = DFSF.getShadow(I.getTrueValue());
+  Value *FalseShadow = DFSF.getShadow(I.getFalseValue());
+
+  if (isa<VectorType>(I.getCondition()->getType())) {
+    DFSF.setShadow(
+        &I,
+        DFSF.combineShadows(
+            CondShadow, DFSF.combineShadows(TrueShadow, FalseShadow, &I), &I));
+  } else {
+    Value *ShadowSel;
+    if (TrueShadow == FalseShadow) {
+      ShadowSel = TrueShadow;
+    } else {
+      ShadowSel =
+          SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I);
+    }
+    DFSF.setShadow(&I, DFSF.combineShadows(CondShadow, ShadowSel, &I));
+  }
+}
+
+void DFSanVisitor::visitMemSetInst(MemSetInst &I) {
+  IRBuilder<> IRB(&I);
+  Value *ValShadow = DFSF.getShadow(I.getValue());
+  IRB.CreateCall(DFSF.DFS.DFSanSetLabelFn,
+                 {ValShadow, IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy(
+                                                                *DFSF.DFS.Ctx)),
+                  IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)});
+}
+
+void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
+  IRBuilder<> IRB(&I);
+  Value *DestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I);
+  Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I);
+  Value *LenShadow = IRB.CreateMul(
+      I.getLength(),
+      ConstantInt::get(I.getLength()->getType(), DFSF.DFS.ShadowWidth / 8));
+  Value *AlignShadow;
+  if (ClPreserveAlignment) {
+    AlignShadow = IRB.CreateMul(I.getAlignmentCst(),
+                                ConstantInt::get(I.getAlignmentCst()->getType(),
+                                                 DFSF.DFS.ShadowWidth / 8));
+  } else {
+    AlignShadow = ConstantInt::get(I.getAlignmentCst()->getType(),
+                                   DFSF.DFS.ShadowWidth / 8);
+  }
+  Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx);
+  DestShadow = IRB.CreateBitCast(DestShadow, Int8Ptr);
+  SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr);
+  IRB.CreateCall(I.getCalledValue(), {DestShadow, SrcShadow, LenShadow,
+                                      AlignShadow, I.getVolatileCst()});
+}
+
+void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
+  if (!DFSF.IsNativeABI && RI.getReturnValue()) {
+    switch (DFSF.IA) {
+    case DataFlowSanitizer::IA_TLS: {
+      Value *S = DFSF.getShadow(RI.getReturnValue());
+      IRBuilder<> IRB(&RI);
+      IRB.CreateStore(S, DFSF.getRetvalTLS());
+      break;
+    }
+    case DataFlowSanitizer::IA_Args: {
+      IRBuilder<> IRB(&RI);
+      Type *RT = DFSF.F->getFunctionType()->getReturnType();
+      Value *InsVal =
+          IRB.CreateInsertValue(UndefValue::get(RT), RI.getReturnValue(), 0);
+      Value *InsShadow =
+          IRB.CreateInsertValue(InsVal, DFSF.getShadow(RI.getReturnValue()), 1);
+      RI.setOperand(0, InsShadow);
+      break;
+    }
+    }
+  }
+}
+
+void DFSanVisitor::visitCallSite(CallSite CS) {
+  Function *F = CS.getCalledFunction();
+  if ((F && F->isIntrinsic()) || isa<InlineAsm>(CS.getCalledValue())) {
+    visitOperandShadowInst(*CS.getInstruction());
+    return;
+  }
+
+  // Calls to this function are synthesized in wrappers, and we shouldn't
+  // instrument them.
+  if (F == DFSF.DFS.DFSanVarargWrapperFn)
+    return;
+
+  IRBuilder<> IRB(CS.getInstruction());
+
+  DenseMap<Value *, Function *>::iterator i =
+      DFSF.DFS.UnwrappedFnMap.find(CS.getCalledValue());
+  if (i != DFSF.DFS.UnwrappedFnMap.end()) {
+    Function *F = i->second;
+    switch (DFSF.DFS.getWrapperKind(F)) {
+    case DataFlowSanitizer::WK_Warning: {
+      CS.setCalledFunction(F);
+      IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn,
+                     IRB.CreateGlobalStringPtr(F->getName()));
+      DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
+      return;
+    }
+    case DataFlowSanitizer::WK_Discard: {
+      CS.setCalledFunction(F);
+      DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
+      return;
+    }
+    case DataFlowSanitizer::WK_Functional: {
+      CS.setCalledFunction(F);
+      visitOperandShadowInst(*CS.getInstruction());
+      return;
+    }
+    case DataFlowSanitizer::WK_Custom: {
+      // Don't try to handle invokes of custom functions, it's too complicated.
+      // Instead, invoke the dfsw$ wrapper, which will in turn call the __dfsw_
+      // wrapper.
+      if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
+        FunctionType *FT = F->getFunctionType();
+        FunctionType *CustomFT = DFSF.DFS.getCustomFunctionType(FT);
+        std::string CustomFName = "__dfsw_";
+        CustomFName += F->getName();
+        Constant *CustomF =
+            DFSF.DFS.Mod->getOrInsertFunction(CustomFName, CustomFT);
+        if (Function *CustomFn = dyn_cast<Function>(CustomF)) {
+          CustomFn->copyAttributesFrom(F);
+
+          // Custom functions returning non-void will write to the return label.
+          if (!FT->getReturnType()->isVoidTy()) {
+            CustomFn->removeAttributes(AttributeList::FunctionIndex,
+                                       DFSF.DFS.ReadOnlyNoneAttrs);
+          }
+        }
+
+        std::vector<Value *> Args;
+
+        CallSite::arg_iterator i = CS.arg_begin();
+        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) {
+          Type *T = (*i)->getType();
+          FunctionType *ParamFT;
+          if (isa<PointerType>(T) &&
+              (ParamFT = dyn_cast<FunctionType>(
+                   cast<PointerType>(T)->getElementType()))) {
+            std::string TName = "dfst";
+            TName += utostr(FT->getNumParams() - n);
+            TName += "$";
+            TName += F->getName();
+            Constant *T = DFSF.DFS.getOrBuildTrampolineFunction(ParamFT, TName);
+            Args.push_back(T);
+            Args.push_back(
+                IRB.CreateBitCast(*i, Type::getInt8PtrTy(*DFSF.DFS.Ctx)));
+          } else {
+            Args.push_back(*i);
+          }
+        }
+
+        i = CS.arg_begin();
+        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+          Args.push_back(DFSF.getShadow(*i));
+
+        if (FT->isVarArg()) {
+          auto *LabelVATy = ArrayType::get(DFSF.DFS.ShadowTy,
+                                           CS.arg_size() - FT->getNumParams());
+          auto *LabelVAAlloca = new AllocaInst(
+              LabelVATy, getDataLayout().getAllocaAddrSpace(),
+              "labelva", &DFSF.F->getEntryBlock().front());
+
+          for (unsigned n = 0; i != CS.arg_end(); ++i, ++n) {
+            auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n);
+            IRB.CreateStore(DFSF.getShadow(*i), LabelVAPtr);
+          }
+
+          Args.push_back(IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, 0));
+        }
+
+        if (!FT->getReturnType()->isVoidTy()) {
+          if (!DFSF.LabelReturnAlloca) {
+            DFSF.LabelReturnAlloca =
+              new AllocaInst(DFSF.DFS.ShadowTy,
+                             getDataLayout().getAllocaAddrSpace(),
+                             "labelreturn", &DFSF.F->getEntryBlock().front());
+          }
+          Args.push_back(DFSF.LabelReturnAlloca);
+        }
+
+        for (i = CS.arg_begin() + FT->getNumParams(); i != CS.arg_end(); ++i)
+          Args.push_back(*i);
+
+        CallInst *CustomCI = IRB.CreateCall(CustomF, Args);
+        CustomCI->setCallingConv(CI->getCallingConv());
+        CustomCI->setAttributes(CI->getAttributes());
+
+        if (!FT->getReturnType()->isVoidTy()) {
+          LoadInst *LabelLoad = IRB.CreateLoad(DFSF.LabelReturnAlloca);
+          DFSF.setShadow(CustomCI, LabelLoad);
+        }
+
+        CI->replaceAllUsesWith(CustomCI);
+        CI->eraseFromParent();
+        return;
+      }
+      break;
+    }
+    }
+  }
+
+  FunctionType *FT = cast<FunctionType>(
+      CS.getCalledValue()->getType()->getPointerElementType());
+  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+    for (unsigned i = 0, n = FT->getNumParams(); i != n; ++i) {
+      IRB.CreateStore(DFSF.getShadow(CS.getArgument(i)),
+                      DFSF.getArgTLS(i, CS.getInstruction()));
+    }
+  }
+
+  Instruction *Next = nullptr;
+  if (!CS.getType()->isVoidTy()) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      if (II->getNormalDest()->getSinglePredecessor()) {
+        Next = &II->getNormalDest()->front();
+      } else {
+        BasicBlock *NewBB =
+            SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT);
+        Next = &NewBB->front();
+      }
+    } else {
+      assert(CS->getIterator() != CS->getParent()->end());
+      Next = CS->getNextNode();
+    }
+
+    if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+      IRBuilder<> NextIRB(Next);
+      LoadInst *LI = NextIRB.CreateLoad(DFSF.getRetvalTLS());
+      DFSF.SkipInsts.insert(LI);
+      DFSF.setShadow(CS.getInstruction(), LI);
+      DFSF.NonZeroChecks.push_back(LI);
+    }
+  }
+
+  // Do all instrumentation for IA_Args down here to defer tampering with the
+  // CFG in a way that SplitEdge may be able to detect.
+  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) {
+    FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT);
+    Value *Func =
+        IRB.CreateBitCast(CS.getCalledValue(), PointerType::getUnqual(NewFT));
+    std::vector<Value *> Args;
+
+    CallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+      Args.push_back(*i);
+
+    i = CS.arg_begin();
+    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+      Args.push_back(DFSF.getShadow(*i));
+
+    if (FT->isVarArg()) {
+      unsigned VarArgSize = CS.arg_size() - FT->getNumParams();
+      ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize);
+      AllocaInst *VarArgShadow =
+        new AllocaInst(VarArgArrayTy, getDataLayout().getAllocaAddrSpace(),
+                       "", &DFSF.F->getEntryBlock().front());
+      Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0));
+      for (unsigned n = 0; i != e; ++i, ++n) {
+        IRB.CreateStore(
+            DFSF.getShadow(*i),
+            IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, n));
+        Args.push_back(*i);
+      }
+    }
+
+    CallSite NewCS;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      NewCS = IRB.CreateInvoke(Func, II->getNormalDest(), II->getUnwindDest(),
+                               Args);
+    } else {
+      NewCS = IRB.CreateCall(Func, Args);
+    }
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(CS.getAttributes().removeAttributes(
+        *DFSF.DFS.Ctx, AttributeList::ReturnIndex,
+        AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType())));
+
+    if (Next) {
+      ExtractValueInst *ExVal =
+          ExtractValueInst::Create(NewCS.getInstruction(), 0, "", Next);
+      DFSF.SkipInsts.insert(ExVal);
+      ExtractValueInst *ExShadow =
+          ExtractValueInst::Create(NewCS.getInstruction(), 1, "", Next);
+      DFSF.SkipInsts.insert(ExShadow);
+      DFSF.setShadow(ExVal, ExShadow);
+      DFSF.NonZeroChecks.push_back(ExShadow);
+
+      CS.getInstruction()->replaceAllUsesWith(ExVal);
+    }
+
+    CS.getInstruction()->eraseFromParent();
+  }
+}
+
+void DFSanVisitor::visitPHINode(PHINode &PN) {
+  PHINode *ShadowPN =
+      PHINode::Create(DFSF.DFS.ShadowTy, PN.getNumIncomingValues(), "", &PN);
+
+  // Give the shadow phi node valid predecessors to fool SplitEdge into working.
+  Value *UndefShadow = UndefValue::get(DFSF.DFS.ShadowTy);
+  for (PHINode::block_iterator i = PN.block_begin(), e = PN.block_end(); i != e;
+       ++i) {
+    ShadowPN->addIncoming(UndefShadow, *i);
+  }
+
+  DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN));
+  DFSF.setShadow(&PN, ShadowPN);
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
new file mode 100644
index 000000000000..6864d295525c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -0,0 +1,915 @@
+//===-- EfficiencySanitizer.cpp - performance tuner -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners
+// that detects multiple performance issues via separate sub-tools.
+//
+// The instrumentation phase is straightforward:
+//   - Take action on every memory access: either inlined instrumentation,
+//     or Inserted calls to our run-time library.
+//   - Optimizations may apply to avoid instrumenting some of the accesses.
+//   - Turn mem{set,cpy,move} instrinsics into library calls.
+// The rest is handled by the run-time library.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "esan"
+
+// The tool type must be just one of these ClTool* options, as the tools
+// cannot be combined due to shadow memory constraints.
+static cl::opt<bool>
+    ClToolCacheFrag("esan-cache-frag", cl::init(false),
+                    cl::desc("Detect data cache fragmentation"), cl::Hidden);
+static cl::opt<bool>
+    ClToolWorkingSet("esan-working-set", cl::init(false),
+                    cl::desc("Measure the working set size"), cl::Hidden);
+// Each new tool will get its own opt flag here.
+// These are converted to EfficiencySanitizerOptions for use
+// in the code.
+
+static cl::opt<bool> ClInstrumentLoadsAndStores(
+    "esan-instrument-loads-and-stores", cl::init(true),
+    cl::desc("Instrument loads and stores"), cl::Hidden);
+static cl::opt<bool> ClInstrumentMemIntrinsics(
+    "esan-instrument-memintrinsics", cl::init(true),
+    cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
+static cl::opt<bool> ClInstrumentFastpath(
+    "esan-instrument-fastpath", cl::init(true),
+    cl::desc("Instrument fastpath"), cl::Hidden);
+static cl::opt<bool> ClAuxFieldInfo(
+    "esan-aux-field-info", cl::init(true),
+    cl::desc("Generate binary with auxiliary struct field information"),
+    cl::Hidden);
+
+// Experiments show that the performance difference can be 2x or more,
+// and accuracy loss is typically negligible, so we turn this on by default.
+static cl::opt<bool> ClAssumeIntraCacheLine(
+    "esan-assume-intra-cache-line", cl::init(true),
+    cl::desc("Assume each memory access touches just one cache line, for "
+             "better performance but with a potential loss of accuracy."),
+    cl::Hidden);
+
+STATISTIC(NumInstrumentedLoads, "Number of instrumented loads");
+STATISTIC(NumInstrumentedStores, "Number of instrumented stores");
+STATISTIC(NumFastpaths, "Number of instrumented fastpaths");
+STATISTIC(NumAccessesWithIrregularSize,
+          "Number of accesses with a size outside our targeted callout sizes");
+STATISTIC(NumIgnoredStructs, "Number of ignored structs");
+STATISTIC(NumIgnoredGEPs, "Number of ignored GEP instructions");
+STATISTIC(NumInstrumentedGEPs, "Number of instrumented GEP instructions");
+STATISTIC(NumAssumedIntraCacheLine,
+          "Number of accesses assumed to be intra-cache-line");
+
+static const uint64_t EsanCtorAndDtorPriority = 0;
+static const char *const EsanModuleCtorName = "esan.module_ctor";
+static const char *const EsanModuleDtorName = "esan.module_dtor";
+static const char *const EsanInitName = "__esan_init";
+static const char *const EsanExitName = "__esan_exit";
+
+// We need to specify the tool to the runtime earlier than
+// the ctor is called in some cases, so we set a global variable.
+static const char *const EsanWhichToolName = "__esan_which_tool";
+
+// We must keep these Shadow* constants consistent with the esan runtime.
+// FIXME: Try to place these shadow constants, the names of the __esan_*
+// interface functions, and the ToolType enum into a header shared between
+// llvm and compiler-rt.
+struct ShadowMemoryParams {
+  uint64_t ShadowMask;
+  uint64_t ShadowOffs[3];
+};
+
+static const ShadowMemoryParams ShadowParams47 = {
+    0x00000fffffffffffull,
+    {
+        0x0000130000000000ull, 0x0000220000000000ull, 0x0000440000000000ull,
+    }};
+
+static const ShadowMemoryParams ShadowParams40 = {
+    0x0fffffffffull,
+    {
+        0x1300000000ull, 0x2200000000ull, 0x4400000000ull,
+    }};
+
+// This array is indexed by the ToolType enum.
+static const int ShadowScale[] = {
+  0, // ESAN_None.
+  2, // ESAN_CacheFrag: 4B:1B, so 4 to 1 == >>2.
+  6, // ESAN_WorkingSet: 64B:1B, so 64 to 1 == >>6.
+};
+
+// MaxStructCounterNameSize is a soft size limit to avoid insanely long
+// names for those extremely large structs.
+static const unsigned MaxStructCounterNameSize = 512;
+
+namespace {
+
+static EfficiencySanitizerOptions
+OverrideOptionsFromCL(EfficiencySanitizerOptions Options) {
+  if (ClToolCacheFrag)
+    Options.ToolType = EfficiencySanitizerOptions::ESAN_CacheFrag;
+  else if (ClToolWorkingSet)
+    Options.ToolType = EfficiencySanitizerOptions::ESAN_WorkingSet;
+
+  // Direct opt invocation with no params will have the default ESAN_None.
+  // We run the default tool in that case.
+  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_None)
+    Options.ToolType = EfficiencySanitizerOptions::ESAN_CacheFrag;
+
+  return Options;
+}
+
+// Create a constant for Str so that we can pass it to the run-time lib.
+static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
+                                                    bool AllowMerging) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  // We use private linkage for module-local strings. If they can be merged
+  // with another one, we set the unnamed_addr attribute.
+  GlobalVariable *GV =
+    new GlobalVariable(M, StrConst->getType(), true,
+                       GlobalValue::PrivateLinkage, StrConst, "");
+  if (AllowMerging)
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
+  return GV;
+}
+
+/// EfficiencySanitizer: instrument each module to find performance issues.
+class EfficiencySanitizer : public ModulePass {
+public:
+  EfficiencySanitizer(
+      const EfficiencySanitizerOptions &Opts = EfficiencySanitizerOptions())
+      : ModulePass(ID), Options(OverrideOptionsFromCL(Opts)) {}
+  StringRef getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnModule(Module &M) override;
+  static char ID;
+
+private:
+  bool initOnModule(Module &M);
+  void initializeCallbacks(Module &M);
+  bool shouldIgnoreStructType(StructType *StructTy);
+  void createStructCounterName(
+      StructType *StructTy, SmallString<MaxStructCounterNameSize> &NameStr);
+  void createCacheFragAuxGV(
+    Module &M, const DataLayout &DL, StructType *StructTy,
+    GlobalVariable *&TypeNames, GlobalVariable *&Offsets, GlobalVariable *&Size);
+  GlobalVariable *createCacheFragInfoGV(Module &M, const DataLayout &DL,
+                                        Constant *UnitName);
+  Constant *createEsanInitToolInfoArg(Module &M, const DataLayout &DL);
+  void createDestructor(Module &M, Constant *ToolInfoArg);
+  bool runOnFunction(Function &F, Module &M);
+  bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL);
+  bool instrumentMemIntrinsic(MemIntrinsic *MI);
+  bool instrumentGetElementPtr(Instruction *I, Module &M);
+  bool insertCounterUpdate(Instruction *I, StructType *StructTy,
+                           unsigned CounterIdx);
+  unsigned getFieldCounterIdx(StructType *StructTy) {
+    return 0;
+  }
+  unsigned getArrayCounterIdx(StructType *StructTy) {
+    return StructTy->getNumElements();
+  }
+  unsigned getStructCounterSize(StructType *StructTy) {
+    // The struct counter array includes:
+    // - one counter for each struct field,
+    // - one counter for the struct access within an array.
+    return (StructTy->getNumElements()/*field*/ + 1/*array*/);
+  }
+  bool shouldIgnoreMemoryAccess(Instruction *I);
+  int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL);
+  Value *appToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool instrumentFastpath(Instruction *I, const DataLayout &DL, bool IsStore,
+                          Value *Addr, unsigned Alignment);
+  // Each tool has its own fastpath routine:
+  bool instrumentFastpathCacheFrag(Instruction *I, const DataLayout &DL,
+                                   Value *Addr, unsigned Alignment);
+  bool instrumentFastpathWorkingSet(Instruction *I, const DataLayout &DL,
+                                    Value *Addr, unsigned Alignment);
+
+  EfficiencySanitizerOptions Options;
+  LLVMContext *Ctx;
+  Type *IntptrTy;
+  // Our slowpath involves callouts to the runtime library.
+  // Access sizes are powers of two: 1, 2, 4, 8, 16.
+  static const size_t NumberOfAccessSizes = 5;
+  Function *EsanAlignedLoad[NumberOfAccessSizes];
+  Function *EsanAlignedStore[NumberOfAccessSizes];
+  Function *EsanUnalignedLoad[NumberOfAccessSizes];
+  Function *EsanUnalignedStore[NumberOfAccessSizes];
+  // For irregular sizes of any alignment:
+  Function *EsanUnalignedLoadN, *EsanUnalignedStoreN;
+  Function *MemmoveFn, *MemcpyFn, *MemsetFn;
+  Function *EsanCtorFunction;
+  Function *EsanDtorFunction;
+  // Remember the counter variable for each struct type to avoid
+  // recomputing the variable name later during instrumentation.
+  std::map<Type *, GlobalVariable *> StructTyMap;
+  ShadowMemoryParams ShadowParams;
+};
+} // namespace
+
+char EfficiencySanitizer::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    EfficiencySanitizer, "esan",
+    "EfficiencySanitizer: finds performance issues.", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    EfficiencySanitizer, "esan",
+    "EfficiencySanitizer: finds performance issues.", false, false)
+
+StringRef EfficiencySanitizer::getPassName() const {
+  return "EfficiencySanitizer";
+}
+
+void EfficiencySanitizer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
+
+ModulePass *
+llvm::createEfficiencySanitizerPass(const EfficiencySanitizerOptions &Options) {
+  return new EfficiencySanitizer(Options);
+}
+
+void EfficiencySanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(M.getContext());
+  // Initialize the callbacks.
+  for (size_t Idx = 0; Idx < NumberOfAccessSizes; ++Idx) {
+    const unsigned ByteSize = 1U << Idx;
+    std::string ByteSizeStr = utostr(ByteSize);
+    // We'll inline the most common (i.e., aligned and frequent sizes)
+    // load + store instrumentation: these callouts are for the slowpath.
+    SmallString<32> AlignedLoadName("__esan_aligned_load" + ByteSizeStr);
+    EsanAlignedLoad[Idx] =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            AlignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
+    SmallString<32> AlignedStoreName("__esan_aligned_store" + ByteSizeStr);
+    EsanAlignedStore[Idx] =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            AlignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
+    SmallString<32> UnalignedLoadName("__esan_unaligned_load" + ByteSizeStr);
+    EsanUnalignedLoad[Idx] =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            UnalignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
+    SmallString<32> UnalignedStoreName("__esan_unaligned_store" + ByteSizeStr);
+    EsanUnalignedStore[Idx] =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            UnalignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
+  }
+  EsanUnalignedLoadN = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("__esan_unaligned_loadN", IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IntptrTy));
+  EsanUnalignedStoreN = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("__esan_unaligned_storeN", IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IntptrTy));
+  MemmoveFn = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IntptrTy));
+  MemcpyFn = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IntptrTy));
+  MemsetFn = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt32Ty(), IntptrTy));
+}
+
+bool EfficiencySanitizer::shouldIgnoreStructType(StructType *StructTy) {
+  if (StructTy == nullptr || StructTy->isOpaque() /* no struct body */)
+    return true;
+  return false;
+}
+
+void EfficiencySanitizer::createStructCounterName(
+    StructType *StructTy, SmallString<MaxStructCounterNameSize> &NameStr) {
+  // Append NumFields and field type ids to avoid struct conflicts
+  // with the same name but different fields.
+  if (StructTy->hasName())
+    NameStr += StructTy->getName();
+  else
+    NameStr += "struct.anon";
+  // We allow the actual size of the StructCounterName to be larger than
+  // MaxStructCounterNameSize and append $NumFields and at least one
+  // field type id.
+  // Append $NumFields.
+  NameStr += "$";
+  Twine(StructTy->getNumElements()).toVector(NameStr);
+  // Append struct field type ids in the reverse order.
+  for (int i = StructTy->getNumElements() - 1; i >= 0; --i) {
+    NameStr += "$";
+    Twine(StructTy->getElementType(i)->getTypeID()).toVector(NameStr);
+    if (NameStr.size() >= MaxStructCounterNameSize)
+      break;
+  }
+  if (StructTy->isLiteral()) {
+    // End with $ for literal struct.
+    NameStr += "$";
+  }
+}
+
+// Create global variables with auxiliary information (e.g., struct field size,
+// offset, and type name) for better user report.
+void EfficiencySanitizer::createCacheFragAuxGV(
+    Module &M, const DataLayout &DL, StructType *StructTy,
+    GlobalVariable *&TypeName, GlobalVariable *&Offset,
+    GlobalVariable *&Size) {
+  auto *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
+  auto *Int32Ty = Type::getInt32Ty(*Ctx);
+  // FieldTypeName.
+  auto *TypeNameArrayTy = ArrayType::get(Int8PtrTy, StructTy->getNumElements());
+  TypeName = new GlobalVariable(M, TypeNameArrayTy, true,
+                                 GlobalVariable::InternalLinkage, nullptr);
+  SmallVector<Constant *, 16> TypeNameVec;
+  // FieldOffset.
+  auto *OffsetArrayTy = ArrayType::get(Int32Ty, StructTy->getNumElements());
+  Offset = new GlobalVariable(M, OffsetArrayTy, true,
+                              GlobalVariable::InternalLinkage, nullptr);
+  SmallVector<Constant *, 16> OffsetVec;
+  // FieldSize
+  auto *SizeArrayTy = ArrayType::get(Int32Ty, StructTy->getNumElements());
+  Size = new GlobalVariable(M, SizeArrayTy, true,
+                            GlobalVariable::InternalLinkage, nullptr);
+  SmallVector<Constant *, 16> SizeVec;
+  for (unsigned i = 0; i < StructTy->getNumElements(); ++i) {
+    Type *Ty = StructTy->getElementType(i);
+    std::string Str;
+    raw_string_ostream StrOS(Str);
+    Ty->print(StrOS);
+    TypeNameVec.push_back(
+        ConstantExpr::getPointerCast(
+            createPrivateGlobalForString(M, StrOS.str(), true),
+            Int8PtrTy));
+    OffsetVec.push_back(
+        ConstantInt::get(Int32Ty,
+                         DL.getStructLayout(StructTy)->getElementOffset(i)));
+    SizeVec.push_back(ConstantInt::get(Int32Ty,
+                                       DL.getTypeAllocSize(Ty)));
+    }
+  TypeName->setInitializer(ConstantArray::get(TypeNameArrayTy, TypeNameVec));
+  Offset->setInitializer(ConstantArray::get(OffsetArrayTy, OffsetVec));
+  Size->setInitializer(ConstantArray::get(SizeArrayTy, SizeVec));
+}
+
+// Create the global variable for the cache-fragmentation tool.
+GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
+    Module &M, const DataLayout &DL, Constant *UnitName) {
+  assert(Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag);
+
+  auto *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
+  auto *Int8PtrPtrTy = Int8PtrTy->getPointerTo();
+  auto *Int32Ty = Type::getInt32Ty(*Ctx);
+  auto *Int32PtrTy = Type::getInt32PtrTy(*Ctx);
+  auto *Int64Ty = Type::getInt64Ty(*Ctx);
+  auto *Int64PtrTy = Type::getInt64PtrTy(*Ctx);
+  // This structure should be kept consistent with the StructInfo struct
+  // in the runtime library.
+  // struct StructInfo {
+  //   const char *StructName;
+  //   u32 Size;
+  //   u32 NumFields;
+  //   u32 *FieldOffset;           // auxiliary struct field info.
+  //   u32 *FieldSize;             // auxiliary struct field info.
+  //   const char **FieldTypeName; // auxiliary struct field info.
+  //   u64 *FieldCounters;
+  //   u64 *ArrayCounter;
+  // };
+  auto *StructInfoTy =
+      StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
+                      Int8PtrPtrTy, Int64PtrTy, Int64PtrTy);
+  auto *StructInfoPtrTy = StructInfoTy->getPointerTo();
+  // This structure should be kept consistent with the CacheFragInfo struct
+  // in the runtime library.
+  // struct CacheFragInfo {
+  //   const char *UnitName;
+  //   u32 NumStructs;
+  //   StructInfo *Structs;
+  // };
+  auto *CacheFragInfoTy = StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy);
+
+  std::vector<StructType *> Vec = M.getIdentifiedStructTypes();
+  unsigned NumStructs = 0;
+  SmallVector<Constant *, 16> Initializers;
+
+  for (auto &StructTy : Vec) {
+    if (shouldIgnoreStructType(StructTy)) {
+      ++NumIgnoredStructs;
+      continue;
+    }
+    ++NumStructs;
+
+    // StructName.
+    SmallString<MaxStructCounterNameSize> CounterNameStr;
+    createStructCounterName(StructTy, CounterNameStr);
+    GlobalVariable *StructCounterName = createPrivateGlobalForString(
+        M, CounterNameStr, /*AllowMerging*/true);
+
+    // Counters.
+    // We create the counter array with StructCounterName and weak linkage
+    // so that the structs with the same name and layout from different
+    // compilation units will be merged into one.
+    auto *CounterArrayTy = ArrayType::get(Int64Ty,
+                                          getStructCounterSize(StructTy));
+    GlobalVariable *Counters =
+      new GlobalVariable(M, CounterArrayTy, false,
+                         GlobalVariable::WeakAnyLinkage,
+                         ConstantAggregateZero::get(CounterArrayTy),
+                         CounterNameStr);
+
+    // Remember the counter variable for each struct type.
+    StructTyMap.insert(std::pair<Type *, GlobalVariable *>(StructTy, Counters));
+
+    // We pass the field type name array, offset array, and size array to
+    // the runtime for better reporting.
+    GlobalVariable *TypeName = nullptr, *Offset = nullptr, *Size = nullptr;
+    if (ClAuxFieldInfo)
+      createCacheFragAuxGV(M, DL, StructTy, TypeName, Offset, Size);
+
+    Constant *FieldCounterIdx[2];
+    FieldCounterIdx[0] = ConstantInt::get(Int32Ty, 0);
+    FieldCounterIdx[1] = ConstantInt::get(Int32Ty,
+                                          getFieldCounterIdx(StructTy));
+    Constant *ArrayCounterIdx[2];
+    ArrayCounterIdx[0] = ConstantInt::get(Int32Ty, 0);
+    ArrayCounterIdx[1] = ConstantInt::get(Int32Ty,
+                                          getArrayCounterIdx(StructTy));
+    Initializers.push_back(ConstantStruct::get(
+        StructInfoTy,
+        ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
+        ConstantInt::get(Int32Ty,
+                         DL.getStructLayout(StructTy)->getSizeInBytes()),
+        ConstantInt::get(Int32Ty, StructTy->getNumElements()),
+        Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy)
+                          : ConstantExpr::getPointerCast(Offset, Int32PtrTy),
+        Size == nullptr ? ConstantPointerNull::get(Int32PtrTy)
+                        : ConstantExpr::getPointerCast(Size, Int32PtrTy),
+        TypeName == nullptr
+            ? ConstantPointerNull::get(Int8PtrPtrTy)
+            : ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
+        ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+                                       FieldCounterIdx),
+        ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+                                       ArrayCounterIdx)));
+  }
+  // Structs.
+  Constant *StructInfo;
+  if (NumStructs == 0) {
+    StructInfo = ConstantPointerNull::get(StructInfoPtrTy);
+  } else {
+    auto *StructInfoArrayTy = ArrayType::get(StructInfoTy, NumStructs);
+    StructInfo = ConstantExpr::getPointerCast(
+        new GlobalVariable(M, StructInfoArrayTy, false,
+                           GlobalVariable::InternalLinkage,
+                           ConstantArray::get(StructInfoArrayTy, Initializers)),
+        StructInfoPtrTy);
+  }
+
+  auto *CacheFragInfoGV = new GlobalVariable(
+      M, CacheFragInfoTy, true, GlobalVariable::InternalLinkage,
+      ConstantStruct::get(CacheFragInfoTy, UnitName,
+                          ConstantInt::get(Int32Ty, NumStructs), StructInfo));
+  return CacheFragInfoGV;
+}
+
+// Create the tool-specific argument passed to EsanInit and EsanExit.
+Constant *EfficiencySanitizer::createEsanInitToolInfoArg(Module &M,
+                                                         const DataLayout &DL) {
+  // This structure contains tool-specific information about each compilation
+  // unit (module) and is passed to the runtime library.
+  GlobalVariable *ToolInfoGV = nullptr;
+
+  auto *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
+  // Compilation unit name.
+  auto *UnitName = ConstantExpr::getPointerCast(
+      createPrivateGlobalForString(M, M.getModuleIdentifier(), true),
+      Int8PtrTy);
+
+  // Create the tool-specific variable.
+  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag)
+    ToolInfoGV = createCacheFragInfoGV(M, DL, UnitName);
+
+  if (ToolInfoGV != nullptr)
+    return ConstantExpr::getPointerCast(ToolInfoGV, Int8PtrTy);
+
+  // Create the null pointer if no tool-specific variable created.
+  return ConstantPointerNull::get(Int8PtrTy);
+}
+
+void EfficiencySanitizer::createDestructor(Module &M, Constant *ToolInfoArg) {
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
+  EsanDtorFunction = Function::Create(FunctionType::get(Type::getVoidTy(*Ctx),
+                                                        false),
+                                      GlobalValue::InternalLinkage,
+                                      EsanModuleDtorName, &M);
+  ReturnInst::Create(*Ctx, BasicBlock::Create(*Ctx, "", EsanDtorFunction));
+  IRBuilder<> IRB_Dtor(EsanDtorFunction->getEntryBlock().getTerminator());
+  Function *EsanExit = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(EsanExitName, IRB_Dtor.getVoidTy(),
+                            Int8PtrTy));
+  EsanExit->setLinkage(Function::ExternalLinkage);
+  IRB_Dtor.CreateCall(EsanExit, {ToolInfoArg});
+  appendToGlobalDtors(M, EsanDtorFunction, EsanCtorAndDtorPriority);
+}
+
+bool EfficiencySanitizer::initOnModule(Module &M) {
+
+  Triple TargetTriple(M.getTargetTriple());
+  if (TargetTriple.getArch() == Triple::mips64 || TargetTriple.getArch() == Triple::mips64el)
+    ShadowParams = ShadowParams40;
+  else
+    ShadowParams = ShadowParams47;
+
+  Ctx = &M.getContext();
+  const DataLayout &DL = M.getDataLayout();
+  IRBuilder<> IRB(M.getContext());
+  IntegerType *OrdTy = IRB.getInt32Ty();
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
+  IntptrTy = DL.getIntPtrType(M.getContext());
+  // Create the variable passed to EsanInit and EsanExit.
+  Constant *ToolInfoArg = createEsanInitToolInfoArg(M, DL);
+  // Constructor
+  // We specify the tool type both in the EsanWhichToolName global
+  // and as an arg to the init routine as a sanity check.
+  std::tie(EsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, EsanModuleCtorName, EsanInitName, /*InitArgTypes=*/{OrdTy, Int8PtrTy},
+      /*InitArgs=*/{
+        ConstantInt::get(OrdTy, static_cast<int>(Options.ToolType)),
+        ToolInfoArg});
+  appendToGlobalCtors(M, EsanCtorFunction, EsanCtorAndDtorPriority);
+
+  createDestructor(M, ToolInfoArg);
+
+  new GlobalVariable(M, OrdTy, true,
+                     GlobalValue::WeakAnyLinkage,
+                     ConstantInt::get(OrdTy,
+                                      static_cast<int>(Options.ToolType)),
+                     EsanWhichToolName);
+
+  return true;
+}
+
+Value *EfficiencySanitizer::appToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  // Shadow = ((App & Mask) + Offs) >> Scale
+  Shadow = IRB.CreateAnd(Shadow, ConstantInt::get(IntptrTy, ShadowParams.ShadowMask));
+  uint64_t Offs;
+  int Scale = ShadowScale[Options.ToolType];
+  if (Scale <= 2)
+    Offs = ShadowParams.ShadowOffs[Scale];
+  else
+    Offs = ShadowParams.ShadowOffs[0] << Scale;
+  Shadow = IRB.CreateAdd(Shadow, ConstantInt::get(IntptrTy, Offs));
+  if (Scale > 0)
+    Shadow = IRB.CreateLShr(Shadow, Scale);
+  return Shadow;
+}
+
+bool EfficiencySanitizer::shouldIgnoreMemoryAccess(Instruction *I) {
+  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag) {
+    // We'd like to know about cache fragmentation in vtable accesses and
+    // constant data references, so we do not currently ignore anything.
+    return false;
+  } else if (Options.ToolType == EfficiencySanitizerOptions::ESAN_WorkingSet) {
+    // TODO: the instrumentation disturbs the data layout on the stack, so we
+    // may want to add an option to ignore stack references (if we can
+    // distinguish them) to reduce overhead.
+  }
+  // TODO(bruening): future tools will be returning true for some cases.
+  return false;
+}
+
+bool EfficiencySanitizer::runOnModule(Module &M) {
+  bool Res = initOnModule(M);
+  initializeCallbacks(M);
+  for (auto &F : M) {
+    Res |= runOnFunction(F, M);
+  }
+  return Res;
+}
+
+bool EfficiencySanitizer::runOnFunction(Function &F, Module &M) {
+  // This is required to prevent instrumenting the call to __esan_init from
+  // within the module constructor.
+  if (&F == EsanCtorFunction)
+    return false;
+  SmallVector<Instruction *, 8> LoadsAndStores;
+  SmallVector<Instruction *, 8> MemIntrinCalls;
+  SmallVector<Instruction *, 8> GetElementPtrs;
+  bool Res = false;
+  const DataLayout &DL = M.getDataLayout();
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if ((isa<LoadInst>(Inst) || isa<StoreInst>(Inst) ||
+           isa<AtomicRMWInst>(Inst) || isa<AtomicCmpXchgInst>(Inst)) &&
+          !shouldIgnoreMemoryAccess(&Inst))
+        LoadsAndStores.push_back(&Inst);
+      else if (isa<MemIntrinsic>(Inst))
+        MemIntrinCalls.push_back(&Inst);
+      else if (isa<GetElementPtrInst>(Inst))
+        GetElementPtrs.push_back(&Inst);
+      else if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+        maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
+    }
+  }
+
+  if (ClInstrumentLoadsAndStores) {
+    for (auto Inst : LoadsAndStores) {
+      Res |= instrumentLoadOrStore(Inst, DL);
+    }
+  }
+
+  if (ClInstrumentMemIntrinsics) {
+    for (auto Inst : MemIntrinCalls) {
+      Res |= instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+    }
+  }
+
+  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag) {
+    for (auto Inst : GetElementPtrs) {
+      Res |= instrumentGetElementPtr(Inst, M);
+    }
+  }
+
+  return Res;
+}
+
+bool EfficiencySanitizer::instrumentLoadOrStore(Instruction *I,
+                                                const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  bool IsStore;
+  Value *Addr;
+  unsigned Alignment;
+  if (LoadInst *Load = dyn_cast<LoadInst>(I)) {
+    IsStore = false;
+    Alignment = Load->getAlignment();
+    Addr = Load->getPointerOperand();
+  } else if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
+    IsStore = true;
+    Alignment = Store->getAlignment();
+    Addr = Store->getPointerOperand();
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    IsStore = true;
+    Alignment = 0;
+    Addr = RMW->getPointerOperand();
+  } else if (AtomicCmpXchgInst *Xchg = dyn_cast<AtomicCmpXchgInst>(I)) {
+    IsStore = true;
+    Alignment = 0;
+    Addr = Xchg->getPointerOperand();
+  } else
+    llvm_unreachable("Unsupported mem access type");
+
+  Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+  const uint32_t TypeSizeBytes = DL.getTypeStoreSizeInBits(OrigTy) / 8;
+  Value *OnAccessFunc = nullptr;
+
+  // Convert 0 to the default alignment.
+  if (Alignment == 0)
+    Alignment = DL.getPrefTypeAlignment(OrigTy);
+
+  if (IsStore)
+    NumInstrumentedStores++;
+  else
+    NumInstrumentedLoads++;
+  int Idx = getMemoryAccessFuncIndex(Addr, DL);
+  if (Idx < 0) {
+    OnAccessFunc = IsStore ? EsanUnalignedStoreN : EsanUnalignedLoadN;
+    IRB.CreateCall(OnAccessFunc,
+                   {IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                    ConstantInt::get(IntptrTy, TypeSizeBytes)});
+  } else {
+    if (ClInstrumentFastpath &&
+        instrumentFastpath(I, DL, IsStore, Addr, Alignment)) {
+      NumFastpaths++;
+      return true;
+    }
+    if (Alignment == 0 || (Alignment % TypeSizeBytes) == 0)
+      OnAccessFunc = IsStore ? EsanAlignedStore[Idx] : EsanAlignedLoad[Idx];
+    else
+      OnAccessFunc = IsStore ? EsanUnalignedStore[Idx] : EsanUnalignedLoad[Idx];
+    IRB.CreateCall(OnAccessFunc,
+                   IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+  }
+  return true;
+}
+
+// It's simplest to replace the memset/memmove/memcpy intrinsics with
+// calls that the runtime library intercepts.
+// Our pass is late enough that calls should not turn back into intrinsics.
+bool EfficiencySanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> IRB(MI);
+  bool Res = false;
+  if (isa<MemSetInst>(MI)) {
+    IRB.CreateCall(
+        MemsetFn,
+        {IRB.CreatePointerCast(MI->getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getArgOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(MI->getArgOperand(2), IntptrTy, false)});
+    MI->eraseFromParent();
+    Res = true;
+  } else if (isa<MemTransferInst>(MI)) {
+    IRB.CreateCall(
+        isa<MemCpyInst>(MI) ? MemcpyFn : MemmoveFn,
+        {IRB.CreatePointerCast(MI->getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(MI->getArgOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getArgOperand(2), IntptrTy, false)});
+    MI->eraseFromParent();
+    Res = true;
+  } else
+    llvm_unreachable("Unsupported mem intrinsic type");
+  return Res;
+}
+
+bool EfficiencySanitizer::instrumentGetElementPtr(Instruction *I, Module &M) {
+  GetElementPtrInst *GepInst = dyn_cast<GetElementPtrInst>(I);
+  bool Res = false;
+  if (GepInst == nullptr || GepInst->getNumIndices() == 1) {
+    ++NumIgnoredGEPs;
+    return false;
+  }
+  Type *SourceTy = GepInst->getSourceElementType();
+  StructType *StructTy = nullptr;
+  ConstantInt *Idx;
+  // Check if GEP calculates address from a struct array.
+  if (isa<StructType>(SourceTy)) {
+    StructTy = cast<StructType>(SourceTy);
+    Idx = dyn_cast<ConstantInt>(GepInst->getOperand(1));
+    if ((Idx == nullptr || Idx->getSExtValue() != 0) &&
+        !shouldIgnoreStructType(StructTy) && StructTyMap.count(StructTy) != 0)
+      Res |= insertCounterUpdate(I, StructTy, getArrayCounterIdx(StructTy));
+  }
+  // Iterate all (except the first and the last) idx within each GEP instruction
+  // for possible nested struct field address calculation.
+  for (unsigned i = 1; i < GepInst->getNumIndices(); ++i) {
+    SmallVector<Value *, 8> IdxVec(GepInst->idx_begin(),
+                                   GepInst->idx_begin() + i);
+    Type *Ty = GetElementPtrInst::getIndexedType(SourceTy, IdxVec);
+    unsigned CounterIdx = 0;
+    if (isa<ArrayType>(Ty)) {
+      ArrayType *ArrayTy = cast<ArrayType>(Ty);
+      StructTy = dyn_cast<StructType>(ArrayTy->getElementType());
+      if (shouldIgnoreStructType(StructTy) || StructTyMap.count(StructTy) == 0)
+        continue;
+      // The last counter for struct array access.
+      CounterIdx = getArrayCounterIdx(StructTy);
+    } else if (isa<StructType>(Ty)) {
+      StructTy = cast<StructType>(Ty);
+      if (shouldIgnoreStructType(StructTy) || StructTyMap.count(StructTy) == 0)
+        continue;
+      // Get the StructTy's subfield index.
+      Idx = cast<ConstantInt>(GepInst->getOperand(i+1));
+      assert(Idx->getSExtValue() >= 0 &&
+             Idx->getSExtValue() < StructTy->getNumElements());
+      CounterIdx = getFieldCounterIdx(StructTy) + Idx->getSExtValue();
+    }
+    Res |= insertCounterUpdate(I, StructTy, CounterIdx);
+  }
+  if (Res)
+    ++NumInstrumentedGEPs;
+  else
+    ++NumIgnoredGEPs;
+  return Res;
+}
+
+bool EfficiencySanitizer::insertCounterUpdate(Instruction *I,
+                                              StructType *StructTy,
+                                              unsigned CounterIdx) {
+  GlobalVariable *CounterArray = StructTyMap[StructTy];
+  if (CounterArray == nullptr)
+    return false;
+  IRBuilder<> IRB(I);
+  Constant *Indices[2];
+  // Xref http://llvm.org/docs/LangRef.html#i-getelementptr and
+  // http://llvm.org/docs/GetElementPtr.html.
+  // The first index of the GEP instruction steps through the first operand,
+  // i.e., the array itself.
+  Indices[0] = ConstantInt::get(IRB.getInt32Ty(), 0);
+  // The second index is the index within the array.
+  Indices[1] = ConstantInt::get(IRB.getInt32Ty(), CounterIdx);
+  Constant *Counter =
+    ConstantExpr::getGetElementPtr(
+        ArrayType::get(IRB.getInt64Ty(), getStructCounterSize(StructTy)),
+        CounterArray, Indices);
+  Value *Load = IRB.CreateLoad(Counter);
+  IRB.CreateStore(IRB.CreateAdd(Load, ConstantInt::get(IRB.getInt64Ty(), 1)),
+                  Counter);
+  return true;
+}
+
+int EfficiencySanitizer::getMemoryAccessFuncIndex(Value *Addr,
+                                                  const DataLayout &DL) {
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+  assert(OrigTy->isSized());
+  // The size is always a multiple of 8.
+  uint32_t TypeSizeBytes = DL.getTypeStoreSizeInBits(OrigTy) / 8;
+  if (TypeSizeBytes != 1 && TypeSizeBytes != 2 && TypeSizeBytes != 4 &&
+      TypeSizeBytes != 8 && TypeSizeBytes != 16) {
+    // Irregular sizes do not have per-size call targets.
+    NumAccessesWithIrregularSize++;
+    return -1;
+  }
+  size_t Idx = countTrailingZeros(TypeSizeBytes);
+  assert(Idx < NumberOfAccessSizes);
+  return Idx;
+}
+
+bool EfficiencySanitizer::instrumentFastpath(Instruction *I,
+                                             const DataLayout &DL, bool IsStore,
+                                             Value *Addr, unsigned Alignment) {
+  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag) {
+    return instrumentFastpathCacheFrag(I, DL, Addr, Alignment);
+  } else if (Options.ToolType == EfficiencySanitizerOptions::ESAN_WorkingSet) {
+    return instrumentFastpathWorkingSet(I, DL, Addr, Alignment);
+  }
+  return false;
+}
+
+bool EfficiencySanitizer::instrumentFastpathCacheFrag(Instruction *I,
+                                                      const DataLayout &DL,
+                                                      Value *Addr,
+                                                      unsigned Alignment) {
+  // Do nothing.
+  return true; // Return true to avoid slowpath instrumentation.
+}
+
+bool EfficiencySanitizer::instrumentFastpathWorkingSet(
+    Instruction *I, const DataLayout &DL, Value *Addr, unsigned Alignment) {
+  assert(ShadowScale[Options.ToolType] == 6); // The code below assumes this
+  IRBuilder<> IRB(I);
+  Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+  const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+  // Bail to the slowpath if the access might touch multiple cache lines.
+  // An access aligned to its size is guaranteed to be intra-cache-line.
+  // getMemoryAccessFuncIndex has already ruled out a size larger than 16
+  // and thus larger than a cache line for platforms this tool targets
+  // (and our shadow memory setup assumes 64-byte cache lines).
+  assert(TypeSize <= 128);
+  if (!(TypeSize == 8 ||
+        (Alignment % (TypeSize / 8)) == 0)) {
+    if (ClAssumeIntraCacheLine)
+      ++NumAssumedIntraCacheLine;
+    else
+      return false;
+  }
+
+  // We inline instrumentation to set the corresponding shadow bits for
+  // each cache line touched by the application.  Here we handle a single
+  // load or store where we've already ruled out the possibility that it
+  // might touch more than one cache line and thus we simply update the
+  // shadow memory for a single cache line.
+  // Our shadow memory model is fine with races when manipulating shadow values.
+  // We generate the following code:
+  //
+  //   const char BitMask = 0x81;
+  //   char *ShadowAddr = appToShadow(AppAddr);
+  //   if ((*ShadowAddr & BitMask) != BitMask)
+  //     *ShadowAddr |= Bitmask;
+  //
+  Value *AddrPtr = IRB.CreatePointerCast(Addr, IntptrTy);
+  Value *ShadowPtr = appToShadow(AddrPtr, IRB);
+  Type *ShadowTy = IntegerType::get(*Ctx, 8U);
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  // The bottom bit is used for the current sampling period's working set.
+  // The top bit is used for the total working set.  We set both on each
+  // memory access, if they are not already set.
+  Value *ValueMask = ConstantInt::get(ShadowTy, 0x81); // 10000001B
+
+  Value *OldValue = IRB.CreateLoad(IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+  // The AND and CMP will be turned into a TEST instruction by the compiler.
+  Value *Cmp = IRB.CreateICmpNE(IRB.CreateAnd(OldValue, ValueMask), ValueMask);
+  TerminatorInst *CmpTerm = SplitBlockAndInsertIfThen(Cmp, I, false);
+  // FIXME: do I need to call SetCurrentDebugLocation?
+  IRB.SetInsertPoint(CmpTerm);
+  // We use OR to set the shadow bits to avoid corrupting the middle 6 bits,
+  // which are used by the runtime library.
+  Value *NewVal = IRB.CreateOr(OldValue, ValueMask);
+  IRB.CreateStore(NewVal, IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+  IRB.SetInsertPoint(I);
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
new file mode 100644
index 000000000000..56d0f5e983ca
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -0,0 +1,994 @@
+//===- GCOVProfiling.cpp - Insert edge counters for gcov profiling --------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements GCOV-style profiling. When this pass is run it emits
+// "gcno" files next to the existing source, and instruments the code that runs
+// to records the edges between blocks that run and emit a complementary "gcda"
+// file on exit.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/GCOVProfiler.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "insert-gcov-profiling"
+
+static cl::opt<std::string>
+DefaultGCOVVersion("default-gcov-version", cl::init("402*"), cl::Hidden,
+                   cl::ValueRequired);
+static cl::opt<bool> DefaultExitBlockBeforeBody("gcov-exit-block-before-body",
+                                                cl::init(false), cl::Hidden);
+
+GCOVOptions GCOVOptions::getDefault() {
+  GCOVOptions Options;
+  Options.EmitNotes = true;
+  Options.EmitData = true;
+  Options.UseCfgChecksum = false;
+  Options.NoRedZone = false;
+  Options.FunctionNamesInData = true;
+  Options.ExitBlockBeforeBody = DefaultExitBlockBeforeBody;
+
+  if (DefaultGCOVVersion.size() != 4) {
+    llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") +
+                             DefaultGCOVVersion);
+  }
+  memcpy(Options.Version, DefaultGCOVVersion.c_str(), 4);
+  return Options;
+}
+
+namespace {
+class GCOVFunction;
+
+class GCOVProfiler {
+public:
+  GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
+  GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {
+    assert((Options.EmitNotes || Options.EmitData) &&
+           "GCOVProfiler asked to do nothing?");
+    ReversedVersion[0] = Options.Version[3];
+    ReversedVersion[1] = Options.Version[2];
+    ReversedVersion[2] = Options.Version[1];
+    ReversedVersion[3] = Options.Version[0];
+    ReversedVersion[4] = '\0';
+  }
+  bool runOnModule(Module &M);
+
+private:
+  // Create the .gcno files for the Module based on DebugInfo.
+  void emitProfileNotes();
+
+  // Modify the program to track transitions along edges and call into the
+  // profiling runtime to emit .gcda files when run.
+  bool emitProfileArcs();
+
+  // Get pointers to the functions in the runtime library.
+  Constant *getStartFileFunc();
+  Constant *getIncrementIndirectCounterFunc();
+  Constant *getEmitFunctionFunc();
+  Constant *getEmitArcsFunc();
+  Constant *getSummaryInfoFunc();
+  Constant *getEndFileFunc();
+
+  // Create or retrieve an i32 state value that is used to represent the
+  // pred block number for certain non-trivial edges.
+  GlobalVariable *getEdgeStateValue();
+
+  // Produce a table of pointers to counters, by predecessor and successor
+  // block number.
+  GlobalVariable *buildEdgeLookupTable(Function *F, GlobalVariable *Counter,
+                                       const UniqueVector<BasicBlock *> &Preds,
+                                       const UniqueVector<BasicBlock *> &Succs);
+
+  // Add the function to write out all our counters to the global destructor
+  // list.
+  Function *
+  insertCounterWriteout(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
+  Function *insertFlush(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
+  void insertIndirectCounterIncrement();
+
+  enum class GCovFileType { GCNO, GCDA };
+  std::string mangleName(const DICompileUnit *CU, GCovFileType FileType);
+
+  GCOVOptions Options;
+
+  // Reversed, NUL-terminated copy of Options.Version.
+  char ReversedVersion[5];
+  // Checksum, produced by hash of EdgeDestinations
+  SmallVector<uint32_t, 4> FileChecksums;
+
+  Module *M;
+  LLVMContext *Ctx;
+  SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
+};
+
+class GCOVProfilerLegacyPass : public ModulePass {
+public:
+  static char ID;
+  GCOVProfilerLegacyPass()
+      : GCOVProfilerLegacyPass(GCOVOptions::getDefault()) {}
+  GCOVProfilerLegacyPass(const GCOVOptions &Opts)
+      : ModulePass(ID), Profiler(Opts) {
+    initializeGCOVProfilerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  StringRef getPassName() const override { return "GCOV Profiler"; }
+
+  bool runOnModule(Module &M) override { return Profiler.runOnModule(M); }
+
+private:
+  GCOVProfiler Profiler;
+};
+}
+
+char GCOVProfilerLegacyPass::ID = 0;
+INITIALIZE_PASS(GCOVProfilerLegacyPass, "insert-gcov-profiling",
+                "Insert instrumentation for GCOV profiling", false, false)
+
+ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) {
+  return new GCOVProfilerLegacyPass(Options);
+}
+
+static StringRef getFunctionName(const DISubprogram *SP) {
+  if (!SP->getLinkageName().empty())
+    return SP->getLinkageName();
+  return SP->getName();
+}
+
+namespace {
+  class GCOVRecord {
+   protected:
+    static const char *const LinesTag;
+    static const char *const FunctionTag;
+    static const char *const BlockTag;
+    static const char *const EdgeTag;
+
+    GCOVRecord() = default;
+
+    void writeBytes(const char *Bytes, int Size) {
+      os->write(Bytes, Size);
+    }
+
+    void write(uint32_t i) {
+      writeBytes(reinterpret_cast<char*>(&i), 4);
+    }
+
+    // Returns the length measured in 4-byte blocks that will be used to
+    // represent this string in a GCOV file
+    static unsigned lengthOfGCOVString(StringRef s) {
+      // A GCOV string is a length, followed by a NUL, then between 0 and 3 NULs
+      // padding out to the next 4-byte word. The length is measured in 4-byte
+      // words including padding, not bytes of actual string.
+      return (s.size() / 4) + 1;
+    }
+
+    void writeGCOVString(StringRef s) {
+      uint32_t Len = lengthOfGCOVString(s);
+      write(Len);
+      writeBytes(s.data(), s.size());
+
+      // Write 1 to 4 bytes of NUL padding.
+      assert((unsigned)(4 - (s.size() % 4)) > 0);
+      assert((unsigned)(4 - (s.size() % 4)) <= 4);
+      writeBytes("\0\0\0\0", 4 - (s.size() % 4));
+    }
+
+    raw_ostream *os;
+  };
+  const char *const GCOVRecord::LinesTag = "\0\0\x45\x01";
+  const char *const GCOVRecord::FunctionTag = "\0\0\0\1";
+  const char *const GCOVRecord::BlockTag = "\0\0\x41\x01";
+  const char *const GCOVRecord::EdgeTag = "\0\0\x43\x01";
+
+  class GCOVFunction;
+  class GCOVBlock;
+
+  // Constructed only by requesting it from a GCOVBlock, this object stores a
+  // list of line numbers and a single filename, representing lines that belong
+  // to the block.
+  class GCOVLines : public GCOVRecord {
+   public:
+    void addLine(uint32_t Line) {
+      assert(Line != 0 && "Line zero is not a valid real line number.");
+      Lines.push_back(Line);
+    }
+
+    uint32_t length() const {
+      // Here 2 = 1 for string length + 1 for '0' id#.
+      return lengthOfGCOVString(Filename) + 2 + Lines.size();
+    }
+
+    void writeOut() {
+      write(0);
+      writeGCOVString(Filename);
+      for (int i = 0, e = Lines.size(); i != e; ++i)
+        write(Lines[i]);
+    }
+
+    GCOVLines(StringRef F, raw_ostream *os)
+      : Filename(F) {
+      this->os = os;
+    }
+
+   private:
+    StringRef Filename;
+    SmallVector<uint32_t, 32> Lines;
+  };
+
+
+  // Represent a basic block in GCOV. Each block has a unique number in the
+  // function, number of lines belonging to each block, and a set of edges to
+  // other blocks.
+  class GCOVBlock : public GCOVRecord {
+   public:
+    GCOVLines &getFile(StringRef Filename) {
+      return LinesByFile.try_emplace(Filename, Filename, os).first->second;
+    }
+
+    void addEdge(GCOVBlock &Successor) {
+      OutEdges.push_back(&Successor);
+    }
+
+    void writeOut() {
+      uint32_t Len = 3;
+      SmallVector<StringMapEntry<GCOVLines> *, 32> SortedLinesByFile;
+      for (auto &I : LinesByFile) {
+        Len += I.second.length();
+        SortedLinesByFile.push_back(&I);
+      }
+
+      writeBytes(LinesTag, 4);
+      write(Len);
+      write(Number);
+
+      std::sort(
+          SortedLinesByFile.begin(), SortedLinesByFile.end(),
+          [](StringMapEntry<GCOVLines> *LHS, StringMapEntry<GCOVLines> *RHS) {
+            return LHS->getKey() < RHS->getKey();
+          });
+      for (auto &I : SortedLinesByFile)
+        I->getValue().writeOut();
+      write(0);
+      write(0);
+    }
+
+    GCOVBlock(const GCOVBlock &RHS) : GCOVRecord(RHS), Number(RHS.Number) {
+      // Only allow copy before edges and lines have been added. After that,
+      // there are inter-block pointers (eg: edges) that won't take kindly to
+      // blocks being copied or moved around.
+      assert(LinesByFile.empty());
+      assert(OutEdges.empty());
+    }
+
+   private:
+    friend class GCOVFunction;
+
+    GCOVBlock(uint32_t Number, raw_ostream *os)
+        : Number(Number) {
+      this->os = os;
+    }
+
+    uint32_t Number;
+    StringMap<GCOVLines> LinesByFile;
+    SmallVector<GCOVBlock *, 4> OutEdges;
+  };
+
+  // A function has a unique identifier, a checksum (we leave as zero) and a
+  // set of blocks and a map of edges between blocks. This is the only GCOV
+  // object users can construct, the blocks and lines will be rooted here.
+  class GCOVFunction : public GCOVRecord {
+   public:
+     GCOVFunction(const DISubprogram *SP, Function *F, raw_ostream *os,
+                  uint32_t Ident, bool UseCfgChecksum, bool ExitBlockBeforeBody)
+         : SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0),
+           ReturnBlock(1, os) {
+      this->os = os;
+
+      DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
+
+      uint32_t i = 0;
+      for (auto &BB : *F) {
+        // Skip index 1 if it's assigned to the ReturnBlock.
+        if (i == 1 && ExitBlockBeforeBody)
+          ++i;
+        Blocks.insert(std::make_pair(&BB, GCOVBlock(i++, os)));
+      }
+      if (!ExitBlockBeforeBody)
+        ReturnBlock.Number = i;
+
+      std::string FunctionNameAndLine;
+      raw_string_ostream FNLOS(FunctionNameAndLine);
+      FNLOS << getFunctionName(SP) << SP->getLine();
+      FNLOS.flush();
+      FuncChecksum = hash_value(FunctionNameAndLine);
+    }
+
+    GCOVBlock &getBlock(BasicBlock *BB) {
+      return Blocks.find(BB)->second;
+    }
+
+    GCOVBlock &getReturnBlock() {
+      return ReturnBlock;
+    }
+
+    std::string getEdgeDestinations() {
+      std::string EdgeDestinations;
+      raw_string_ostream EDOS(EdgeDestinations);
+      Function *F = Blocks.begin()->first->getParent();
+      for (BasicBlock &I : *F) {
+        GCOVBlock &Block = getBlock(&I);
+        for (int i = 0, e = Block.OutEdges.size(); i != e; ++i)
+          EDOS << Block.OutEdges[i]->Number;
+      }
+      return EdgeDestinations;
+    }
+
+    uint32_t getFuncChecksum() {
+      return FuncChecksum;
+    }
+
+    void setCfgChecksum(uint32_t Checksum) {
+      CfgChecksum = Checksum;
+    }
+
+    void writeOut() {
+      writeBytes(FunctionTag, 4);
+      uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(getFunctionName(SP)) +
+                          1 + lengthOfGCOVString(SP->getFilename()) + 1;
+      if (UseCfgChecksum)
+        ++BlockLen;
+      write(BlockLen);
+      write(Ident);
+      write(FuncChecksum);
+      if (UseCfgChecksum)
+        write(CfgChecksum);
+      writeGCOVString(getFunctionName(SP));
+      writeGCOVString(SP->getFilename());
+      write(SP->getLine());
+
+      // Emit count of blocks.
+      writeBytes(BlockTag, 4);
+      write(Blocks.size() + 1);
+      for (int i = 0, e = Blocks.size() + 1; i != e; ++i) {
+        write(0);  // No flags on our blocks.
+      }
+      DEBUG(dbgs() << Blocks.size() << " blocks.\n");
+
+      // Emit edges between blocks.
+      if (Blocks.empty()) return;
+      Function *F = Blocks.begin()->first->getParent();
+      for (BasicBlock &I : *F) {
+        GCOVBlock &Block = getBlock(&I);
+        if (Block.OutEdges.empty()) continue;
+
+        writeBytes(EdgeTag, 4);
+        write(Block.OutEdges.size() * 2 + 1);
+        write(Block.Number);
+        for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) {
+          DEBUG(dbgs() << Block.Number << " -> " << Block.OutEdges[i]->Number
+                       << "\n");
+          write(Block.OutEdges[i]->Number);
+          write(0);  // no flags
+        }
+      }
+
+      // Emit lines for each block.
+      for (BasicBlock &I : *F)
+        getBlock(&I).writeOut();
+    }
+
+   private:
+     const DISubprogram *SP;
+    uint32_t Ident;
+    uint32_t FuncChecksum;
+    bool UseCfgChecksum;
+    uint32_t CfgChecksum;
+    DenseMap<BasicBlock *, GCOVBlock> Blocks;
+    GCOVBlock ReturnBlock;
+  };
+}
+
+std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
+                                     GCovFileType OutputType) {
+  bool Notes = OutputType == GCovFileType::GCNO;
+
+  if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) {
+    for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) {
+      MDNode *N = GCov->getOperand(i);
+      bool ThreeElement = N->getNumOperands() == 3;
+      if (!ThreeElement && N->getNumOperands() != 2)
+        continue;
+      if (dyn_cast<MDNode>(N->getOperand(ThreeElement ? 2 : 1)) != CU)
+        continue;
+
+      if (ThreeElement) {
+        // These nodes have no mangling to apply, it's stored mangled in the
+        // bitcode.
+        MDString *NotesFile = dyn_cast<MDString>(N->getOperand(0));
+        MDString *DataFile = dyn_cast<MDString>(N->getOperand(1));
+        if (!NotesFile || !DataFile)
+          continue;
+        return Notes ? NotesFile->getString() : DataFile->getString();
+      }
+
+      MDString *GCovFile = dyn_cast<MDString>(N->getOperand(0));
+      if (!GCovFile)
+        continue;
+
+      SmallString<128> Filename = GCovFile->getString();
+      sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
+      return Filename.str();
+    }
+  }
+
+  SmallString<128> Filename = CU->getFilename();
+  sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
+  StringRef FName = sys::path::filename(Filename);
+  SmallString<128> CurPath;
+  if (sys::fs::current_path(CurPath)) return FName;
+  sys::path::append(CurPath, FName);
+  return CurPath.str();
+}
+
+bool GCOVProfiler::runOnModule(Module &M) {
+  this->M = &M;
+  Ctx = &M.getContext();
+
+  if (Options.EmitNotes) emitProfileNotes();
+  if (Options.EmitData) return emitProfileArcs();
+  return false;
+}
+
+PreservedAnalyses GCOVProfilerPass::run(Module &M,
+                                        ModuleAnalysisManager &AM) {
+
+  GCOVProfiler Profiler(GCOVOpts);
+
+  if (!Profiler.runOnModule(M))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+static bool functionHasLines(Function &F) {
+  // Check whether this function actually has any source lines. Not only
+  // do these waste space, they also can crash gcov.
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      // Debug intrinsic locations correspond to the location of the
+      // declaration, not necessarily any statements or expressions.
+      if (isa<DbgInfoIntrinsic>(&I)) continue;
+
+      const DebugLoc &Loc = I.getDebugLoc();
+      if (!Loc)
+        continue;
+
+      // Artificial lines such as calls to the global constructors.
+      if (Loc.getLine() == 0) continue;
+
+      return true;
+    }
+  }
+  return false;
+}
+
+void GCOVProfiler::emitProfileNotes() {
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes) return;
+
+  for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+    // Each compile unit gets its own .gcno file. This means that whether we run
+    // this pass over the original .o's as they're produced, or run it after
+    // LTO, we'll generate the same .gcno files.
+
+    auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));
+
+    // Skip module skeleton (and module) CUs.
+    if (CU->getDWOId())
+      continue;
+
+    std::error_code EC;
+    raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC, sys::fs::F_None);
+    std::string EdgeDestinations;
+
+    unsigned FunctionIdent = 0;
+    for (auto &F : M->functions()) {
+      DISubprogram *SP = F.getSubprogram();
+      if (!SP) continue;
+      if (!functionHasLines(F)) continue;
+
+      // gcov expects every function to start with an entry block that has a
+      // single successor, so split the entry block to make sure of that.
+      BasicBlock &EntryBlock = F.getEntryBlock();
+      BasicBlock::iterator It = EntryBlock.begin();
+      while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
+        ++It;
+      EntryBlock.splitBasicBlock(It);
+
+      Funcs.push_back(make_unique<GCOVFunction>(SP, &F, &out, FunctionIdent++,
+                                                Options.UseCfgChecksum,
+                                                Options.ExitBlockBeforeBody));
+      GCOVFunction &Func = *Funcs.back();
+
+      for (auto &BB : F) {
+        GCOVBlock &Block = Func.getBlock(&BB);
+        TerminatorInst *TI = BB.getTerminator();
+        if (int successors = TI->getNumSuccessors()) {
+          for (int i = 0; i != successors; ++i) {
+            Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
+          }
+        } else if (isa<ReturnInst>(TI)) {
+          Block.addEdge(Func.getReturnBlock());
+        }
+
+        uint32_t Line = 0;
+        for (auto &I : BB) {
+          // Debug intrinsic locations correspond to the location of the
+          // declaration, not necessarily any statements or expressions.
+          if (isa<DbgInfoIntrinsic>(&I)) continue;
+
+          const DebugLoc &Loc = I.getDebugLoc();
+          if (!Loc)
+            continue;
+
+          // Artificial lines such as calls to the global constructors.
+          if (Loc.getLine() == 0) continue;
+
+          if (Line == Loc.getLine()) continue;
+          Line = Loc.getLine();
+          if (SP != getDISubprogram(Loc.getScope()))
+            continue;
+
+          GCOVLines &Lines = Block.getFile(SP->getFilename());
+          Lines.addLine(Loc.getLine());
+        }
+      }
+      EdgeDestinations += Func.getEdgeDestinations();
+    }
+
+    FileChecksums.push_back(hash_value(EdgeDestinations));
+    out.write("oncg", 4);
+    out.write(ReversedVersion, 4);
+    out.write(reinterpret_cast<char*>(&FileChecksums.back()), 4);
+
+    for (auto &Func : Funcs) {
+      Func->setCfgChecksum(FileChecksums.back());
+      Func->writeOut();
+    }
+
+    out.write("\0\0\0\0\0\0\0\0", 8);  // EOF
+    out.close();
+  }
+}
+
+bool GCOVProfiler::emitProfileArcs() {
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes) return false;
+
+  bool Result = false;
+  bool InsertIndCounterIncrCode = false;
+  for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+    SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
+    for (auto &F : M->functions()) {
+      DISubprogram *SP = F.getSubprogram();
+      if (!SP) continue;
+      if (!functionHasLines(F)) continue;
+      if (!Result) Result = true;
+      unsigned Edges = 0;
+      for (auto &BB : F) {
+        TerminatorInst *TI = BB.getTerminator();
+        if (isa<ReturnInst>(TI))
+          ++Edges;
+        else
+          Edges += TI->getNumSuccessors();
+      }
+
+      ArrayType *CounterTy =
+        ArrayType::get(Type::getInt64Ty(*Ctx), Edges);
+      GlobalVariable *Counters =
+        new GlobalVariable(*M, CounterTy, false,
+                           GlobalValue::InternalLinkage,
+                           Constant::getNullValue(CounterTy),
+                           "__llvm_gcov_ctr");
+      CountersBySP.push_back(std::make_pair(Counters, SP));
+
+      UniqueVector<BasicBlock *> ComplexEdgePreds;
+      UniqueVector<BasicBlock *> ComplexEdgeSuccs;
+
+      unsigned Edge = 0;
+      for (auto &BB : F) {
+        TerminatorInst *TI = BB.getTerminator();
+        int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
+        if (Successors) {
+          if (Successors == 1) {
+            IRBuilder<> Builder(&*BB.getFirstInsertionPt());
+            Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
+                                                                Edge);
+            Value *Count = Builder.CreateLoad(Counter);
+            Count = Builder.CreateAdd(Count, Builder.getInt64(1));
+            Builder.CreateStore(Count, Counter);
+          } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+            IRBuilder<> Builder(BI);
+            Value *Sel = Builder.CreateSelect(BI->getCondition(),
+                                              Builder.getInt64(Edge),
+                                              Builder.getInt64(Edge + 1));
+            Value *Counter = Builder.CreateInBoundsGEP(
+                Counters->getValueType(), Counters, {Builder.getInt64(0), Sel});
+            Value *Count = Builder.CreateLoad(Counter);
+            Count = Builder.CreateAdd(Count, Builder.getInt64(1));
+            Builder.CreateStore(Count, Counter);
+          } else {
+            ComplexEdgePreds.insert(&BB);
+            for (int i = 0; i != Successors; ++i)
+              ComplexEdgeSuccs.insert(TI->getSuccessor(i));
+          }
+
+          Edge += Successors;
+        }
+      }
+
+      if (!ComplexEdgePreds.empty()) {
+        GlobalVariable *EdgeTable =
+          buildEdgeLookupTable(&F, Counters,
+                               ComplexEdgePreds, ComplexEdgeSuccs);
+        GlobalVariable *EdgeState = getEdgeStateValue();
+
+        for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) {
+          IRBuilder<> Builder(&*ComplexEdgePreds[i + 1]->getFirstInsertionPt());
+          Builder.CreateStore(Builder.getInt32(i), EdgeState);
+        }
+
+        for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) {
+          // Call runtime to perform increment.
+          IRBuilder<> Builder(&*ComplexEdgeSuccs[i + 1]->getFirstInsertionPt());
+          Value *CounterPtrArray =
+            Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0,
+                                               i * ComplexEdgePreds.size());
+
+          // Build code to increment the counter.
+          InsertIndCounterIncrCode = true;
+          Builder.CreateCall(getIncrementIndirectCounterFunc(),
+                             {EdgeState, CounterPtrArray});
+        }
+      }
+    }
+
+    Function *WriteoutF = insertCounterWriteout(CountersBySP);
+    Function *FlushF = insertFlush(CountersBySP);
+
+    // Create a small bit of code that registers the "__llvm_gcov_writeout" to
+    // be executed at exit and the "__llvm_gcov_flush" function to be executed
+    // when "__gcov_flush" is called.
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Function *F = Function::Create(FTy, GlobalValue::InternalLinkage,
+                                   "__llvm_gcov_init", M);
+    F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+    F->setLinkage(GlobalValue::InternalLinkage);
+    F->addFnAttr(Attribute::NoInline);
+    if (Options.NoRedZone)
+      F->addFnAttr(Attribute::NoRedZone);
+
+    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+    IRBuilder<> Builder(BB);
+
+    FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Type *Params[] = {
+      PointerType::get(FTy, 0),
+      PointerType::get(FTy, 0)
+    };
+    FTy = FunctionType::get(Builder.getVoidTy(), Params, false);
+
+    // Initialize the environment and register the local writeout and flush
+    // functions.
+    Constant *GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
+    Builder.CreateCall(GCOVInit, {WriteoutF, FlushF});
+    Builder.CreateRetVoid();
+
+    appendToGlobalCtors(*M, F, 0);
+  }
+
+  if (InsertIndCounterIncrCode)
+    insertIndirectCounterIncrement();
+
+  return Result;
+}
+
+// All edges with successors that aren't branches are "complex", because it
+// requires complex logic to pick which counter to update.
+GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
+    Function *F,
+    GlobalVariable *Counters,
+    const UniqueVector<BasicBlock *> &Preds,
+    const UniqueVector<BasicBlock *> &Succs) {
+  // TODO: support invoke, threads. We rely on the fact that nothing can modify
+  // the whole-Module pred edge# between the time we set it and the time we next
+  // read it. Threads and invoke make this untrue.
+
+  // emit [(succs * preds) x i64*], logically [succ x [pred x i64*]].
+  size_t TableSize = Succs.size() * Preds.size();
+  Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx);
+  ArrayType *EdgeTableTy = ArrayType::get(Int64PtrTy, TableSize);
+
+  std::unique_ptr<Constant * []> EdgeTable(new Constant *[TableSize]);
+  Constant *NullValue = Constant::getNullValue(Int64PtrTy);
+  for (size_t i = 0; i != TableSize; ++i)
+    EdgeTable[i] = NullValue;
+
+  unsigned Edge = 0;
+  for (BasicBlock &BB : *F) {
+    TerminatorInst *TI = BB.getTerminator();
+    int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
+    if (Successors > 1 && !isa<BranchInst>(TI) && !isa<ReturnInst>(TI)) {
+      for (int i = 0; i != Successors; ++i) {
+        BasicBlock *Succ = TI->getSuccessor(i);
+        IRBuilder<> Builder(Succ);
+        Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
+                                                            Edge + i);
+        EdgeTable[((Succs.idFor(Succ) - 1) * Preds.size()) +
+                  (Preds.idFor(&BB) - 1)] = cast<Constant>(Counter);
+      }
+    }
+    Edge += Successors;
+  }
+
+  GlobalVariable *EdgeTableGV =
+      new GlobalVariable(
+          *M, EdgeTableTy, true, GlobalValue::InternalLinkage,
+          ConstantArray::get(EdgeTableTy,
+                             makeArrayRef(&EdgeTable[0],TableSize)),
+          "__llvm_gcda_edge_table");
+  EdgeTableGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  return EdgeTableGV;
+}
+
+Constant *GCOVProfiler::getStartFileFunc() {
+  Type *Args[] = {
+    Type::getInt8PtrTy(*Ctx),  // const char *orig_filename
+    Type::getInt8PtrTy(*Ctx),  // const char version[4]
+    Type::getInt32Ty(*Ctx),    // uint32_t checksum
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  return M->getOrInsertFunction("llvm_gcda_start_file", FTy);
+}
+
+Constant *GCOVProfiler::getIncrementIndirectCounterFunc() {
+  Type *Int32Ty = Type::getInt32Ty(*Ctx);
+  Type *Int64Ty = Type::getInt64Ty(*Ctx);
+  Type *Args[] = {
+    Int32Ty->getPointerTo(),                // uint32_t *predecessor
+    Int64Ty->getPointerTo()->getPointerTo() // uint64_t **counters
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  return M->getOrInsertFunction("__llvm_gcov_indirect_counter_increment", FTy);
+}
+
+Constant *GCOVProfiler::getEmitFunctionFunc() {
+  Type *Args[] = {
+    Type::getInt32Ty(*Ctx),    // uint32_t ident
+    Type::getInt8PtrTy(*Ctx),  // const char *function_name
+    Type::getInt32Ty(*Ctx),    // uint32_t func_checksum
+    Type::getInt8Ty(*Ctx),     // uint8_t use_extra_checksum
+    Type::getInt32Ty(*Ctx),    // uint32_t cfg_checksum
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
+}
+
+Constant *GCOVProfiler::getEmitArcsFunc() {
+  Type *Args[] = {
+    Type::getInt32Ty(*Ctx),     // uint32_t num_counters
+    Type::getInt64PtrTy(*Ctx),  // uint64_t *counters
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
+}
+
+Constant *GCOVProfiler::getSummaryInfoFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_gcda_summary_info", FTy);
+}
+
+Constant *GCOVProfiler::getEndFileFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_gcda_end_file", FTy);
+}
+
+GlobalVariable *GCOVProfiler::getEdgeStateValue() {
+  GlobalVariable *GV = M->getGlobalVariable("__llvm_gcov_global_state_pred");
+  if (!GV) {
+    GV = new GlobalVariable(*M, Type::getInt32Ty(*Ctx), false,
+                            GlobalValue::InternalLinkage,
+                            ConstantInt::get(Type::getInt32Ty(*Ctx),
+                                             0xffffffff),
+                            "__llvm_gcov_global_state_pred");
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  }
+  return GV;
+}
+
+Function *GCOVProfiler::insertCounterWriteout(
+    ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) {
+  FunctionType *WriteoutFTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *WriteoutF = M->getFunction("__llvm_gcov_writeout");
+  if (!WriteoutF)
+    WriteoutF = Function::Create(WriteoutFTy, GlobalValue::InternalLinkage,
+                                 "__llvm_gcov_writeout", M);
+  WriteoutF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  WriteoutF->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    WriteoutF->addFnAttr(Attribute::NoRedZone);
+
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF);
+  IRBuilder<> Builder(BB);
+
+  Constant *StartFile = getStartFileFunc();
+  Constant *EmitFunction = getEmitFunctionFunc();
+  Constant *EmitArcs = getEmitArcsFunc();
+  Constant *SummaryInfo = getSummaryInfoFunc();
+  Constant *EndFile = getEndFileFunc();
+
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (CU_Nodes) {
+    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+      auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));
+
+      // Skip module skeleton (and module) CUs.
+      if (CU->getDWOId())
+        continue;
+
+      std::string FilenameGcda = mangleName(CU, GCovFileType::GCDA);
+      uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i];
+      Builder.CreateCall(StartFile,
+                         {Builder.CreateGlobalStringPtr(FilenameGcda),
+                          Builder.CreateGlobalStringPtr(ReversedVersion),
+                          Builder.getInt32(CfgChecksum)});
+      for (unsigned j = 0, e = CountersBySP.size(); j != e; ++j) {
+        auto *SP = cast_or_null<DISubprogram>(CountersBySP[j].second);
+        uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum();
+        Builder.CreateCall(
+            EmitFunction,
+            {Builder.getInt32(j),
+             Options.FunctionNamesInData
+                 ? Builder.CreateGlobalStringPtr(getFunctionName(SP))
+                 : Constant::getNullValue(Builder.getInt8PtrTy()),
+             Builder.getInt32(FuncChecksum),
+             Builder.getInt8(Options.UseCfgChecksum),
+             Builder.getInt32(CfgChecksum)});
+
+        GlobalVariable *GV = CountersBySP[j].first;
+        unsigned Arcs =
+          cast<ArrayType>(GV->getValueType())->getNumElements();
+        Builder.CreateCall(EmitArcs, {Builder.getInt32(Arcs),
+                                      Builder.CreateConstGEP2_64(GV, 0, 0)});
+      }
+      Builder.CreateCall(SummaryInfo, {});
+      Builder.CreateCall(EndFile, {});
+    }
+  }
+
+  Builder.CreateRetVoid();
+  return WriteoutF;
+}
+
+void GCOVProfiler::insertIndirectCounterIncrement() {
+  Function *Fn =
+    cast<Function>(GCOVProfiler::getIncrementIndirectCounterFunc());
+  Fn->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  Fn->setLinkage(GlobalValue::InternalLinkage);
+  Fn->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    Fn->addFnAttr(Attribute::NoRedZone);
+
+  // Create basic blocks for function.
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", Fn);
+  IRBuilder<> Builder(BB);
+
+  BasicBlock *PredNotNegOne = BasicBlock::Create(*Ctx, "", Fn);
+  BasicBlock *CounterEnd = BasicBlock::Create(*Ctx, "", Fn);
+  BasicBlock *Exit = BasicBlock::Create(*Ctx, "exit", Fn);
+
+  // uint32_t pred = *predecessor;
+  // if (pred == 0xffffffff) return;
+  Argument *Arg = &*Fn->arg_begin();
+  Arg->setName("predecessor");
+  Value *Pred = Builder.CreateLoad(Arg, "pred");
+  Value *Cond = Builder.CreateICmpEQ(Pred, Builder.getInt32(0xffffffff));
+  BranchInst::Create(Exit, PredNotNegOne, Cond, BB);
+
+  Builder.SetInsertPoint(PredNotNegOne);
+
+  // uint64_t *counter = counters[pred];
+  // if (!counter) return;
+  Value *ZExtPred = Builder.CreateZExt(Pred, Builder.getInt64Ty());
+  Arg = &*std::next(Fn->arg_begin());
+  Arg->setName("counters");
+  Value *GEP = Builder.CreateGEP(Type::getInt64PtrTy(*Ctx), Arg, ZExtPred);
+  Value *Counter = Builder.CreateLoad(GEP, "counter");
+  Cond = Builder.CreateICmpEQ(Counter,
+                              Constant::getNullValue(
+                                  Builder.getInt64Ty()->getPointerTo()));
+  Builder.CreateCondBr(Cond, Exit, CounterEnd);
+
+  // ++*counter;
+  Builder.SetInsertPoint(CounterEnd);
+  Value *Add = Builder.CreateAdd(Builder.CreateLoad(Counter),
+                                 Builder.getInt64(1));
+  Builder.CreateStore(Add, Counter);
+  Builder.CreateBr(Exit);
+
+  // Fill in the exit block.
+  Builder.SetInsertPoint(Exit);
+  Builder.CreateRetVoid();
+}
+
+Function *GCOVProfiler::
+insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *FlushF = M->getFunction("__llvm_gcov_flush");
+  if (!FlushF)
+    FlushF = Function::Create(FTy, GlobalValue::InternalLinkage,
+                              "__llvm_gcov_flush", M);
+  else
+    FlushF->setLinkage(GlobalValue::InternalLinkage);
+  FlushF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  FlushF->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    FlushF->addFnAttr(Attribute::NoRedZone);
+
+  BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", FlushF);
+
+  // Write out the current counters.
+  Constant *WriteoutF = M->getFunction("__llvm_gcov_writeout");
+  assert(WriteoutF && "Need to create the writeout function first!");
+
+  IRBuilder<> Builder(Entry);
+  Builder.CreateCall(WriteoutF, {});
+
+  // Zero out the counters.
+  for (const auto &I : CountersBySP) {
+    GlobalVariable *GV = I.first;
+    Constant *Null = Constant::getNullValue(GV->getValueType());
+    Builder.CreateStore(Null, GV);
+  }
+
+  Type *RetTy = FlushF->getReturnType();
+  if (RetTy == Type::getVoidTy(*Ctx))
+    Builder.CreateRetVoid();
+  else if (RetTy->isIntegerTy())
+    // Used if __llvm_gcov_flush was implicitly declared.
+    Builder.CreateRet(ConstantInt::get(RetTy, 0));
+  else
+    report_fatal_error("invalid return type for __llvm_gcov_flush");
+
+  return FlushF;
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
new file mode 100644
index 000000000000..4089d81ea3e1
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -0,0 +1,685 @@
+//===-- IndirectCallPromotion.cpp - Optimizations based on value profiling ===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the transformation that promotes indirect calls to
+// conditional direct calls when the indirect-call value profile metadata is
+// available.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
+#include "llvm/Analysis/IndirectCallSiteVisitor.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/PGOInstrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-icall-prom"
+
+STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
+STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
+
+// Command line option to disable indirect-call promotion with the default as
+// false. This is for debug purpose.
+static cl::opt<bool> DisableICP("disable-icp", cl::init(false), cl::Hidden,
+                                cl::desc("Disable indirect call promotion"));
+
+// Set the cutoff value for the promotion. If the value is other than 0, we
+// stop the transformation once the total number of promotions equals the cutoff
+// value.
+// For debug use only.
+static cl::opt<unsigned>
+    ICPCutOff("icp-cutoff", cl::init(0), cl::Hidden, cl::ZeroOrMore,
+              cl::desc("Max number of promotions for this compilation"));
+
+// If ICPCSSkip is non zero, the first ICPCSSkip callsites will be skipped.
+// For debug use only.
+static cl::opt<unsigned>
+    ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden, cl::ZeroOrMore,
+              cl::desc("Skip Callsite up to this number for this compilation"));
+
+// Set if the pass is called in LTO optimization. The difference for LTO mode
+// is the pass won't prefix the source module name to the internal linkage
+// symbols.
+static cl::opt<bool> ICPLTOMode("icp-lto", cl::init(false), cl::Hidden,
+                                cl::desc("Run indirect-call promotion in LTO "
+                                         "mode"));
+
+// Set if the pass is called in SamplePGO mode. The difference for SamplePGO
+// mode is it will add prof metadatato the created direct call.
+static cl::opt<bool>
+    ICPSamplePGOMode("icp-samplepgo", cl::init(false), cl::Hidden,
+                     cl::desc("Run indirect-call promotion in SamplePGO mode"));
+
+// If the option is set to true, only call instructions will be considered for
+// transformation -- invoke instructions will be ignored.
+static cl::opt<bool>
+    ICPCallOnly("icp-call-only", cl::init(false), cl::Hidden,
+                cl::desc("Run indirect-call promotion for call instructions "
+                         "only"));
+
+// If the option is set to true, only invoke instructions will be considered for
+// transformation -- call instructions will be ignored.
+static cl::opt<bool> ICPInvokeOnly("icp-invoke-only", cl::init(false),
+                                   cl::Hidden,
+                                   cl::desc("Run indirect-call promotion for "
+                                            "invoke instruction only"));
+
+// Dump the function level IR if the transformation happened in this
+// function. For debug use only.
+static cl::opt<bool>
+    ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden,
+                 cl::desc("Dump IR after transformation happens"));
+
+namespace {
+class PGOIndirectCallPromotionLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  PGOIndirectCallPromotionLegacyPass(bool InLTO = false, bool SamplePGO = false)
+      : ModulePass(ID), InLTO(InLTO), SamplePGO(SamplePGO) {
+    initializePGOIndirectCallPromotionLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "PGOIndirectCallPromotion"; }
+
+private:
+  bool runOnModule(Module &M) override;
+
+  // If this pass is called in LTO. We need to special handling the PGOFuncName
+  // for the static variables due to LTO's internalization.
+  bool InLTO;
+
+  // If this pass is called in SamplePGO. We need to add the prof metadata to
+  // the promoted direct call.
+  bool SamplePGO;
+};
+} // end anonymous namespace
+
+char PGOIndirectCallPromotionLegacyPass::ID = 0;
+INITIALIZE_PASS(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
+                "Use PGO instrumentation profile to promote indirect calls to "
+                "direct calls.",
+                false, false)
+
+ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO,
+                                                           bool SamplePGO) {
+  return new PGOIndirectCallPromotionLegacyPass(InLTO, SamplePGO);
+}
+
+namespace {
+// The class for main data structure to promote indirect calls to conditional
+// direct calls.
+class ICallPromotionFunc {
+private:
+  Function &F;
+  Module *M;
+
+  // Symtab that maps indirect call profile values to function names and
+  // defines.
+  InstrProfSymtab *Symtab;
+
+  bool SamplePGO;
+
+  // Test if we can legally promote this direct-call of Target.
+  bool isPromotionLegal(Instruction *Inst, uint64_t Target, Function *&F,
+                        const char **Reason = nullptr);
+
+  // A struct that records the direct target and it's call count.
+  struct PromotionCandidate {
+    Function *TargetFunction;
+    uint64_t Count;
+    PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {}
+  };
+
+  // Check if the indirect-call call site should be promoted. Return the number
+  // of promotions. Inst is the candidate indirect call, ValueDataRef
+  // contains the array of value profile data for profiled targets,
+  // TotalCount is the total profiled count of call executions, and
+  // NumCandidates is the number of candidate entries in ValueDataRef.
+  std::vector<PromotionCandidate> getPromotionCandidatesForCallSite(
+      Instruction *Inst, const ArrayRef<InstrProfValueData> &ValueDataRef,
+      uint64_t TotalCount, uint32_t NumCandidates);
+
+  // Promote a list of targets for one indirect-call callsite. Return
+  // the number of promotions.
+  uint32_t tryToPromote(Instruction *Inst,
+                        const std::vector<PromotionCandidate> &Candidates,
+                        uint64_t &TotalCount);
+
+  // Noncopyable
+  ICallPromotionFunc(const ICallPromotionFunc &other) = delete;
+  ICallPromotionFunc &operator=(const ICallPromotionFunc &other) = delete;
+
+public:
+  ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab,
+                     bool SamplePGO)
+      : F(Func), M(Modu), Symtab(Symtab), SamplePGO(SamplePGO) {}
+
+  bool processFunction();
+};
+} // end anonymous namespace
+
+bool llvm::isLegalToPromote(Instruction *Inst, Function *F,
+                            const char **Reason) {
+  // Check the return type.
+  Type *CallRetType = Inst->getType();
+  if (!CallRetType->isVoidTy()) {
+    Type *FuncRetType = F->getReturnType();
+    if (FuncRetType != CallRetType &&
+        !CastInst::isBitCastable(FuncRetType, CallRetType)) {
+      if (Reason)
+        *Reason = "Return type mismatch";
+      return false;
+    }
+  }
+
+  // Check if the arguments are compatible with the parameters
+  FunctionType *DirectCalleeType = F->getFunctionType();
+  unsigned ParamNum = DirectCalleeType->getFunctionNumParams();
+  CallSite CS(Inst);
+  unsigned ArgNum = CS.arg_size();
+
+  if (ParamNum != ArgNum && !DirectCalleeType->isVarArg()) {
+    if (Reason)
+      *Reason = "The number of arguments mismatch";
+    return false;
+  }
+
+  for (unsigned I = 0; I < ParamNum; ++I) {
+    Type *PTy = DirectCalleeType->getFunctionParamType(I);
+    Type *ATy = CS.getArgument(I)->getType();
+    if (PTy == ATy)
+      continue;
+    if (!CastInst::castIsValid(Instruction::BitCast, CS.getArgument(I), PTy)) {
+      if (Reason)
+        *Reason = "Argument type mismatch";
+      return false;
+    }
+  }
+
+  DEBUG(dbgs() << " #" << NumOfPGOICallPromotion << " Promote the icall to "
+               << F->getName() << "\n");
+  return true;
+}
+
+bool ICallPromotionFunc::isPromotionLegal(Instruction *Inst, uint64_t Target,
+                                          Function *&TargetFunction,
+                                          const char **Reason) {
+  TargetFunction = Symtab->getFunction(Target);
+  if (TargetFunction == nullptr) {
+    *Reason = "Cannot find the target";
+    return false;
+  }
+  return isLegalToPromote(Inst, TargetFunction, Reason);
+}
+
+// Indirect-call promotion heuristic. The direct targets are sorted based on
+// the count. Stop at the first target that is not promoted.
+std::vector<ICallPromotionFunc::PromotionCandidate>
+ICallPromotionFunc::getPromotionCandidatesForCallSite(
+    Instruction *Inst, const ArrayRef<InstrProfValueData> &ValueDataRef,
+    uint64_t TotalCount, uint32_t NumCandidates) {
+  std::vector<PromotionCandidate> Ret;
+
+  DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << *Inst
+               << " Num_targets: " << ValueDataRef.size()
+               << " Num_candidates: " << NumCandidates << "\n");
+  NumOfPGOICallsites++;
+  if (ICPCSSkip != 0 && NumOfPGOICallsites <= ICPCSSkip) {
+    DEBUG(dbgs() << " Skip: User options.\n");
+    return Ret;
+  }
+
+  for (uint32_t I = 0; I < NumCandidates; I++) {
+    uint64_t Count = ValueDataRef[I].Count;
+    assert(Count <= TotalCount);
+    uint64_t Target = ValueDataRef[I].Value;
+    DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
+                 << "  Target_func: " << Target << "\n");
+
+    if (ICPInvokeOnly && dyn_cast<CallInst>(Inst)) {
+      DEBUG(dbgs() << " Not promote: User options.\n");
+      break;
+    }
+    if (ICPCallOnly && dyn_cast<InvokeInst>(Inst)) {
+      DEBUG(dbgs() << " Not promote: User option.\n");
+      break;
+    }
+    if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
+      DEBUG(dbgs() << " Not promote: Cutoff reached.\n");
+      break;
+    }
+    Function *TargetFunction = nullptr;
+    const char *Reason = nullptr;
+    if (!isPromotionLegal(Inst, Target, TargetFunction, &Reason)) {
+      StringRef TargetFuncName = Symtab->getFuncName(Target);
+      DEBUG(dbgs() << " Not promote: " << Reason << "\n");
+      emitOptimizationRemarkMissed(
+          F.getContext(), "pgo-icall-prom", F, Inst->getDebugLoc(),
+          Twine("Cannot promote indirect call to ") +
+              (TargetFuncName.empty() ? Twine(Target) : Twine(TargetFuncName)) +
+              Twine(" with count of ") + Twine(Count) + ": " + Reason);
+      break;
+    }
+    Ret.push_back(PromotionCandidate(TargetFunction, Count));
+    TotalCount -= Count;
+  }
+  return Ret;
+}
+
+// Create a diamond structure for If_Then_Else. Also update the profile
+// count. Do the fix-up for the invoke instruction.
+static void createIfThenElse(Instruction *Inst, Function *DirectCallee,
+                             uint64_t Count, uint64_t TotalCount,
+                             BasicBlock **DirectCallBB,
+                             BasicBlock **IndirectCallBB,
+                             BasicBlock **MergeBB) {
+  CallSite CS(Inst);
+  Value *OrigCallee = CS.getCalledValue();
+
+  IRBuilder<> BBBuilder(Inst);
+  LLVMContext &Ctx = Inst->getContext();
+  Value *BCI1 =
+      BBBuilder.CreateBitCast(OrigCallee, Type::getInt8PtrTy(Ctx), "");
+  Value *BCI2 =
+      BBBuilder.CreateBitCast(DirectCallee, Type::getInt8PtrTy(Ctx), "");
+  Value *PtrCmp = BBBuilder.CreateICmpEQ(BCI1, BCI2, "");
+
+  uint64_t ElseCount = TotalCount - Count;
+  uint64_t MaxCount = (Count >= ElseCount ? Count : ElseCount);
+  uint64_t Scale = calculateCountScale(MaxCount);
+  MDBuilder MDB(Inst->getContext());
+  MDNode *BranchWeights = MDB.createBranchWeights(
+      scaleBranchCount(Count, Scale), scaleBranchCount(ElseCount, Scale));
+  TerminatorInst *ThenTerm, *ElseTerm;
+  SplitBlockAndInsertIfThenElse(PtrCmp, Inst, &ThenTerm, &ElseTerm,
+                                BranchWeights);
+  *DirectCallBB = ThenTerm->getParent();
+  (*DirectCallBB)->setName("if.true.direct_targ");
+  *IndirectCallBB = ElseTerm->getParent();
+  (*IndirectCallBB)->setName("if.false.orig_indirect");
+  *MergeBB = Inst->getParent();
+  (*MergeBB)->setName("if.end.icp");
+
+  // Special handing of Invoke instructions.
+  InvokeInst *II = dyn_cast<InvokeInst>(Inst);
+  if (!II)
+    return;
+
+  // We don't need branch instructions for invoke.
+  ThenTerm->eraseFromParent();
+  ElseTerm->eraseFromParent();
+
+  // Add jump from Merge BB to the NormalDest. This is needed for the newly
+  // created direct invoke stmt -- as its NormalDst will be fixed up to MergeBB.
+  BranchInst::Create(II->getNormalDest(), *MergeBB);
+}
+
+// Find the PHI in BB that have the CallResult as the operand.
+static bool getCallRetPHINode(BasicBlock *BB, Instruction *Inst) {
+  BasicBlock *From = Inst->getParent();
+  for (auto &I : *BB) {
+    PHINode *PHI = dyn_cast<PHINode>(&I);
+    if (!PHI)
+      continue;
+    int IX = PHI->getBasicBlockIndex(From);
+    if (IX == -1)
+      continue;
+    Value *V = PHI->getIncomingValue(IX);
+    if (dyn_cast<Instruction>(V) == Inst)
+      return true;
+  }
+  return false;
+}
+
+// This method fixes up PHI nodes in BB where BB is the UnwindDest of an
+// invoke instruction. In BB, there may be PHIs with incoming block being
+// OrigBB (the MergeBB after if-then-else splitting). After moving the invoke
+// instructions to its own BB, OrigBB is no longer the predecessor block of BB.
+// Instead two new predecessors are added: IndirectCallBB and DirectCallBB,
+// so the PHI node's incoming BBs need to be fixed up accordingly.
+static void fixupPHINodeForUnwind(Instruction *Inst, BasicBlock *BB,
+                                  BasicBlock *OrigBB,
+                                  BasicBlock *IndirectCallBB,
+                                  BasicBlock *DirectCallBB) {
+  for (auto &I : *BB) {
+    PHINode *PHI = dyn_cast<PHINode>(&I);
+    if (!PHI)
+      continue;
+    int IX = PHI->getBasicBlockIndex(OrigBB);
+    if (IX == -1)
+      continue;
+    Value *V = PHI->getIncomingValue(IX);
+    PHI->addIncoming(V, IndirectCallBB);
+    PHI->setIncomingBlock(IX, DirectCallBB);
+  }
+}
+
+// This method fixes up PHI nodes in BB where BB is the NormalDest of an
+// invoke instruction. In BB, there may be PHIs with incoming block being
+// OrigBB (the MergeBB after if-then-else splitting). After moving the invoke
+// instructions to its own BB, a new incoming edge will be added to the original
+// NormalDstBB from the IndirectCallBB.
+static void fixupPHINodeForNormalDest(Instruction *Inst, BasicBlock *BB,
+                                      BasicBlock *OrigBB,
+                                      BasicBlock *IndirectCallBB,
+                                      Instruction *NewInst) {
+  for (auto &I : *BB) {
+    PHINode *PHI = dyn_cast<PHINode>(&I);
+    if (!PHI)
+      continue;
+    int IX = PHI->getBasicBlockIndex(OrigBB);
+    if (IX == -1)
+      continue;
+    Value *V = PHI->getIncomingValue(IX);
+    if (dyn_cast<Instruction>(V) == Inst) {
+      PHI->setIncomingBlock(IX, IndirectCallBB);
+      PHI->addIncoming(NewInst, OrigBB);
+      continue;
+    }
+    PHI->addIncoming(V, IndirectCallBB);
+  }
+}
+
+// Add a bitcast instruction to the direct-call return value if needed.
+static Instruction *insertCallRetCast(const Instruction *Inst,
+                                      Instruction *DirectCallInst,
+                                      Function *DirectCallee) {
+  if (Inst->getType()->isVoidTy())
+    return DirectCallInst;
+
+  Type *CallRetType = Inst->getType();
+  Type *FuncRetType = DirectCallee->getReturnType();
+  if (FuncRetType == CallRetType)
+    return DirectCallInst;
+
+  BasicBlock *InsertionBB;
+  if (CallInst *CI = dyn_cast<CallInst>(DirectCallInst))
+    InsertionBB = CI->getParent();
+  else
+    InsertionBB = (dyn_cast<InvokeInst>(DirectCallInst))->getNormalDest();
+
+  return (new BitCastInst(DirectCallInst, CallRetType, "",
+                          InsertionBB->getTerminator()));
+}
+
+// Create a DirectCall instruction in the DirectCallBB.
+// Parameter Inst is the indirect-call (invoke) instruction.
+// DirectCallee is the decl of the direct-call (invoke) target.
+// DirecallBB is the BB that the direct-call (invoke) instruction is inserted.
+// MergeBB is the bottom BB of the if-then-else-diamond after the
+// transformation. For invoke instruction, the edges from DirectCallBB and
+// IndirectCallBB to MergeBB are removed before this call (during
+// createIfThenElse).
+static Instruction *createDirectCallInst(const Instruction *Inst,
+                                         Function *DirectCallee,
+                                         BasicBlock *DirectCallBB,
+                                         BasicBlock *MergeBB) {
+  Instruction *NewInst = Inst->clone();
+  if (CallInst *CI = dyn_cast<CallInst>(NewInst)) {
+    CI->setCalledFunction(DirectCallee);
+    CI->mutateFunctionType(DirectCallee->getFunctionType());
+  } else {
+    // Must be an invoke instruction. Direct invoke's normal destination is
+    // fixed up to MergeBB. MergeBB is the place where return cast is inserted.
+    // Also since IndirectCallBB does not have an edge to MergeBB, there is no
+    // need to insert new PHIs into MergeBB.
+    InvokeInst *II = dyn_cast<InvokeInst>(NewInst);
+    assert(II);
+    II->setCalledFunction(DirectCallee);
+    II->mutateFunctionType(DirectCallee->getFunctionType());
+    II->setNormalDest(MergeBB);
+  }
+
+  DirectCallBB->getInstList().insert(DirectCallBB->getFirstInsertionPt(),
+                                     NewInst);
+
+  // Clear the value profile data.
+  NewInst->setMetadata(LLVMContext::MD_prof, nullptr);
+  CallSite NewCS(NewInst);
+  FunctionType *DirectCalleeType = DirectCallee->getFunctionType();
+  unsigned ParamNum = DirectCalleeType->getFunctionNumParams();
+  for (unsigned I = 0; I < ParamNum; ++I) {
+    Type *ATy = NewCS.getArgument(I)->getType();
+    Type *PTy = DirectCalleeType->getParamType(I);
+    if (ATy != PTy) {
+      BitCastInst *BI = new BitCastInst(NewCS.getArgument(I), PTy, "", NewInst);
+      NewCS.setArgument(I, BI);
+    }
+  }
+
+  return insertCallRetCast(Inst, NewInst, DirectCallee);
+}
+
+// Create a PHI to unify the return values of calls.
+static void insertCallRetPHI(Instruction *Inst, Instruction *CallResult,
+                             Function *DirectCallee) {
+  if (Inst->getType()->isVoidTy())
+    return;
+
+  BasicBlock *RetValBB = CallResult->getParent();
+
+  BasicBlock *PHIBB;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(CallResult))
+    RetValBB = II->getNormalDest();
+
+  PHIBB = RetValBB->getSingleSuccessor();
+  if (getCallRetPHINode(PHIBB, Inst))
+    return;
+
+  PHINode *CallRetPHI = PHINode::Create(Inst->getType(), 0);
+  PHIBB->getInstList().push_front(CallRetPHI);
+  Inst->replaceAllUsesWith(CallRetPHI);
+  CallRetPHI->addIncoming(Inst, Inst->getParent());
+  CallRetPHI->addIncoming(CallResult, RetValBB);
+}
+
+// This function does the actual indirect-call promotion transformation:
+// For an indirect-call like:
+//     Ret = (*Foo)(Args);
+// It transforms to:
+//     if (Foo == DirectCallee)
+//        Ret1 = DirectCallee(Args);
+//     else
+//        Ret2 = (*Foo)(Args);
+//     Ret = phi(Ret1, Ret2);
+// It adds type casts for the args do not match the parameters and the return
+// value. Branch weights metadata also updated.
+// If \p AttachProfToDirectCall is true, a prof metadata is attached to the
+// new direct call to contain \p Count. This is used by SamplePGO inliner to
+// check callsite hotness.
+// Returns the promoted direct call instruction.
+Instruction *llvm::promoteIndirectCall(Instruction *Inst,
+                                       Function *DirectCallee, uint64_t Count,
+                                       uint64_t TotalCount,
+                                       bool AttachProfToDirectCall) {
+  assert(DirectCallee != nullptr);
+  BasicBlock *BB = Inst->getParent();
+  // Just to suppress the non-debug build warning.
+  (void)BB;
+  DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
+  DEBUG(dbgs() << *BB << "\n");
+
+  BasicBlock *DirectCallBB, *IndirectCallBB, *MergeBB;
+  createIfThenElse(Inst, DirectCallee, Count, TotalCount, &DirectCallBB,
+                   &IndirectCallBB, &MergeBB);
+
+  Instruction *NewInst =
+      createDirectCallInst(Inst, DirectCallee, DirectCallBB, MergeBB);
+
+  if (AttachProfToDirectCall) {
+    SmallVector<uint32_t, 1> Weights;
+    Weights.push_back(Count);
+    MDBuilder MDB(NewInst->getContext());
+    dyn_cast<Instruction>(NewInst->stripPointerCasts())
+        ->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+  }
+
+  // Move Inst from MergeBB to IndirectCallBB.
+  Inst->removeFromParent();
+  IndirectCallBB->getInstList().insert(IndirectCallBB->getFirstInsertionPt(),
+                                       Inst);
+
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Inst)) {
+    // At this point, the original indirect invoke instruction has the original
+    // UnwindDest and NormalDest. For the direct invoke instruction, the
+    // NormalDest points to MergeBB, and MergeBB jumps to the original
+    // NormalDest. MergeBB might have a new bitcast instruction for the return
+    // value. The PHIs are with the original NormalDest. Since we now have two
+    // incoming edges to NormalDest and UnwindDest, we have to do some fixups.
+    //
+    // UnwindDest will not use the return value. So pass nullptr here.
+    fixupPHINodeForUnwind(Inst, II->getUnwindDest(), MergeBB, IndirectCallBB,
+                          DirectCallBB);
+    // We don't need to update the operand from NormalDest for DirectCallBB.
+    // Pass nullptr here.
+    fixupPHINodeForNormalDest(Inst, II->getNormalDest(), MergeBB,
+                              IndirectCallBB, NewInst);
+  }
+
+  insertCallRetPHI(Inst, NewInst, DirectCallee);
+
+  DEBUG(dbgs() << "\n== Basic Blocks After ==\n");
+  DEBUG(dbgs() << *BB << *DirectCallBB << *IndirectCallBB << *MergeBB << "\n");
+
+  emitOptimizationRemark(
+      BB->getContext(), "pgo-icall-prom", *BB->getParent(), Inst->getDebugLoc(),
+      Twine("Promote indirect call to ") + DirectCallee->getName() +
+          " with count " + Twine(Count) + " out of " + Twine(TotalCount));
+  return NewInst;
+}
+
+// Promote indirect-call to conditional direct-call for one callsite.
+uint32_t ICallPromotionFunc::tryToPromote(
+    Instruction *Inst, const std::vector<PromotionCandidate> &Candidates,
+    uint64_t &TotalCount) {
+  uint32_t NumPromoted = 0;
+
+  for (auto &C : Candidates) {
+    uint64_t Count = C.Count;
+    promoteIndirectCall(Inst, C.TargetFunction, Count, TotalCount, SamplePGO);
+    assert(TotalCount >= Count);
+    TotalCount -= Count;
+    NumOfPGOICallPromotion++;
+    NumPromoted++;
+  }
+  return NumPromoted;
+}
+
+// Traverse all the indirect-call callsite and get the value profile
+// annotation to perform indirect-call promotion.
+bool ICallPromotionFunc::processFunction() {
+  bool Changed = false;
+  ICallPromotionAnalysis ICallAnalysis;
+  for (auto &I : findIndirectCallSites(F)) {
+    uint32_t NumVals, NumCandidates;
+    uint64_t TotalCount;
+    auto ICallProfDataRef = ICallAnalysis.getPromotionCandidatesForInstruction(
+        I, NumVals, TotalCount, NumCandidates);
+    if (!NumCandidates)
+      continue;
+    auto PromotionCandidates = getPromotionCandidatesForCallSite(
+        I, ICallProfDataRef, TotalCount, NumCandidates);
+    uint32_t NumPromoted = tryToPromote(I, PromotionCandidates, TotalCount);
+    if (NumPromoted == 0)
+      continue;
+
+    Changed = true;
+    // Adjust the MD.prof metadata. First delete the old one.
+    I->setMetadata(LLVMContext::MD_prof, nullptr);
+    // If all promoted, we don't need the MD.prof metadata.
+    if (TotalCount == 0 || NumPromoted == NumVals)
+      continue;
+    // Otherwise we need update with the un-promoted records back.
+    annotateValueSite(*M, *I, ICallProfDataRef.slice(NumPromoted), TotalCount,
+                      IPVK_IndirectCallTarget, NumCandidates);
+  }
+  return Changed;
+}
+
+// A wrapper function that does the actual work.
+static bool promoteIndirectCalls(Module &M, bool InLTO, bool SamplePGO) {
+  if (DisableICP)
+    return false;
+  InstrProfSymtab Symtab;
+  if (Error E = Symtab.create(M, InLTO)) {
+    std::string SymtabFailure = toString(std::move(E));
+    DEBUG(dbgs() << "Failed to create symtab: " << SymtabFailure << "\n");
+    (void)SymtabFailure;
+    return false;
+  }
+  bool Changed = false;
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    if (F.hasFnAttribute(Attribute::OptimizeNone))
+      continue;
+    ICallPromotionFunc ICallPromotion(F, &M, &Symtab, SamplePGO);
+    bool FuncChanged = ICallPromotion.processFunction();
+    if (ICPDUMPAFTER && FuncChanged) {
+      DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
+      DEBUG(dbgs() << "\n");
+    }
+    Changed |= FuncChanged;
+    if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
+      DEBUG(dbgs() << " Stop: Cutoff reached.\n");
+      break;
+    }
+  }
+  return Changed;
+}
+
+bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) {
+  // Command-line option has the priority for InLTO.
+  return promoteIndirectCalls(M, InLTO | ICPLTOMode,
+                              SamplePGO | ICPSamplePGOMode);
+}
+
+PreservedAnalyses PGOIndirectCallPromotion::run(Module &M,
+                                                ModuleAnalysisManager &AM) {
+  if (!promoteIndirectCalls(M, InLTO | ICPLTOMode,
+                            SamplePGO | ICPSamplePGOMode))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
new file mode 100644
index 000000000000..db8fa8977947
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -0,0 +1,971 @@
+//===-- InstrProfiling.cpp - Frontend instrumentation based profiling -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers instrprof_* intrinsics emitted by a frontend for profiling.
+// It also builds the data structures and initialization code needed for
+// updating execution counts and emitting the profile at runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/InstrProfiling.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "instrprof"
+
+// The start and end values of precise value profile range for memory
+// intrinsic sizes
+cl::opt<std::string> MemOPSizeRange(
+    "memop-size-range",
+    cl::desc("Set the range of size in memory intrinsic calls to be profiled "
+             "precisely, in a format of <start_val>:<end_val>"),
+    cl::init(""));
+
+// The value that considered to be large value in  memory intrinsic.
+cl::opt<unsigned> MemOPSizeLarge(
+    "memop-size-large",
+    cl::desc("Set large value thresthold in memory intrinsic size profiling. "
+             "Value of 0 disables the large value profiling."),
+    cl::init(8192));
+
+namespace {
+
+cl::opt<bool> DoNameCompression("enable-name-compression",
+                                cl::desc("Enable name string compression"),
+                                cl::init(true));
+
+cl::opt<bool> DoHashBasedCounterSplit(
+    "hash-based-counter-split",
+    cl::desc("Rename counter variable of a comdat function based on cfg hash"),
+    cl::init(true));
+
+cl::opt<bool> ValueProfileStaticAlloc(
+    "vp-static-alloc",
+    cl::desc("Do static counter allocation for value profiler"),
+    cl::init(true));
+
+cl::opt<double> NumCountersPerValueSite(
+    "vp-counters-per-site",
+    cl::desc("The average number of profile counters allocated "
+             "per value profiling site."),
+    // This is set to a very small value because in real programs, only
+    // a very small percentage of value sites have non-zero targets, e.g, 1/30.
+    // For those sites with non-zero profile, the average number of targets
+    // is usually smaller than 2.
+    cl::init(1.0));
+
+cl::opt<bool> AtomicCounterUpdatePromoted(
+    "atomic-counter-update-promoted", cl::ZeroOrMore,
+    cl::desc("Do counter update using atomic fetch add "
+             " for promoted counters only"),
+    cl::init(false));
+
+// If the option is not specified, the default behavior about whether
+// counter promotion is done depends on how instrumentaiton lowering
+// pipeline is setup, i.e., the default value of true of this option
+// does not mean the promotion will be done by default. Explicitly
+// setting this option can override the default behavior.
+cl::opt<bool> DoCounterPromotion("do-counter-promotion", cl::ZeroOrMore,
+                                 cl::desc("Do counter register promotion"),
+                                 cl::init(false));
+cl::opt<unsigned> MaxNumOfPromotionsPerLoop(
+    cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(20),
+    cl::desc("Max number counter promotions per loop to avoid"
+             " increasing register pressure too much"));
+
+// A debug option
+cl::opt<int>
+    MaxNumOfPromotions(cl::ZeroOrMore, "max-counter-promotions", cl::init(-1),
+                       cl::desc("Max number of allowed counter promotions"));
+
+cl::opt<unsigned> SpeculativeCounterPromotionMaxExiting(
+    cl::ZeroOrMore, "speculative-counter-promotion-max-exiting", cl::init(3),
+    cl::desc("The max number of exiting blocks of a loop to allow "
+             " speculative counter promotion"));
+
+cl::opt<bool> SpeculativeCounterPromotionToLoop(
+    cl::ZeroOrMore, "speculative-counter-promotion-to-loop", cl::init(false),
+    cl::desc("When the option is false, if the target block is in a loop, "
+             "the promotion will be disallowed unless the promoted counter "
+             " update can be further/iteratively promoted into an acyclic "
+             " region."));
+
+cl::opt<bool> IterativeCounterPromotion(
+    cl::ZeroOrMore, "iterative-counter-promotion", cl::init(true),
+    cl::desc("Allow counter promotion across the whole loop nest."));
+
+class InstrProfilingLegacyPass : public ModulePass {
+  InstrProfiling InstrProf;
+
+public:
+  static char ID;
+
+  InstrProfilingLegacyPass() : ModulePass(ID) {}
+  InstrProfilingLegacyPass(const InstrProfOptions &Options)
+      : ModulePass(ID), InstrProf(Options) {}
+
+  StringRef getPassName() const override {
+    return "Frontend instrumentation-based coverage lowering";
+  }
+
+  bool runOnModule(Module &M) override {
+    return InstrProf.run(M, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+
+///
+/// A helper class to promote one counter RMW operation in the loop
+/// into register update.
+///
+/// RWM update for the counter will be sinked out of the loop after
+/// the transformation.
+///
+class PGOCounterPromoterHelper : public LoadAndStorePromoter {
+public:
+  PGOCounterPromoterHelper(
+      Instruction *L, Instruction *S, SSAUpdater &SSA, Value *Init,
+      BasicBlock *PH, ArrayRef<BasicBlock *> ExitBlocks,
+      ArrayRef<Instruction *> InsertPts,
+      DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
+      LoopInfo &LI)
+      : LoadAndStorePromoter({L, S}, SSA), Store(S), ExitBlocks(ExitBlocks),
+        InsertPts(InsertPts), LoopToCandidates(LoopToCands), LI(LI) {
+    assert(isa<LoadInst>(L));
+    assert(isa<StoreInst>(S));
+    SSA.AddAvailableValue(PH, Init);
+  }
+
+  void doExtraRewritesBeforeFinalDeletion() const override {
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+      BasicBlock *ExitBlock = ExitBlocks[i];
+      Instruction *InsertPos = InsertPts[i];
+      // Get LiveIn value into the ExitBlock. If there are multiple
+      // predecessors, the value is defined by a PHI node in this
+      // block.
+      Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+      Value *Addr = cast<StoreInst>(Store)->getPointerOperand();
+      IRBuilder<> Builder(InsertPos);
+      if (AtomicCounterUpdatePromoted)
+        // automic update currently can only be promoted across the current
+        // loop, not the whole loop nest.
+        Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, LiveInValue,
+                                AtomicOrdering::SequentiallyConsistent);
+      else {
+        LoadInst *OldVal = Builder.CreateLoad(Addr, "pgocount.promoted");
+        auto *NewVal = Builder.CreateAdd(OldVal, LiveInValue);
+        auto *NewStore = Builder.CreateStore(NewVal, Addr);
+
+        // Now update the parent loop's candidate list:
+        if (IterativeCounterPromotion) {
+          auto *TargetLoop = LI.getLoopFor(ExitBlock);
+          if (TargetLoop)
+            LoopToCandidates[TargetLoop].emplace_back(OldVal, NewStore);
+        }
+      }
+    }
+  }
+
+private:
+  Instruction *Store;
+  ArrayRef<BasicBlock *> ExitBlocks;
+  ArrayRef<Instruction *> InsertPts;
+  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates;
+  LoopInfo &LI;
+};
+
+/// A helper class to do register promotion for all profile counter
+/// updates in a loop.
+///
+class PGOCounterPromoter {
+public:
+  PGOCounterPromoter(
+      DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
+      Loop &CurLoop, LoopInfo &LI)
+      : LoopToCandidates(LoopToCands), ExitBlocks(), InsertPts(), L(CurLoop),
+        LI(LI) {
+
+    SmallVector<BasicBlock *, 8> LoopExitBlocks;
+    SmallPtrSet<BasicBlock *, 8> BlockSet;
+    L.getExitBlocks(LoopExitBlocks);
+
+    for (BasicBlock *ExitBlock : LoopExitBlocks) {
+      if (BlockSet.insert(ExitBlock).second) {
+        ExitBlocks.push_back(ExitBlock);
+        InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+      }
+    }
+  }
+
+  bool run(int64_t *NumPromoted) {
+    unsigned MaxProm = getMaxNumOfPromotionsInLoop(&L);
+    if (MaxProm == 0)
+      return false;
+
+    unsigned Promoted = 0;
+    for (auto &Cand : LoopToCandidates[&L]) {
+
+      SmallVector<PHINode *, 4> NewPHIs;
+      SSAUpdater SSA(&NewPHIs);
+      Value *InitVal = ConstantInt::get(Cand.first->getType(), 0);
+
+      PGOCounterPromoterHelper Promoter(Cand.first, Cand.second, SSA, InitVal,
+                                        L.getLoopPreheader(), ExitBlocks,
+                                        InsertPts, LoopToCandidates, LI);
+      Promoter.run(SmallVector<Instruction *, 2>({Cand.first, Cand.second}));
+      Promoted++;
+      if (Promoted >= MaxProm)
+        break;
+
+      (*NumPromoted)++;
+      if (MaxNumOfPromotions != -1 && *NumPromoted >= MaxNumOfPromotions)
+        break;
+    }
+
+    DEBUG(dbgs() << Promoted << " counters promoted for loop (depth="
+                 << L.getLoopDepth() << ")\n");
+    return Promoted != 0;
+  }
+
+private:
+  bool allowSpeculativeCounterPromotion(Loop *LP) {
+    SmallVector<BasicBlock *, 8> ExitingBlocks;
+    L.getExitingBlocks(ExitingBlocks);
+    // Not considierered speculative.
+    if (ExitingBlocks.size() == 1)
+      return true;
+    if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
+      return false;
+    return true;
+  }
+
+  // Returns the max number of Counter Promotions for LP.
+  unsigned getMaxNumOfPromotionsInLoop(Loop *LP) {
+    // We can't insert into a catchswitch.
+    SmallVector<BasicBlock *, 8> LoopExitBlocks;
+    LP->getExitBlocks(LoopExitBlocks);
+    if (llvm::any_of(LoopExitBlocks, [](BasicBlock *Exit) {
+          return isa<CatchSwitchInst>(Exit->getTerminator());
+        }))
+      return 0;
+
+    if (!LP->hasDedicatedExits())
+      return 0;
+
+    BasicBlock *PH = LP->getLoopPreheader();
+    if (!PH)
+      return 0;
+
+    SmallVector<BasicBlock *, 8> ExitingBlocks;
+    LP->getExitingBlocks(ExitingBlocks);
+    // Not considierered speculative.
+    if (ExitingBlocks.size() == 1)
+      return MaxNumOfPromotionsPerLoop;
+
+    if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
+      return 0;
+
+    // Whether the target block is in a loop does not matter:
+    if (SpeculativeCounterPromotionToLoop)
+      return MaxNumOfPromotionsPerLoop;
+
+    // Now check the target block:
+    unsigned MaxProm = MaxNumOfPromotionsPerLoop;
+    for (auto *TargetBlock : LoopExitBlocks) {
+      auto *TargetLoop = LI.getLoopFor(TargetBlock);
+      if (!TargetLoop)
+        continue;
+      unsigned MaxPromForTarget = getMaxNumOfPromotionsInLoop(TargetLoop);
+      unsigned PendingCandsInTarget = LoopToCandidates[TargetLoop].size();
+      MaxProm =
+          std::min(MaxProm, std::max(MaxPromForTarget, PendingCandsInTarget) -
+                                PendingCandsInTarget);
+    }
+    return MaxProm;
+  }
+
+  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates;
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  SmallVector<Instruction *, 8> InsertPts;
+  Loop &L;
+  LoopInfo &LI;
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+  if (!run(M, TLI))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+char InstrProfilingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    InstrProfilingLegacyPass, "instrprof",
+    "Frontend instrumentation-based coverage lowering.", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    InstrProfilingLegacyPass, "instrprof",
+    "Frontend instrumentation-based coverage lowering.", false, false)
+
+ModulePass *
+llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options) {
+  return new InstrProfilingLegacyPass(Options);
+}
+
+static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) {
+  InstrProfIncrementInst *Inc = dyn_cast<InstrProfIncrementInstStep>(Instr);
+  if (Inc)
+    return Inc;
+  return dyn_cast<InstrProfIncrementInst>(Instr);
+}
+
+bool InstrProfiling::lowerIntrinsics(Function *F) {
+  bool MadeChange = false;
+  PromotionCandidates.clear();
+  for (BasicBlock &BB : *F) {
+    for (auto I = BB.begin(), E = BB.end(); I != E;) {
+      auto Instr = I++;
+      InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr);
+      if (Inc) {
+        lowerIncrement(Inc);
+        MadeChange = true;
+      } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) {
+        lowerValueProfileInst(Ind);
+        MadeChange = true;
+      }
+    }
+  }
+
+  if (!MadeChange)
+    return false;
+
+  promoteCounterLoadStores(F);
+  return true;
+}
+
+bool InstrProfiling::isCounterPromotionEnabled() const {
+  if (DoCounterPromotion.getNumOccurrences() > 0)
+    return DoCounterPromotion;
+
+  return Options.DoCounterPromotion;
+}
+
+void InstrProfiling::promoteCounterLoadStores(Function *F) {
+  if (!isCounterPromotionEnabled())
+    return;
+
+  DominatorTree DT(*F);
+  LoopInfo LI(DT);
+  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> LoopPromotionCandidates;
+
+  for (const auto &LoadStore : PromotionCandidates) {
+    auto *CounterLoad = LoadStore.first;
+    auto *CounterStore = LoadStore.second;
+    BasicBlock *BB = CounterLoad->getParent();
+    Loop *ParentLoop = LI.getLoopFor(BB);
+    if (!ParentLoop)
+      continue;
+    LoopPromotionCandidates[ParentLoop].emplace_back(CounterLoad, CounterStore);
+  }
+
+  SmallVector<Loop *, 4> Loops = LI.getLoopsInPreorder();
+
+  // Do a post-order traversal of the loops so that counter updates can be
+  // iteratively hoisted outside the loop nest.
+  for (auto *Loop : llvm::reverse(Loops)) {
+    PGOCounterPromoter Promoter(LoopPromotionCandidates, *Loop, LI);
+    Promoter.run(&TotalCountersPromoted);
+  }
+}
+
+bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
+  bool MadeChange = false;
+
+  this->M = &M;
+  this->TLI = &TLI;
+  NamesVar = nullptr;
+  NamesSize = 0;
+  ProfileDataMap.clear();
+  UsedVars.clear();
+  getMemOPSizeRangeFromOption(MemOPSizeRange, MemOPSizeRangeStart,
+                              MemOPSizeRangeLast);
+  TT = Triple(M.getTargetTriple());
+
+  // We did not know how many value sites there would be inside
+  // the instrumented function. This is counting the number of instrumented
+  // target value sites to enter it as field in the profile data variable.
+  for (Function &F : M) {
+    InstrProfIncrementInst *FirstProfIncInst = nullptr;
+    for (BasicBlock &BB : F)
+      for (auto I = BB.begin(), E = BB.end(); I != E; I++)
+        if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I))
+          computeNumValueSiteCounts(Ind);
+        else if (FirstProfIncInst == nullptr)
+          FirstProfIncInst = dyn_cast<InstrProfIncrementInst>(I);
+
+    // Value profiling intrinsic lowering requires per-function profile data
+    // variable to be created first.
+    if (FirstProfIncInst != nullptr)
+      static_cast<void>(getOrCreateRegionCounters(FirstProfIncInst));
+  }
+
+  for (Function &F : M)
+    MadeChange |= lowerIntrinsics(&F);
+
+  if (GlobalVariable *CoverageNamesVar =
+          M.getNamedGlobal(getCoverageUnusedNamesVarName())) {
+    lowerCoverageData(CoverageNamesVar);
+    MadeChange = true;
+  }
+
+  if (!MadeChange)
+    return false;
+
+  emitVNodes();
+  emitNameData();
+  emitRegistration();
+  emitRuntimeHook();
+  emitUses();
+  emitInitialization();
+  return true;
+}
+
+static Constant *getOrInsertValueProfilingCall(Module &M,
+                                               const TargetLibraryInfo &TLI,
+                                               bool IsRange = false) {
+  LLVMContext &Ctx = M.getContext();
+  auto *ReturnTy = Type::getVoidTy(M.getContext());
+
+  Constant *Res;
+  if (!IsRange) {
+    Type *ParamTypes[] = {
+#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
+#include "llvm/ProfileData/InstrProfData.inc"
+    };
+    auto *ValueProfilingCallTy =
+        FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false);
+    Res = M.getOrInsertFunction(getInstrProfValueProfFuncName(),
+                                ValueProfilingCallTy);
+  } else {
+    Type *RangeParamTypes[] = {
+#define VALUE_RANGE_PROF 1
+#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
+#include "llvm/ProfileData/InstrProfData.inc"
+#undef VALUE_RANGE_PROF
+    };
+    auto *ValueRangeProfilingCallTy =
+        FunctionType::get(ReturnTy, makeArrayRef(RangeParamTypes), false);
+    Res = M.getOrInsertFunction(getInstrProfValueRangeProfFuncName(),
+                                ValueRangeProfilingCallTy);
+  }
+
+  if (Function *FunRes = dyn_cast<Function>(Res)) {
+    if (auto AK = TLI.getExtAttrForI32Param(false))
+      FunRes->addParamAttr(2, AK);
+  }
+  return Res;
+}
+
+void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
+  GlobalVariable *Name = Ind->getName();
+  uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
+  uint64_t Index = Ind->getIndex()->getZExtValue();
+  auto It = ProfileDataMap.find(Name);
+  if (It == ProfileDataMap.end()) {
+    PerFunctionProfileData PD;
+    PD.NumValueSites[ValueKind] = Index + 1;
+    ProfileDataMap[Name] = PD;
+  } else if (It->second.NumValueSites[ValueKind] <= Index)
+    It->second.NumValueSites[ValueKind] = Index + 1;
+}
+
+void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
+  GlobalVariable *Name = Ind->getName();
+  auto It = ProfileDataMap.find(Name);
+  assert(It != ProfileDataMap.end() && It->second.DataVar &&
+         "value profiling detected in function with no counter incerement");
+
+  GlobalVariable *DataVar = It->second.DataVar;
+  uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
+  uint64_t Index = Ind->getIndex()->getZExtValue();
+  for (uint32_t Kind = IPVK_First; Kind < ValueKind; ++Kind)
+    Index += It->second.NumValueSites[Kind];
+
+  IRBuilder<> Builder(Ind);
+  bool IsRange = (Ind->getValueKind()->getZExtValue() ==
+                  llvm::InstrProfValueKind::IPVK_MemOPSize);
+  CallInst *Call = nullptr;
+  if (!IsRange) {
+    Value *Args[3] = {Ind->getTargetValue(),
+                      Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+                      Builder.getInt32(Index)};
+    Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI), Args);
+  } else {
+    Value *Args[6] = {
+        Ind->getTargetValue(),
+        Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+        Builder.getInt32(Index),
+        Builder.getInt64(MemOPSizeRangeStart),
+        Builder.getInt64(MemOPSizeRangeLast),
+        Builder.getInt64(MemOPSizeLarge == 0 ? INT64_MIN : MemOPSizeLarge)};
+    Call =
+        Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI, true), Args);
+  }
+  if (auto AK = TLI->getExtAttrForI32Param(false))
+    Call->addParamAttr(2, AK);
+  Ind->replaceAllUsesWith(Call);
+  Ind->eraseFromParent();
+}
+
+void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
+  GlobalVariable *Counters = getOrCreateRegionCounters(Inc);
+
+  IRBuilder<> Builder(Inc);
+  uint64_t Index = Inc->getIndex()->getZExtValue();
+  Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index);
+  Value *Load = Builder.CreateLoad(Addr, "pgocount");
+  auto *Count = Builder.CreateAdd(Load, Inc->getStep());
+  auto *Store = Builder.CreateStore(Count, Addr);
+  Inc->replaceAllUsesWith(Store);
+  if (isCounterPromotionEnabled())
+    PromotionCandidates.emplace_back(cast<Instruction>(Load), Store);
+  Inc->eraseFromParent();
+}
+
+void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
+  ConstantArray *Names =
+      cast<ConstantArray>(CoverageNamesVar->getInitializer());
+  for (unsigned I = 0, E = Names->getNumOperands(); I < E; ++I) {
+    Constant *NC = Names->getOperand(I);
+    Value *V = NC->stripPointerCasts();
+    assert(isa<GlobalVariable>(V) && "Missing reference to function name");
+    GlobalVariable *Name = cast<GlobalVariable>(V);
+
+    Name->setLinkage(GlobalValue::PrivateLinkage);
+    ReferencedNames.push_back(Name);
+    NC->dropAllReferences();
+  }
+  CoverageNamesVar->eraseFromParent();
+}
+
+/// Get the name of a profiling variable for a particular function.
+static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) {
+  StringRef NamePrefix = getInstrProfNameVarPrefix();
+  StringRef Name = Inc->getName()->getName().substr(NamePrefix.size());
+  Function *F = Inc->getParent()->getParent();
+  Module *M = F->getParent();
+  if (!DoHashBasedCounterSplit || !isIRPGOFlagSet(M) ||
+      !canRenameComdatFunc(*F))
+    return (Prefix + Name).str();
+  uint64_t FuncHash = Inc->getHash()->getZExtValue();
+  SmallVector<char, 24> HashPostfix;
+  if (Name.endswith((Twine(".") + Twine(FuncHash)).toStringRef(HashPostfix)))
+    return (Prefix + Name).str();
+  return (Prefix + Name + "." + Twine(FuncHash)).str();
+}
+
+static inline bool shouldRecordFunctionAddr(Function *F) {
+  // Check the linkage
+  bool HasAvailableExternallyLinkage = F->hasAvailableExternallyLinkage();
+  if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() &&
+      !HasAvailableExternallyLinkage)
+    return true;
+
+  // A function marked 'alwaysinline' with available_externally linkage can't
+  // have its address taken. Doing so would create an undefined external ref to
+  // the function, which would fail to link.
+  if (HasAvailableExternallyLinkage &&
+      F->hasFnAttribute(Attribute::AlwaysInline))
+    return false;
+
+  // Prohibit function address recording if the function is both internal and
+  // COMDAT. This avoids the profile data variable referencing internal symbols
+  // in COMDAT.
+  if (F->hasLocalLinkage() && F->hasComdat())
+    return false;
+
+  // Check uses of this function for other than direct calls or invokes to it.
+  // Inline virtual functions have linkeOnceODR linkage. When a key method
+  // exists, the vtable will only be emitted in the TU where the key method
+  // is defined. In a TU where vtable is not available, the function won't
+  // be 'addresstaken'. If its address is not recorded here, the profile data
+  // with missing address may be picked by the linker leading  to missing
+  // indirect call target info.
+  return F->hasAddressTaken() || F->hasLinkOnceLinkage();
+}
+
+static inline Comdat *getOrCreateProfileComdat(Module &M, Function &F,
+                                               InstrProfIncrementInst *Inc) {
+  if (!needsComdatForCounter(F, M))
+    return nullptr;
+
+  // COFF format requires a COMDAT section to have a key symbol with the same
+  // name. The linker targeting COFF also requires that the COMDAT
+  // a section is associated to must precede the associating section. For this
+  // reason, we must choose the counter var's name as the name of the comdat.
+  StringRef ComdatPrefix = (Triple(M.getTargetTriple()).isOSBinFormatCOFF()
+                                ? getInstrProfCountersVarPrefix()
+                                : getInstrProfComdatPrefix());
+  return M.getOrInsertComdat(StringRef(getVarName(Inc, ComdatPrefix)));
+}
+
+static bool needsRuntimeRegistrationOfSectionRange(const Module &M) {
+  // Don't do this for Darwin.  compiler-rt uses linker magic.
+  if (Triple(M.getTargetTriple()).isOSDarwin())
+    return false;
+
+  // Use linker script magic to get data/cnts/name start/end.
+  if (Triple(M.getTargetTriple()).isOSLinux() ||
+      Triple(M.getTargetTriple()).isOSFreeBSD() ||
+      Triple(M.getTargetTriple()).isPS4CPU())
+    return false;
+
+  return true;
+}
+
+GlobalVariable *
+InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
+  GlobalVariable *NamePtr = Inc->getName();
+  auto It = ProfileDataMap.find(NamePtr);
+  PerFunctionProfileData PD;
+  if (It != ProfileDataMap.end()) {
+    if (It->second.RegionCounters)
+      return It->second.RegionCounters;
+    PD = It->second;
+  }
+
+  // Move the name variable to the right section. Place them in a COMDAT group
+  // if the associated function is a COMDAT. This will make sure that
+  // only one copy of counters of the COMDAT function will be emitted after
+  // linking.
+  Function *Fn = Inc->getParent()->getParent();
+  Comdat *ProfileVarsComdat = nullptr;
+  ProfileVarsComdat = getOrCreateProfileComdat(*M, *Fn, Inc);
+
+  uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
+  LLVMContext &Ctx = M->getContext();
+  ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
+
+  // Create the counters variable.
+  auto *CounterPtr =
+      new GlobalVariable(*M, CounterTy, false, NamePtr->getLinkage(),
+                         Constant::getNullValue(CounterTy),
+                         getVarName(Inc, getInstrProfCountersVarPrefix()));
+  CounterPtr->setVisibility(NamePtr->getVisibility());
+  CounterPtr->setSection(
+      getInstrProfSectionName(IPSK_cnts, TT.getObjectFormat()));
+  CounterPtr->setAlignment(8);
+  CounterPtr->setComdat(ProfileVarsComdat);
+
+  auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
+  // Allocate statically the array of pointers to value profile nodes for
+  // the current function.
+  Constant *ValuesPtrExpr = ConstantPointerNull::get(Int8PtrTy);
+  if (ValueProfileStaticAlloc && !needsRuntimeRegistrationOfSectionRange(*M)) {
+    uint64_t NS = 0;
+    for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+      NS += PD.NumValueSites[Kind];
+    if (NS) {
+      ArrayType *ValuesTy = ArrayType::get(Type::getInt64Ty(Ctx), NS);
+
+      auto *ValuesVar =
+          new GlobalVariable(*M, ValuesTy, false, NamePtr->getLinkage(),
+                             Constant::getNullValue(ValuesTy),
+                             getVarName(Inc, getInstrProfValuesVarPrefix()));
+      ValuesVar->setVisibility(NamePtr->getVisibility());
+      ValuesVar->setSection(
+          getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
+      ValuesVar->setAlignment(8);
+      ValuesVar->setComdat(ProfileVarsComdat);
+      ValuesPtrExpr =
+          ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
+    }
+  }
+
+  // Create data variable.
+  auto *Int16Ty = Type::getInt16Ty(Ctx);
+  auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1);
+  Type *DataTypes[] = {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+  auto *DataTy = StructType::get(Ctx, makeArrayRef(DataTypes));
+
+  Constant *FunctionAddr = shouldRecordFunctionAddr(Fn)
+                               ? ConstantExpr::getBitCast(Fn, Int8PtrTy)
+                               : ConstantPointerNull::get(Int8PtrTy);
+
+  Constant *Int16ArrayVals[IPVK_Last + 1];
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+    Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
+
+  Constant *DataVals[] = {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+  auto *Data = new GlobalVariable(*M, DataTy, false, NamePtr->getLinkage(),
+                                  ConstantStruct::get(DataTy, DataVals),
+                                  getVarName(Inc, getInstrProfDataVarPrefix()));
+  Data->setVisibility(NamePtr->getVisibility());
+  Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat()));
+  Data->setAlignment(INSTR_PROF_DATA_ALIGNMENT);
+  Data->setComdat(ProfileVarsComdat);
+
+  PD.RegionCounters = CounterPtr;
+  PD.DataVar = Data;
+  ProfileDataMap[NamePtr] = PD;
+
+  // Mark the data variable as used so that it isn't stripped out.
+  UsedVars.push_back(Data);
+  // Now that the linkage set by the FE has been passed to the data and counter
+  // variables, reset Name variable's linkage and visibility to private so that
+  // it can be removed later by the compiler.
+  NamePtr->setLinkage(GlobalValue::PrivateLinkage);
+  // Collect the referenced names to be used by emitNameData.
+  ReferencedNames.push_back(NamePtr);
+
+  return CounterPtr;
+}
+
+void InstrProfiling::emitVNodes() {
+  if (!ValueProfileStaticAlloc)
+    return;
+
+  // For now only support this on platforms that do
+  // not require runtime registration to discover
+  // named section start/end.
+  if (needsRuntimeRegistrationOfSectionRange(*M))
+    return;
+
+  size_t TotalNS = 0;
+  for (auto &PD : ProfileDataMap) {
+    for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+      TotalNS += PD.second.NumValueSites[Kind];
+  }
+
+  if (!TotalNS)
+    return;
+
+  uint64_t NumCounters = TotalNS * NumCountersPerValueSite;
+// Heuristic for small programs with very few total value sites.
+// The default value of vp-counters-per-site is chosen based on
+// the observation that large apps usually have a low percentage
+// of value sites that actually have any profile data, and thus
+// the average number of counters per site is low. For small
+// apps with very few sites, this may not be true. Bump up the
+// number of counters in this case.
+#define INSTR_PROF_MIN_VAL_COUNTS 10
+  if (NumCounters < INSTR_PROF_MIN_VAL_COUNTS)
+    NumCounters = std::max(INSTR_PROF_MIN_VAL_COUNTS, (int)NumCounters * 2);
+
+  auto &Ctx = M->getContext();
+  Type *VNodeTypes[] = {
+#define INSTR_PROF_VALUE_NODE(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+  auto *VNodeTy = StructType::get(Ctx, makeArrayRef(VNodeTypes));
+
+  ArrayType *VNodesTy = ArrayType::get(VNodeTy, NumCounters);
+  auto *VNodesVar = new GlobalVariable(
+      *M, VNodesTy, false, GlobalValue::PrivateLinkage,
+      Constant::getNullValue(VNodesTy), getInstrProfVNodesVarName());
+  VNodesVar->setSection(
+      getInstrProfSectionName(IPSK_vnodes, TT.getObjectFormat()));
+  UsedVars.push_back(VNodesVar);
+}
+
+void InstrProfiling::emitNameData() {
+  std::string UncompressedData;
+
+  if (ReferencedNames.empty())
+    return;
+
+  std::string CompressedNameStr;
+  if (Error E = collectPGOFuncNameStrings(ReferencedNames, CompressedNameStr,
+                                          DoNameCompression)) {
+    report_fatal_error(toString(std::move(E)), false);
+  }
+
+  auto &Ctx = M->getContext();
+  auto *NamesVal = ConstantDataArray::getString(
+      Ctx, StringRef(CompressedNameStr), false);
+  NamesVar = new GlobalVariable(*M, NamesVal->getType(), true,
+                                GlobalValue::PrivateLinkage, NamesVal,
+                                getInstrProfNamesVarName());
+  NamesSize = CompressedNameStr.size();
+  NamesVar->setSection(
+      getInstrProfSectionName(IPSK_name, TT.getObjectFormat()));
+  UsedVars.push_back(NamesVar);
+
+  for (auto *NamePtr : ReferencedNames)
+    NamePtr->eraseFromParent();
+}
+
+void InstrProfiling::emitRegistration() {
+  if (!needsRuntimeRegistrationOfSectionRange(*M))
+    return;
+
+  // Construct the function.
+  auto *VoidTy = Type::getVoidTy(M->getContext());
+  auto *VoidPtrTy = Type::getInt8PtrTy(M->getContext());
+  auto *Int64Ty = Type::getInt64Ty(M->getContext());
+  auto *RegisterFTy = FunctionType::get(VoidTy, false);
+  auto *RegisterF = Function::Create(RegisterFTy, GlobalValue::InternalLinkage,
+                                     getInstrProfRegFuncsName(), M);
+  RegisterF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  if (Options.NoRedZone)
+    RegisterF->addFnAttr(Attribute::NoRedZone);
+
+  auto *RuntimeRegisterTy = FunctionType::get(VoidTy, VoidPtrTy, false);
+  auto *RuntimeRegisterF =
+      Function::Create(RuntimeRegisterTy, GlobalVariable::ExternalLinkage,
+                       getInstrProfRegFuncName(), M);
+
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", RegisterF));
+  for (Value *Data : UsedVars)
+    if (Data != NamesVar)
+      IRB.CreateCall(RuntimeRegisterF, IRB.CreateBitCast(Data, VoidPtrTy));
+
+  if (NamesVar) {
+    Type *ParamTypes[] = {VoidPtrTy, Int64Ty};
+    auto *NamesRegisterTy =
+        FunctionType::get(VoidTy, makeArrayRef(ParamTypes), false);
+    auto *NamesRegisterF =
+        Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage,
+                         getInstrProfNamesRegFuncName(), M);
+    IRB.CreateCall(NamesRegisterF, {IRB.CreateBitCast(NamesVar, VoidPtrTy),
+                                    IRB.getInt64(NamesSize)});
+  }
+
+  IRB.CreateRetVoid();
+}
+
+void InstrProfiling::emitRuntimeHook() {
+  // We expect the linker to be invoked with -u<hook_var> flag for linux,
+  // for which case there is no need to emit the user function.
+  if (Triple(M->getTargetTriple()).isOSLinux())
+    return;
+
+  // If the module's provided its own runtime, we don't need to do anything.
+  if (M->getGlobalVariable(getInstrProfRuntimeHookVarName()))
+    return;
+
+  // Declare an external variable that will pull in the runtime initialization.
+  auto *Int32Ty = Type::getInt32Ty(M->getContext());
+  auto *Var =
+      new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
+                         nullptr, getInstrProfRuntimeHookVarName());
+
+  // Make a function that uses it.
+  auto *User = Function::Create(FunctionType::get(Int32Ty, false),
+                                GlobalValue::LinkOnceODRLinkage,
+                                getInstrProfRuntimeHookVarUseFuncName(), M);
+  User->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    User->addFnAttr(Attribute::NoRedZone);
+  User->setVisibility(GlobalValue::HiddenVisibility);
+  if (Triple(M->getTargetTriple()).supportsCOMDAT())
+    User->setComdat(M->getOrInsertComdat(User->getName()));
+
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User));
+  auto *Load = IRB.CreateLoad(Var);
+  IRB.CreateRet(Load);
+
+  // Mark the user variable as used so that it isn't stripped out.
+  UsedVars.push_back(User);
+}
+
+void InstrProfiling::emitUses() {
+  if (!UsedVars.empty())
+    appendToUsed(*M, UsedVars);
+}
+
+void InstrProfiling::emitInitialization() {
+  StringRef InstrProfileOutput = Options.InstrProfileOutput;
+
+  if (!InstrProfileOutput.empty()) {
+    // Create variable for profile name.
+    Constant *ProfileNameConst =
+        ConstantDataArray::getString(M->getContext(), InstrProfileOutput, true);
+    GlobalVariable *ProfileNameVar = new GlobalVariable(
+        *M, ProfileNameConst->getType(), true, GlobalValue::WeakAnyLinkage,
+        ProfileNameConst, INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_NAME_VAR));
+    if (TT.supportsCOMDAT()) {
+      ProfileNameVar->setLinkage(GlobalValue::ExternalLinkage);
+      ProfileNameVar->setComdat(M->getOrInsertComdat(
+          StringRef(INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_NAME_VAR))));
+    }
+  }
+
+  Constant *RegisterF = M->getFunction(getInstrProfRegFuncsName());
+  if (!RegisterF)
+    return;
+
+  // Create the initialization function.
+  auto *VoidTy = Type::getVoidTy(M->getContext());
+  auto *F = Function::Create(FunctionType::get(VoidTy, false),
+                             GlobalValue::InternalLinkage,
+                             getInstrProfInitFuncName(), M);
+  F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  F->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    F->addFnAttr(Attribute::NoRedZone);
+
+  // Add the basic block and the necessary calls.
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F));
+  if (RegisterF)
+    IRB.CreateCall(RegisterF, {});
+  IRB.CreateRetVoid();
+
+  appendToGlobalCtors(*M, F, 0);
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
new file mode 100644
index 000000000000..7bb62d2c8455
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -0,0 +1,79 @@
+//===-- Instrumentation.cpp - TransformUtils Infrastructure ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the common initialization infrastructure for the
+// Instrumentation library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+/// Moves I before IP. Returns new insert point.
+static BasicBlock::iterator moveBeforeInsertPoint(BasicBlock::iterator I, BasicBlock::iterator IP) {
+  // If I is IP, move the insert point down.
+  if (I == IP)
+    return ++IP;
+  // Otherwise, move I before IP and return IP.
+  I->moveBefore(&*IP);
+  return IP;
+}
+
+/// Instrumentation passes often insert conditional checks into entry blocks.
+/// Call this function before splitting the entry block to move instructions
+/// that must remain in the entry block up before the split point. Static
+/// allocas and llvm.localescape calls, for example, must remain in the entry
+/// block.
+BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB,
+                                                    BasicBlock::iterator IP) {
+  assert(&BB.getParent()->getEntryBlock() == &BB);
+  for (auto I = IP, E = BB.end(); I != E; ++I) {
+    bool KeepInEntry = false;
+    if (auto *AI = dyn_cast<AllocaInst>(I)) {
+      if (AI->isStaticAlloca())
+        KeepInEntry = true;
+    } else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+      if (II->getIntrinsicID() == llvm::Intrinsic::localescape)
+        KeepInEntry = true;
+    }
+    if (KeepInEntry)
+      IP = moveBeforeInsertPoint(I, IP);
+  }
+  return IP;
+}
+
+/// initializeInstrumentation - Initialize all passes in the TransformUtils
+/// library.
+void llvm::initializeInstrumentation(PassRegistry &Registry) {
+  initializeAddressSanitizerPass(Registry);
+  initializeAddressSanitizerModulePass(Registry);
+  initializeBoundsCheckingPass(Registry);
+  initializeGCOVProfilerLegacyPassPass(Registry);
+  initializePGOInstrumentationGenLegacyPassPass(Registry);
+  initializePGOInstrumentationUseLegacyPassPass(Registry);
+  initializePGOIndirectCallPromotionLegacyPassPass(Registry);
+  initializePGOMemOPSizeOptLegacyPassPass(Registry);
+  initializeInstrProfilingLegacyPassPass(Registry);
+  initializeMemorySanitizerPass(Registry);
+  initializeThreadSanitizerPass(Registry);
+  initializeSanitizerCoverageModulePass(Registry);
+  initializeDataFlowSanitizerPass(Registry);
+  initializeEfficiencySanitizerPass(Registry);
+}
+
+/// LLVMInitializeInstrumentation - C binding for
+/// initializeInstrumentation.
+void LLVMInitializeInstrumentation(LLVMPassRegistryRef R) {
+  initializeInstrumentation(*unwrap(R));
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h b/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
new file mode 100644
index 000000000000..4eb758c69c58
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
@@ -0,0 +1,111 @@
+//===- llvm/Analysis/MaximumSpanningTree.h - Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This module provides means for calculating a maximum spanning tree for a
+// given set of weighted edges. The type parameter T is the type of a node.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_INSTRUMENTATION_MAXIMUMSPANNINGTREE_H
+#define LLVM_LIB_TRANSFORMS_INSTRUMENTATION_MAXIMUMSPANNINGTREE_H
+
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/IR/BasicBlock.h"
+#include <algorithm>
+#include <vector>
+
+namespace llvm {
+
+  /// MaximumSpanningTree - A MST implementation.
+  /// The type parameter T determines the type of the nodes of the graph.
+  template <typename T>
+  class MaximumSpanningTree {
+  public:
+    typedef std::pair<const T*, const T*> Edge;
+    typedef std::pair<Edge, double> EdgeWeight;
+    typedef std::vector<EdgeWeight> EdgeWeights;
+  protected:
+    typedef std::vector<Edge> MaxSpanTree;
+
+    MaxSpanTree MST;
+
+  private:
+    // A comparing class for comparing weighted edges.
+    struct EdgeWeightCompare {
+      static bool getBlockSize(const T *X) {
+        const BasicBlock *BB = dyn_cast_or_null<BasicBlock>(X);
+        return BB ? BB->size() : 0;
+      }
+
+      bool operator()(EdgeWeight X, EdgeWeight Y) const {
+        if (X.second > Y.second) return true;
+        if (X.second < Y.second) return false;
+
+        // Equal edge weights: break ties by comparing block sizes.
+        size_t XSizeA = getBlockSize(X.first.first);
+        size_t YSizeA = getBlockSize(Y.first.first);
+        if (XSizeA > YSizeA) return true;
+        if (XSizeA < YSizeA) return false;
+
+        size_t XSizeB = getBlockSize(X.first.second);
+        size_t YSizeB = getBlockSize(Y.first.second);
+        if (XSizeB > YSizeB) return true;
+        if (XSizeB < YSizeB) return false;
+
+        return false;
+      }
+    };
+
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+
+    /// MaximumSpanningTree() - Takes a vector of weighted edges and returns a
+    /// spanning tree.
+    MaximumSpanningTree(EdgeWeights &EdgeVector) {
+
+      std::stable_sort(EdgeVector.begin(), EdgeVector.end(), EdgeWeightCompare());
+
+      // Create spanning tree, Forest contains a special data structure
+      // that makes checking if two nodes are already in a common (sub-)tree
+      // fast and cheap.
+      EquivalenceClasses<const T*> Forest;
+      for (typename EdgeWeights::iterator EWi = EdgeVector.begin(),
+           EWe = EdgeVector.end(); EWi != EWe; ++EWi) {
+        Edge e = (*EWi).first;
+
+        Forest.insert(e.first);
+        Forest.insert(e.second);
+      }
+
+      // Iterate over the sorted edges, biggest first.
+      for (typename EdgeWeights::iterator EWi = EdgeVector.begin(),
+           EWe = EdgeVector.end(); EWi != EWe; ++EWi) {
+        Edge e = (*EWi).first;
+
+        if (Forest.findLeader(e.first) != Forest.findLeader(e.second)) {
+          Forest.unionSets(e.first, e.second);
+          // So we know now that the edge is not already in a subtree, so we push
+          // the edge to the MST.
+          MST.push_back(e);
+        }
+      }
+    }
+
+    typename MaxSpanTree::iterator begin() {
+      return MST.begin();
+    }
+
+    typename MaxSpanTree::iterator end() {
+      return MST.end();
+    }
+  };
+
+} // End llvm namespace
+
+#endif // LLVM_LIB_TRANSFORMS_INSTRUMENTATION_MAXIMUMSPANNINGTREE_H
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
new file mode 100644
index 000000000000..1348e0ed0ed0
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -0,0 +1,3669 @@
+//===-- MemorySanitizer.cpp - detector of uninitialized reads -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file is a part of MemorySanitizer, a detector of uninitialized
+/// reads.
+///
+/// The algorithm of the tool is similar to Memcheck
+/// (http://goo.gl/QKbem). We associate a few shadow bits with every
+/// byte of the application memory, poison the shadow of the malloc-ed
+/// or alloca-ed memory, load the shadow bits on every memory read,
+/// propagate the shadow bits through some of the arithmetic
+/// instruction (including MOV), store the shadow bits on every memory
+/// write, report a bug on some other instructions (e.g. JMP) if the
+/// associated shadow is poisoned.
+///
+/// But there are differences too. The first and the major one:
+/// compiler instrumentation instead of binary instrumentation. This
+/// gives us much better register allocation, possible compiler
+/// optimizations and a fast start-up. But this brings the major issue
+/// as well: msan needs to see all program events, including system
+/// calls and reads/writes in system libraries, so we either need to
+/// compile *everything* with msan or use a binary translation
+/// component (e.g. DynamoRIO) to instrument pre-built libraries.
+/// Another difference from Memcheck is that we use 8 shadow bits per
+/// byte of application memory and use a direct shadow mapping. This
+/// greatly simplifies the instrumentation code and avoids races on
+/// shadow updates (Memcheck is single-threaded so races are not a
+/// concern there. Memcheck uses 2 shadow bits per byte with a slow
+/// path storage that uses 8 bits per byte).
+///
+/// The default value of shadow is 0, which means "clean" (not poisoned).
+///
+/// Every module initializer should call __msan_init to ensure that the
+/// shadow memory is ready. On error, __msan_warning is called. Since
+/// parameters and return values may be passed via registers, we have a
+/// specialized thread-local shadow for return values
+/// (__msan_retval_tls) and parameters (__msan_param_tls).
+///
+///                           Origin tracking.
+///
+/// MemorySanitizer can track origins (allocation points) of all uninitialized
+/// values. This behavior is controlled with a flag (msan-track-origins) and is
+/// disabled by default.
+///
+/// Origins are 4-byte values created and interpreted by the runtime library.
+/// They are stored in a second shadow mapping, one 4-byte value for 4 bytes
+/// of application memory. Propagation of origins is basically a bunch of
+/// "select" instructions that pick the origin of a dirty argument, if an
+/// instruction has one.
+///
+/// Every 4 aligned, consecutive bytes of application memory have one origin
+/// value associated with them. If these bytes contain uninitialized data
+/// coming from 2 different allocations, the last store wins. Because of this,
+/// MemorySanitizer reports can show unrelated origins, but this is unlikely in
+/// practice.
+///
+/// Origins are meaningless for fully initialized values, so MemorySanitizer
+/// avoids storing origin to memory when a fully initialized value is stored.
+/// This way it avoids needless overwritting origin of the 4-byte region on
+/// a short (i.e. 1 byte) clean store, and it is also good for performance.
+///
+///                            Atomic handling.
+///
+/// Ideally, every atomic store of application value should update the
+/// corresponding shadow location in an atomic way. Unfortunately, atomic store
+/// of two disjoint locations can not be done without severe slowdown.
+///
+/// Therefore, we implement an approximation that may err on the safe side.
+/// In this implementation, every atomically accessed location in the program
+/// may only change from (partially) uninitialized to fully initialized, but
+/// not the other way around. We load the shadow _after_ the application load,
+/// and we store the shadow _before_ the app store. Also, we always store clean
+/// shadow (if the application store is atomic). This way, if the store-load
+/// pair constitutes a happens-before arc, shadow store and load are correctly
+/// ordered such that the load will get either the value that was stored, or
+/// some later value (which is always clean).
+///
+/// This does not work very well with Compare-And-Swap (CAS) and
+/// Read-Modify-Write (RMW) operations. To follow the above logic, CAS and RMW
+/// must store the new shadow before the app operation, and load the shadow
+/// after the app operation. Computers don't work this way. Current
+/// implementation ignores the load aspect of CAS/RMW, always returning a clean
+/// value. It implements the store part as a simple atomic store by storing a
+/// clean shadow.
+
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "msan"
+
+static const unsigned kOriginSize = 4;
+static const unsigned kMinOriginAlignment = 4;
+static const unsigned kShadowTLSAlignment = 8;
+
+// These constants must be kept in sync with the ones in msan.h.
+static const unsigned kParamTLSSize = 800;
+static const unsigned kRetvalTLSSize = 800;
+
+// Accesses sizes are powers of two: 1, 2, 4, 8.
+static const size_t kNumberOfAccessSizes = 4;
+
+/// \brief Track origins of uninitialized values.
+///
+/// Adds a section to MemorySanitizer report that points to the allocation
+/// (stack or heap) the uninitialized bits came from originally.
+static cl::opt<int> ClTrackOrigins("msan-track-origins",
+       cl::desc("Track origins (allocation sites) of poisoned memory"),
+       cl::Hidden, cl::init(0));
+static cl::opt<bool> ClKeepGoing("msan-keep-going",
+       cl::desc("keep going after reporting a UMR"),
+       cl::Hidden, cl::init(false));
+static cl::opt<bool> ClPoisonStack("msan-poison-stack",
+       cl::desc("poison uninitialized stack variables"),
+       cl::Hidden, cl::init(true));
+static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call",
+       cl::desc("poison uninitialized stack variables with a call"),
+       cl::Hidden, cl::init(false));
+static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern",
+       cl::desc("poison uninitialized stack variables with the given pattern"),
+       cl::Hidden, cl::init(0xff));
+static cl::opt<bool> ClPoisonUndef("msan-poison-undef",
+       cl::desc("poison undef temps"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClHandleICmp("msan-handle-icmp",
+       cl::desc("propagate shadow through ICmpEQ and ICmpNE"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact",
+       cl::desc("exact handling of relational integer ICmp"),
+       cl::Hidden, cl::init(false));
+
+// This flag controls whether we check the shadow of the address
+// operand of load or store. Such bugs are very rare, since load from
+// a garbage address typically results in SEGV, but still happen
+// (e.g. only lower bits of address are garbage, or the access happens
+// early at program startup where malloc-ed memory is more likely to
+// be zeroed. As of 2012-08-28 this flag adds 20% slowdown.
+static cl::opt<bool> ClCheckAccessAddress("msan-check-access-address",
+       cl::desc("report accesses through a pointer which has poisoned shadow"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions",
+       cl::desc("print out instructions with default strict semantics"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<int> ClInstrumentationWithCallThreshold(
+    "msan-instrumentation-with-call-threshold",
+    cl::desc(
+        "If the function being instrumented requires more than "
+        "this number of checks and origin stores, use callbacks instead of "
+        "inline checks (-1 means never use callbacks)."),
+    cl::Hidden, cl::init(3500));
+
+// This is an experiment to enable handling of cases where shadow is a non-zero
+// compile-time constant. For some unexplainable reason they were silently
+// ignored in the instrumentation.
+static cl::opt<bool> ClCheckConstantShadow("msan-check-constant-shadow",
+       cl::desc("Insert checks for constant shadow values"),
+       cl::Hidden, cl::init(false));
+
+// This is off by default because of a bug in gold:
+// https://sourceware.org/bugzilla/show_bug.cgi?id=19002
+static cl::opt<bool> ClWithComdat("msan-with-comdat",
+       cl::desc("Place MSan constructors in comdat sections"),
+       cl::Hidden, cl::init(false));
+
+static const char *const kMsanModuleCtorName = "msan.module_ctor";
+static const char *const kMsanInitName = "__msan_init";
+
+namespace {
+
+// Memory map parameters used in application-to-shadow address calculation.
+// Offset = (Addr & ~AndMask) ^ XorMask
+// Shadow = ShadowBase + Offset
+// Origin = OriginBase + Offset
+struct MemoryMapParams {
+  uint64_t AndMask;
+  uint64_t XorMask;
+  uint64_t ShadowBase;
+  uint64_t OriginBase;
+};
+
+struct PlatformMemoryMapParams {
+  const MemoryMapParams *bits32;
+  const MemoryMapParams *bits64;
+};
+
+// i386 Linux
+static const MemoryMapParams Linux_I386_MemoryMapParams = {
+  0x000080000000,  // AndMask
+  0,               // XorMask (not used)
+  0,               // ShadowBase (not used)
+  0x000040000000,  // OriginBase
+};
+
+// x86_64 Linux
+static const MemoryMapParams Linux_X86_64_MemoryMapParams = {
+#ifdef MSAN_LINUX_X86_64_OLD_MAPPING
+  0x400000000000,  // AndMask
+  0,               // XorMask (not used)
+  0,               // ShadowBase (not used)
+  0x200000000000,  // OriginBase
+#else
+  0,               // AndMask (not used)
+  0x500000000000,  // XorMask
+  0,               // ShadowBase (not used)
+  0x100000000000,  // OriginBase
+#endif
+};
+
+// mips64 Linux
+static const MemoryMapParams Linux_MIPS64_MemoryMapParams = {
+  0,               // AndMask (not used)
+  0x008000000000,  // XorMask
+  0,               // ShadowBase (not used)
+  0x002000000000,  // OriginBase
+};
+
+// ppc64 Linux
+static const MemoryMapParams Linux_PowerPC64_MemoryMapParams = {
+  0x200000000000,  // AndMask
+  0x100000000000,  // XorMask
+  0x080000000000,  // ShadowBase
+  0x1C0000000000,  // OriginBase
+};
+
+// aarch64 Linux
+static const MemoryMapParams Linux_AArch64_MemoryMapParams = {
+  0,               // AndMask (not used)
+  0x06000000000,   // XorMask
+  0,               // ShadowBase (not used)
+  0x01000000000,   // OriginBase
+};
+
+// i386 FreeBSD
+static const MemoryMapParams FreeBSD_I386_MemoryMapParams = {
+  0x000180000000,  // AndMask
+  0x000040000000,  // XorMask
+  0x000020000000,  // ShadowBase
+  0x000700000000,  // OriginBase
+};
+
+// x86_64 FreeBSD
+static const MemoryMapParams FreeBSD_X86_64_MemoryMapParams = {
+  0xc00000000000,  // AndMask
+  0x200000000000,  // XorMask
+  0x100000000000,  // ShadowBase
+  0x380000000000,  // OriginBase
+};
+
+static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = {
+  &Linux_I386_MemoryMapParams,
+  &Linux_X86_64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = {
+  nullptr,
+  &Linux_MIPS64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_PowerPC_MemoryMapParams = {
+  nullptr,
+  &Linux_PowerPC64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_ARM_MemoryMapParams = {
+  nullptr,
+  &Linux_AArch64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = {
+  &FreeBSD_I386_MemoryMapParams,
+  &FreeBSD_X86_64_MemoryMapParams,
+};
+
+/// \brief An instrumentation pass implementing detection of uninitialized
+/// reads.
+///
+/// MemorySanitizer: instrument the code in module to find
+/// uninitialized reads.
+class MemorySanitizer : public FunctionPass {
+ public:
+  MemorySanitizer(int TrackOrigins = 0, bool Recover = false)
+      : FunctionPass(ID),
+        TrackOrigins(std::max(TrackOrigins, (int)ClTrackOrigins)),
+        Recover(Recover || ClKeepGoing),
+        WarningFn(nullptr) {}
+  StringRef getPassName() const override { return "MemorySanitizer"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+  bool runOnFunction(Function &F) override;
+  bool doInitialization(Module &M) override;
+  static char ID;  // Pass identification, replacement for typeid.
+
+ private:
+  void initializeCallbacks(Module &M);
+
+  /// \brief Track origins (allocation points) of uninitialized values.
+  int TrackOrigins;
+  bool Recover;
+
+  LLVMContext *C;
+  Type *IntptrTy;
+  Type *OriginTy;
+  /// \brief Thread-local shadow storage for function parameters.
+  GlobalVariable *ParamTLS;
+  /// \brief Thread-local origin storage for function parameters.
+  GlobalVariable *ParamOriginTLS;
+  /// \brief Thread-local shadow storage for function return value.
+  GlobalVariable *RetvalTLS;
+  /// \brief Thread-local origin storage for function return value.
+  GlobalVariable *RetvalOriginTLS;
+  /// \brief Thread-local shadow storage for in-register va_arg function
+  /// parameters (x86_64-specific).
+  GlobalVariable *VAArgTLS;
+  /// \brief Thread-local shadow storage for va_arg overflow area
+  /// (x86_64-specific).
+  GlobalVariable *VAArgOverflowSizeTLS;
+  /// \brief Thread-local space used to pass origin value to the UMR reporting
+  /// function.
+  GlobalVariable *OriginTLS;
+
+  /// \brief The run-time callback to print a warning.
+  Value *WarningFn;
+  // These arrays are indexed by log2(AccessSize).
+  Value *MaybeWarningFn[kNumberOfAccessSizes];
+  Value *MaybeStoreOriginFn[kNumberOfAccessSizes];
+
+  /// \brief Run-time helper that generates a new origin value for a stack
+  /// allocation.
+  Value *MsanSetAllocaOrigin4Fn;
+  /// \brief Run-time helper that poisons stack on function entry.
+  Value *MsanPoisonStackFn;
+  /// \brief Run-time helper that records a store (or any event) of an
+  /// uninitialized value and returns an updated origin id encoding this info.
+  Value *MsanChainOriginFn;
+  /// \brief MSan runtime replacements for memmove, memcpy and memset.
+  Value *MemmoveFn, *MemcpyFn, *MemsetFn;
+
+  /// \brief Memory map parameters used in application-to-shadow calculation.
+  const MemoryMapParams *MapParams;
+
+  MDNode *ColdCallWeights;
+  /// \brief Branch weights for origin store.
+  MDNode *OriginStoreWeights;
+  /// \brief An empty volatile inline asm that prevents callback merge.
+  InlineAsm *EmptyAsm;
+  Function *MsanCtorFunction;
+
+  friend struct MemorySanitizerVisitor;
+  friend struct VarArgAMD64Helper;
+  friend struct VarArgMIPS64Helper;
+  friend struct VarArgAArch64Helper;
+  friend struct VarArgPowerPC64Helper;
+};
+} // anonymous namespace
+
+char MemorySanitizer::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    MemorySanitizer, "msan",
+    "MemorySanitizer: detects uninitialized reads.", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    MemorySanitizer, "msan",
+    "MemorySanitizer: detects uninitialized reads.", false, false)
+
+FunctionPass *llvm::createMemorySanitizerPass(int TrackOrigins, bool Recover) {
+  return new MemorySanitizer(TrackOrigins, Recover);
+}
+
+/// \brief Create a non-const global initialized with the given string.
+///
+/// Creates a writable global for Str so that we can pass it to the
+/// run-time lib. Runtime uses first 4 bytes of the string to store the
+/// frame ID, so the string needs to be mutable.
+static GlobalVariable *createPrivateNonConstGlobalForString(Module &M,
+                                                            StringRef Str) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  return new GlobalVariable(M, StrConst->getType(), /*isConstant=*/false,
+                            GlobalValue::PrivateLinkage, StrConst, "");
+}
+
+/// \brief Insert extern declaration of runtime-provided functions and globals.
+void MemorySanitizer::initializeCallbacks(Module &M) {
+  // Only do this once.
+  if (WarningFn)
+    return;
+
+  IRBuilder<> IRB(*C);
+  // Create the callback.
+  // FIXME: this function should have "Cold" calling conv,
+  // which is not yet implemented.
+  StringRef WarningFnName = Recover ? "__msan_warning"
+                                    : "__msan_warning_noreturn";
+  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy());
+
+  for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+       AccessSizeIndex++) {
+    unsigned AccessSize = 1 << AccessSizeIndex;
+    std::string FunctionName = "__msan_maybe_warning_" + itostr(AccessSize);
+    MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction(
+        FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
+        IRB.getInt32Ty());
+
+    FunctionName = "__msan_maybe_store_origin_" + itostr(AccessSize);
+    MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction(
+        FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
+        IRB.getInt8PtrTy(), IRB.getInt32Ty());
+  }
+
+  MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
+    "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
+    IRB.getInt8PtrTy(), IntptrTy);
+  MsanPoisonStackFn =
+      M.getOrInsertFunction("__msan_poison_stack", IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IntptrTy);
+  MsanChainOriginFn = M.getOrInsertFunction(
+    "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty());
+  MemmoveFn = M.getOrInsertFunction(
+    "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IRB.getInt8PtrTy(), IntptrTy);
+  MemcpyFn = M.getOrInsertFunction(
+    "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IntptrTy);
+  MemsetFn = M.getOrInsertFunction(
+    "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
+    IntptrTy);
+
+  // Create globals.
+  RetvalTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8), false,
+    GlobalVariable::ExternalLinkage, nullptr, "__msan_retval_tls", nullptr,
+    GlobalVariable::InitialExecTLSModel);
+  RetvalOriginTLS = new GlobalVariable(
+    M, OriginTy, false, GlobalVariable::ExternalLinkage, nullptr,
+    "__msan_retval_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
+
+  ParamTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), false,
+    GlobalVariable::ExternalLinkage, nullptr, "__msan_param_tls", nullptr,
+    GlobalVariable::InitialExecTLSModel);
+  ParamOriginTLS = new GlobalVariable(
+    M, ArrayType::get(OriginTy, kParamTLSSize / 4), false,
+    GlobalVariable::ExternalLinkage, nullptr, "__msan_param_origin_tls",
+    nullptr, GlobalVariable::InitialExecTLSModel);
+
+  VAArgTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), false,
+    GlobalVariable::ExternalLinkage, nullptr, "__msan_va_arg_tls", nullptr,
+    GlobalVariable::InitialExecTLSModel);
+  VAArgOverflowSizeTLS = new GlobalVariable(
+    M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, nullptr,
+    "__msan_va_arg_overflow_size_tls", nullptr,
+    GlobalVariable::InitialExecTLSModel);
+  OriginTLS = new GlobalVariable(
+    M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, nullptr,
+    "__msan_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
+
+  // We insert an empty inline asm after __msan_report* to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+}
+
+/// \brief Module-level initialization.
+///
+/// inserts a call to __msan_init to the module's constructor list.
+bool MemorySanitizer::doInitialization(Module &M) {
+  auto &DL = M.getDataLayout();
+
+  Triple TargetTriple(M.getTargetTriple());
+  switch (TargetTriple.getOS()) {
+    case Triple::FreeBSD:
+      switch (TargetTriple.getArch()) {
+        case Triple::x86_64:
+          MapParams = FreeBSD_X86_MemoryMapParams.bits64;
+          break;
+        case Triple::x86:
+          MapParams = FreeBSD_X86_MemoryMapParams.bits32;
+          break;
+        default:
+          report_fatal_error("unsupported architecture");
+      }
+      break;
+    case Triple::Linux:
+      switch (TargetTriple.getArch()) {
+        case Triple::x86_64:
+          MapParams = Linux_X86_MemoryMapParams.bits64;
+          break;
+        case Triple::x86:
+          MapParams = Linux_X86_MemoryMapParams.bits32;
+          break;
+        case Triple::mips64:
+        case Triple::mips64el:
+          MapParams = Linux_MIPS_MemoryMapParams.bits64;
+          break;
+        case Triple::ppc64:
+        case Triple::ppc64le:
+          MapParams = Linux_PowerPC_MemoryMapParams.bits64;
+          break;
+        case Triple::aarch64:
+        case Triple::aarch64_be:
+          MapParams = Linux_ARM_MemoryMapParams.bits64;
+          break;
+        default:
+          report_fatal_error("unsupported architecture");
+      }
+      break;
+    default:
+      report_fatal_error("unsupported operating system");
+  }
+
+  C = &(M.getContext());
+  IRBuilder<> IRB(*C);
+  IntptrTy = IRB.getIntPtrTy(DL);
+  OriginTy = IRB.getInt32Ty();
+
+  ColdCallWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+  OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+
+  std::tie(MsanCtorFunction, std::ignore) =
+      createSanitizerCtorAndInitFunctions(M, kMsanModuleCtorName, kMsanInitName,
+                                          /*InitArgTypes=*/{},
+                                          /*InitArgs=*/{});
+  if (ClWithComdat) {
+    Comdat *MsanCtorComdat = M.getOrInsertComdat(kMsanModuleCtorName);
+    MsanCtorFunction->setComdat(MsanCtorComdat);
+    appendToGlobalCtors(M, MsanCtorFunction, 0, MsanCtorFunction);
+  } else {
+    appendToGlobalCtors(M, MsanCtorFunction, 0);
+  }
+
+
+  if (TrackOrigins)
+    new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                       IRB.getInt32(TrackOrigins), "__msan_track_origins");
+
+  if (Recover)
+    new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                       IRB.getInt32(Recover), "__msan_keep_going");
+
+  return true;
+}
+
+namespace {
+
+/// \brief A helper class that handles instrumentation of VarArg
+/// functions on a particular platform.
+///
+/// Implementations are expected to insert the instrumentation
+/// necessary to propagate argument shadow through VarArg function
+/// calls. Visit* methods are called during an InstVisitor pass over
+/// the function, and should avoid creating new basic blocks. A new
+/// instance of this class is created for each instrumented function.
+struct VarArgHelper {
+  /// \brief Visit a CallSite.
+  virtual void visitCallSite(CallSite &CS, IRBuilder<> &IRB) = 0;
+
+  /// \brief Visit a va_start call.
+  virtual void visitVAStartInst(VAStartInst &I) = 0;
+
+  /// \brief Visit a va_copy call.
+  virtual void visitVACopyInst(VACopyInst &I) = 0;
+
+  /// \brief Finalize function instrumentation.
+  ///
+  /// This method is called after visiting all interesting (see above)
+  /// instructions in a function.
+  virtual void finalizeInstrumentation() = 0;
+
+  virtual ~VarArgHelper() {}
+};
+
+struct MemorySanitizerVisitor;
+
+VarArgHelper*
+CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+                   MemorySanitizerVisitor &Visitor);
+
+unsigned TypeSizeToSizeIndex(unsigned TypeSize) {
+  if (TypeSize <= 8) return 0;
+  return Log2_32_Ceil((TypeSize + 7) / 8);
+}
+
+/// This class does all the work for a given function. Store and Load
+/// instructions store and load corresponding shadow and origin
+/// values. Most instructions propagate shadow from arguments to their
+/// return values. Certain instructions (most importantly, BranchInst)
+/// test their argument shadow and print reports (with a runtime call) if it's
+/// non-zero.
+struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
+  Function &F;
+  MemorySanitizer &MS;
+  SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes;
+  ValueMap<Value*, Value*> ShadowMap, OriginMap;
+  std::unique_ptr<VarArgHelper> VAHelper;
+  const TargetLibraryInfo *TLI;
+
+  // The following flags disable parts of MSan instrumentation based on
+  // blacklist contents and command-line options.
+  bool InsertChecks;
+  bool PropagateShadow;
+  bool PoisonStack;
+  bool PoisonUndef;
+  bool CheckReturnValue;
+
+  struct ShadowOriginAndInsertPoint {
+    Value *Shadow;
+    Value *Origin;
+    Instruction *OrigIns;
+    ShadowOriginAndInsertPoint(Value *S, Value *O, Instruction *I)
+      : Shadow(S), Origin(O), OrigIns(I) { }
+  };
+  SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
+  SmallVector<StoreInst *, 16> StoreList;
+
+  MemorySanitizerVisitor(Function &F, MemorySanitizer &MS)
+      : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) {
+    bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeMemory);
+    InsertChecks = SanitizeFunction;
+    PropagateShadow = SanitizeFunction;
+    PoisonStack = SanitizeFunction && ClPoisonStack;
+    PoisonUndef = SanitizeFunction && ClPoisonUndef;
+    // FIXME: Consider using SpecialCaseList to specify a list of functions that
+    // must always return fully initialized values. For now, we hardcode "main".
+    CheckReturnValue = SanitizeFunction && (F.getName() == "main");
+    TLI = &MS.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+    DEBUG(if (!InsertChecks)
+          dbgs() << "MemorySanitizer is not inserting checks into '"
+                 << F.getName() << "'\n");
+  }
+
+  Value *updateOrigin(Value *V, IRBuilder<> &IRB) {
+    if (MS.TrackOrigins <= 1) return V;
+    return IRB.CreateCall(MS.MsanChainOriginFn, V);
+  }
+
+  Value *originToIntptr(IRBuilder<> &IRB, Value *Origin) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
+    if (IntptrSize == kOriginSize) return Origin;
+    assert(IntptrSize == kOriginSize * 2);
+    Origin = IRB.CreateIntCast(Origin, MS.IntptrTy, /* isSigned */ false);
+    return IRB.CreateOr(Origin, IRB.CreateShl(Origin, kOriginSize * 8));
+  }
+
+  /// \brief Fill memory range with the given origin value.
+  void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr,
+                   unsigned Size, unsigned Alignment) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    unsigned IntptrAlignment = DL.getABITypeAlignment(MS.IntptrTy);
+    unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
+    assert(IntptrAlignment >= kMinOriginAlignment);
+    assert(IntptrSize >= kOriginSize);
+
+    unsigned Ofs = 0;
+    unsigned CurrentAlignment = Alignment;
+    if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) {
+      Value *IntptrOrigin = originToIntptr(IRB, Origin);
+      Value *IntptrOriginPtr =
+          IRB.CreatePointerCast(OriginPtr, PointerType::get(MS.IntptrTy, 0));
+      for (unsigned i = 0; i < Size / IntptrSize; ++i) {
+        Value *Ptr = i ? IRB.CreateConstGEP1_32(MS.IntptrTy, IntptrOriginPtr, i)
+                       : IntptrOriginPtr;
+        IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment);
+        Ofs += IntptrSize / kOriginSize;
+        CurrentAlignment = IntptrAlignment;
+      }
+    }
+
+    for (unsigned i = Ofs; i < (Size + kOriginSize - 1) / kOriginSize; ++i) {
+      Value *GEP =
+          i ? IRB.CreateConstGEP1_32(nullptr, OriginPtr, i) : OriginPtr;
+      IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment);
+      CurrentAlignment = kMinOriginAlignment;
+    }
+  }
+
+  void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin,
+                   unsigned Alignment, bool AsCall) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+    unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
+    if (Shadow->getType()->isAggregateType()) {
+      paintOrigin(IRB, updateOrigin(Origin, IRB),
+                  getOriginPtr(Addr, IRB, Alignment), StoreSize,
+                  OriginAlignment);
+    } else {
+      Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+      Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow);
+      if (ConstantShadow) {
+        if (ClCheckConstantShadow && !ConstantShadow->isZeroValue())
+          paintOrigin(IRB, updateOrigin(Origin, IRB),
+                      getOriginPtr(Addr, IRB, Alignment), StoreSize,
+                      OriginAlignment);
+        return;
+      }
+
+      unsigned TypeSizeInBits =
+          DL.getTypeSizeInBits(ConvertedShadow->getType());
+      unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
+      if (AsCall && SizeIndex < kNumberOfAccessSizes) {
+        Value *Fn = MS.MaybeStoreOriginFn[SizeIndex];
+        Value *ConvertedShadow2 = IRB.CreateZExt(
+            ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex)));
+        IRB.CreateCall(Fn, {ConvertedShadow2,
+                            IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                            Origin});
+      } else {
+        Value *Cmp = IRB.CreateICmpNE(
+            ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp");
+        Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+            Cmp, &*IRB.GetInsertPoint(), false, MS.OriginStoreWeights);
+        IRBuilder<> IRBNew(CheckTerm);
+        paintOrigin(IRBNew, updateOrigin(Origin, IRBNew),
+                    getOriginPtr(Addr, IRBNew, Alignment), StoreSize,
+                    OriginAlignment);
+      }
+    }
+  }
+
+  void materializeStores(bool InstrumentWithCalls) {
+    for (StoreInst *SI : StoreList) {
+      IRBuilder<> IRB(SI);
+      Value *Val = SI->getValueOperand();
+      Value *Addr = SI->getPointerOperand();
+      Value *Shadow = SI->isAtomic() ? getCleanShadow(Val) : getShadow(Val);
+      Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
+
+      StoreInst *NewSI =
+          IRB.CreateAlignedStore(Shadow, ShadowPtr, SI->getAlignment());
+      DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
+      (void)NewSI;
+
+      if (ClCheckAccessAddress)
+        insertShadowCheck(Addr, SI);
+
+      if (SI->isAtomic())
+        SI->setOrdering(addReleaseOrdering(SI->getOrdering()));
+
+      if (MS.TrackOrigins && !SI->isAtomic())
+        storeOrigin(IRB, Addr, Shadow, getOrigin(Val), SI->getAlignment(),
+                    InstrumentWithCalls);
+    }
+  }
+
+  void materializeOneCheck(Instruction *OrigIns, Value *Shadow, Value *Origin,
+                           bool AsCall) {
+    IRBuilder<> IRB(OrigIns);
+    DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
+    Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+    DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
+
+    Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow);
+    if (ConstantShadow) {
+      if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) {
+        if (MS.TrackOrigins) {
+          IRB.CreateStore(Origin ? (Value *)Origin : (Value *)IRB.getInt32(0),
+                          MS.OriginTLS);
+        }
+        IRB.CreateCall(MS.WarningFn, {});
+        IRB.CreateCall(MS.EmptyAsm, {});
+        // FIXME: Insert UnreachableInst if !MS.Recover?
+        // This may invalidate some of the following checks and needs to be done
+        // at the very end.
+      }
+      return;
+    }
+
+    const DataLayout &DL = OrigIns->getModule()->getDataLayout();
+
+    unsigned TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType());
+    unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
+    if (AsCall && SizeIndex < kNumberOfAccessSizes) {
+      Value *Fn = MS.MaybeWarningFn[SizeIndex];
+      Value *ConvertedShadow2 =
+          IRB.CreateZExt(ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex)));
+      IRB.CreateCall(Fn, {ConvertedShadow2, MS.TrackOrigins && Origin
+                                                ? Origin
+                                                : (Value *)IRB.getInt32(0)});
+    } else {
+      Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
+                                    getCleanShadow(ConvertedShadow), "_mscmp");
+      Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+          Cmp, OrigIns,
+          /* Unreachable */ !MS.Recover, MS.ColdCallWeights);
+
+      IRB.SetInsertPoint(CheckTerm);
+      if (MS.TrackOrigins) {
+        IRB.CreateStore(Origin ? (Value *)Origin : (Value *)IRB.getInt32(0),
+                        MS.OriginTLS);
+      }
+      IRB.CreateCall(MS.WarningFn, {});
+      IRB.CreateCall(MS.EmptyAsm, {});
+      DEBUG(dbgs() << "  CHECK: " << *Cmp << "\n");
+    }
+  }
+
+  void materializeChecks(bool InstrumentWithCalls) {
+    for (const auto &ShadowData : InstrumentationList) {
+      Instruction *OrigIns = ShadowData.OrigIns;
+      Value *Shadow = ShadowData.Shadow;
+      Value *Origin = ShadowData.Origin;
+      materializeOneCheck(OrigIns, Shadow, Origin, InstrumentWithCalls);
+    }
+    DEBUG(dbgs() << "DONE:\n" << F);
+  }
+
+  /// \brief Add MemorySanitizer instrumentation to a function.
+  bool runOnFunction() {
+    MS.initializeCallbacks(*F.getParent());
+
+    // In the presence of unreachable blocks, we may see Phi nodes with
+    // incoming nodes from such blocks. Since InstVisitor skips unreachable
+    // blocks, such nodes will not have any shadow value associated with them.
+    // It's easier to remove unreachable blocks than deal with missing shadow.
+    removeUnreachableBlocks(F);
+
+    // Iterate all BBs in depth-first order and create shadow instructions
+    // for all instructions (where applicable).
+    // For PHI nodes we create dummy shadow PHIs which will be finalized later.
+    for (BasicBlock *BB : depth_first(&F.getEntryBlock()))
+      visit(*BB);
+
+
+    // Finalize PHI nodes.
+    for (PHINode *PN : ShadowPHINodes) {
+      PHINode *PNS = cast<PHINode>(getShadow(PN));
+      PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : nullptr;
+      size_t NumValues = PN->getNumIncomingValues();
+      for (size_t v = 0; v < NumValues; v++) {
+        PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v));
+        if (PNO) PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v));
+      }
+    }
+
+    VAHelper->finalizeInstrumentation();
+
+    bool InstrumentWithCalls = ClInstrumentationWithCallThreshold >= 0 &&
+                               InstrumentationList.size() + StoreList.size() >
+                                   (unsigned)ClInstrumentationWithCallThreshold;
+
+    // Delayed instrumentation of StoreInst.
+    // This may add new checks to be inserted later.
+    materializeStores(InstrumentWithCalls);
+
+    // Insert shadow value checks.
+    materializeChecks(InstrumentWithCalls);
+
+    return true;
+  }
+
+  /// \brief Compute the shadow type that corresponds to a given Value.
+  Type *getShadowTy(Value *V) {
+    return getShadowTy(V->getType());
+  }
+
+  /// \brief Compute the shadow type that corresponds to a given Type.
+  Type *getShadowTy(Type *OrigTy) {
+    if (!OrigTy->isSized()) {
+      return nullptr;
+    }
+    // For integer type, shadow is the same as the original type.
+    // This may return weird-sized types like i1.
+    if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy))
+      return IT;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) {
+      uint32_t EltSize = DL.getTypeSizeInBits(VT->getElementType());
+      return VectorType::get(IntegerType::get(*MS.C, EltSize),
+                             VT->getNumElements());
+    }
+    if (ArrayType *AT = dyn_cast<ArrayType>(OrigTy)) {
+      return ArrayType::get(getShadowTy(AT->getElementType()),
+                            AT->getNumElements());
+    }
+    if (StructType *ST = dyn_cast<StructType>(OrigTy)) {
+      SmallVector<Type*, 4> Elements;
+      for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+        Elements.push_back(getShadowTy(ST->getElementType(i)));
+      StructType *Res = StructType::get(*MS.C, Elements, ST->isPacked());
+      DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
+      return Res;
+    }
+    uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy);
+    return IntegerType::get(*MS.C, TypeSize);
+  }
+
+  /// \brief Flatten a vector type.
+  Type *getShadowTyNoVec(Type *ty) {
+    if (VectorType *vt = dyn_cast<VectorType>(ty))
+      return IntegerType::get(*MS.C, vt->getBitWidth());
+    return ty;
+  }
+
+  /// \brief Convert a shadow value to it's flattened variant.
+  Value *convertToShadowTyNoVec(Value *V, IRBuilder<> &IRB) {
+    Type *Ty = V->getType();
+    Type *NoVecTy = getShadowTyNoVec(Ty);
+    if (Ty == NoVecTy) return V;
+    return IRB.CreateBitCast(V, NoVecTy);
+  }
+
+  /// \brief Compute the integer shadow offset that corresponds to a given
+  /// application address.
+  ///
+  /// Offset = (Addr & ~AndMask) ^ XorMask
+  Value *getShadowPtrOffset(Value *Addr, IRBuilder<> &IRB) {
+    Value *OffsetLong = IRB.CreatePointerCast(Addr, MS.IntptrTy);
+
+    uint64_t AndMask = MS.MapParams->AndMask;
+    if (AndMask)
+      OffsetLong =
+          IRB.CreateAnd(OffsetLong, ConstantInt::get(MS.IntptrTy, ~AndMask));
+
+    uint64_t XorMask = MS.MapParams->XorMask;
+    if (XorMask)
+      OffsetLong =
+          IRB.CreateXor(OffsetLong, ConstantInt::get(MS.IntptrTy, XorMask));
+    return OffsetLong;
+  }
+
+  /// \brief Compute the shadow address that corresponds to a given application
+  /// address.
+  ///
+  /// Shadow = ShadowBase + Offset
+  Value *getShadowPtr(Value *Addr, Type *ShadowTy,
+                      IRBuilder<> &IRB) {
+    Value *ShadowLong = getShadowPtrOffset(Addr, IRB);
+    uint64_t ShadowBase = MS.MapParams->ShadowBase;
+    if (ShadowBase != 0)
+      ShadowLong =
+        IRB.CreateAdd(ShadowLong,
+                      ConstantInt::get(MS.IntptrTy, ShadowBase));
+    return IRB.CreateIntToPtr(ShadowLong, PointerType::get(ShadowTy, 0));
+  }
+
+  /// \brief Compute the origin address that corresponds to a given application
+  /// address.
+  ///
+  /// OriginAddr = (OriginBase + Offset) & ~3ULL
+  Value *getOriginPtr(Value *Addr, IRBuilder<> &IRB, unsigned Alignment) {
+    Value *OriginLong = getShadowPtrOffset(Addr, IRB);
+    uint64_t OriginBase = MS.MapParams->OriginBase;
+    if (OriginBase != 0)
+      OriginLong =
+        IRB.CreateAdd(OriginLong,
+                      ConstantInt::get(MS.IntptrTy, OriginBase));
+    if (Alignment < kMinOriginAlignment) {
+      uint64_t Mask = kMinOriginAlignment - 1;
+      OriginLong = IRB.CreateAnd(OriginLong,
+                                 ConstantInt::get(MS.IntptrTy, ~Mask));
+    }
+    return IRB.CreateIntToPtr(OriginLong,
+                              PointerType::get(IRB.getInt32Ty(), 0));
+  }
+
+  /// \brief Compute the shadow address for a given function argument.
+  ///
+  /// Shadow = ParamTLS+ArgOffset.
+  Value *getShadowPtrForArgument(Value *A, IRBuilder<> &IRB,
+                                 int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
+                              "_msarg");
+  }
+
+  /// \brief Compute the origin address for a given function argument.
+  Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB,
+                                 int ArgOffset) {
+    if (!MS.TrackOrigins) return nullptr;
+    Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
+                              "_msarg_o");
+  }
+
+  /// \brief Compute the shadow address for a retval.
+  Value *getShadowPtrForRetval(Value *A, IRBuilder<> &IRB) {
+    Value *Base = IRB.CreatePointerCast(MS.RetvalTLS, MS.IntptrTy);
+    return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
+                              "_msret");
+  }
+
+  /// \brief Compute the origin address for a retval.
+  Value *getOriginPtrForRetval(IRBuilder<> &IRB) {
+    // We keep a single origin for the entire retval. Might be too optimistic.
+    return MS.RetvalOriginTLS;
+  }
+
+  /// \brief Set SV to be the shadow value for V.
+  void setShadow(Value *V, Value *SV) {
+    assert(!ShadowMap.count(V) && "Values may only have one shadow");
+    ShadowMap[V] = PropagateShadow ? SV : getCleanShadow(V);
+  }
+
+  /// \brief Set Origin to be the origin value for V.
+  void setOrigin(Value *V, Value *Origin) {
+    if (!MS.TrackOrigins) return;
+    assert(!OriginMap.count(V) && "Values may only have one origin");
+    DEBUG(dbgs() << "ORIGIN: " << *V << "  ==> " << *Origin << "\n");
+    OriginMap[V] = Origin;
+  }
+
+  Constant *getCleanShadow(Type *OrigTy) {
+    Type *ShadowTy = getShadowTy(OrigTy);
+    if (!ShadowTy)
+      return nullptr;
+    return Constant::getNullValue(ShadowTy);
+  }
+
+  /// \brief Create a clean shadow value for a given value.
+  ///
+  /// Clean shadow (all zeroes) means all bits of the value are defined
+  /// (initialized).
+  Constant *getCleanShadow(Value *V) {
+    return getCleanShadow(V->getType());
+  }
+
+  /// \brief Create a dirty shadow of a given shadow type.
+  Constant *getPoisonedShadow(Type *ShadowTy) {
+    assert(ShadowTy);
+    if (isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy))
+      return Constant::getAllOnesValue(ShadowTy);
+    if (ArrayType *AT = dyn_cast<ArrayType>(ShadowTy)) {
+      SmallVector<Constant *, 4> Vals(AT->getNumElements(),
+                                      getPoisonedShadow(AT->getElementType()));
+      return ConstantArray::get(AT, Vals);
+    }
+    if (StructType *ST = dyn_cast<StructType>(ShadowTy)) {
+      SmallVector<Constant *, 4> Vals;
+      for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+        Vals.push_back(getPoisonedShadow(ST->getElementType(i)));
+      return ConstantStruct::get(ST, Vals);
+    }
+    llvm_unreachable("Unexpected shadow type");
+  }
+
+  /// \brief Create a dirty shadow for a given value.
+  Constant *getPoisonedShadow(Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (!ShadowTy)
+      return nullptr;
+    return getPoisonedShadow(ShadowTy);
+  }
+
+  /// \brief Create a clean (zero) origin.
+  Value *getCleanOrigin() {
+    return Constant::getNullValue(MS.OriginTy);
+  }
+
+  /// \brief Get the shadow value for a given Value.
+  ///
+  /// This function either returns the value set earlier with setShadow,
+  /// or extracts if from ParamTLS (for function arguments).
+  Value *getShadow(Value *V) {
+    if (!PropagateShadow) return getCleanShadow(V);
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      // For instructions the shadow is already stored in the map.
+      Value *Shadow = ShadowMap[V];
+      if (!Shadow) {
+        DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent()));
+        (void)I;
+        assert(Shadow && "No shadow for a value");
+      }
+      return Shadow;
+    }
+    if (UndefValue *U = dyn_cast<UndefValue>(V)) {
+      Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
+      DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
+      (void)U;
+      return AllOnes;
+    }
+    if (Argument *A = dyn_cast<Argument>(V)) {
+      // For arguments we compute the shadow on demand and store it in the map.
+      Value **ShadowPtr = &ShadowMap[V];
+      if (*ShadowPtr)
+        return *ShadowPtr;
+      Function *F = A->getParent();
+      IRBuilder<> EntryIRB(F->getEntryBlock().getFirstNonPHI());
+      unsigned ArgOffset = 0;
+      const DataLayout &DL = F->getParent()->getDataLayout();
+      for (auto &FArg : F->args()) {
+        if (!FArg.getType()->isSized()) {
+          DEBUG(dbgs() << "Arg is not sized\n");
+          continue;
+        }
+        unsigned Size =
+            FArg.hasByValAttr()
+                ? DL.getTypeAllocSize(FArg.getType()->getPointerElementType())
+                : DL.getTypeAllocSize(FArg.getType());
+        if (A == &FArg) {
+          bool Overflow = ArgOffset + Size > kParamTLSSize;
+          Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
+          if (FArg.hasByValAttr()) {
+            // ByVal pointer itself has clean shadow. We copy the actual
+            // argument shadow to the underlying memory.
+            // Figure out maximal valid memcpy alignment.
+            unsigned ArgAlign = FArg.getParamAlignment();
+            if (ArgAlign == 0) {
+              Type *EltType = A->getType()->getPointerElementType();
+              ArgAlign = DL.getABITypeAlignment(EltType);
+            }
+            if (Overflow) {
+              // ParamTLS overflow.
+              EntryIRB.CreateMemSet(
+                  getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB),
+                  Constant::getNullValue(EntryIRB.getInt8Ty()), Size, ArgAlign);
+            } else {
+              unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
+              Value *Cpy = EntryIRB.CreateMemCpy(
+                  getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB), Base, Size,
+                  CopyAlign);
+              DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
+              (void)Cpy;
+            }
+            *ShadowPtr = getCleanShadow(V);
+          } else {
+            if (Overflow) {
+              // ParamTLS overflow.
+              *ShadowPtr = getCleanShadow(V);
+            } else {
+              *ShadowPtr =
+                  EntryIRB.CreateAlignedLoad(Base, kShadowTLSAlignment);
+            }
+          }
+          DEBUG(dbgs() << "  ARG:    "  << FArg << " ==> " <<
+                **ShadowPtr << "\n");
+          if (MS.TrackOrigins && !Overflow) {
+            Value *OriginPtr =
+                getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
+            setOrigin(A, EntryIRB.CreateLoad(OriginPtr));
+          } else {
+            setOrigin(A, getCleanOrigin());
+          }
+        }
+        ArgOffset += alignTo(Size, kShadowTLSAlignment);
+      }
+      assert(*ShadowPtr && "Could not find shadow for an argument");
+      return *ShadowPtr;
+    }
+    // For everything else the shadow is zero.
+    return getCleanShadow(V);
+  }
+
+  /// \brief Get the shadow for i-th argument of the instruction I.
+  Value *getShadow(Instruction *I, int i) {
+    return getShadow(I->getOperand(i));
+  }
+
+  /// \brief Get the origin for a value.
+  Value *getOrigin(Value *V) {
+    if (!MS.TrackOrigins) return nullptr;
+    if (!PropagateShadow) return getCleanOrigin();
+    if (isa<Constant>(V)) return getCleanOrigin();
+    assert((isa<Instruction>(V) || isa<Argument>(V)) &&
+           "Unexpected value type in getOrigin()");
+    Value *Origin = OriginMap[V];
+    assert(Origin && "Missing origin");
+    return Origin;
+  }
+
+  /// \brief Get the origin for i-th argument of the instruction I.
+  Value *getOrigin(Instruction *I, int i) {
+    return getOrigin(I->getOperand(i));
+  }
+
+  /// \brief Remember the place where a shadow check should be inserted.
+  ///
+  /// This location will be later instrumented with a check that will print a
+  /// UMR warning in runtime if the shadow value is not 0.
+  void insertShadowCheck(Value *Shadow, Value *Origin, Instruction *OrigIns) {
+    assert(Shadow);
+    if (!InsertChecks) return;
+#ifndef NDEBUG
+    Type *ShadowTy = Shadow->getType();
+    assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) &&
+           "Can only insert checks for integer and vector shadow types");
+#endif
+    InstrumentationList.push_back(
+        ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
+  }
+
+  /// \brief Remember the place where a shadow check should be inserted.
+  ///
+  /// This location will be later instrumented with a check that will print a
+  /// UMR warning in runtime if the value is not fully defined.
+  void insertShadowCheck(Value *Val, Instruction *OrigIns) {
+    assert(Val);
+    Value *Shadow, *Origin;
+    if (ClCheckConstantShadow) {
+      Shadow = getShadow(Val);
+      if (!Shadow) return;
+      Origin = getOrigin(Val);
+    } else {
+      Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
+      if (!Shadow) return;
+      Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
+    }
+    insertShadowCheck(Shadow, Origin, OrigIns);
+  }
+
+  AtomicOrdering addReleaseOrdering(AtomicOrdering a) {
+    switch (a) {
+      case AtomicOrdering::NotAtomic:
+        return AtomicOrdering::NotAtomic;
+      case AtomicOrdering::Unordered:
+      case AtomicOrdering::Monotonic:
+      case AtomicOrdering::Release:
+        return AtomicOrdering::Release;
+      case AtomicOrdering::Acquire:
+      case AtomicOrdering::AcquireRelease:
+        return AtomicOrdering::AcquireRelease;
+      case AtomicOrdering::SequentiallyConsistent:
+        return AtomicOrdering::SequentiallyConsistent;
+    }
+    llvm_unreachable("Unknown ordering");
+  }
+
+  AtomicOrdering addAcquireOrdering(AtomicOrdering a) {
+    switch (a) {
+      case AtomicOrdering::NotAtomic:
+        return AtomicOrdering::NotAtomic;
+      case AtomicOrdering::Unordered:
+      case AtomicOrdering::Monotonic:
+      case AtomicOrdering::Acquire:
+        return AtomicOrdering::Acquire;
+      case AtomicOrdering::Release:
+      case AtomicOrdering::AcquireRelease:
+        return AtomicOrdering::AcquireRelease;
+      case AtomicOrdering::SequentiallyConsistent:
+        return AtomicOrdering::SequentiallyConsistent;
+    }
+    llvm_unreachable("Unknown ordering");
+  }
+
+  // ------------------- Visitors.
+
+  /// \brief Instrument LoadInst
+  ///
+  /// Loads the corresponding shadow and (optionally) origin.
+  /// Optionally, checks that the load address is fully defined.
+  void visitLoadInst(LoadInst &I) {
+    assert(I.getType()->isSized() && "Load type must have size");
+    IRBuilder<> IRB(I.getNextNode());
+    Type *ShadowTy = getShadowTy(&I);
+    Value *Addr = I.getPointerOperand();
+    if (PropagateShadow && !I.getMetadata("nosanitize")) {
+      Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB);
+      setShadow(&I,
+                IRB.CreateAlignedLoad(ShadowPtr, I.getAlignment(), "_msld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(I.getPointerOperand(), &I);
+
+    if (I.isAtomic())
+      I.setOrdering(addAcquireOrdering(I.getOrdering()));
+
+    if (MS.TrackOrigins) {
+      if (PropagateShadow) {
+        unsigned Alignment = I.getAlignment();
+        unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+        setOrigin(&I, IRB.CreateAlignedLoad(getOriginPtr(Addr, IRB, Alignment),
+                                            OriginAlignment));
+      } else {
+        setOrigin(&I, getCleanOrigin());
+      }
+    }
+  }
+
+  /// \brief Instrument StoreInst
+  ///
+  /// Stores the corresponding shadow and (optionally) origin.
+  /// Optionally, checks that the store address is fully defined.
+  void visitStoreInst(StoreInst &I) {
+    StoreList.push_back(&I);
+  }
+
+  void handleCASOrRMW(Instruction &I) {
+    assert(isa<AtomicRMWInst>(I) || isa<AtomicCmpXchgInst>(I));
+
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getOperand(0);
+    Value *ShadowPtr = getShadowPtr(Addr, I.getType(), IRB);
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    // Only test the conditional argument of cmpxchg instruction.
+    // The other argument can potentially be uninitialized, but we can not
+    // detect this situation reliably without possible false positives.
+    if (isa<AtomicCmpXchgInst>(I))
+      insertShadowCheck(I.getOperand(1), &I);
+
+    IRB.CreateStore(getCleanShadow(&I), ShadowPtr);
+
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitAtomicRMWInst(AtomicRMWInst &I) {
+    handleCASOrRMW(I);
+    I.setOrdering(addReleaseOrdering(I.getOrdering()));
+  }
+
+  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+    handleCASOrRMW(I);
+    I.setSuccessOrdering(addReleaseOrdering(I.getSuccessOrdering()));
+  }
+
+  // Vector manipulation.
+  void visitExtractElementInst(ExtractElementInst &I) {
+    insertShadowCheck(I.getOperand(1), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1),
+              "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitInsertElementInst(InsertElementInst &I) {
+    insertShadowCheck(I.getOperand(2), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1),
+              I.getOperand(2), "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  void visitShuffleVectorInst(ShuffleVectorInst &I) {
+    insertShadowCheck(I.getOperand(2), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1),
+              I.getOperand(2), "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  // Casts.
+  void visitSExtInst(SExtInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateSExt(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitZExtInst(ZExtInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateZExt(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitTruncInst(TruncInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateTrunc(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitBitCastInst(BitCastInst &I) {
+    // Special case: if this is the bitcast (there is exactly 1 allowed) between
+    // a musttail call and a ret, don't instrument. New instructions are not
+    // allowed after a musttail call.
+    if (auto *CI = dyn_cast<CallInst>(I.getOperand(0)))
+      if (CI->isMustTailCall())
+        return;
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I)));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitPtrToIntInst(PtrToIntInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+             "_msprop_ptrtoint"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitIntToPtrInst(IntToPtrInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+             "_msprop_inttoptr"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitFPToSIInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPToUIInst(CastInst& I) { handleShadowOr(I); }
+  void visitSIToFPInst(CastInst& I) { handleShadowOr(I); }
+  void visitUIToFPInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPExtInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPTruncInst(CastInst& I) { handleShadowOr(I); }
+
+  /// \brief Propagate shadow for bitwise AND.
+  ///
+  /// This code is exact, i.e. if, for example, a bit in the left argument
+  /// is defined and 0, then neither the value not definedness of the
+  /// corresponding bit in B don't affect the resulting shadow.
+  void visitAnd(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    //  "And" of 0 and a poisoned value results in unpoisoned value.
+    //  1&1 => 1;     0&1 => 0;     p&1 => p;
+    //  1&0 => 0;     0&0 => 0;     p&0 => 0;
+    //  1&p => p;     0&p => 0;     p&p => p;
+    //  S = (S1 & S2) | (V1 & S2) | (S1 & V2)
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = I.getOperand(0);
+    Value *V2 = I.getOperand(1);
+    if (V1->getType() != S1->getType()) {
+      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+    }
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    setShadow(&I, IRB.CreateOr(S1S2, IRB.CreateOr(V1S2, S1V2)));
+    setOriginForNaryOp(I);
+  }
+
+  void visitOr(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    //  "Or" of 1 and a poisoned value results in unpoisoned value.
+    //  1|1 => 1;     0|1 => 1;     p|1 => 1;
+    //  1|0 => 1;     0|0 => 0;     p|0 => p;
+    //  1|p => 1;     0|p => p;     p|p => p;
+    //  S = (S1 & S2) | (~V1 & S2) | (S1 & ~V2)
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = IRB.CreateNot(I.getOperand(0));
+    Value *V2 = IRB.CreateNot(I.getOperand(1));
+    if (V1->getType() != S1->getType()) {
+      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+    }
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    setShadow(&I, IRB.CreateOr(S1S2, IRB.CreateOr(V1S2, S1V2)));
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Default propagation of shadow and/or origin.
+  ///
+  /// This class implements the general case of shadow propagation, used in all
+  /// cases where we don't know and/or don't care about what the operation
+  /// actually does. It converts all input shadow values to a common type
+  /// (extending or truncating as necessary), and bitwise OR's them.
+  ///
+  /// This is much cheaper than inserting checks (i.e. requiring inputs to be
+  /// fully initialized), and less prone to false positives.
+  ///
+  /// This class also implements the general case of origin propagation. For a
+  /// Nary operation, result origin is set to the origin of an argument that is
+  /// not entirely initialized. If there is more than one such arguments, the
+  /// rightmost of them is picked. It does not matter which one is picked if all
+  /// arguments are initialized.
+  template <bool CombineShadow>
+  class Combiner {
+    Value *Shadow;
+    Value *Origin;
+    IRBuilder<> &IRB;
+    MemorySanitizerVisitor *MSV;
+
+  public:
+    Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB) :
+      Shadow(nullptr), Origin(nullptr), IRB(IRB), MSV(MSV) {}
+
+    /// \brief Add a pair of shadow and origin values to the mix.
+    Combiner &Add(Value *OpShadow, Value *OpOrigin) {
+      if (CombineShadow) {
+        assert(OpShadow);
+        if (!Shadow)
+          Shadow = OpShadow;
+        else {
+          OpShadow = MSV->CreateShadowCast(IRB, OpShadow, Shadow->getType());
+          Shadow = IRB.CreateOr(Shadow, OpShadow, "_msprop");
+        }
+      }
+
+      if (MSV->MS.TrackOrigins) {
+        assert(OpOrigin);
+        if (!Origin) {
+          Origin = OpOrigin;
+        } else {
+          Constant *ConstOrigin = dyn_cast<Constant>(OpOrigin);
+          // No point in adding something that might result in 0 origin value.
+          if (!ConstOrigin || !ConstOrigin->isNullValue()) {
+            Value *FlatShadow = MSV->convertToShadowTyNoVec(OpShadow, IRB);
+            Value *Cond =
+                IRB.CreateICmpNE(FlatShadow, MSV->getCleanShadow(FlatShadow));
+            Origin = IRB.CreateSelect(Cond, OpOrigin, Origin);
+          }
+        }
+      }
+      return *this;
+    }
+
+    /// \brief Add an application value to the mix.
+    Combiner &Add(Value *V) {
+      Value *OpShadow = MSV->getShadow(V);
+      Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : nullptr;
+      return Add(OpShadow, OpOrigin);
+    }
+
+    /// \brief Set the current combined values as the given instruction's shadow
+    /// and origin.
+    void Done(Instruction *I) {
+      if (CombineShadow) {
+        assert(Shadow);
+        Shadow = MSV->CreateShadowCast(IRB, Shadow, MSV->getShadowTy(I));
+        MSV->setShadow(I, Shadow);
+      }
+      if (MSV->MS.TrackOrigins) {
+        assert(Origin);
+        MSV->setOrigin(I, Origin);
+      }
+    }
+  };
+
+  typedef Combiner<true> ShadowAndOriginCombiner;
+  typedef Combiner<false> OriginCombiner;
+
+  /// \brief Propagate origin for arbitrary operation.
+  void setOriginForNaryOp(Instruction &I) {
+    if (!MS.TrackOrigins) return;
+    IRBuilder<> IRB(&I);
+    OriginCombiner OC(this, IRB);
+    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+      OC.Add(OI->get());
+    OC.Done(&I);
+  }
+
+  size_t VectorOrPrimitiveTypeSizeInBits(Type *Ty) {
+    assert(!(Ty->isVectorTy() && Ty->getScalarType()->isPointerTy()) &&
+           "Vector of pointers is not a valid shadow type");
+    return Ty->isVectorTy() ?
+      Ty->getVectorNumElements() * Ty->getScalarSizeInBits() :
+      Ty->getPrimitiveSizeInBits();
+  }
+
+  /// \brief Cast between two shadow types, extending or truncating as
+  /// necessary.
+  Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
+                          bool Signed = false) {
+    Type *srcTy = V->getType();
+    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
+    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
+    if (srcSizeInBits > 1 && dstSizeInBits == 1)
+      return IRB.CreateICmpNE(V, getCleanShadow(V));
+
+    if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
+      return IRB.CreateIntCast(V, dstTy, Signed);
+    if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
+        dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
+      return IRB.CreateIntCast(V, dstTy, Signed);
+    Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
+    Value *V2 =
+      IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
+    return IRB.CreateBitCast(V2, dstTy);
+    // TODO: handle struct types.
+  }
+
+  /// \brief Cast an application value to the type of its own shadow.
+  Value *CreateAppToShadowCast(IRBuilder<> &IRB, Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (V->getType() == ShadowTy)
+      return V;
+    if (V->getType()->isPtrOrPtrVectorTy())
+      return IRB.CreatePtrToInt(V, ShadowTy);
+    else
+      return IRB.CreateBitCast(V, ShadowTy);
+  }
+
+  /// \brief Propagate shadow for arbitrary operation.
+  void handleShadowOr(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    ShadowAndOriginCombiner SC(this, IRB);
+    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+      SC.Add(OI->get());
+    SC.Done(&I);
+  }
+
+  // \brief Handle multiplication by constant.
+  //
+  // Handle a special case of multiplication by constant that may have one or
+  // more zeros in the lower bits. This makes corresponding number of lower bits
+  // of the result zero as well. We model it by shifting the other operand
+  // shadow left by the required number of bits. Effectively, we transform
+  // (X * (A * 2**B)) to ((X << B) * A) and instrument (X << B) as (Sx << B).
+  // We use multiplication by 2**N instead of shift to cover the case of
+  // multiplication by 0, which may occur in some elements of a vector operand.
+  void handleMulByConstant(BinaryOperator &I, Constant *ConstArg,
+                           Value *OtherArg) {
+    Constant *ShadowMul;
+    Type *Ty = ConstArg->getType();
+    if (Ty->isVectorTy()) {
+      unsigned NumElements = Ty->getVectorNumElements();
+      Type *EltTy = Ty->getSequentialElementType();
+      SmallVector<Constant *, 16> Elements;
+      for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
+        if (ConstantInt *Elt =
+                dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx))) {
+          const APInt &V = Elt->getValue();
+          APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+          Elements.push_back(ConstantInt::get(EltTy, V2));
+        } else {
+          Elements.push_back(ConstantInt::get(EltTy, 1));
+        }
+      }
+      ShadowMul = ConstantVector::get(Elements);
+    } else {
+      if (ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg)) {
+        const APInt &V = Elt->getValue();
+        APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+        ShadowMul = ConstantInt::get(Ty, V2);
+      } else {
+        ShadowMul = ConstantInt::get(Ty, 1);
+      }
+    }
+
+    IRBuilder<> IRB(&I);
+    setShadow(&I,
+              IRB.CreateMul(getShadow(OtherArg), ShadowMul, "msprop_mul_cst"));
+    setOrigin(&I, getOrigin(OtherArg));
+  }
+
+  void visitMul(BinaryOperator &I) {
+    Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
+    Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
+    if (constOp0 && !constOp1)
+      handleMulByConstant(I, constOp0, I.getOperand(1));
+    else if (constOp1 && !constOp0)
+      handleMulByConstant(I, constOp1, I.getOperand(0));
+    else
+      handleShadowOr(I);
+  }
+
+  void visitFAdd(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFSub(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFMul(BinaryOperator &I) { handleShadowOr(I); }
+  void visitAdd(BinaryOperator &I) { handleShadowOr(I); }
+  void visitSub(BinaryOperator &I) { handleShadowOr(I); }
+  void visitXor(BinaryOperator &I) { handleShadowOr(I); }
+
+  void handleDiv(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    // Strict on the second argument.
+    insertShadowCheck(I.getOperand(1), &I);
+    setShadow(&I, getShadow(&I, 0));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitUDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitSDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitFDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitURem(BinaryOperator &I) { handleDiv(I); }
+  void visitSRem(BinaryOperator &I) { handleDiv(I); }
+  void visitFRem(BinaryOperator &I) { handleDiv(I); }
+
+  /// \brief Instrument == and != comparisons.
+  ///
+  /// Sometimes the comparison result is known even if some of the bits of the
+  /// arguments are not.
+  void handleEqualityComparison(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
+    // A == B  <==>  (C = A^B) == 0
+    // A != B  <==>  (C = A^B) != 0
+    // Sc = Sa | Sb
+    Value *C = IRB.CreateXor(A, B);
+    Value *Sc = IRB.CreateOr(Sa, Sb);
+    // Now dealing with i = (C == 0) comparison (or C != 0, does not matter now)
+    // Result is defined if one of the following is true
+    // * there is a defined 1 bit in C
+    // * C is fully defined
+    // Si = !(C & ~Sc) && Sc
+    Value *Zero = Constant::getNullValue(Sc->getType());
+    Value *MinusOne = Constant::getAllOnesValue(Sc->getType());
+    Value *Si =
+      IRB.CreateAnd(IRB.CreateICmpNE(Sc, Zero),
+                    IRB.CreateICmpEQ(
+                      IRB.CreateAnd(IRB.CreateXor(Sc, MinusOne), C), Zero));
+    Si->setName("_msprop_icmp");
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Build the lowest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getLowestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Maximise the undefined shadow bit, minimize other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaOtherBits)), SaSignBit);
+    } else {
+      // Minimize undefined bits.
+      return IRB.CreateAnd(A, IRB.CreateNot(Sa));
+    }
+  }
+
+  /// \brief Build the highest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getHighestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Minimise the undefined shadow bit, maximise other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaSignBit)), SaOtherBits);
+    } else {
+      // Maximize undefined bits.
+      return IRB.CreateOr(A, Sa);
+    }
+  }
+
+  /// \brief Instrument relational comparisons.
+  ///
+  /// This function does exact shadow propagation for all relational
+  /// comparisons of integers, pointers and vectors of those.
+  /// FIXME: output seems suboptimal when one of the operands is a constant
+  void handleRelationalComparisonExact(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
+    // Let [a0, a1] be the interval of possible values of A, taking into account
+    // its undefined bits. Let [b0, b1] be the interval of possible values of B.
+    // Then (A cmp B) is defined iff (a0 cmp b1) == (a1 cmp b0).
+    bool IsSigned = I.isSigned();
+    Value *S1 = IRB.CreateICmp(I.getPredicate(),
+                               getLowestPossibleValue(IRB, A, Sa, IsSigned),
+                               getHighestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *S2 = IRB.CreateICmp(I.getPredicate(),
+                               getHighestPossibleValue(IRB, A, Sa, IsSigned),
+                               getLowestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *Si = IRB.CreateXor(S1, S2);
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Instrument signed relational comparisons.
+  ///
+  /// Handle sign bit tests: x<0, x>=0, x<=-1, x>-1 by propagating the highest
+  /// bit of the shadow. Everything else is delegated to handleShadowOr().
+  void handleSignedRelationalComparison(ICmpInst &I) {
+    Constant *constOp;
+    Value *op = nullptr;
+    CmpInst::Predicate pre;
+    if ((constOp = dyn_cast<Constant>(I.getOperand(1)))) {
+      op = I.getOperand(0);
+      pre = I.getPredicate();
+    } else if ((constOp = dyn_cast<Constant>(I.getOperand(0)))) {
+      op = I.getOperand(1);
+      pre = I.getSwappedPredicate();
+    } else {
+      handleShadowOr(I);
+      return;
+    }
+
+    if ((constOp->isNullValue() &&
+         (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) ||
+        (constOp->isAllOnesValue() &&
+         (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE))) {
+      IRBuilder<> IRB(&I);
+      Value *Shadow = IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op),
+                                        "_msprop_icmp_s");
+      setShadow(&I, Shadow);
+      setOrigin(&I, getOrigin(op));
+    } else {
+      handleShadowOr(I);
+    }
+  }
+
+  void visitICmpInst(ICmpInst &I) {
+    if (!ClHandleICmp) {
+      handleShadowOr(I);
+      return;
+    }
+    if (I.isEquality()) {
+      handleEqualityComparison(I);
+      return;
+    }
+
+    assert(I.isRelational());
+    if (ClHandleICmpExact) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+    if (I.isSigned()) {
+      handleSignedRelationalComparison(I);
+      return;
+    }
+
+    assert(I.isUnsigned());
+    if ((isa<Constant>(I.getOperand(0)) || isa<Constant>(I.getOperand(1)))) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+
+    handleShadowOr(I);
+  }
+
+  void visitFCmpInst(FCmpInst &I) {
+    handleShadowOr(I);
+  }
+
+  void handleShift(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    // If any of the S2 bits are poisoned, the whole thing is poisoned.
+    // Otherwise perform the same shift on S1.
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *S2Conv = IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)),
+                                   S2->getType());
+    Value *V2 = I.getOperand(1);
+    Value *Shift = IRB.CreateBinOp(I.getOpcode(), S1, V2);
+    setShadow(&I, IRB.CreateOr(Shift, S2Conv));
+    setOriginForNaryOp(I);
+  }
+
+  void visitShl(BinaryOperator &I) { handleShift(I); }
+  void visitAShr(BinaryOperator &I) { handleShift(I); }
+  void visitLShr(BinaryOperator &I) { handleShift(I); }
+
+  /// \brief Instrument llvm.memmove
+  ///
+  /// At this point we don't know if llvm.memmove will be inlined or not.
+  /// If we don't instrument it and it gets inlined,
+  /// our interceptor will not kick in and we will lose the memmove.
+  /// If we instrument the call here, but it does not get inlined,
+  /// we will memove the shadow twice: which is bad in case
+  /// of overlapping regions. So, we simply lower the intrinsic to a call.
+  ///
+  /// Similar situation exists for memcpy and memset.
+  void visitMemMoveInst(MemMoveInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall(
+        MS.MemmoveFn,
+        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+    I.eraseFromParent();
+  }
+
+  // Similar to memmove: avoid copying shadow twice.
+  // This is somewhat unfortunate as it may slowdown small constant memcpys.
+  // FIXME: consider doing manual inline for small constant sizes and proper
+  // alignment.
+  void visitMemCpyInst(MemCpyInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall(
+        MS.MemcpyFn,
+        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+    I.eraseFromParent();
+  }
+
+  // Same as memcpy.
+  void visitMemSetInst(MemSetInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall(
+        MS.MemsetFn,
+        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(I.getArgOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+    I.eraseFromParent();
+  }
+
+  void visitVAStartInst(VAStartInst &I) {
+    VAHelper->visitVAStartInst(I);
+  }
+
+  void visitVACopyInst(VACopyInst &I) {
+    VAHelper->visitVACopyInst(I);
+  }
+
+  /// \brief Handle vector store-like intrinsics.
+  ///
+  /// Instrument intrinsics that look like a simple SIMD store: writes memory,
+  /// has 1 pointer argument and 1 vector argument, returns void.
+  bool handleVectorStoreIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value* Addr = I.getArgOperand(0);
+    Value *Shadow = getShadow(&I, 1);
+    Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
+
+    // We don't know the pointer alignment (could be unaligned SSE store!).
+    // Have to assume to worst case.
+    IRB.CreateAlignedStore(Shadow, ShadowPtr, 1);
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    // FIXME: factor out common code from materializeStores
+    if (MS.TrackOrigins)
+      IRB.CreateStore(getOrigin(&I, 1), getOriginPtr(Addr, IRB, 1));
+    return true;
+  }
+
+  /// \brief Handle vector load-like intrinsics.
+  ///
+  /// Instrument intrinsics that look like a simple SIMD load: reads memory,
+  /// has 1 pointer argument, returns a vector.
+  bool handleVectorLoadIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+
+    Type *ShadowTy = getShadowTy(&I);
+    if (PropagateShadow) {
+      Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB);
+      // We don't know the pointer alignment (could be unaligned SSE load!).
+      // Have to assume to worst case.
+      setShadow(&I, IRB.CreateAlignedLoad(ShadowPtr, 1, "_msld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    if (MS.TrackOrigins) {
+      if (PropagateShadow)
+        setOrigin(&I, IRB.CreateLoad(getOriginPtr(Addr, IRB, 1)));
+      else
+        setOrigin(&I, getCleanOrigin());
+    }
+    return true;
+  }
+
+  /// \brief Handle (SIMD arithmetic)-like intrinsics.
+  ///
+  /// Instrument intrinsics with any number of arguments of the same type,
+  /// equal to the return type. The type should be simple (no aggregates or
+  /// pointers; vectors are fine).
+  /// Caller guarantees that this intrinsic does not access memory.
+  bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) {
+    Type *RetTy = I.getType();
+    if (!(RetTy->isIntOrIntVectorTy() ||
+          RetTy->isFPOrFPVectorTy() ||
+          RetTy->isX86_MMXTy()))
+      return false;
+
+    unsigned NumArgOperands = I.getNumArgOperands();
+
+    for (unsigned i = 0; i < NumArgOperands; ++i) {
+      Type *Ty = I.getArgOperand(i)->getType();
+      if (Ty != RetTy)
+        return false;
+    }
+
+    IRBuilder<> IRB(&I);
+    ShadowAndOriginCombiner SC(this, IRB);
+    for (unsigned i = 0; i < NumArgOperands; ++i)
+      SC.Add(I.getArgOperand(i));
+    SC.Done(&I);
+
+    return true;
+  }
+
+  /// \brief Heuristically instrument unknown intrinsics.
+  ///
+  /// The main purpose of this code is to do something reasonable with all
+  /// random intrinsics we might encounter, most importantly - SIMD intrinsics.
+  /// We recognize several classes of intrinsics by their argument types and
+  /// ModRefBehaviour and apply special intrumentation when we are reasonably
+  /// sure that we know what the intrinsic does.
+  ///
+  /// We special-case intrinsics where this approach fails. See llvm.bswap
+  /// handling as an example of that.
+  bool handleUnknownIntrinsic(IntrinsicInst &I) {
+    unsigned NumArgOperands = I.getNumArgOperands();
+    if (NumArgOperands == 0)
+      return false;
+
+    if (NumArgOperands == 2 &&
+        I.getArgOperand(0)->getType()->isPointerTy() &&
+        I.getArgOperand(1)->getType()->isVectorTy() &&
+        I.getType()->isVoidTy() &&
+        !I.onlyReadsMemory()) {
+      // This looks like a vector store.
+      return handleVectorStoreIntrinsic(I);
+    }
+
+    if (NumArgOperands == 1 &&
+        I.getArgOperand(0)->getType()->isPointerTy() &&
+        I.getType()->isVectorTy() &&
+        I.onlyReadsMemory()) {
+      // This looks like a vector load.
+      return handleVectorLoadIntrinsic(I);
+    }
+
+    if (I.doesNotAccessMemory())
+      if (maybeHandleSimpleNomemIntrinsic(I))
+        return true;
+
+    // FIXME: detect and handle SSE maskstore/maskload
+    return false;
+  }
+
+  void handleBswap(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Op = I.getArgOperand(0);
+    Type *OpType = Op->getType();
+    Function *BswapFunc = Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::bswap, makeArrayRef(&OpType, 1));
+    setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op)));
+    setOrigin(&I, getOrigin(Op));
+  }
+
+  // \brief Instrument vector convert instrinsic.
+  //
+  // This function instruments intrinsics like cvtsi2ss:
+  // %Out = int_xxx_cvtyyy(%ConvertOp)
+  // or
+  // %Out = int_xxx_cvtyyy(%CopyOp, %ConvertOp)
+  // Intrinsic converts \p NumUsedElements elements of \p ConvertOp to the same
+  // number \p Out elements, and (if has 2 arguments) copies the rest of the
+  // elements from \p CopyOp.
+  // In most cases conversion involves floating-point value which may trigger a
+  // hardware exception when not fully initialized. For this reason we require
+  // \p ConvertOp[0:NumUsedElements] to be fully initialized and trap otherwise.
+  // We copy the shadow of \p CopyOp[NumUsedElements:] to \p
+  // Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always
+  // return a fully initialized value.
+  void handleVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements) {
+    IRBuilder<> IRB(&I);
+    Value *CopyOp, *ConvertOp;
+
+    switch (I.getNumArgOperands()) {
+    case 3:
+      assert(isa<ConstantInt>(I.getArgOperand(2)) && "Invalid rounding mode");
+      LLVM_FALLTHROUGH;
+    case 2:
+      CopyOp = I.getArgOperand(0);
+      ConvertOp = I.getArgOperand(1);
+      break;
+    case 1:
+      ConvertOp = I.getArgOperand(0);
+      CopyOp = nullptr;
+      break;
+    default:
+      llvm_unreachable("Cvt intrinsic with unsupported number of arguments.");
+    }
+
+    // The first *NumUsedElements* elements of ConvertOp are converted to the
+    // same number of output elements. The rest of the output is copied from
+    // CopyOp, or (if not available) filled with zeroes.
+    // Combine shadow for elements of ConvertOp that are used in this operation,
+    // and insert a check.
+    // FIXME: consider propagating shadow of ConvertOp, at least in the case of
+    // int->any conversion.
+    Value *ConvertShadow = getShadow(ConvertOp);
+    Value *AggShadow = nullptr;
+    if (ConvertOp->getType()->isVectorTy()) {
+      AggShadow = IRB.CreateExtractElement(
+          ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+      for (int i = 1; i < NumUsedElements; ++i) {
+        Value *MoreShadow = IRB.CreateExtractElement(
+            ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), i));
+        AggShadow = IRB.CreateOr(AggShadow, MoreShadow);
+      }
+    } else {
+      AggShadow = ConvertShadow;
+    }
+    assert(AggShadow->getType()->isIntegerTy());
+    insertShadowCheck(AggShadow, getOrigin(ConvertOp), &I);
+
+    // Build result shadow by zero-filling parts of CopyOp shadow that come from
+    // ConvertOp.
+    if (CopyOp) {
+      assert(CopyOp->getType() == I.getType());
+      assert(CopyOp->getType()->isVectorTy());
+      Value *ResultShadow = getShadow(CopyOp);
+      Type *EltTy = ResultShadow->getType()->getVectorElementType();
+      for (int i = 0; i < NumUsedElements; ++i) {
+        ResultShadow = IRB.CreateInsertElement(
+            ResultShadow, ConstantInt::getNullValue(EltTy),
+            ConstantInt::get(IRB.getInt32Ty(), i));
+      }
+      setShadow(&I, ResultShadow);
+      setOrigin(&I, getOrigin(CopyOp));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+      setOrigin(&I, getCleanOrigin());
+    }
+  }
+
+  // Given a scalar or vector, extract lower 64 bits (or less), and return all
+  // zeroes if it is zero, and all ones otherwise.
+  Value *Lower64ShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) {
+    if (S->getType()->isVectorTy())
+      S = CreateShadowCast(IRB, S, IRB.getInt64Ty(), /* Signed */ true);
+    assert(S->getType()->getPrimitiveSizeInBits() <= 64);
+    Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S));
+    return CreateShadowCast(IRB, S2, T, /* Signed */ true);
+  }
+
+  // Given a vector, extract its first element, and return all
+  // zeroes if it is zero, and all ones otherwise.
+  Value *LowerElementShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) {
+    Value *S1 = IRB.CreateExtractElement(S, (uint64_t)0);
+    Value *S2 = IRB.CreateICmpNE(S1, getCleanShadow(S1));
+    return CreateShadowCast(IRB, S2, T, /* Signed */ true);
+  }
+
+  Value *VariableShadowExtend(IRBuilder<> &IRB, Value *S) {
+    Type *T = S->getType();
+    assert(T->isVectorTy());
+    Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S));
+    return IRB.CreateSExt(S2, T);
+  }
+
+  // \brief Instrument vector shift instrinsic.
+  //
+  // This function instruments intrinsics like int_x86_avx2_psll_w.
+  // Intrinsic shifts %In by %ShiftSize bits.
+  // %ShiftSize may be a vector. In that case the lower 64 bits determine shift
+  // size, and the rest is ignored. Behavior is defined even if shift size is
+  // greater than register (or field) width.
+  void handleVectorShiftIntrinsic(IntrinsicInst &I, bool Variable) {
+    assert(I.getNumArgOperands() == 2);
+    IRBuilder<> IRB(&I);
+    // If any of the S2 bits are poisoned, the whole thing is poisoned.
+    // Otherwise perform the same shift on S1.
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *S2Conv = Variable ? VariableShadowExtend(IRB, S2)
+                             : Lower64ShadowExtend(IRB, S2, getShadowTy(&I));
+    Value *V1 = I.getOperand(0);
+    Value *V2 = I.getOperand(1);
+    Value *Shift = IRB.CreateCall(I.getCalledValue(),
+                                  {IRB.CreateBitCast(S1, V1->getType()), V2});
+    Shift = IRB.CreateBitCast(Shift, getShadowTy(&I));
+    setShadow(&I, IRB.CreateOr(Shift, S2Conv));
+    setOriginForNaryOp(I);
+  }
+
+  // \brief Get an X86_MMX-sized vector type.
+  Type *getMMXVectorTy(unsigned EltSizeInBits) {
+    const unsigned X86_MMXSizeInBits = 64;
+    return VectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
+                           X86_MMXSizeInBits / EltSizeInBits);
+  }
+
+  // \brief Returns a signed counterpart for an (un)signed-saturate-and-pack
+  // intrinsic.
+  Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) {
+    switch (id) {
+      case llvm::Intrinsic::x86_sse2_packsswb_128:
+      case llvm::Intrinsic::x86_sse2_packuswb_128:
+        return llvm::Intrinsic::x86_sse2_packsswb_128;
+
+      case llvm::Intrinsic::x86_sse2_packssdw_128:
+      case llvm::Intrinsic::x86_sse41_packusdw:
+        return llvm::Intrinsic::x86_sse2_packssdw_128;
+
+      case llvm::Intrinsic::x86_avx2_packsswb:
+      case llvm::Intrinsic::x86_avx2_packuswb:
+        return llvm::Intrinsic::x86_avx2_packsswb;
+
+      case llvm::Intrinsic::x86_avx2_packssdw:
+      case llvm::Intrinsic::x86_avx2_packusdw:
+        return llvm::Intrinsic::x86_avx2_packssdw;
+
+      case llvm::Intrinsic::x86_mmx_packsswb:
+      case llvm::Intrinsic::x86_mmx_packuswb:
+        return llvm::Intrinsic::x86_mmx_packsswb;
+
+      case llvm::Intrinsic::x86_mmx_packssdw:
+        return llvm::Intrinsic::x86_mmx_packssdw;
+      default:
+        llvm_unreachable("unexpected intrinsic id");
+    }
+  }
+
+  // \brief Instrument vector pack instrinsic.
+  //
+  // This function instruments intrinsics like x86_mmx_packsswb, that
+  // packs elements of 2 input vectors into half as many bits with saturation.
+  // Shadow is propagated with the signed variant of the same intrinsic applied
+  // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
+  // EltSizeInBits is used only for x86mmx arguments.
+  void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) {
+    assert(I.getNumArgOperands() == 2);
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    IRBuilder<> IRB(&I);
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    assert(isX86_MMX || S1->getType()->isVectorTy());
+
+    // SExt and ICmpNE below must apply to individual elements of input vectors.
+    // In case of x86mmx arguments, cast them to appropriate vector types and
+    // back.
+    Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType();
+    if (isX86_MMX) {
+      S1 = IRB.CreateBitCast(S1, T);
+      S2 = IRB.CreateBitCast(S2, T);
+    }
+    Value *S1_ext = IRB.CreateSExt(
+        IRB.CreateICmpNE(S1, llvm::Constant::getNullValue(T)), T);
+    Value *S2_ext = IRB.CreateSExt(
+        IRB.CreateICmpNE(S2, llvm::Constant::getNullValue(T)), T);
+    if (isX86_MMX) {
+      Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C);
+      S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy);
+      S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy);
+    }
+
+    Function *ShadowFn = Intrinsic::getDeclaration(
+        F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID()));
+
+    Value *S =
+        IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack");
+    if (isX86_MMX) S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // \brief Instrument sum-of-absolute-differencies intrinsic.
+  void handleVectorSadIntrinsic(IntrinsicInst &I) {
+    const unsigned SignificantBitsPerResultElement = 16;
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType();
+    unsigned ZeroBitsPerResultElement =
+        ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement;
+
+    IRBuilder<> IRB(&I);
+    Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    S = IRB.CreateBitCast(S, ResTy);
+    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+                       ResTy);
+    S = IRB.CreateLShr(S, ZeroBitsPerResultElement);
+    S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // \brief Instrument multiply-add intrinsic.
+  void handleVectorPmaddIntrinsic(IntrinsicInst &I,
+                                  unsigned EltSizeInBits = 0) {
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType();
+    IRBuilder<> IRB(&I);
+    Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    S = IRB.CreateBitCast(S, ResTy);
+    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+                       ResTy);
+    S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // \brief Instrument compare-packed intrinsic.
+  // Basically, an or followed by sext(icmp ne 0) to end up with all-zeros or
+  // all-ones shadow.
+  void handleVectorComparePackedIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Type *ResTy = getShadowTy(&I);
+    Value *S0 = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    Value *S = IRB.CreateSExt(
+        IRB.CreateICmpNE(S0, Constant::getNullValue(ResTy)), ResTy);
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // \brief Instrument compare-scalar intrinsic.
+  // This handles both cmp* intrinsics which return the result in the first
+  // element of a vector, and comi* which return the result as i32.
+  void handleVectorCompareScalarIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *S0 = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    Value *S = LowerElementShadowExtend(IRB, S0, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  void handleStmxcsr(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value* Addr = I.getArgOperand(0);
+    Type *Ty = IRB.getInt32Ty();
+    Value *ShadowPtr = getShadowPtr(Addr, Ty, IRB);
+
+    IRB.CreateStore(getCleanShadow(Ty),
+                    IRB.CreatePointerCast(ShadowPtr, Ty->getPointerTo()));
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+  }
+
+  void handleLdmxcsr(IntrinsicInst &I) {
+    if (!InsertChecks) return;
+
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+    Type *Ty = IRB.getInt32Ty();
+    unsigned Alignment = 1;
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    Value *Shadow = IRB.CreateAlignedLoad(getShadowPtr(Addr, Ty, IRB),
+                                          Alignment, "_ldmxcsr");
+    Value *Origin = MS.TrackOrigins
+                        ? IRB.CreateLoad(getOriginPtr(Addr, IRB, Alignment))
+                        : getCleanOrigin();
+    insertShadowCheck(Shadow, Origin, &I);
+  }
+
+  void visitIntrinsicInst(IntrinsicInst &I) {
+    switch (I.getIntrinsicID()) {
+    case llvm::Intrinsic::bswap:
+      handleBswap(I);
+      break;
+    case llvm::Intrinsic::x86_sse_stmxcsr:
+      handleStmxcsr(I);
+      break;
+    case llvm::Intrinsic::x86_sse_ldmxcsr:
+      handleLdmxcsr(I);
+      break;
+    case llvm::Intrinsic::x86_avx512_vcvtsd2usi64:
+    case llvm::Intrinsic::x86_avx512_vcvtsd2usi32:
+    case llvm::Intrinsic::x86_avx512_vcvtss2usi64:
+    case llvm::Intrinsic::x86_avx512_vcvtss2usi32:
+    case llvm::Intrinsic::x86_avx512_cvttss2usi64:
+    case llvm::Intrinsic::x86_avx512_cvttss2usi:
+    case llvm::Intrinsic::x86_avx512_cvttsd2usi64:
+    case llvm::Intrinsic::x86_avx512_cvttsd2usi:
+    case llvm::Intrinsic::x86_avx512_cvtusi2sd:
+    case llvm::Intrinsic::x86_avx512_cvtusi2ss:
+    case llvm::Intrinsic::x86_avx512_cvtusi642sd:
+    case llvm::Intrinsic::x86_avx512_cvtusi642ss:
+    case llvm::Intrinsic::x86_sse2_cvtsd2si64:
+    case llvm::Intrinsic::x86_sse2_cvtsd2si:
+    case llvm::Intrinsic::x86_sse2_cvtsd2ss:
+    case llvm::Intrinsic::x86_sse2_cvtsi2sd:
+    case llvm::Intrinsic::x86_sse2_cvtsi642sd:
+    case llvm::Intrinsic::x86_sse2_cvtss2sd:
+    case llvm::Intrinsic::x86_sse2_cvttsd2si64:
+    case llvm::Intrinsic::x86_sse2_cvttsd2si:
+    case llvm::Intrinsic::x86_sse_cvtsi2ss:
+    case llvm::Intrinsic::x86_sse_cvtsi642ss:
+    case llvm::Intrinsic::x86_sse_cvtss2si64:
+    case llvm::Intrinsic::x86_sse_cvtss2si:
+    case llvm::Intrinsic::x86_sse_cvttss2si64:
+    case llvm::Intrinsic::x86_sse_cvttss2si:
+      handleVectorConvertIntrinsic(I, 1);
+      break;
+    case llvm::Intrinsic::x86_sse_cvtps2pi:
+    case llvm::Intrinsic::x86_sse_cvttps2pi:
+      handleVectorConvertIntrinsic(I, 2);
+      break;
+
+    case llvm::Intrinsic::x86_avx512_psll_w_512:
+    case llvm::Intrinsic::x86_avx512_psll_d_512:
+    case llvm::Intrinsic::x86_avx512_psll_q_512:
+    case llvm::Intrinsic::x86_avx512_pslli_w_512:
+    case llvm::Intrinsic::x86_avx512_pslli_d_512:
+    case llvm::Intrinsic::x86_avx512_pslli_q_512:
+    case llvm::Intrinsic::x86_avx512_psrl_w_512:
+    case llvm::Intrinsic::x86_avx512_psrl_d_512:
+    case llvm::Intrinsic::x86_avx512_psrl_q_512:
+    case llvm::Intrinsic::x86_avx512_psra_w_512:
+    case llvm::Intrinsic::x86_avx512_psra_d_512:
+    case llvm::Intrinsic::x86_avx512_psra_q_512:
+    case llvm::Intrinsic::x86_avx512_psrli_w_512:
+    case llvm::Intrinsic::x86_avx512_psrli_d_512:
+    case llvm::Intrinsic::x86_avx512_psrli_q_512:
+    case llvm::Intrinsic::x86_avx512_psrai_w_512:
+    case llvm::Intrinsic::x86_avx512_psrai_d_512:
+    case llvm::Intrinsic::x86_avx512_psrai_q_512:
+    case llvm::Intrinsic::x86_avx512_psra_q_256:
+    case llvm::Intrinsic::x86_avx512_psra_q_128:
+    case llvm::Intrinsic::x86_avx512_psrai_q_256:
+    case llvm::Intrinsic::x86_avx512_psrai_q_128:
+    case llvm::Intrinsic::x86_avx2_psll_w:
+    case llvm::Intrinsic::x86_avx2_psll_d:
+    case llvm::Intrinsic::x86_avx2_psll_q:
+    case llvm::Intrinsic::x86_avx2_pslli_w:
+    case llvm::Intrinsic::x86_avx2_pslli_d:
+    case llvm::Intrinsic::x86_avx2_pslli_q:
+    case llvm::Intrinsic::x86_avx2_psrl_w:
+    case llvm::Intrinsic::x86_avx2_psrl_d:
+    case llvm::Intrinsic::x86_avx2_psrl_q:
+    case llvm::Intrinsic::x86_avx2_psra_w:
+    case llvm::Intrinsic::x86_avx2_psra_d:
+    case llvm::Intrinsic::x86_avx2_psrli_w:
+    case llvm::Intrinsic::x86_avx2_psrli_d:
+    case llvm::Intrinsic::x86_avx2_psrli_q:
+    case llvm::Intrinsic::x86_avx2_psrai_w:
+    case llvm::Intrinsic::x86_avx2_psrai_d:
+    case llvm::Intrinsic::x86_sse2_psll_w:
+    case llvm::Intrinsic::x86_sse2_psll_d:
+    case llvm::Intrinsic::x86_sse2_psll_q:
+    case llvm::Intrinsic::x86_sse2_pslli_w:
+    case llvm::Intrinsic::x86_sse2_pslli_d:
+    case llvm::Intrinsic::x86_sse2_pslli_q:
+    case llvm::Intrinsic::x86_sse2_psrl_w:
+    case llvm::Intrinsic::x86_sse2_psrl_d:
+    case llvm::Intrinsic::x86_sse2_psrl_q:
+    case llvm::Intrinsic::x86_sse2_psra_w:
+    case llvm::Intrinsic::x86_sse2_psra_d:
+    case llvm::Intrinsic::x86_sse2_psrli_w:
+    case llvm::Intrinsic::x86_sse2_psrli_d:
+    case llvm::Intrinsic::x86_sse2_psrli_q:
+    case llvm::Intrinsic::x86_sse2_psrai_w:
+    case llvm::Intrinsic::x86_sse2_psrai_d:
+    case llvm::Intrinsic::x86_mmx_psll_w:
+    case llvm::Intrinsic::x86_mmx_psll_d:
+    case llvm::Intrinsic::x86_mmx_psll_q:
+    case llvm::Intrinsic::x86_mmx_pslli_w:
+    case llvm::Intrinsic::x86_mmx_pslli_d:
+    case llvm::Intrinsic::x86_mmx_pslli_q:
+    case llvm::Intrinsic::x86_mmx_psrl_w:
+    case llvm::Intrinsic::x86_mmx_psrl_d:
+    case llvm::Intrinsic::x86_mmx_psrl_q:
+    case llvm::Intrinsic::x86_mmx_psra_w:
+    case llvm::Intrinsic::x86_mmx_psra_d:
+    case llvm::Intrinsic::x86_mmx_psrli_w:
+    case llvm::Intrinsic::x86_mmx_psrli_d:
+    case llvm::Intrinsic::x86_mmx_psrli_q:
+    case llvm::Intrinsic::x86_mmx_psrai_w:
+    case llvm::Intrinsic::x86_mmx_psrai_d:
+      handleVectorShiftIntrinsic(I, /* Variable */ false);
+      break;
+    case llvm::Intrinsic::x86_avx2_psllv_d:
+    case llvm::Intrinsic::x86_avx2_psllv_d_256:
+    case llvm::Intrinsic::x86_avx512_psllv_d_512:
+    case llvm::Intrinsic::x86_avx2_psllv_q:
+    case llvm::Intrinsic::x86_avx2_psllv_q_256:
+    case llvm::Intrinsic::x86_avx512_psllv_q_512:
+    case llvm::Intrinsic::x86_avx2_psrlv_d:
+    case llvm::Intrinsic::x86_avx2_psrlv_d_256:
+    case llvm::Intrinsic::x86_avx512_psrlv_d_512:
+    case llvm::Intrinsic::x86_avx2_psrlv_q:
+    case llvm::Intrinsic::x86_avx2_psrlv_q_256:
+    case llvm::Intrinsic::x86_avx512_psrlv_q_512:
+    case llvm::Intrinsic::x86_avx2_psrav_d:
+    case llvm::Intrinsic::x86_avx2_psrav_d_256:
+    case llvm::Intrinsic::x86_avx512_psrav_d_512:
+    case llvm::Intrinsic::x86_avx512_psrav_q_128:
+    case llvm::Intrinsic::x86_avx512_psrav_q_256:
+    case llvm::Intrinsic::x86_avx512_psrav_q_512:
+      handleVectorShiftIntrinsic(I, /* Variable */ true);
+      break;
+
+    case llvm::Intrinsic::x86_sse2_packsswb_128:
+    case llvm::Intrinsic::x86_sse2_packssdw_128:
+    case llvm::Intrinsic::x86_sse2_packuswb_128:
+    case llvm::Intrinsic::x86_sse41_packusdw:
+    case llvm::Intrinsic::x86_avx2_packsswb:
+    case llvm::Intrinsic::x86_avx2_packssdw:
+    case llvm::Intrinsic::x86_avx2_packuswb:
+    case llvm::Intrinsic::x86_avx2_packusdw:
+      handleVectorPackIntrinsic(I);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_packsswb:
+    case llvm::Intrinsic::x86_mmx_packuswb:
+      handleVectorPackIntrinsic(I, 16);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_packssdw:
+      handleVectorPackIntrinsic(I, 32);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_psad_bw:
+    case llvm::Intrinsic::x86_sse2_psad_bw:
+    case llvm::Intrinsic::x86_avx2_psad_bw:
+      handleVectorSadIntrinsic(I);
+      break;
+
+    case llvm::Intrinsic::x86_sse2_pmadd_wd:
+    case llvm::Intrinsic::x86_avx2_pmadd_wd:
+    case llvm::Intrinsic::x86_ssse3_pmadd_ub_sw_128:
+    case llvm::Intrinsic::x86_avx2_pmadd_ub_sw:
+      handleVectorPmaddIntrinsic(I);
+      break;
+
+    case llvm::Intrinsic::x86_ssse3_pmadd_ub_sw:
+      handleVectorPmaddIntrinsic(I, 8);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_pmadd_wd:
+      handleVectorPmaddIntrinsic(I, 16);
+      break;
+
+    case llvm::Intrinsic::x86_sse_cmp_ss:
+    case llvm::Intrinsic::x86_sse2_cmp_sd:
+    case llvm::Intrinsic::x86_sse_comieq_ss:
+    case llvm::Intrinsic::x86_sse_comilt_ss:
+    case llvm::Intrinsic::x86_sse_comile_ss:
+    case llvm::Intrinsic::x86_sse_comigt_ss:
+    case llvm::Intrinsic::x86_sse_comige_ss:
+    case llvm::Intrinsic::x86_sse_comineq_ss:
+    case llvm::Intrinsic::x86_sse_ucomieq_ss:
+    case llvm::Intrinsic::x86_sse_ucomilt_ss:
+    case llvm::Intrinsic::x86_sse_ucomile_ss:
+    case llvm::Intrinsic::x86_sse_ucomigt_ss:
+    case llvm::Intrinsic::x86_sse_ucomige_ss:
+    case llvm::Intrinsic::x86_sse_ucomineq_ss:
+    case llvm::Intrinsic::x86_sse2_comieq_sd:
+    case llvm::Intrinsic::x86_sse2_comilt_sd:
+    case llvm::Intrinsic::x86_sse2_comile_sd:
+    case llvm::Intrinsic::x86_sse2_comigt_sd:
+    case llvm::Intrinsic::x86_sse2_comige_sd:
+    case llvm::Intrinsic::x86_sse2_comineq_sd:
+    case llvm::Intrinsic::x86_sse2_ucomieq_sd:
+    case llvm::Intrinsic::x86_sse2_ucomilt_sd:
+    case llvm::Intrinsic::x86_sse2_ucomile_sd:
+    case llvm::Intrinsic::x86_sse2_ucomigt_sd:
+    case llvm::Intrinsic::x86_sse2_ucomige_sd:
+    case llvm::Intrinsic::x86_sse2_ucomineq_sd:
+      handleVectorCompareScalarIntrinsic(I);
+      break;
+
+    case llvm::Intrinsic::x86_sse_cmp_ps:
+    case llvm::Intrinsic::x86_sse2_cmp_pd:
+      // FIXME: For x86_avx_cmp_pd_256 and x86_avx_cmp_ps_256 this function
+      // generates reasonably looking IR that fails in the backend with "Do not
+      // know how to split the result of this operator!".
+      handleVectorComparePackedIntrinsic(I);
+      break;
+
+    default:
+      if (!handleUnknownIntrinsic(I))
+        visitInstruction(I);
+      break;
+    }
+  }
+
+  void visitCallSite(CallSite CS) {
+    Instruction &I = *CS.getInstruction();
+    assert((CS.isCall() || CS.isInvoke()) && "Unknown type of CallSite");
+    if (CS.isCall()) {
+      CallInst *Call = cast<CallInst>(&I);
+
+      // For inline asm, do the usual thing: check argument shadow and mark all
+      // outputs as clean. Note that any side effects of the inline asm that are
+      // not immediately visible in its constraints are not handled.
+      if (Call->isInlineAsm()) {
+        visitInstruction(I);
+        return;
+      }
+
+      assert(!isa<IntrinsicInst>(&I) && "intrinsics are handled elsewhere");
+
+      // We are going to insert code that relies on the fact that the callee
+      // will become a non-readonly function after it is instrumented by us. To
+      // prevent this code from being optimized out, mark that function
+      // non-readonly in advance.
+      if (Function *Func = Call->getCalledFunction()) {
+        // Clear out readonly/readnone attributes.
+        AttrBuilder B;
+        B.addAttribute(Attribute::ReadOnly)
+          .addAttribute(Attribute::ReadNone);
+        Func->removeAttributes(AttributeList::FunctionIndex, B);
+      }
+
+      maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
+    }
+    IRBuilder<> IRB(&I);
+
+    unsigned ArgOffset = 0;
+    DEBUG(dbgs() << "  CallSite: " << I << "\n");
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned i = ArgIt - CS.arg_begin();
+      if (!A->getType()->isSized()) {
+        DEBUG(dbgs() << "Arg " << i << " is not sized: " << I << "\n");
+        continue;
+      }
+      unsigned Size = 0;
+      Value *Store = nullptr;
+      // Compute the Shadow for arg even if it is ByVal, because
+      // in that case getShadow() will copy the actual arg shadow to
+      // __msan_param_tls.
+      Value *ArgShadow = getShadow(A);
+      Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
+      DEBUG(dbgs() << "  Arg#" << i << ": " << *A <<
+            " Shadow: " << *ArgShadow << "\n");
+      bool ArgIsInitialized = false;
+      const DataLayout &DL = F.getParent()->getDataLayout();
+      if (CS.paramHasAttr(i, Attribute::ByVal)) {
+        assert(A->getType()->isPointerTy() &&
+               "ByVal argument is not a pointer!");
+        Size = DL.getTypeAllocSize(A->getType()->getPointerElementType());
+        if (ArgOffset + Size > kParamTLSSize) break;
+        unsigned ParamAlignment = CS.getParamAlignment(i);
+        unsigned Alignment = std::min(ParamAlignment, kShadowTLSAlignment);
+        Store = IRB.CreateMemCpy(ArgShadowBase,
+                                 getShadowPtr(A, Type::getInt8Ty(*MS.C), IRB),
+                                 Size, Alignment);
+      } else {
+        Size = DL.getTypeAllocSize(A->getType());
+        if (ArgOffset + Size > kParamTLSSize) break;
+        Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
+                                       kShadowTLSAlignment);
+        Constant *Cst = dyn_cast<Constant>(ArgShadow);
+        if (Cst && Cst->isNullValue()) ArgIsInitialized = true;
+      }
+      if (MS.TrackOrigins && !ArgIsInitialized)
+        IRB.CreateStore(getOrigin(A),
+                        getOriginPtrForArgument(A, IRB, ArgOffset));
+      (void)Store;
+      assert(Size != 0 && Store != nullptr);
+      DEBUG(dbgs() << "  Param:" << *Store << "\n");
+      ArgOffset += alignTo(Size, 8);
+    }
+    DEBUG(dbgs() << "  done with call args\n");
+
+    FunctionType *FT =
+      cast<FunctionType>(CS.getCalledValue()->getType()->getContainedType(0));
+    if (FT->isVarArg()) {
+      VAHelper->visitCallSite(CS, IRB);
+    }
+
+    // Now, get the shadow for the RetVal.
+    if (!I.getType()->isSized()) return;
+    // Don't emit the epilogue for musttail call returns.
+    if (CS.isCall() && cast<CallInst>(&I)->isMustTailCall()) return;
+    IRBuilder<> IRBBefore(&I);
+    // Until we have full dynamic coverage, make sure the retval shadow is 0.
+    Value *Base = getShadowPtrForRetval(&I, IRBBefore);
+    IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment);
+    BasicBlock::iterator NextInsn;
+    if (CS.isCall()) {
+      NextInsn = ++I.getIterator();
+      assert(NextInsn != I.getParent()->end());
+    } else {
+      BasicBlock *NormalDest = cast<InvokeInst>(&I)->getNormalDest();
+      if (!NormalDest->getSinglePredecessor()) {
+        // FIXME: this case is tricky, so we are just conservative here.
+        // Perhaps we need to split the edge between this BB and NormalDest,
+        // but a naive attempt to use SplitEdge leads to a crash.
+        setShadow(&I, getCleanShadow(&I));
+        setOrigin(&I, getCleanOrigin());
+        return;
+      }
+      NextInsn = NormalDest->getFirstInsertionPt();
+      assert(NextInsn != NormalDest->end() &&
+             "Could not find insertion point for retval shadow load");
+    }
+    IRBuilder<> IRBAfter(&*NextInsn);
+    Value *RetvalShadow =
+      IRBAfter.CreateAlignedLoad(getShadowPtrForRetval(&I, IRBAfter),
+                                 kShadowTLSAlignment, "_msret");
+    setShadow(&I, RetvalShadow);
+    if (MS.TrackOrigins)
+      setOrigin(&I, IRBAfter.CreateLoad(getOriginPtrForRetval(IRBAfter)));
+  }
+
+  bool isAMustTailRetVal(Value *RetVal) {
+    if (auto *I = dyn_cast<BitCastInst>(RetVal)) {
+      RetVal = I->getOperand(0);
+    }
+    if (auto *I = dyn_cast<CallInst>(RetVal)) {
+      return I->isMustTailCall();
+    }
+    return false;
+  }
+
+  void visitReturnInst(ReturnInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *RetVal = I.getReturnValue();
+    if (!RetVal) return;
+    // Don't emit the epilogue for musttail call returns.
+    if (isAMustTailRetVal(RetVal)) return;
+    Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
+    if (CheckReturnValue) {
+      insertShadowCheck(RetVal, &I);
+      Value *Shadow = getCleanShadow(RetVal);
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+    } else {
+      Value *Shadow = getShadow(RetVal);
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+      if (MS.TrackOrigins)
+        IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
+    }
+  }
+
+  void visitPHINode(PHINode &I) {
+    IRBuilder<> IRB(&I);
+    if (!PropagateShadow) {
+      setShadow(&I, getCleanShadow(&I));
+      setOrigin(&I, getCleanOrigin());
+      return;
+    }
+
+    ShadowPHINodes.push_back(&I);
+    setShadow(&I, IRB.CreatePHI(getShadowTy(&I), I.getNumIncomingValues(),
+                                "_msphi_s"));
+    if (MS.TrackOrigins)
+      setOrigin(&I, IRB.CreatePHI(MS.OriginTy, I.getNumIncomingValues(),
+                                  "_msphi_o"));
+  }
+
+  void visitAllocaInst(AllocaInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+    IRBuilder<> IRB(I.getNextNode());
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    uint64_t TypeSize = DL.getTypeAllocSize(I.getAllocatedType());
+    Value *Len = ConstantInt::get(MS.IntptrTy, TypeSize);
+    if (I.isArrayAllocation())
+      Len = IRB.CreateMul(Len, I.getArraySize());
+    if (PoisonStack && ClPoisonStackWithCall) {
+      IRB.CreateCall(MS.MsanPoisonStackFn,
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
+    } else {
+      Value *ShadowBase = getShadowPtr(&I, Type::getInt8PtrTy(*MS.C), IRB);
+      Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
+      IRB.CreateMemSet(ShadowBase, PoisonValue, Len, I.getAlignment());
+    }
+
+    if (PoisonStack && MS.TrackOrigins) {
+      SmallString<2048> StackDescriptionStorage;
+      raw_svector_ostream StackDescription(StackDescriptionStorage);
+      // We create a string with a description of the stack allocation and
+      // pass it into __msan_set_alloca_origin.
+      // It will be printed by the run-time if stack-originated UMR is found.
+      // The first 4 bytes of the string are set to '----' and will be replaced
+      // by __msan_va_arg_overflow_size_tls at the first call.
+      StackDescription << "----" << I.getName() << "@" << F.getName();
+      Value *Descr =
+          createPrivateNonConstGlobalForString(*F.getParent(),
+                                               StackDescription.str());
+
+      IRB.CreateCall(MS.MsanSetAllocaOrigin4Fn,
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len,
+                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()),
+                      IRB.CreatePointerCast(&F, MS.IntptrTy)});
+    }
+  }
+
+  void visitSelectInst(SelectInst& I) {
+    IRBuilder<> IRB(&I);
+    // a = select b, c, d
+    Value *B = I.getCondition();
+    Value *C = I.getTrueValue();
+    Value *D = I.getFalseValue();
+    Value *Sb = getShadow(B);
+    Value *Sc = getShadow(C);
+    Value *Sd = getShadow(D);
+
+    // Result shadow if condition shadow is 0.
+    Value *Sa0 = IRB.CreateSelect(B, Sc, Sd);
+    Value *Sa1;
+    if (I.getType()->isAggregateType()) {
+      // To avoid "sign extending" i1 to an arbitrary aggregate type, we just do
+      // an extra "select". This results in much more compact IR.
+      // Sa = select Sb, poisoned, (select b, Sc, Sd)
+      Sa1 = getPoisonedShadow(getShadowTy(I.getType()));
+    } else {
+      // Sa = select Sb, [ (c^d) | Sc | Sd ], [ b ? Sc : Sd ]
+      // If Sb (condition is poisoned), look for bits in c and d that are equal
+      // and both unpoisoned.
+      // If !Sb (condition is unpoisoned), simply pick one of Sc and Sd.
+
+      // Cast arguments to shadow-compatible type.
+      C = CreateAppToShadowCast(IRB, C);
+      D = CreateAppToShadowCast(IRB, D);
+
+      // Result shadow if condition shadow is 1.
+      Sa1 = IRB.CreateOr(IRB.CreateXor(C, D), IRB.CreateOr(Sc, Sd));
+    }
+    Value *Sa = IRB.CreateSelect(Sb, Sa1, Sa0, "_msprop_select");
+    setShadow(&I, Sa);
+    if (MS.TrackOrigins) {
+      // Origins are always i32, so any vector conditions must be flattened.
+      // FIXME: consider tracking vector origins for app vectors?
+      if (B->getType()->isVectorTy()) {
+        Type *FlatTy = getShadowTyNoVec(B->getType());
+        B = IRB.CreateICmpNE(IRB.CreateBitCast(B, FlatTy),
+                                ConstantInt::getNullValue(FlatTy));
+        Sb = IRB.CreateICmpNE(IRB.CreateBitCast(Sb, FlatTy),
+                                      ConstantInt::getNullValue(FlatTy));
+      }
+      // a = select b, c, d
+      // Oa = Sb ? Ob : (b ? Oc : Od)
+      setOrigin(
+          &I, IRB.CreateSelect(Sb, getOrigin(I.getCondition()),
+                               IRB.CreateSelect(B, getOrigin(I.getTrueValue()),
+                                                getOrigin(I.getFalseValue()))));
+    }
+  }
+
+  void visitLandingPadInst(LandingPadInst &I) {
+    // Do nothing.
+    // See http://code.google.com/p/memory-sanitizer/issues/detail?id=1
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitCatchSwitchInst(CatchSwitchInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitFuncletPadInst(FuncletPadInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &I) {
+    handleShadowOr(I);
+  }
+
+  void visitExtractValueInst(ExtractValueInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Agg = I.getAggregateOperand();
+    DEBUG(dbgs() << "ExtractValue:  " << I << "\n");
+    Value *AggShadow = getShadow(Agg);
+    DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
+    DEBUG(dbgs() << "   ResShadow:  " << *ResShadow << "\n");
+    setShadow(&I, ResShadow);
+    setOriginForNaryOp(I);
+  }
+
+  void visitInsertValueInst(InsertValueInst &I) {
+    IRBuilder<> IRB(&I);
+    DEBUG(dbgs() << "InsertValue:  " << I << "\n");
+    Value *AggShadow = getShadow(I.getAggregateOperand());
+    Value *InsShadow = getShadow(I.getInsertedValueOperand());
+    DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    DEBUG(dbgs() << "   InsShadow:  " << *InsShadow << "\n");
+    Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
+    DEBUG(dbgs() << "   Res:        " << *Res << "\n");
+    setShadow(&I, Res);
+    setOriginForNaryOp(I);
+  }
+
+  void dumpInst(Instruction &I) {
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      errs() << "ZZZ call " << CI->getCalledFunction()->getName() << "\n";
+    } else {
+      errs() << "ZZZ " << I.getOpcodeName() << "\n";
+    }
+    errs() << "QQQ " << I << "\n";
+  }
+
+  void visitResumeInst(ResumeInst &I) {
+    DEBUG(dbgs() << "Resume: " << I << "\n");
+    // Nothing to do here.
+  }
+
+  void visitCleanupReturnInst(CleanupReturnInst &CRI) {
+    DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n");
+    // Nothing to do here.
+  }
+
+  void visitCatchReturnInst(CatchReturnInst &CRI) {
+    DEBUG(dbgs() << "CatchReturn: " << CRI << "\n");
+    // Nothing to do here.
+  }
+
+  void visitInstruction(Instruction &I) {
+    // Everything else: stop propagating and check for poisoned shadow.
+    if (ClDumpStrictInstructions)
+      dumpInst(I);
+    DEBUG(dbgs() << "DEFAULT: " << I << "\n");
+    for (size_t i = 0, n = I.getNumOperands(); i < n; i++) {
+      Value *Operand = I.getOperand(i);
+      if (Operand->getType()->isSized())
+        insertShadowCheck(Operand, &I);
+    }
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+};
+
+/// \brief AMD64-specific implementation of VarArgHelper.
+struct VarArgAMD64Helper : public VarArgHelper {
+  // An unfortunate workaround for asymmetric lowering of va_arg stuff.
+  // See a comment in visitCallSite for more details.
+  static const unsigned AMD64GpEndOffset = 48;  // AMD64 ABI Draft 0.99.6 p3.5.7
+  static const unsigned AMD64FpEndOffset = 176;
+
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy;
+  Value *VAArgOverflowSize;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgAMD64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV)
+    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr),
+      VAArgOverflowSize(nullptr) {}
+
+  enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
+
+  ArgKind classifyArgument(Value* arg) {
+    // A very rough approximation of X86_64 argument classification rules.
+    Type *T = arg->getType();
+    if (T->isFPOrFPVectorTy() || T->isX86_MMXTy())
+      return AK_FloatingPoint;
+    if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
+      return AK_GeneralPurpose;
+    if (T->isPointerTy())
+      return AK_GeneralPurpose;
+    return AK_Memory;
+  }
+
+  // For VarArg functions, store the argument shadow in an ABI-specific format
+  // that corresponds to va_list layout.
+  // We do this because Clang lowers va_arg in the frontend, and this pass
+  // only sees the low level code that deals with va_list internals.
+  // A much easier alternative (provided that Clang emits va_arg instructions)
+  // would have been to associate each live instance of va_list with a copy of
+  // MSanParamTLS, and extract shadow on va_arg() call in the argument list
+  // order.
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+    unsigned GpOffset = 0;
+    unsigned FpOffset = AMD64GpEndOffset;
+    unsigned OverflowOffset = AMD64FpEndOffset;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned ArgNo = CS.getArgumentNo(ArgIt);
+      bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
+      bool IsByVal = CS.paramHasAttr(ArgNo, Attribute::ByVal);
+      if (IsByVal) {
+        // ByVal arguments always go to the overflow area.
+        // Fixed arguments passed through the overflow area will be stepped
+        // over by va_start, so don't count them towards the offset.
+        if (IsFixed)
+          continue;
+        assert(A->getType()->isPointerTy());
+        Type *RealTy = A->getType()->getPointerElementType();
+        uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
+        Value *Base = getShadowPtrForVAArgument(RealTy, IRB, OverflowOffset);
+        OverflowOffset += alignTo(ArgSize, 8);
+        IRB.CreateMemCpy(Base, MSV.getShadowPtr(A, IRB.getInt8Ty(), IRB),
+                         ArgSize, kShadowTLSAlignment);
+      } else {
+        ArgKind AK = classifyArgument(A);
+        if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset)
+          AK = AK_Memory;
+        if (AK == AK_FloatingPoint && FpOffset >= AMD64FpEndOffset)
+          AK = AK_Memory;
+        Value *Base;
+        switch (AK) {
+          case AK_GeneralPurpose:
+            Base = getShadowPtrForVAArgument(A->getType(), IRB, GpOffset);
+            GpOffset += 8;
+            break;
+          case AK_FloatingPoint:
+            Base = getShadowPtrForVAArgument(A->getType(), IRB, FpOffset);
+            FpOffset += 16;
+            break;
+          case AK_Memory:
+            if (IsFixed)
+              continue;
+            uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+            Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset);
+            OverflowOffset += alignTo(ArgSize, 8);
+        }
+        // Take fixed arguments into account for GpOffset and FpOffset,
+        // but don't actually store shadows for them.
+        if (IsFixed)
+          continue;
+        IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+      }
+    }
+    Constant *OverflowSize =
+      ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AMD64FpEndOffset);
+    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// \brief Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    if (F.getCallingConv() == CallingConv::X86_64_Win64)
+      return;
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */24, /* alignment */8, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    if (F.getCallingConv() == CallingConv::X86_64_Win64)
+      return;
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */24, /* alignment */8, false);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+      VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
+      Value *CopySize =
+        IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset),
+                      VAArgOverflowSize);
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+
+      Value *RegSaveAreaPtrPtr =
+        IRB.CreateIntToPtr(
+          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        ConstantInt::get(MS.IntptrTy, 16)),
+          Type::getInt64PtrTy(*MS.C));
+      Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr =
+        MSV.getShadowPtr(RegSaveAreaPtr, IRB.getInt8Ty(), IRB);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy,
+                       AMD64FpEndOffset, 16);
+
+      Value *OverflowArgAreaPtrPtr =
+        IRB.CreateIntToPtr(
+          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        ConstantInt::get(MS.IntptrTy, 8)),
+          Type::getInt64PtrTy(*MS.C));
+      Value *OverflowArgAreaPtr = IRB.CreateLoad(OverflowArgAreaPtrPtr);
+      Value *OverflowArgAreaShadowPtr =
+        MSV.getShadowPtr(OverflowArgAreaPtr, IRB.getInt8Ty(), IRB);
+      Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy,
+                                             AMD64FpEndOffset);
+      IRB.CreateMemCpy(OverflowArgAreaShadowPtr, SrcPtr, VAArgOverflowSize, 16);
+    }
+  }
+};
+
+/// \brief MIPS64-specific implementation of VarArgHelper.
+struct VarArgMIPS64Helper : public VarArgHelper {
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy;
+  Value *VAArgSize;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgMIPS64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV)
+    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr),
+      VAArgSize(nullptr) {}
+
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+    unsigned VAArgOffset = 0;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin() +
+         CS.getFunctionType()->getNumParams(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      llvm::Triple TargetTriple(F.getParent()->getTargetTriple());
+      Value *A = *ArgIt;
+      Value *Base;
+      uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+      if (TargetTriple.getArch() == llvm::Triple::mips64) {
+        // Adjusting the shadow for argument with size < 8 to match the placement
+        // of bits in big endian system
+        if (ArgSize < 8)
+          VAArgOffset += (8 - ArgSize);
+      }
+      Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset);
+      VAArgOffset += ArgSize;
+      VAArgOffset = alignTo(VAArgOffset, 8);
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+    }
+
+    Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), VAArgOffset);
+    // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
+    // a new class member i.e. it is the total size of all VarArgs.
+    IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// \brief Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */8, /* alignment */8, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */8, /* alignment */8, false);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    VAArgSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
+    Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
+                                    VAArgSize);
+
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+      Value *RegSaveAreaPtrPtr =
+        IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        Type::getInt64PtrTy(*MS.C));
+      Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr =
+      MSV.getShadowPtr(RegSaveAreaPtr, IRB.getInt8Ty(), IRB);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy, CopySize, 8);
+    }
+  }
+};
+
+
+/// \brief AArch64-specific implementation of VarArgHelper.
+struct VarArgAArch64Helper : public VarArgHelper {
+  static const unsigned kAArch64GrArgSize = 64;
+  static const unsigned kAArch64VrArgSize = 128;
+
+  static const unsigned AArch64GrBegOffset = 0;
+  static const unsigned AArch64GrEndOffset = kAArch64GrArgSize;
+  // Make VR space aligned to 16 bytes.
+  static const unsigned AArch64VrBegOffset = AArch64GrEndOffset;
+  static const unsigned AArch64VrEndOffset = AArch64VrBegOffset
+                                             + kAArch64VrArgSize;
+  static const unsigned AArch64VAEndOffset = AArch64VrEndOffset;
+
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy;
+  Value *VAArgOverflowSize;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgAArch64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV)
+    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr),
+      VAArgOverflowSize(nullptr) {}
+
+  enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
+
+  ArgKind classifyArgument(Value* arg) {
+    Type *T = arg->getType();
+    if (T->isFPOrFPVectorTy())
+      return AK_FloatingPoint;
+    if ((T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
+        || (T->isPointerTy()))
+      return AK_GeneralPurpose;
+    return AK_Memory;
+  }
+
+  // The instrumentation stores the argument shadow in a non ABI-specific
+  // format because it does not know which argument is named (since Clang,
+  // like x86_64 case, lowers the va_args in the frontend and this pass only
+  // sees the low level code that deals with va_list internals).
+  // The first seven GR registers are saved in the first 56 bytes of the
+  // va_arg tls arra, followers by the first 8 FP/SIMD registers, and then
+  // the remaining arguments.
+  // Using constant offset within the va_arg TLS array allows fast copy
+  // in the finalize instrumentation.
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+    unsigned GrOffset = AArch64GrBegOffset;
+    unsigned VrOffset = AArch64VrBegOffset;
+    unsigned OverflowOffset = AArch64VAEndOffset;
+
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned ArgNo = CS.getArgumentNo(ArgIt);
+      bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
+      ArgKind AK = classifyArgument(A);
+      if (AK == AK_GeneralPurpose && GrOffset >= AArch64GrEndOffset)
+        AK = AK_Memory;
+      if (AK == AK_FloatingPoint && VrOffset >= AArch64VrEndOffset)
+        AK = AK_Memory;
+      Value *Base;
+      switch (AK) {
+        case AK_GeneralPurpose:
+          Base = getShadowPtrForVAArgument(A->getType(), IRB, GrOffset);
+          GrOffset += 8;
+          break;
+        case AK_FloatingPoint:
+          Base = getShadowPtrForVAArgument(A->getType(), IRB, VrOffset);
+          VrOffset += 16;
+          break;
+        case AK_Memory:
+          // Don't count fixed arguments in the overflow area - va_start will
+          // skip right over them.
+          if (IsFixed)
+            continue;
+          uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+          Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset);
+          OverflowOffset += alignTo(ArgSize, 8);
+          break;
+      }
+      // Count Gp/Vr fixed arguments to their respective offsets, but don't
+      // bother to actually store a shadow.
+      if (IsFixed)
+        continue;
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+    }
+    Constant *OverflowSize =
+      ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AArch64VAEndOffset);
+    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants (size of va_list).
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */32, /* alignment */8, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants (size of va_list).
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */32, /* alignment */8, false);
+  }
+
+  // Retrieve a va_list field of 'void*' size.
+  Value* getVAField64(IRBuilder<> &IRB, Value *VAListTag, int offset) {
+    Value *SaveAreaPtrPtr =
+      IRB.CreateIntToPtr(
+        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                      ConstantInt::get(MS.IntptrTy, offset)),
+        Type::getInt64PtrTy(*MS.C));
+    return IRB.CreateLoad(SaveAreaPtrPtr);
+  }
+
+  // Retrieve a va_list field of 'int' size.
+  Value* getVAField32(IRBuilder<> &IRB, Value *VAListTag, int offset) {
+    Value *SaveAreaPtr =
+      IRB.CreateIntToPtr(
+        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                      ConstantInt::get(MS.IntptrTy, offset)),
+        Type::getInt32PtrTy(*MS.C));
+    Value *SaveArea32 = IRB.CreateLoad(SaveAreaPtr);
+    return IRB.CreateSExt(SaveArea32, MS.IntptrTy);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+      VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
+      Value *CopySize =
+        IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset),
+                      VAArgOverflowSize);
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+    }
+
+    Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize);
+    Value *VrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64VrArgSize);
+
+    // Instrument va_start, copy va_list shadow from the backup copy of
+    // the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+
+      Value *VAListTag = OrigInst->getArgOperand(0);
+
+      // The variadic ABI for AArch64 creates two areas to save the incoming
+      // argument registers (one for 64-bit general register xn-x7 and another
+      // for 128-bit FP/SIMD vn-v7).
+      // We need then to propagate the shadow arguments on both regions
+      // 'va::__gr_top + va::__gr_offs' and 'va::__vr_top + va::__vr_offs'.
+      // The remaning arguments are saved on shadow for 'va::stack'.
+      // One caveat is it requires only to propagate the non-named arguments,
+      // however on the call site instrumentation 'all' the arguments are
+      // saved. So to copy the shadow values from the va_arg TLS array
+      // we need to adjust the offset for both GR and VR fields based on
+      // the __{gr,vr}_offs value (since they are stores based on incoming
+      // named arguments).
+
+      // Read the stack pointer from the va_list.
+      Value *StackSaveAreaPtr = getVAField64(IRB, VAListTag, 0);
+
+      // Read both the __gr_top and __gr_off and add them up.
+      Value *GrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 8);
+      Value *GrOffSaveArea = getVAField32(IRB, VAListTag, 24);
+
+      Value *GrRegSaveAreaPtr = IRB.CreateAdd(GrTopSaveAreaPtr, GrOffSaveArea);
+
+      // Read both the __vr_top and __vr_off and add them up.
+      Value *VrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 16);
+      Value *VrOffSaveArea = getVAField32(IRB, VAListTag, 28);
+
+      Value *VrRegSaveAreaPtr = IRB.CreateAdd(VrTopSaveAreaPtr, VrOffSaveArea);
+
+      // It does not know how many named arguments is being used and, on the
+      // callsite all the arguments were saved.  Since __gr_off is defined as
+      // '0 - ((8 - named_gr) * 8)', the idea is to just propagate the variadic
+      // argument by ignoring the bytes of shadow from named arguments.
+      Value *GrRegSaveAreaShadowPtrOff =
+        IRB.CreateAdd(GrArgSize, GrOffSaveArea);
+
+      Value *GrRegSaveAreaShadowPtr =
+        MSV.getShadowPtr(GrRegSaveAreaPtr, IRB.getInt8Ty(), IRB);
+
+      Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+                                              GrRegSaveAreaShadowPtrOff);
+      Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff);
+
+      IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, GrSrcPtr, GrCopySize, 8);
+
+      // Again, but for FP/SIMD values.
+      Value *VrRegSaveAreaShadowPtrOff =
+          IRB.CreateAdd(VrArgSize, VrOffSaveArea);
+
+      Value *VrRegSaveAreaShadowPtr =
+        MSV.getShadowPtr(VrRegSaveAreaPtr, IRB.getInt8Ty(), IRB);
+
+      Value *VrSrcPtr = IRB.CreateInBoundsGEP(
+        IRB.getInt8Ty(),
+        IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+                              IRB.getInt32(AArch64VrBegOffset)),
+        VrRegSaveAreaShadowPtrOff);
+      Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff);
+
+      IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, VrSrcPtr, VrCopySize, 8);
+
+      // And finally for remaining arguments.
+      Value *StackSaveAreaShadowPtr =
+        MSV.getShadowPtr(StackSaveAreaPtr, IRB.getInt8Ty(), IRB);
+
+      Value *StackSrcPtr =
+        IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+                              IRB.getInt32(AArch64VAEndOffset));
+
+      IRB.CreateMemCpy(StackSaveAreaShadowPtr, StackSrcPtr,
+                       VAArgOverflowSize, 16);
+    }
+  }
+};
+
+/// \brief PowerPC64-specific implementation of VarArgHelper.
+struct VarArgPowerPC64Helper : public VarArgHelper {
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy;
+  Value *VAArgSize;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgPowerPC64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV)
+    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr),
+      VAArgSize(nullptr) {}
+
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+    // For PowerPC, we need to deal with alignment of stack arguments -
+    // they are mostly aligned to 8 bytes, but vectors and i128 arrays
+    // are aligned to 16 bytes, byvals can be aligned to 8 or 16 bytes,
+    // and QPX vectors are aligned to 32 bytes.  For that reason, we
+    // compute current offset from stack pointer (which is always properly
+    // aligned), and offset for the first vararg, then subtract them.
+    unsigned VAArgBase;
+    llvm::Triple TargetTriple(F.getParent()->getTargetTriple());
+    // Parameter save area starts at 48 bytes from frame pointer for ABIv1,
+    // and 32 bytes for ABIv2.  This is usually determined by target
+    // endianness, but in theory could be overriden by function attribute.
+    // For simplicity, we ignore it here (it'd only matter for QPX vectors).
+    if (TargetTriple.getArch() == llvm::Triple::ppc64)
+      VAArgBase = 48;
+    else
+      VAArgBase = 32;
+    unsigned VAArgOffset = VAArgBase;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned ArgNo = CS.getArgumentNo(ArgIt);
+      bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
+      bool IsByVal = CS.paramHasAttr(ArgNo, Attribute::ByVal);
+      if (IsByVal) {
+        assert(A->getType()->isPointerTy());
+        Type *RealTy = A->getType()->getPointerElementType();
+        uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
+        uint64_t ArgAlign = CS.getParamAlignment(ArgNo);
+        if (ArgAlign < 8)
+          ArgAlign = 8;
+        VAArgOffset = alignTo(VAArgOffset, ArgAlign);
+        if (!IsFixed) {
+          Value *Base = getShadowPtrForVAArgument(RealTy, IRB,
+                                                  VAArgOffset - VAArgBase);
+          IRB.CreateMemCpy(Base, MSV.getShadowPtr(A, IRB.getInt8Ty(), IRB),
+                           ArgSize, kShadowTLSAlignment);
+        }
+        VAArgOffset += alignTo(ArgSize, 8);
+      } else {
+        Value *Base;
+        uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+        uint64_t ArgAlign = 8;
+        if (A->getType()->isArrayTy()) {
+          // Arrays are aligned to element size, except for long double
+          // arrays, which are aligned to 8 bytes.
+          Type *ElementTy = A->getType()->getArrayElementType();
+          if (!ElementTy->isPPC_FP128Ty())
+            ArgAlign = DL.getTypeAllocSize(ElementTy);
+        } else if (A->getType()->isVectorTy()) {
+          // Vectors are naturally aligned.
+          ArgAlign = DL.getTypeAllocSize(A->getType());
+        }
+        if (ArgAlign < 8)
+          ArgAlign = 8;
+        VAArgOffset = alignTo(VAArgOffset, ArgAlign);
+        if (DL.isBigEndian()) {
+          // Adjusting the shadow for argument with size < 8 to match the placement
+          // of bits in big endian system
+          if (ArgSize < 8)
+            VAArgOffset += (8 - ArgSize);
+        }
+        if (!IsFixed) {
+          Base = getShadowPtrForVAArgument(A->getType(), IRB,
+                                           VAArgOffset - VAArgBase);
+          IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+        }
+        VAArgOffset += ArgSize;
+        VAArgOffset = alignTo(VAArgOffset, 8);
+      }
+      if (IsFixed)
+        VAArgBase = VAArgOffset;
+    }
+
+    Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(),
+                                                VAArgOffset - VAArgBase);
+    // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
+    // a new class member i.e. it is the total size of all VarArgs.
+    IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// \brief Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */8, /* alignment */8, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */8, /* alignment */8, false);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    VAArgSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
+    Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
+                                    VAArgSize);
+
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+      Value *RegSaveAreaPtrPtr =
+        IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        Type::getInt64PtrTy(*MS.C));
+      Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr =
+      MSV.getShadowPtr(RegSaveAreaPtr, IRB.getInt8Ty(), IRB);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy, CopySize, 8);
+    }
+  }
+};
+
+/// \brief A no-op implementation of VarArgHelper.
+struct VarArgNoOpHelper : public VarArgHelper {
+  VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
+                   MemorySanitizerVisitor &MSV) {}
+
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {}
+
+  void visitVAStartInst(VAStartInst &I) override {}
+
+  void visitVACopyInst(VACopyInst &I) override {}
+
+  void finalizeInstrumentation() override {}
+};
+
+VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+                                 MemorySanitizerVisitor &Visitor) {
+  // VarArg handling is only implemented on AMD64. False positives are possible
+  // on other platforms.
+  llvm::Triple TargetTriple(Func.getParent()->getTargetTriple());
+  if (TargetTriple.getArch() == llvm::Triple::x86_64)
+    return new VarArgAMD64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.getArch() == llvm::Triple::mips64 ||
+           TargetTriple.getArch() == llvm::Triple::mips64el)
+    return new VarArgMIPS64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.getArch() == llvm::Triple::aarch64)
+    return new VarArgAArch64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.getArch() == llvm::Triple::ppc64 ||
+           TargetTriple.getArch() == llvm::Triple::ppc64le)
+    return new VarArgPowerPC64Helper(Func, Msan, Visitor);
+  else
+    return new VarArgNoOpHelper(Func, Msan, Visitor);
+}
+
+} // anonymous namespace
+
+bool MemorySanitizer::runOnFunction(Function &F) {
+  if (&F == MsanCtorFunction)
+    return false;
+  MemorySanitizerVisitor Visitor(F, *this);
+
+  // Clear out readonly/readnone attributes.
+  AttrBuilder B;
+  B.addAttribute(Attribute::ReadOnly)
+    .addAttribute(Attribute::ReadNone);
+  F.removeAttributes(AttributeList::FunctionIndex, B);
+
+  return Visitor.runOnFunction();
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
new file mode 100644
index 000000000000..8e4bfc0b91bc
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -0,0 +1,1562 @@
+//===-- PGOInstrumentation.cpp - MST-based PGO Instrumentation ------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements PGO instrumentation using a minimum spanning tree based
+// on the following paper:
+//   [1] Donald E. Knuth, Francis R. Stevenson. Optimal measurement of points
+//   for program frequency counts. BIT Numerical Mathematics 1973, Volume 13,
+//   Issue 3, pp 313-322
+// The idea of the algorithm based on the fact that for each node (except for
+// the entry and exit), the sum of incoming edge counts equals the sum of
+// outgoing edge counts. The count of edge on spanning tree can be derived from
+// those edges not on the spanning tree. Knuth proves this method instruments
+// the minimum number of edges.
+//
+// The minimal spanning tree here is actually a maximum weight tree -- on-tree
+// edges have higher frequencies (more likely to execute). The idea is to
+// instrument those less frequently executed edges to reduce the runtime
+// overhead of instrumented binaries.
+//
+// This file contains two passes:
+// (1) Pass PGOInstrumentationGen which instruments the IR to generate edge
+// count profile, and generates the instrumentation for indirect call
+// profiling.
+// (2) Pass PGOInstrumentationUse which reads the edge count profile and
+// annotates the branch weights. It also reads the indirect call value
+// profiling records and annotate the indirect call instructions.
+//
+// To get the precise counter information, These two passes need to invoke at
+// the same compilation point (so they see the same IR). For pass
+// PGOInstrumentationGen, the real work is done in instrumentOneFunc(). For
+// pass PGOInstrumentationUse, the real work in done in class PGOUseFunc and
+// the profile is opened in module level and passed to each PGOUseFunc instance.
+// The shared code for PGOInstrumentationGen and PGOInstrumentationUse is put
+// in class FuncPGOInstrumentation.
+//
+// Class PGOEdge represents a CFG edge and some auxiliary information. Class
+// BBInfo contains auxiliary information for each BB. These two classes are used
+// in pass PGOInstrumentationGen. Class PGOUseEdge and UseBBInfo are the derived
+// class of PGOEdge and BBInfo, respectively. They contains extra data structure
+// used in populating profile counters.
+// The MST implementation is in Class CFGMST (CFGMST.h).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/PGOInstrumentation.h"
+#include "CFGMST.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/IndirectCallSiteVisitor.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/JamCRC.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-instrumentation"
+
+STATISTIC(NumOfPGOInstrument, "Number of edges instrumented.");
+STATISTIC(NumOfPGOSelectInsts, "Number of select instruction instrumented.");
+STATISTIC(NumOfPGOMemIntrinsics, "Number of mem intrinsics instrumented.");
+STATISTIC(NumOfPGOEdge, "Number of edges.");
+STATISTIC(NumOfPGOBB, "Number of basic-blocks.");
+STATISTIC(NumOfPGOSplit, "Number of critical edge splits.");
+STATISTIC(NumOfPGOFunc, "Number of functions having valid profile counts.");
+STATISTIC(NumOfPGOMismatch, "Number of functions having mismatch profile.");
+STATISTIC(NumOfPGOMissing, "Number of functions without profile.");
+STATISTIC(NumOfPGOICall, "Number of indirect call value instrumentations.");
+
+// Command line option to specify the file to read profile from. This is
+// mainly used for testing.
+static cl::opt<std::string>
+    PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden,
+                       cl::value_desc("filename"),
+                       cl::desc("Specify the path of profile data file. This is"
+                                "mainly for test purpose."));
+
+// Command line option to disable value profiling. The default is false:
+// i.e. value profiling is enabled by default. This is for debug purpose.
+static cl::opt<bool> DisableValueProfiling("disable-vp", cl::init(false),
+                                           cl::Hidden,
+                                           cl::desc("Disable Value Profiling"));
+
+// Command line option to set the maximum number of VP annotations to write to
+// the metadata for a single indirect call callsite.
+static cl::opt<unsigned> MaxNumAnnotations(
+    "icp-max-annotations", cl::init(3), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Max number of annotations for a single indirect "
+             "call callsite"));
+
+// Command line option to set the maximum number of value annotations
+// to write to the metadata for a single memop intrinsic.
+static cl::opt<unsigned> MaxNumMemOPAnnotations(
+    "memop-max-annotations", cl::init(4), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Max number of preicise value annotations for a single memop"
+             "intrinsic"));
+
+// Command line option to control appending FunctionHash to the name of a COMDAT
+// function. This is to avoid the hash mismatch caused by the preinliner.
+static cl::opt<bool> DoComdatRenaming(
+    "do-comdat-renaming", cl::init(false), cl::Hidden,
+    cl::desc("Append function hash to the name of COMDAT function to avoid "
+             "function hash mismatch due to the preinliner"));
+
+// Command line option to enable/disable the warning about missing profile
+// information.
+static cl::opt<bool>
+    PGOWarnMissing("pgo-warn-missing-function", cl::init(false), cl::Hidden,
+                   cl::desc("Use this option to turn on/off "
+                            "warnings about missing profile data for "
+                            "functions."));
+
+// Command line option to enable/disable the warning about a hash mismatch in
+// the profile data.
+static cl::opt<bool>
+    NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false), cl::Hidden,
+                      cl::desc("Use this option to turn off/on "
+                               "warnings about profile cfg mismatch."));
+
+// Command line option to enable/disable the warning about a hash mismatch in
+// the profile data for Comdat functions, which often turns out to be false
+// positive due to the pre-instrumentation inline.
+static cl::opt<bool>
+    NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat", cl::init(true),
+                            cl::Hidden,
+                            cl::desc("The option is used to turn on/off "
+                                     "warnings about hash mismatch for comdat "
+                                     "functions."));
+
+// Command line option to enable/disable select instruction instrumentation.
+static cl::opt<bool>
+    PGOInstrSelect("pgo-instr-select", cl::init(true), cl::Hidden,
+                   cl::desc("Use this option to turn on/off SELECT "
+                            "instruction instrumentation. "));
+
+// Command line option to turn on CFG dot dump of raw profile counts
+static cl::opt<bool>
+    PGOViewRawCounts("pgo-view-raw-counts", cl::init(false), cl::Hidden,
+                     cl::desc("A boolean option to show CFG dag "
+                              "with raw profile counts from "
+                              "profile data. See also option "
+                              "-pgo-view-counts. To limit graph "
+                              "display to only one function, use "
+                              "filtering option -view-bfi-func-name."));
+
+// Command line option to enable/disable memop intrinsic call.size profiling.
+static cl::opt<bool>
+    PGOInstrMemOP("pgo-instr-memop", cl::init(true), cl::Hidden,
+                  cl::desc("Use this option to turn on/off "
+                           "memory intrinsic size profiling."));
+
+// Emit branch probability as optimization remarks.
+static cl::opt<bool>
+    EmitBranchProbability("pgo-emit-branch-prob", cl::init(false), cl::Hidden,
+                          cl::desc("When this option is on, the annotated "
+                                   "branch probability will be emitted as "
+                                   " optimization remarks: -Rpass-analysis="
+                                   "pgo-instr-use"));
+
+// Command line option to turn on CFG dot dump after profile annotation.
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
+extern cl::opt<bool> PGOViewCounts;
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+
+namespace {
+
+// Return a string describing the branch condition that can be
+// used in static branch probability heuristics:
+std::string getBranchCondString(Instruction *TI) {
+  BranchInst *BI = dyn_cast<BranchInst>(TI);
+  if (!BI || !BI->isConditional())
+    return std::string();
+
+  Value *Cond = BI->getCondition();
+  ICmpInst *CI = dyn_cast<ICmpInst>(Cond);
+  if (!CI)
+    return std::string();
+
+  std::string result;
+  raw_string_ostream OS(result);
+  OS << CmpInst::getPredicateName(CI->getPredicate()) << "_";
+  CI->getOperand(0)->getType()->print(OS, true);
+
+  Value *RHS = CI->getOperand(1);
+  ConstantInt *CV = dyn_cast<ConstantInt>(RHS);
+  if (CV) {
+    if (CV->isZero())
+      OS << "_Zero";
+    else if (CV->isOne())
+      OS << "_One";
+    else if (CV->isMinusOne())
+      OS << "_MinusOne";
+    else
+      OS << "_Const";
+  }
+  OS.flush();
+  return result;
+}
+
+/// The select instruction visitor plays three roles specified
+/// by the mode. In \c VM_counting mode, it simply counts the number of
+/// select instructions. In \c VM_instrument mode, it inserts code to count
+/// the number times TrueValue of select is taken. In \c VM_annotate mode,
+/// it reads the profile data and annotate the select instruction with metadata.
+enum VisitMode { VM_counting, VM_instrument, VM_annotate };
+class PGOUseFunc;
+
+/// Instruction Visitor class to visit select instructions.
+struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
+  Function &F;
+  unsigned NSIs = 0;             // Number of select instructions instrumented.
+  VisitMode Mode = VM_counting;  // Visiting mode.
+  unsigned *CurCtrIdx = nullptr; // Pointer to current counter index.
+  unsigned TotalNumCtrs = 0;     // Total number of counters
+  GlobalVariable *FuncNameVar = nullptr;
+  uint64_t FuncHash = 0;
+  PGOUseFunc *UseFunc = nullptr;
+
+  SelectInstVisitor(Function &Func) : F(Func) {}
+
+  void countSelects(Function &Func) {
+    NSIs = 0;
+    Mode = VM_counting;
+    visit(Func);
+  }
+  // Visit the IR stream and instrument all select instructions. \p
+  // Ind is a pointer to the counter index variable; \p TotalNC
+  // is the total number of counters; \p FNV is the pointer to the
+  // PGO function name var; \p FHash is the function hash.
+  void instrumentSelects(Function &Func, unsigned *Ind, unsigned TotalNC,
+                         GlobalVariable *FNV, uint64_t FHash) {
+    Mode = VM_instrument;
+    CurCtrIdx = Ind;
+    TotalNumCtrs = TotalNC;
+    FuncHash = FHash;
+    FuncNameVar = FNV;
+    visit(Func);
+  }
+
+  // Visit the IR stream and annotate all select instructions.
+  void annotateSelects(Function &Func, PGOUseFunc *UF, unsigned *Ind) {
+    Mode = VM_annotate;
+    UseFunc = UF;
+    CurCtrIdx = Ind;
+    visit(Func);
+  }
+
+  void instrumentOneSelectInst(SelectInst &SI);
+  void annotateOneSelectInst(SelectInst &SI);
+  // Visit \p SI instruction and perform tasks according to visit mode.
+  void visitSelectInst(SelectInst &SI);
+  // Return the number of select instructions. This needs be called after
+  // countSelects().
+  unsigned getNumOfSelectInsts() const { return NSIs; }
+};
+
+/// Instruction Visitor class to visit memory intrinsic calls.
+struct MemIntrinsicVisitor : public InstVisitor<MemIntrinsicVisitor> {
+  Function &F;
+  unsigned NMemIs = 0;          // Number of memIntrinsics instrumented.
+  VisitMode Mode = VM_counting; // Visiting mode.
+  unsigned CurCtrId = 0;        // Current counter index.
+  unsigned TotalNumCtrs = 0;    // Total number of counters
+  GlobalVariable *FuncNameVar = nullptr;
+  uint64_t FuncHash = 0;
+  PGOUseFunc *UseFunc = nullptr;
+  std::vector<Instruction *> Candidates;
+
+  MemIntrinsicVisitor(Function &Func) : F(Func) {}
+
+  void countMemIntrinsics(Function &Func) {
+    NMemIs = 0;
+    Mode = VM_counting;
+    visit(Func);
+  }
+
+  void instrumentMemIntrinsics(Function &Func, unsigned TotalNC,
+                               GlobalVariable *FNV, uint64_t FHash) {
+    Mode = VM_instrument;
+    TotalNumCtrs = TotalNC;
+    FuncHash = FHash;
+    FuncNameVar = FNV;
+    visit(Func);
+  }
+
+  std::vector<Instruction *> findMemIntrinsics(Function &Func) {
+    Candidates.clear();
+    Mode = VM_annotate;
+    visit(Func);
+    return Candidates;
+  }
+
+  // Visit the IR stream and annotate all mem intrinsic call instructions.
+  void instrumentOneMemIntrinsic(MemIntrinsic &MI);
+  // Visit \p MI instruction and perform tasks according to visit mode.
+  void visitMemIntrinsic(MemIntrinsic &SI);
+  unsigned getNumOfMemIntrinsics() const { return NMemIs; }
+};
+
+class PGOInstrumentationGenLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  PGOInstrumentationGenLegacyPass() : ModulePass(ID) {
+    initializePGOInstrumentationGenLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "PGOInstrumentationGenPass"; }
+
+private:
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+  }
+};
+
+class PGOInstrumentationUseLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  // Provide the profile filename as the parameter.
+  PGOInstrumentationUseLegacyPass(std::string Filename = "")
+      : ModulePass(ID), ProfileFileName(std::move(Filename)) {
+    if (!PGOTestProfileFile.empty())
+      ProfileFileName = PGOTestProfileFile;
+    initializePGOInstrumentationUseLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "PGOInstrumentationUsePass"; }
+
+private:
+  std::string ProfileFileName;
+
+  bool runOnModule(Module &M) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+char PGOInstrumentationGenLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
+                      "PGO instrumentation.", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_END(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
+                    "PGO instrumentation.", false, false)
+
+ModulePass *llvm::createPGOInstrumentationGenLegacyPass() {
+  return new PGOInstrumentationGenLegacyPass();
+}
+
+char PGOInstrumentationUseLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOInstrumentationUseLegacyPass, "pgo-instr-use",
+                      "Read PGO instrumentation profile.", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_END(PGOInstrumentationUseLegacyPass, "pgo-instr-use",
+                    "Read PGO instrumentation profile.", false, false)
+
+ModulePass *llvm::createPGOInstrumentationUseLegacyPass(StringRef Filename) {
+  return new PGOInstrumentationUseLegacyPass(Filename.str());
+}
+
+namespace {
+/// \brief An MST based instrumentation for PGO
+///
+/// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO
+/// in the function level.
+struct PGOEdge {
+  // This class implements the CFG edges. Note the CFG can be a multi-graph.
+  // So there might be multiple edges with same SrcBB and DestBB.
+  const BasicBlock *SrcBB;
+  const BasicBlock *DestBB;
+  uint64_t Weight;
+  bool InMST;
+  bool Removed;
+  bool IsCritical;
+  PGOEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1)
+      : SrcBB(Src), DestBB(Dest), Weight(W), InMST(false), Removed(false),
+        IsCritical(false) {}
+  // Return the information string of an edge.
+  const std::string infoString() const {
+    return (Twine(Removed ? "-" : " ") + (InMST ? " " : "*") +
+            (IsCritical ? "c" : " ") + "  W=" + Twine(Weight)).str();
+  }
+};
+
+// This class stores the auxiliary information for each BB.
+struct BBInfo {
+  BBInfo *Group;
+  uint32_t Index;
+  uint32_t Rank;
+
+  BBInfo(unsigned IX) : Group(this), Index(IX), Rank(0) {}
+
+  // Return the information string of this object.
+  const std::string infoString() const {
+    return (Twine("Index=") + Twine(Index)).str();
+  }
+};
+
+// This class implements the CFG edges. Note the CFG can be a multi-graph.
+template <class Edge, class BBInfo> class FuncPGOInstrumentation {
+private:
+  Function &F;
+  void computeCFGHash();
+  void renameComdatFunction();
+  // A map that stores the Comdat group in function F.
+  std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers;
+
+public:
+  std::vector<std::vector<Instruction *>> ValueSites;
+  SelectInstVisitor SIVisitor;
+  MemIntrinsicVisitor MIVisitor;
+  std::string FuncName;
+  GlobalVariable *FuncNameVar;
+  // CFG hash value for this function.
+  uint64_t FunctionHash;
+
+  // The Minimum Spanning Tree of function CFG.
+  CFGMST<Edge, BBInfo> MST;
+
+  // Give an edge, find the BB that will be instrumented.
+  // Return nullptr if there is no BB to be instrumented.
+  BasicBlock *getInstrBB(Edge *E);
+
+  // Return the auxiliary BB information.
+  BBInfo &getBBInfo(const BasicBlock *BB) const { return MST.getBBInfo(BB); }
+
+  // Return the auxiliary BB information if available.
+  BBInfo *findBBInfo(const BasicBlock *BB) const { return MST.findBBInfo(BB); }
+
+  // Dump edges and BB information.
+  void dumpInfo(std::string Str = "") const {
+    MST.dumpEdges(dbgs(), Twine("Dump Function ") + FuncName + " Hash: " +
+                              Twine(FunctionHash) + "\t" + Str);
+  }
+
+  FuncPGOInstrumentation(
+      Function &Func,
+      std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
+      bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
+      BlockFrequencyInfo *BFI = nullptr)
+      : F(Func), ComdatMembers(ComdatMembers), ValueSites(IPVK_Last + 1),
+        SIVisitor(Func), MIVisitor(Func), FunctionHash(0), MST(F, BPI, BFI) {
+
+    // This should be done before CFG hash computation.
+    SIVisitor.countSelects(Func);
+    MIVisitor.countMemIntrinsics(Func);
+    NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
+    NumOfPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics();
+    ValueSites[IPVK_IndirectCallTarget] = findIndirectCallSites(Func);
+    ValueSites[IPVK_MemOPSize] = MIVisitor.findMemIntrinsics(Func);
+
+    FuncName = getPGOFuncName(F);
+    computeCFGHash();
+    if (ComdatMembers.size())
+      renameComdatFunction();
+    DEBUG(dumpInfo("after CFGMST"));
+
+    NumOfPGOBB += MST.BBInfos.size();
+    for (auto &E : MST.AllEdges) {
+      if (E->Removed)
+        continue;
+      NumOfPGOEdge++;
+      if (!E->InMST)
+        NumOfPGOInstrument++;
+    }
+
+    if (CreateGlobalVar)
+      FuncNameVar = createPGOFuncNameVar(F, FuncName);
+  }
+
+  // Return the number of profile counters needed for the function.
+  unsigned getNumCounters() {
+    unsigned NumCounters = 0;
+    for (auto &E : this->MST.AllEdges) {
+      if (!E->InMST && !E->Removed)
+        NumCounters++;
+    }
+    return NumCounters + SIVisitor.getNumOfSelectInsts();
+  }
+};
+
+// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index
+// value of each BB in the CFG. The higher 32 bits record the number of edges.
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
+  std::vector<char> Indexes;
+  JamCRC JC;
+  for (auto &BB : F) {
+    const TerminatorInst *TI = BB.getTerminator();
+    for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) {
+      BasicBlock *Succ = TI->getSuccessor(I);
+      auto BI = findBBInfo(Succ);
+      if (BI == nullptr)
+        continue;
+      uint32_t Index = BI->Index;
+      for (int J = 0; J < 4; J++)
+        Indexes.push_back((char)(Index >> (J * 8)));
+    }
+  }
+  JC.update(Indexes);
+  FunctionHash = (uint64_t)SIVisitor.getNumOfSelectInsts() << 56 |
+                 (uint64_t)ValueSites[IPVK_IndirectCallTarget].size() << 48 |
+                 (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC();
+}
+
+// Check if we can safely rename this Comdat function.
+static bool canRenameComdat(
+    Function &F,
+    std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
+  if (!DoComdatRenaming || !canRenameComdatFunc(F, true))
+    return false;
+
+  // FIXME: Current only handle those Comdat groups that only containing one
+  // function and function aliases.
+  // (1) For a Comdat group containing multiple functions, we need to have a
+  // unique postfix based on the hashes for each function. There is a
+  // non-trivial code refactoring to do this efficiently.
+  // (2) Variables can not be renamed, so we can not rename Comdat function in a
+  // group including global vars.
+  Comdat *C = F.getComdat();
+  for (auto &&CM : make_range(ComdatMembers.equal_range(C))) {
+    if (dyn_cast<GlobalAlias>(CM.second))
+      continue;
+    Function *FM = dyn_cast<Function>(CM.second);
+    if (FM != &F)
+      return false;
+  }
+  return true;
+}
+
+// Append the CFGHash to the Comdat function name.
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::renameComdatFunction() {
+  if (!canRenameComdat(F, ComdatMembers))
+    return;
+  std::string OrigName = F.getName().str();
+  std::string NewFuncName =
+      Twine(F.getName() + "." + Twine(FunctionHash)).str();
+  F.setName(Twine(NewFuncName));
+  GlobalAlias::create(GlobalValue::WeakAnyLinkage, OrigName, &F);
+  FuncName = Twine(FuncName + "." + Twine(FunctionHash)).str();
+  Comdat *NewComdat;
+  Module *M = F.getParent();
+  // For AvailableExternallyLinkage functions, change the linkage to
+  // LinkOnceODR and put them into comdat. This is because after renaming, there
+  // is no backup external copy available for the function.
+  if (!F.hasComdat()) {
+    assert(F.getLinkage() == GlobalValue::AvailableExternallyLinkage);
+    NewComdat = M->getOrInsertComdat(StringRef(NewFuncName));
+    F.setLinkage(GlobalValue::LinkOnceODRLinkage);
+    F.setComdat(NewComdat);
+    return;
+  }
+
+  // This function belongs to a single function Comdat group.
+  Comdat *OrigComdat = F.getComdat();
+  std::string NewComdatName =
+      Twine(OrigComdat->getName() + "." + Twine(FunctionHash)).str();
+  NewComdat = M->getOrInsertComdat(StringRef(NewComdatName));
+  NewComdat->setSelectionKind(OrigComdat->getSelectionKind());
+
+  for (auto &&CM : make_range(ComdatMembers.equal_range(OrigComdat))) {
+    if (GlobalAlias *GA = dyn_cast<GlobalAlias>(CM.second)) {
+      // For aliases, change the name directly.
+      assert(dyn_cast<Function>(GA->getAliasee()->stripPointerCasts()) == &F);
+      std::string OrigGAName = GA->getName().str();
+      GA->setName(Twine(GA->getName() + "." + Twine(FunctionHash)));
+      GlobalAlias::create(GlobalValue::WeakAnyLinkage, OrigGAName, GA);
+      continue;
+    }
+    // Must be a function.
+    Function *CF = dyn_cast<Function>(CM.second);
+    assert(CF);
+    CF->setComdat(NewComdat);
+  }
+}
+
+// Given a CFG E to be instrumented, find which BB to place the instrumented
+// code. The function will split the critical edge if necessary.
+template <class Edge, class BBInfo>
+BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
+  if (E->InMST || E->Removed)
+    return nullptr;
+
+  BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB);
+  BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB);
+  // For a fake edge, instrument the real BB.
+  if (SrcBB == nullptr)
+    return DestBB;
+  if (DestBB == nullptr)
+    return SrcBB;
+
+  // Instrument the SrcBB if it has a single successor,
+  // otherwise, the DestBB if this is not a critical edge.
+  TerminatorInst *TI = SrcBB->getTerminator();
+  if (TI->getNumSuccessors() <= 1)
+    return SrcBB;
+  if (!E->IsCritical)
+    return DestBB;
+
+  // For a critical edge, we have to split. Instrument the newly
+  // created BB.
+  NumOfPGOSplit++;
+  DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index << " --> "
+               << getBBInfo(DestBB).Index << "\n");
+  unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
+  BasicBlock *InstrBB = SplitCriticalEdge(TI, SuccNum);
+  assert(InstrBB && "Critical edge is not split");
+
+  E->Removed = true;
+  return InstrBB;
+}
+
+// Visit all edge and instrument the edges not in MST, and do value profiling.
+// Critical edges will be split.
+static void instrumentOneFunc(
+    Function &F, Module *M, BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFI,
+    std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
+  FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, ComdatMembers, true, BPI,
+                                                   BFI);
+  unsigned NumCounters = FuncInfo.getNumCounters();
+
+  uint32_t I = 0;
+  Type *I8PtrTy = Type::getInt8PtrTy(M->getContext());
+  for (auto &E : FuncInfo.MST.AllEdges) {
+    BasicBlock *InstrBB = FuncInfo.getInstrBB(E.get());
+    if (!InstrBB)
+      continue;
+
+    IRBuilder<> Builder(InstrBB, InstrBB->getFirstInsertionPt());
+    assert(Builder.GetInsertPoint() != InstrBB->end() &&
+           "Cannot get the Instrumentation point");
+    Builder.CreateCall(
+        Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment),
+        {llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
+         Builder.getInt64(FuncInfo.FunctionHash), Builder.getInt32(NumCounters),
+         Builder.getInt32(I++)});
+  }
+
+  // Now instrument select instructions:
+  FuncInfo.SIVisitor.instrumentSelects(F, &I, NumCounters, FuncInfo.FuncNameVar,
+                                       FuncInfo.FunctionHash);
+  assert(I == NumCounters);
+
+  if (DisableValueProfiling)
+    return;
+
+  unsigned NumIndirectCallSites = 0;
+  for (auto &I : FuncInfo.ValueSites[IPVK_IndirectCallTarget]) {
+    CallSite CS(I);
+    Value *Callee = CS.getCalledValue();
+    DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = "
+                 << NumIndirectCallSites << "\n");
+    IRBuilder<> Builder(I);
+    assert(Builder.GetInsertPoint() != I->getParent()->end() &&
+           "Cannot get the Instrumentation point");
+    Builder.CreateCall(
+        Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
+        {llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
+         Builder.getInt64(FuncInfo.FunctionHash),
+         Builder.CreatePtrToInt(Callee, Builder.getInt64Ty()),
+         Builder.getInt32(IPVK_IndirectCallTarget),
+         Builder.getInt32(NumIndirectCallSites++)});
+  }
+  NumOfPGOICall += NumIndirectCallSites;
+
+  // Now instrument memop intrinsic calls.
+  FuncInfo.MIVisitor.instrumentMemIntrinsics(
+      F, NumCounters, FuncInfo.FuncNameVar, FuncInfo.FunctionHash);
+}
+
+// This class represents a CFG edge in profile use compilation.
+struct PGOUseEdge : public PGOEdge {
+  bool CountValid;
+  uint64_t CountValue;
+  PGOUseEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1)
+      : PGOEdge(Src, Dest, W), CountValid(false), CountValue(0) {}
+
+  // Set edge count value
+  void setEdgeCount(uint64_t Value) {
+    CountValue = Value;
+    CountValid = true;
+  }
+
+  // Return the information string for this object.
+  const std::string infoString() const {
+    if (!CountValid)
+      return PGOEdge::infoString();
+    return (Twine(PGOEdge::infoString()) + "  Count=" + Twine(CountValue))
+        .str();
+  }
+};
+
+typedef SmallVector<PGOUseEdge *, 2> DirectEdges;
+
+// This class stores the auxiliary information for each BB.
+struct UseBBInfo : public BBInfo {
+  uint64_t CountValue;
+  bool CountValid;
+  int32_t UnknownCountInEdge;
+  int32_t UnknownCountOutEdge;
+  DirectEdges InEdges;
+  DirectEdges OutEdges;
+  UseBBInfo(unsigned IX)
+      : BBInfo(IX), CountValue(0), CountValid(false), UnknownCountInEdge(0),
+        UnknownCountOutEdge(0) {}
+  UseBBInfo(unsigned IX, uint64_t C)
+      : BBInfo(IX), CountValue(C), CountValid(true), UnknownCountInEdge(0),
+        UnknownCountOutEdge(0) {}
+
+  // Set the profile count value for this BB.
+  void setBBInfoCount(uint64_t Value) {
+    CountValue = Value;
+    CountValid = true;
+  }
+
+  // Return the information string of this object.
+  const std::string infoString() const {
+    if (!CountValid)
+      return BBInfo::infoString();
+    return (Twine(BBInfo::infoString()) + "  Count=" + Twine(CountValue)).str();
+  }
+};
+
+// Sum up the count values for all the edges.
+static uint64_t sumEdgeCount(const ArrayRef<PGOUseEdge *> Edges) {
+  uint64_t Total = 0;
+  for (auto &E : Edges) {
+    if (E->Removed)
+      continue;
+    Total += E->CountValue;
+  }
+  return Total;
+}
+
+class PGOUseFunc {
+public:
+  PGOUseFunc(Function &Func, Module *Modu,
+             std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
+             BranchProbabilityInfo *BPI = nullptr,
+             BlockFrequencyInfo *BFI = nullptr)
+      : F(Func), M(Modu), FuncInfo(Func, ComdatMembers, false, BPI, BFI),
+        CountPosition(0), ProfileCountSize(0), FreqAttr(FFA_Normal) {}
+
+  // Read counts for the instrumented BB from profile.
+  bool readCounters(IndexedInstrProfReader *PGOReader);
+
+  // Populate the counts for all BBs.
+  void populateCounters();
+
+  // Set the branch weights based on the count values.
+  void setBranchWeights();
+
+  // Annotate the value profile call sites all all value kind.
+  void annotateValueSites();
+
+  // Annotate the value profile call sites for one value kind.
+  void annotateValueSites(uint32_t Kind);
+
+  // The hotness of the function from the profile count.
+  enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
+
+  // Return the function hotness from the profile.
+  FuncFreqAttr getFuncFreqAttr() const { return FreqAttr; }
+
+  // Return the function hash.
+  uint64_t getFuncHash() const { return FuncInfo.FunctionHash; }
+  // Return the profile record for this function;
+  InstrProfRecord &getProfileRecord() { return ProfileRecord; }
+
+  // Return the auxiliary BB information.
+  UseBBInfo &getBBInfo(const BasicBlock *BB) const {
+    return FuncInfo.getBBInfo(BB);
+  }
+
+  // Return the auxiliary BB information if available.
+  UseBBInfo *findBBInfo(const BasicBlock *BB) const {
+    return FuncInfo.findBBInfo(BB);
+  }
+
+  Function &getFunc() const { return F; }
+
+private:
+  Function &F;
+  Module *M;
+  // This member stores the shared information with class PGOGenFunc.
+  FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
+
+  // The maximum count value in the profile. This is only used in PGO use
+  // compilation.
+  uint64_t ProgramMaxCount;
+
+  // Position of counter that remains to be read.
+  uint32_t CountPosition;
+
+  // Total size of the profile count for this function.
+  uint32_t ProfileCountSize;
+
+  // ProfileRecord for this function.
+  InstrProfRecord ProfileRecord;
+
+  // Function hotness info derived from profile.
+  FuncFreqAttr FreqAttr;
+
+  // Find the Instrumented BB and set the value.
+  void setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile);
+
+  // Set the edge counter value for the unknown edge -- there should be only
+  // one unknown edge.
+  void setEdgeCount(DirectEdges &Edges, uint64_t Value);
+
+  // Return FuncName string;
+  const std::string getFuncName() const { return FuncInfo.FuncName; }
+
+  // Set the hot/cold inline hints based on the count values.
+  // FIXME: This function should be removed once the functionality in
+  // the inliner is implemented.
+  void markFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) {
+    if (ProgramMaxCount == 0)
+      return;
+    // Threshold of the hot functions.
+    const BranchProbability HotFunctionThreshold(1, 100);
+    // Threshold of the cold functions.
+    const BranchProbability ColdFunctionThreshold(2, 10000);
+    if (EntryCount >= HotFunctionThreshold.scale(ProgramMaxCount))
+      FreqAttr = FFA_Hot;
+    else if (MaxCount <= ColdFunctionThreshold.scale(ProgramMaxCount))
+      FreqAttr = FFA_Cold;
+  }
+};
+
+// Visit all the edges and assign the count value for the instrumented
+// edges and the BB.
+void PGOUseFunc::setInstrumentedCounts(
+    const std::vector<uint64_t> &CountFromProfile) {
+
+  assert(FuncInfo.getNumCounters() == CountFromProfile.size());
+  // Use a worklist as we will update the vector during the iteration.
+  std::vector<PGOUseEdge *> WorkList;
+  for (auto &E : FuncInfo.MST.AllEdges)
+    WorkList.push_back(E.get());
+
+  uint32_t I = 0;
+  for (auto &E : WorkList) {
+    BasicBlock *InstrBB = FuncInfo.getInstrBB(E);
+    if (!InstrBB)
+      continue;
+    uint64_t CountValue = CountFromProfile[I++];
+    if (!E->Removed) {
+      getBBInfo(InstrBB).setBBInfoCount(CountValue);
+      E->setEdgeCount(CountValue);
+      continue;
+    }
+
+    // Need to add two new edges.
+    BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB);
+    BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB);
+    // Add new edge of SrcBB->InstrBB.
+    PGOUseEdge &NewEdge = FuncInfo.MST.addEdge(SrcBB, InstrBB, 0);
+    NewEdge.setEdgeCount(CountValue);
+    // Add new edge of InstrBB->DestBB.
+    PGOUseEdge &NewEdge1 = FuncInfo.MST.addEdge(InstrBB, DestBB, 0);
+    NewEdge1.setEdgeCount(CountValue);
+    NewEdge1.InMST = true;
+    getBBInfo(InstrBB).setBBInfoCount(CountValue);
+  }
+  ProfileCountSize = CountFromProfile.size();
+  CountPosition = I;
+}
+
+// Set the count value for the unknown edge. There should be one and only one
+// unknown edge in Edges vector.
+void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) {
+  for (auto &E : Edges) {
+    if (E->CountValid)
+      continue;
+    E->setEdgeCount(Value);
+
+    getBBInfo(E->SrcBB).UnknownCountOutEdge--;
+    getBBInfo(E->DestBB).UnknownCountInEdge--;
+    return;
+  }
+  llvm_unreachable("Cannot find the unknown count edge");
+}
+
+// Read the profile from ProfileFileName and assign the value to the
+// instrumented BB and the edges. This function also updates ProgramMaxCount.
+// Return true if the profile are successfully read, and false on errors.
+bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) {
+  auto &Ctx = M->getContext();
+  Expected<InstrProfRecord> Result =
+      PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash);
+  if (Error E = Result.takeError()) {
+    handleAllErrors(std::move(E), [&](const InstrProfError &IPE) {
+      auto Err = IPE.get();
+      bool SkipWarning = false;
+      if (Err == instrprof_error::unknown_function) {
+        NumOfPGOMissing++;
+        SkipWarning = !PGOWarnMissing;
+      } else if (Err == instrprof_error::hash_mismatch ||
+                 Err == instrprof_error::malformed) {
+        NumOfPGOMismatch++;
+        SkipWarning =
+            NoPGOWarnMismatch ||
+            (NoPGOWarnMismatchComdat &&
+             (F.hasComdat() ||
+              F.getLinkage() == GlobalValue::AvailableExternallyLinkage));
+      }
+
+      if (SkipWarning)
+        return;
+
+      std::string Msg = IPE.message() + std::string(" ") + F.getName().str();
+      Ctx.diagnose(
+          DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+    });
+    return false;
+  }
+  ProfileRecord = std::move(Result.get());
+  std::vector<uint64_t> &CountFromProfile = ProfileRecord.Counts;
+
+  NumOfPGOFunc++;
+  DEBUG(dbgs() << CountFromProfile.size() << " counts\n");
+  uint64_t ValueSum = 0;
+  for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) {
+    DEBUG(dbgs() << "  " << I << ": " << CountFromProfile[I] << "\n");
+    ValueSum += CountFromProfile[I];
+  }
+
+  DEBUG(dbgs() << "SUM =  " << ValueSum << "\n");
+
+  getBBInfo(nullptr).UnknownCountOutEdge = 2;
+  getBBInfo(nullptr).UnknownCountInEdge = 2;
+
+  setInstrumentedCounts(CountFromProfile);
+  ProgramMaxCount = PGOReader->getMaximumFunctionCount();
+  return true;
+}
+
+// Populate the counters from instrumented BBs to all BBs.
+// In the end of this operation, all BBs should have a valid count value.
+void PGOUseFunc::populateCounters() {
+  // First set up Count variable for all BBs.
+  for (auto &E : FuncInfo.MST.AllEdges) {
+    if (E->Removed)
+      continue;
+
+    const BasicBlock *SrcBB = E->SrcBB;
+    const BasicBlock *DestBB = E->DestBB;
+    UseBBInfo &SrcInfo = getBBInfo(SrcBB);
+    UseBBInfo &DestInfo = getBBInfo(DestBB);
+    SrcInfo.OutEdges.push_back(E.get());
+    DestInfo.InEdges.push_back(E.get());
+    SrcInfo.UnknownCountOutEdge++;
+    DestInfo.UnknownCountInEdge++;
+
+    if (!E->CountValid)
+      continue;
+    DestInfo.UnknownCountInEdge--;
+    SrcInfo.UnknownCountOutEdge--;
+  }
+
+  bool Changes = true;
+  unsigned NumPasses = 0;
+  while (Changes) {
+    NumPasses++;
+    Changes = false;
+
+    // For efficient traversal, it's better to start from the end as most
+    // of the instrumented edges are at the end.
+    for (auto &BB : reverse(F)) {
+      UseBBInfo *Count = findBBInfo(&BB);
+      if (Count == nullptr)
+        continue;
+      if (!Count->CountValid) {
+        if (Count->UnknownCountOutEdge == 0) {
+          Count->CountValue = sumEdgeCount(Count->OutEdges);
+          Count->CountValid = true;
+          Changes = true;
+        } else if (Count->UnknownCountInEdge == 0) {
+          Count->CountValue = sumEdgeCount(Count->InEdges);
+          Count->CountValid = true;
+          Changes = true;
+        }
+      }
+      if (Count->CountValid) {
+        if (Count->UnknownCountOutEdge == 1) {
+          uint64_t Total = 0;
+          uint64_t OutSum = sumEdgeCount(Count->OutEdges);
+          // If the one of the successor block can early terminate (no-return),
+          // we can end up with situation where out edge sum count is larger as
+          // the source BB's count is collected by a post-dominated block.
+          if (Count->CountValue > OutSum)
+            Total = Count->CountValue - OutSum;
+          setEdgeCount(Count->OutEdges, Total);
+          Changes = true;
+        }
+        if (Count->UnknownCountInEdge == 1) {
+          uint64_t Total = 0;
+          uint64_t InSum = sumEdgeCount(Count->InEdges);
+          if (Count->CountValue > InSum)
+            Total = Count->CountValue - InSum;
+          setEdgeCount(Count->InEdges, Total);
+          Changes = true;
+        }
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
+#ifndef NDEBUG
+  // Assert every BB has a valid counter.
+  for (auto &BB : F) {
+    auto BI = findBBInfo(&BB);
+    if (BI == nullptr)
+      continue;
+    assert(BI->CountValid && "BB count is not valid");
+  }
+#endif
+  uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue;
+  F.setEntryCount(FuncEntryCount);
+  uint64_t FuncMaxCount = FuncEntryCount;
+  for (auto &BB : F) {
+    auto BI = findBBInfo(&BB);
+    if (BI == nullptr)
+      continue;
+    FuncMaxCount = std::max(FuncMaxCount, BI->CountValue);
+  }
+  markFunctionAttributes(FuncEntryCount, FuncMaxCount);
+
+  // Now annotate select instructions
+  FuncInfo.SIVisitor.annotateSelects(F, this, &CountPosition);
+  assert(CountPosition == ProfileCountSize);
+
+  DEBUG(FuncInfo.dumpInfo("after reading profile."));
+}
+
+// Assign the scaled count values to the BB with multiple out edges.
+void PGOUseFunc::setBranchWeights() {
+  // Generate MD_prof metadata for every branch instruction.
+  DEBUG(dbgs() << "\nSetting branch weights.\n");
+  for (auto &BB : F) {
+    TerminatorInst *TI = BB.getTerminator();
+    if (TI->getNumSuccessors() < 2)
+      continue;
+    if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
+      continue;
+    if (getBBInfo(&BB).CountValue == 0)
+      continue;
+
+    // We have a non-zero Branch BB.
+    const UseBBInfo &BBCountInfo = getBBInfo(&BB);
+    unsigned Size = BBCountInfo.OutEdges.size();
+    SmallVector<uint64_t, 2> EdgeCounts(Size, 0);
+    uint64_t MaxCount = 0;
+    for (unsigned s = 0; s < Size; s++) {
+      const PGOUseEdge *E = BBCountInfo.OutEdges[s];
+      const BasicBlock *SrcBB = E->SrcBB;
+      const BasicBlock *DestBB = E->DestBB;
+      if (DestBB == nullptr)
+        continue;
+      unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
+      uint64_t EdgeCount = E->CountValue;
+      if (EdgeCount > MaxCount)
+        MaxCount = EdgeCount;
+      EdgeCounts[SuccNum] = EdgeCount;
+    }
+    setProfMetadata(M, TI, EdgeCounts, MaxCount);
+  }
+}
+
+void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
+  Module *M = F.getParent();
+  IRBuilder<> Builder(&SI);
+  Type *Int64Ty = Builder.getInt64Ty();
+  Type *I8PtrTy = Builder.getInt8PtrTy();
+  auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty);
+  Builder.CreateCall(
+      Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
+      {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
+       Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs),
+       Builder.getInt32(*CurCtrIdx), Step});
+  ++(*CurCtrIdx);
+}
+
+void SelectInstVisitor::annotateOneSelectInst(SelectInst &SI) {
+  std::vector<uint64_t> &CountFromProfile = UseFunc->getProfileRecord().Counts;
+  assert(*CurCtrIdx < CountFromProfile.size() &&
+         "Out of bound access of counters");
+  uint64_t SCounts[2];
+  SCounts[0] = CountFromProfile[*CurCtrIdx]; // True count
+  ++(*CurCtrIdx);
+  uint64_t TotalCount = 0;
+  auto BI = UseFunc->findBBInfo(SI.getParent());
+  if (BI != nullptr)
+    TotalCount = BI->CountValue;
+  // False Count
+  SCounts[1] = (TotalCount > SCounts[0] ? TotalCount - SCounts[0] : 0);
+  uint64_t MaxCount = std::max(SCounts[0], SCounts[1]);
+  if (MaxCount)
+    setProfMetadata(F.getParent(), &SI, SCounts, MaxCount);
+}
+
+void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
+  if (!PGOInstrSelect)
+    return;
+  // FIXME: do not handle this yet.
+  if (SI.getCondition()->getType()->isVectorTy())
+    return;
+
+  switch (Mode) {
+  case VM_counting:
+    NSIs++;
+    return;
+  case VM_instrument:
+    instrumentOneSelectInst(SI);
+    return;
+  case VM_annotate:
+    annotateOneSelectInst(SI);
+    return;
+  }
+
+  llvm_unreachable("Unknown visiting mode");
+}
+
+void MemIntrinsicVisitor::instrumentOneMemIntrinsic(MemIntrinsic &MI) {
+  Module *M = F.getParent();
+  IRBuilder<> Builder(&MI);
+  Type *Int64Ty = Builder.getInt64Ty();
+  Type *I8PtrTy = Builder.getInt8PtrTy();
+  Value *Length = MI.getLength();
+  assert(!dyn_cast<ConstantInt>(Length));
+  Builder.CreateCall(
+      Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
+      {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
+       Builder.getInt64(FuncHash), Builder.CreateZExtOrTrunc(Length, Int64Ty),
+       Builder.getInt32(IPVK_MemOPSize), Builder.getInt32(CurCtrId)});
+  ++CurCtrId;
+}
+
+void MemIntrinsicVisitor::visitMemIntrinsic(MemIntrinsic &MI) {
+  if (!PGOInstrMemOP)
+    return;
+  Value *Length = MI.getLength();
+  // Not instrument constant length calls.
+  if (dyn_cast<ConstantInt>(Length))
+    return;
+
+  switch (Mode) {
+  case VM_counting:
+    NMemIs++;
+    return;
+  case VM_instrument:
+    instrumentOneMemIntrinsic(MI);
+    return;
+  case VM_annotate:
+    Candidates.push_back(&MI);
+    return;
+  }
+  llvm_unreachable("Unknown visiting mode");
+}
+
+// Traverse all valuesites and annotate the instructions for all value kind.
+void PGOUseFunc::annotateValueSites() {
+  if (DisableValueProfiling)
+    return;
+
+  // Create the PGOFuncName meta data.
+  createPGOFuncNameMetadata(F, FuncInfo.FuncName);
+
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+    annotateValueSites(Kind);
+}
+
+// Annotate the instructions for a specific value kind.
+void PGOUseFunc::annotateValueSites(uint32_t Kind) {
+  unsigned ValueSiteIndex = 0;
+  auto &ValueSites = FuncInfo.ValueSites[Kind];
+  unsigned NumValueSites = ProfileRecord.getNumValueSites(Kind);
+  if (NumValueSites != ValueSites.size()) {
+    auto &Ctx = M->getContext();
+    Ctx.diagnose(DiagnosticInfoPGOProfile(
+        M->getName().data(),
+        Twine("Inconsistent number of value sites for kind = ") + Twine(Kind) +
+            " in " + F.getName().str(),
+        DS_Warning));
+    return;
+  }
+
+  for (auto &I : ValueSites) {
+    DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
+                 << "): Index = " << ValueSiteIndex << " out of "
+                 << NumValueSites << "\n");
+    annotateValueSite(*M, *I, ProfileRecord,
+                      static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
+                      Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations
+                                             : MaxNumAnnotations);
+    ValueSiteIndex++;
+  }
+}
+} // end anonymous namespace
+
+// Create a COMDAT variable INSTR_PROF_RAW_VERSION_VAR to make the runtime
+// aware this is an ir_level profile so it can set the version flag.
+static void createIRLevelProfileFlagVariable(Module &M) {
+  Type *IntTy64 = Type::getInt64Ty(M.getContext());
+  uint64_t ProfileVersion = (INSTR_PROF_RAW_VERSION | VARIANT_MASK_IR_PROF);
+  auto IRLevelVersionVariable = new GlobalVariable(
+      M, IntTy64, true, GlobalVariable::ExternalLinkage,
+      Constant::getIntegerValue(IntTy64, APInt(64, ProfileVersion)),
+      INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
+  IRLevelVersionVariable->setVisibility(GlobalValue::DefaultVisibility);
+  Triple TT(M.getTargetTriple());
+  if (!TT.supportsCOMDAT())
+    IRLevelVersionVariable->setLinkage(GlobalValue::WeakAnyLinkage);
+  else
+    IRLevelVersionVariable->setComdat(M.getOrInsertComdat(
+        StringRef(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR))));
+}
+
+// Collect the set of members for each Comdat in module M and store
+// in ComdatMembers.
+static void collectComdatMembers(
+    Module &M,
+    std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
+  if (!DoComdatRenaming)
+    return;
+  for (Function &F : M)
+    if (Comdat *C = F.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &F));
+  for (GlobalVariable &GV : M.globals())
+    if (Comdat *C = GV.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &GV));
+  for (GlobalAlias &GA : M.aliases())
+    if (Comdat *C = GA.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &GA));
+}
+
+static bool InstrumentAllFunctions(
+    Module &M, function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
+    function_ref<BlockFrequencyInfo *(Function &)> LookupBFI) {
+  createIRLevelProfileFlagVariable(M);
+  std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
+  collectComdatMembers(M, ComdatMembers);
+
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    auto *BPI = LookupBPI(F);
+    auto *BFI = LookupBFI(F);
+    instrumentOneFunc(F, &M, BPI, BFI, ComdatMembers);
+  }
+  return true;
+}
+
+bool PGOInstrumentationGenLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  auto LookupBPI = [this](Function &F) {
+    return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
+  };
+  auto LookupBFI = [this](Function &F) {
+    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+  };
+  return InstrumentAllFunctions(M, LookupBPI, LookupBFI);
+}
+
+PreservedAnalyses PGOInstrumentationGen::run(Module &M,
+                                             ModuleAnalysisManager &AM) {
+
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupBPI = [&FAM](Function &F) {
+    return &FAM.getResult<BranchProbabilityAnalysis>(F);
+  };
+
+  auto LookupBFI = [&FAM](Function &F) {
+    return &FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
+  if (!InstrumentAllFunctions(M, LookupBPI, LookupBFI))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+static bool annotateAllFunctions(
+    Module &M, StringRef ProfileFileName,
+    function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
+    function_ref<BlockFrequencyInfo *(Function &)> LookupBFI) {
+  DEBUG(dbgs() << "Read in profile counters: ");
+  auto &Ctx = M.getContext();
+  // Read the counter array from file.
+  auto ReaderOrErr = IndexedInstrProfReader::create(ProfileFileName);
+  if (Error E = ReaderOrErr.takeError()) {
+    handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) {
+      Ctx.diagnose(
+          DiagnosticInfoPGOProfile(ProfileFileName.data(), EI.message()));
+    });
+    return false;
+  }
+
+  std::unique_ptr<IndexedInstrProfReader> PGOReader =
+      std::move(ReaderOrErr.get());
+  if (!PGOReader) {
+    Ctx.diagnose(DiagnosticInfoPGOProfile(ProfileFileName.data(),
+                                          StringRef("Cannot get PGOReader")));
+    return false;
+  }
+  // TODO: might need to change the warning once the clang option is finalized.
+  if (!PGOReader->isIRLevelProfile()) {
+    Ctx.diagnose(DiagnosticInfoPGOProfile(
+        ProfileFileName.data(), "Not an IR level instrumentation profile"));
+    return false;
+  }
+
+  std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
+  collectComdatMembers(M, ComdatMembers);
+  std::vector<Function *> HotFunctions;
+  std::vector<Function *> ColdFunctions;
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    auto *BPI = LookupBPI(F);
+    auto *BFI = LookupBFI(F);
+    PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI);
+    if (!Func.readCounters(PGOReader.get()))
+      continue;
+    Func.populateCounters();
+    Func.setBranchWeights();
+    Func.annotateValueSites();
+    PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
+    if (FreqAttr == PGOUseFunc::FFA_Cold)
+      ColdFunctions.push_back(&F);
+    else if (FreqAttr == PGOUseFunc::FFA_Hot)
+      HotFunctions.push_back(&F);
+    if (PGOViewCounts && (ViewBlockFreqFuncName.empty() ||
+                          F.getName().equals(ViewBlockFreqFuncName))) {
+      LoopInfo LI{DominatorTree(F)};
+      std::unique_ptr<BranchProbabilityInfo> NewBPI =
+          llvm::make_unique<BranchProbabilityInfo>(F, LI);
+      std::unique_ptr<BlockFrequencyInfo> NewBFI =
+          llvm::make_unique<BlockFrequencyInfo>(F, *NewBPI, LI);
+
+      NewBFI->view();
+    }
+    if (PGOViewRawCounts && (ViewBlockFreqFuncName.empty() ||
+                             F.getName().equals(ViewBlockFreqFuncName))) {
+      if (ViewBlockFreqFuncName.empty())
+        WriteGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
+      else
+        ViewGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
+    }
+  }
+  M.setProfileSummary(PGOReader->getSummary().getMD(M.getContext()));
+  // Set function hotness attribute from the profile.
+  // We have to apply these attributes at the end because their presence
+  // can affect the BranchProbabilityInfo of any callers, resulting in an
+  // inconsistent MST between prof-gen and prof-use.
+  for (auto &F : HotFunctions) {
+    F->addFnAttr(llvm::Attribute::InlineHint);
+    DEBUG(dbgs() << "Set inline attribute to function: " << F->getName()
+                 << "\n");
+  }
+  for (auto &F : ColdFunctions) {
+    F->addFnAttr(llvm::Attribute::Cold);
+    DEBUG(dbgs() << "Set cold attribute to function: " << F->getName() << "\n");
+  }
+  return true;
+}
+
+PGOInstrumentationUse::PGOInstrumentationUse(std::string Filename)
+    : ProfileFileName(std::move(Filename)) {
+  if (!PGOTestProfileFile.empty())
+    ProfileFileName = PGOTestProfileFile;
+}
+
+PreservedAnalyses PGOInstrumentationUse::run(Module &M,
+                                             ModuleAnalysisManager &AM) {
+
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupBPI = [&FAM](Function &F) {
+    return &FAM.getResult<BranchProbabilityAnalysis>(F);
+  };
+
+  auto LookupBFI = [&FAM](Function &F) {
+    return &FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
+  if (!annotateAllFunctions(M, ProfileFileName, LookupBPI, LookupBFI))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  auto LookupBPI = [this](Function &F) {
+    return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
+  };
+  auto LookupBFI = [this](Function &F) {
+    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+  };
+
+  return annotateAllFunctions(M, ProfileFileName, LookupBPI, LookupBFI);
+}
+
+namespace llvm {
+void setProfMetadata(Module *M, Instruction *TI, ArrayRef<uint64_t> EdgeCounts,
+                     uint64_t MaxCount) {
+  MDBuilder MDB(M->getContext());
+  assert(MaxCount > 0 && "Bad max count");
+  uint64_t Scale = calculateCountScale(MaxCount);
+  SmallVector<unsigned, 4> Weights;
+  for (const auto &ECI : EdgeCounts)
+    Weights.push_back(scaleBranchCount(ECI, Scale));
+
+  DEBUG(dbgs() << "Weight is: ";
+        for (const auto &W : Weights) { dbgs() << W << " "; }
+        dbgs() << "\n";);
+  TI->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+  if (EmitBranchProbability) {
+    std::string BrCondStr = getBranchCondString(TI);
+    if (BrCondStr.empty())
+      return;
+
+    unsigned WSum =
+        std::accumulate(Weights.begin(), Weights.end(), 0,
+                        [](unsigned w1, unsigned w2) { return w1 + w2; });
+    uint64_t TotalCount =
+        std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), 0,
+                        [](uint64_t c1, uint64_t c2) { return c1 + c2; });
+    BranchProbability BP(Weights[0], WSum);
+    std::string BranchProbStr;
+    raw_string_ostream OS(BranchProbStr);
+    OS << BP;
+    OS << " (total count : " << TotalCount << ")";
+    OS.flush();
+    Function *F = TI->getParent()->getParent();
+    emitOptimizationRemarkAnalysis(
+        F->getContext(), "pgo-use-annot", *F, TI->getDebugLoc(),
+        Twine(BrCondStr) +
+            " is true with probability : " + Twine(BranchProbStr));
+  }
+}
+
+template <> struct GraphTraits<PGOUseFunc *> {
+  typedef const BasicBlock *NodeRef;
+  typedef succ_const_iterator ChildIteratorType;
+  typedef pointer_iterator<Function::const_iterator> nodes_iterator;
+
+  static NodeRef getEntryNode(const PGOUseFunc *G) {
+    return &G->getFunc().front();
+  }
+  static ChildIteratorType child_begin(const NodeRef N) {
+    return succ_begin(N);
+  }
+  static ChildIteratorType child_end(const NodeRef N) { return succ_end(N); }
+  static nodes_iterator nodes_begin(const PGOUseFunc *G) {
+    return nodes_iterator(G->getFunc().begin());
+  }
+  static nodes_iterator nodes_end(const PGOUseFunc *G) {
+    return nodes_iterator(G->getFunc().end());
+  }
+};
+
+static std::string getSimpleNodeName(const BasicBlock *Node) {
+  if (!Node->getName().empty())
+    return Node->getName();
+
+  std::string SimpleNodeName;
+  raw_string_ostream OS(SimpleNodeName);
+  Node->printAsOperand(OS, false);
+  return OS.str();
+}
+
+template <> struct DOTGraphTraits<PGOUseFunc *> : DefaultDOTGraphTraits {
+  explicit DOTGraphTraits(bool isSimple = false)
+      : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const PGOUseFunc *G) {
+    return G->getFunc().getName();
+  }
+
+  std::string getNodeLabel(const BasicBlock *Node, const PGOUseFunc *Graph) {
+    std::string Result;
+    raw_string_ostream OS(Result);
+
+    OS << getSimpleNodeName(Node) << ":\\l";
+    UseBBInfo *BI = Graph->findBBInfo(Node);
+    OS << "Count : ";
+    if (BI && BI->CountValid)
+      OS << BI->CountValue << "\\l";
+    else
+      OS << "Unknown\\l";
+
+    if (!PGOInstrSelect)
+      return Result;
+
+    for (auto BI = Node->begin(); BI != Node->end(); ++BI) {
+      auto *I = &*BI;
+      if (!isa<SelectInst>(I))
+        continue;
+      // Display scaled counts for SELECT instruction:
+      OS << "SELECT : { T = ";
+      uint64_t TC, FC;
+      bool HasProf = I->extractProfMetadata(TC, FC);
+      if (!HasProf)
+        OS << "Unknown, F = Unknown }\\l";
+      else
+        OS << TC << ", F = " << FC << " }\\l";
+    }
+    return Result;
+  }
+};
+} // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
new file mode 100644
index 000000000000..0bc9ddfbe4d3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -0,0 +1,419 @@
+//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the transformation that optimizes memory intrinsics
+// such as memcpy using the size value profile. When memory intrinsic size
+// value profile metadata is available, a single memory intrinsic is expanded
+// to a sequence of guarded specialized versions that are called with the
+// hottest size(s), for later expansion into more optimal inline sequences.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/PGOInstrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-memop-opt"
+
+STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
+STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
+
+// The minimum call count to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+    MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
+                        cl::init(1000),
+                        cl::desc("The minimum count to optimize memory "
+                                 "intrinsic calls"));
+
+// Command line option to disable memory intrinsic optimization. The default is
+// false. This is for debug purpose.
+static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
+                                     cl::Hidden, cl::desc("Disable optimize"));
+
+// The percent threshold to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+    MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
+                          cl::Hidden, cl::ZeroOrMore,
+                          cl::desc("The percentage threshold for the "
+                                   "memory intrinsic calls optimization"));
+
+// Maximum number of versions for optimizing memory intrinsic call.
+static cl::opt<unsigned>
+    MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
+                    cl::ZeroOrMore,
+                    cl::desc("The max version for the optimized memory "
+                             " intrinsic calls"));
+
+// Scale the counts from the annotation using the BB count value.
+static cl::opt<bool>
+    MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
+                    cl::desc("Scale the memop size counts using the basic "
+                             " block count value"));
+
+// This option sets the rangge of precise profile memop sizes.
+extern cl::opt<std::string> MemOPSizeRange;
+
+// This option sets the value that groups large memop sizes
+extern cl::opt<unsigned> MemOPSizeLarge;
+
+namespace {
+class PGOMemOPSizeOptLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
+    initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "PGOMemOPSize"; }
+
+private:
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+} // end anonymous namespace
+
+char PGOMemOPSizeOptLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+                      "Optimize memory intrinsic using its size value profile",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+                    "Optimize memory intrinsic using its size value profile",
+                    false, false)
+
+FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
+  return new PGOMemOPSizeOptLegacyPass();
+}
+
+namespace {
+class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
+public:
+  MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI)
+      : Func(Func), BFI(BFI), Changed(false) {
+    ValueDataArray =
+        llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
+    // Get the MemOPSize range information from option MemOPSizeRange,
+    getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
+                                PreciseRangeLast);
+  }
+  bool isChanged() const { return Changed; }
+  void perform() {
+    WorkList.clear();
+    visit(Func);
+
+    for (auto &MI : WorkList) {
+      ++NumOfPGOMemOPAnnotate;
+      if (perform(MI)) {
+        Changed = true;
+        ++NumOfPGOMemOPOpt;
+        DEBUG(dbgs() << "MemOP call: " << MI->getCalledFunction()->getName()
+                     << "is Transformed.\n");
+      }
+    }
+  }
+
+  void visitMemIntrinsic(MemIntrinsic &MI) {
+    Value *Length = MI.getLength();
+    // Not perform on constant length calls.
+    if (dyn_cast<ConstantInt>(Length))
+      return;
+    WorkList.push_back(&MI);
+  }
+
+private:
+  Function &Func;
+  BlockFrequencyInfo &BFI;
+  bool Changed;
+  std::vector<MemIntrinsic *> WorkList;
+  // Start of the previse range.
+  int64_t PreciseRangeStart;
+  // Last value of the previse range.
+  int64_t PreciseRangeLast;
+  // The space to read the profile annotation.
+  std::unique_ptr<InstrProfValueData[]> ValueDataArray;
+  bool perform(MemIntrinsic *MI);
+
+  // This kind shows which group the value falls in. For PreciseValue, we have
+  // the profile count for that value. LargeGroup groups the values that are in
+  // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
+  enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
+
+  MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
+    if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
+      return LargeGroup;
+    if (Value == PreciseRangeLast + 1)
+      return NonLargeGroup;
+    return PreciseValue;
+  }
+};
+
+static const char *getMIName(const MemIntrinsic *MI) {
+  switch (MI->getIntrinsicID()) {
+  case Intrinsic::memcpy:
+    return "memcpy";
+  case Intrinsic::memmove:
+    return "memmove";
+  case Intrinsic::memset:
+    return "memset";
+  default:
+    return "unknown";
+  }
+}
+
+static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
+  assert(Count <= TotalCount);
+  if (Count < MemOPCountThreshold)
+    return false;
+  if (Count < TotalCount * MemOPPercentThreshold / 100)
+    return false;
+  return true;
+}
+
+static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
+                                      uint64_t Denom) {
+  if (!MemOPScaleCount)
+    return Count;
+  bool Overflowed;
+  uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
+  return ScaleCount / Denom;
+}
+
+bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
+  assert(MI);
+  if (MI->getIntrinsicID() == Intrinsic::memmove)
+    return false;
+
+  uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
+  uint64_t TotalCount;
+  if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
+                                ValueDataArray.get(), NumVals, TotalCount))
+    return false;
+
+  uint64_t ActualCount = TotalCount;
+  uint64_t SavedTotalCount = TotalCount;
+  if (MemOPScaleCount) {
+    auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
+    if (!BBEdgeCount)
+      return false;
+    ActualCount = *BBEdgeCount;
+  }
+
+  ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
+  DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount
+               << "\n");
+  DEBUG(
+      for (auto &VD
+           : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
+
+  if (ActualCount < MemOPCountThreshold)
+    return false;
+  // Skip if the total value profiled count is 0, in which case we can't
+  // scale up the counts properly (and there is no profitable transformation).
+  if (TotalCount == 0)
+    return false;
+
+  TotalCount = ActualCount;
+  if (MemOPScaleCount)
+    DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
+                 << " denominator = " << SavedTotalCount << "\n");
+
+  // Keeping track of the count of the default case:
+  uint64_t RemainCount = TotalCount;
+  uint64_t SavedRemainCount = SavedTotalCount;
+  SmallVector<uint64_t, 16> SizeIds;
+  SmallVector<uint64_t, 16> CaseCounts;
+  uint64_t MaxCount = 0;
+  unsigned Version = 0;
+  // Default case is in the front -- save the slot here.
+  CaseCounts.push_back(0);
+  for (auto &VD : VDs) {
+    int64_t V = VD.Value;
+    uint64_t C = VD.Count;
+    if (MemOPScaleCount)
+      C = getScaledCount(C, ActualCount, SavedTotalCount);
+
+    // Only care precise value here.
+    if (getMemOPSizeKind(V) != PreciseValue)
+      continue;
+
+    // ValueCounts are sorted on the count. Break at the first un-profitable
+    // value.
+    if (!isProfitable(C, RemainCount))
+      break;
+
+    SizeIds.push_back(V);
+    CaseCounts.push_back(C);
+    if (C > MaxCount)
+      MaxCount = C;
+
+    assert(RemainCount >= C);
+    RemainCount -= C;
+    assert(SavedRemainCount >= VD.Count);
+    SavedRemainCount -= VD.Count;
+
+    if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
+      break;
+  }
+
+  if (Version == 0)
+    return false;
+
+  CaseCounts[0] = RemainCount;
+  if (RemainCount > MaxCount)
+    MaxCount = RemainCount;
+
+  uint64_t SumForOpt = TotalCount - RemainCount;
+
+  DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
+               << " Versions (covering " << SumForOpt << " out of "
+               << TotalCount << ")\n");
+
+  // mem_op(..., size)
+  // ==>
+  // switch (size) {
+  //   case s1:
+  //      mem_op(..., s1);
+  //      goto merge_bb;
+  //   case s2:
+  //      mem_op(..., s2);
+  //      goto merge_bb;
+  //   ...
+  //   default:
+  //      mem_op(..., size);
+  //      goto merge_bb;
+  // }
+  // merge_bb:
+
+  BasicBlock *BB = MI->getParent();
+  DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
+  DEBUG(dbgs() << *BB << "\n");
+  auto OrigBBFreq = BFI.getBlockFreq(BB);
+
+  BasicBlock *DefaultBB = SplitBlock(BB, MI);
+  BasicBlock::iterator It(*MI);
+  ++It;
+  assert(It != DefaultBB->end());
+  BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It));
+  MergeBB->setName("MemOP.Merge");
+  BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
+  DefaultBB->setName("MemOP.Default");
+
+  auto &Ctx = Func.getContext();
+  IRBuilder<> IRB(BB);
+  BB->getTerminator()->eraseFromParent();
+  Value *SizeVar = MI->getLength();
+  SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
+
+  // Clear the value profile data.
+  MI->setMetadata(LLVMContext::MD_prof, nullptr);
+  // If all promoted, we don't need the MD.prof metadata.
+  if (SavedRemainCount > 0 || Version != NumVals)
+    // Otherwise we need update with the un-promoted records back.
+    annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
+                      SavedRemainCount, IPVK_MemOPSize, NumVals);
+
+  DEBUG(dbgs() << "\n\n== Basic Block After==\n");
+
+  for (uint64_t SizeId : SizeIds) {
+    ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId);
+    BasicBlock *CaseBB = BasicBlock::Create(
+        Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
+    Instruction *NewInst = MI->clone();
+    // Fix the argument.
+    dyn_cast<MemIntrinsic>(NewInst)->setLength(CaseSizeId);
+    CaseBB->getInstList().push_back(NewInst);
+    IRBuilder<> IRBCase(CaseBB);
+    IRBCase.CreateBr(MergeBB);
+    SI->addCase(CaseSizeId, CaseBB);
+    DEBUG(dbgs() << *CaseBB << "\n");
+  }
+  setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
+
+  DEBUG(dbgs() << *BB << "\n");
+  DEBUG(dbgs() << *DefaultBB << "\n");
+  DEBUG(dbgs() << *MergeBB << "\n");
+
+  emitOptimizationRemark(Func.getContext(), "memop-opt", Func,
+                         MI->getDebugLoc(),
+                         Twine("optimize ") + getMIName(MI) + " with count " +
+                             Twine(SumForOpt) + " out of " + Twine(TotalCount) +
+                             " for " + Twine(Version) + " versions");
+
+  return true;
+}
+} // namespace
+
+static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI) {
+  if (DisableMemOPOPT)
+    return false;
+
+  if (F.hasFnAttribute(Attribute::OptimizeForSize))
+    return false;
+  MemOPSizeOpt MemOPSizeOpt(F, BFI);
+  MemOPSizeOpt.perform();
+  return MemOPSizeOpt.isChanged();
+}
+
+bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
+  BlockFrequencyInfo &BFI =
+      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+  return PGOMemOPSizeOptImpl(F, BFI);
+}
+
+namespace llvm {
+char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
+
+PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
+                                       FunctionAnalysisManager &FAM) {
+  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+  bool Changed = PGOMemOPSizeOptImpl(F, BFI);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  auto PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+} // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
new file mode 100644
index 000000000000..e3c36c98ab0d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -0,0 +1,666 @@
+//===-- SanitizerCoverage.cpp - coverage instrumentation for sanitizers ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Coverage instrumentation done on LLVM IR level, works with Sanitizers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sancov"
+
+static const char *const SanCovTracePCIndirName =
+    "__sanitizer_cov_trace_pc_indir";
+static const char *const SanCovTracePCName = "__sanitizer_cov_trace_pc";
+static const char *const SanCovTraceCmp1 = "__sanitizer_cov_trace_cmp1";
+static const char *const SanCovTraceCmp2 = "__sanitizer_cov_trace_cmp2";
+static const char *const SanCovTraceCmp4 = "__sanitizer_cov_trace_cmp4";
+static const char *const SanCovTraceCmp8 = "__sanitizer_cov_trace_cmp8";
+static const char *const SanCovTraceDiv4 = "__sanitizer_cov_trace_div4";
+static const char *const SanCovTraceDiv8 = "__sanitizer_cov_trace_div8";
+static const char *const SanCovTraceGep = "__sanitizer_cov_trace_gep";
+static const char *const SanCovTraceSwitchName = "__sanitizer_cov_trace_switch";
+static const char *const SanCovModuleCtorName = "sancov.module_ctor";
+static const uint64_t SanCtorAndDtorPriority = 2;
+
+static const char *const SanCovTracePCGuardName =
+    "__sanitizer_cov_trace_pc_guard";
+static const char *const SanCovTracePCGuardInitName =
+    "__sanitizer_cov_trace_pc_guard_init";
+static const char *const SanCov8bitCountersInitName = 
+    "__sanitizer_cov_8bit_counters_init";
+
+static const char *const SanCovGuardsSectionName = "sancov_guards";
+static const char *const SanCovCountersSectionName = "sancov_cntrs";
+
+static cl::opt<int> ClCoverageLevel(
+    "sanitizer-coverage-level",
+    cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, "
+             "3: all blocks and critical edges"),
+    cl::Hidden, cl::init(0));
+
+static cl::opt<bool> ClTracePC("sanitizer-coverage-trace-pc",
+                               cl::desc("Experimental pc tracing"), cl::Hidden,
+                               cl::init(false));
+
+static cl::opt<bool> ClTracePCGuard("sanitizer-coverage-trace-pc-guard",
+                                    cl::desc("pc tracing with a guard"),
+                                    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClInline8bitCounters("sanitizer-coverage-inline-8bit-counters",
+                                    cl::desc("increments 8-bit counter for every edge"),
+                                    cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClCMPTracing("sanitizer-coverage-trace-compares",
+                 cl::desc("Tracing of CMP and similar instructions"),
+                 cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDIVTracing("sanitizer-coverage-trace-divs",
+                                  cl::desc("Tracing of DIV instructions"),
+                                  cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClGEPTracing("sanitizer-coverage-trace-geps",
+                                  cl::desc("Tracing of GEP instructions"),
+                                  cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClPruneBlocks("sanitizer-coverage-prune-blocks",
+                  cl::desc("Reduce the number of instrumented blocks"),
+                  cl::Hidden, cl::init(true));
+
+namespace {
+
+SanitizerCoverageOptions getOptions(int LegacyCoverageLevel) {
+  SanitizerCoverageOptions Res;
+  switch (LegacyCoverageLevel) {
+  case 0:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_None;
+    break;
+  case 1:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_Function;
+    break;
+  case 2:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_BB;
+    break;
+  case 3:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_Edge;
+    break;
+  case 4:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_Edge;
+    Res.IndirectCalls = true;
+    break;
+  }
+  return Res;
+}
+
+SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
+  // Sets CoverageType and IndirectCalls.
+  SanitizerCoverageOptions CLOpts = getOptions(ClCoverageLevel);
+  Options.CoverageType = std::max(Options.CoverageType, CLOpts.CoverageType);
+  Options.IndirectCalls |= CLOpts.IndirectCalls;
+  Options.TraceCmp |= ClCMPTracing;
+  Options.TraceDiv |= ClDIVTracing;
+  Options.TraceGep |= ClGEPTracing;
+  Options.TracePC |= ClTracePC;
+  Options.TracePCGuard |= ClTracePCGuard;
+  Options.Inline8bitCounters |= ClInline8bitCounters;
+  if (!Options.TracePCGuard && !Options.TracePC && !Options.Inline8bitCounters)
+    Options.TracePCGuard = true; // TracePCGuard is default.
+  Options.NoPrune |= !ClPruneBlocks;
+  return Options;
+}
+
+class SanitizerCoverageModule : public ModulePass {
+public:
+  SanitizerCoverageModule(
+      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions())
+      : ModulePass(ID), Options(OverrideFromCL(Options)) {
+    initializeSanitizerCoverageModulePass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override;
+  bool runOnFunction(Function &F);
+  static char ID; // Pass identification, replacement for typeid
+  StringRef getPassName() const override { return "SanitizerCoverageModule"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+  }
+
+private:
+  void InjectCoverageForIndirectCalls(Function &F,
+                                      ArrayRef<Instruction *> IndirCalls);
+  void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets);
+  void InjectTraceForDiv(Function &F,
+                         ArrayRef<BinaryOperator *> DivTraceTargets);
+  void InjectTraceForGep(Function &F,
+                         ArrayRef<GetElementPtrInst *> GepTraceTargets);
+  void InjectTraceForSwitch(Function &F,
+                            ArrayRef<Instruction *> SwitchTraceTargets);
+  bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks);
+  GlobalVariable *CreateFunctionLocalArrayInSection(size_t NumElements,
+                                                    Function &F, Type *Ty,
+                                                    const char *Section);
+  void CreateFunctionLocalArrays(size_t NumGuards, Function &F);
+  void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx);
+  void CreateInitCallForSection(Module &M, const char *InitFunctionName,
+                                Type *Ty, const std::string &Section);
+
+  void SetNoSanitizeMetadata(Instruction *I) {
+    I->setMetadata(I->getModule()->getMDKindID("nosanitize"),
+                   MDNode::get(*C, None));
+  }
+
+  std::string getSectionName(const std::string &Section) const;
+  std::string getSectionStart(const std::string &Section) const;
+  std::string getSectionEnd(const std::string &Section) const;
+  Function *SanCovTracePCIndir;
+  Function *SanCovTracePC, *SanCovTracePCGuard;
+  Function *SanCovTraceCmpFunction[4];
+  Function *SanCovTraceDivFunction[2];
+  Function *SanCovTraceGepFunction;
+  Function *SanCovTraceSwitchFunction;
+  InlineAsm *EmptyAsm;
+  Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy,
+      *Int8Ty, *Int8PtrTy;
+  Module *CurModule;
+  Triple TargetTriple;
+  LLVMContext *C;
+  const DataLayout *DL;
+
+  GlobalVariable *FunctionGuardArray;  // for trace-pc-guard.
+  GlobalVariable *Function8bitCounterArray;  // for inline-8bit-counters.
+
+  SanitizerCoverageOptions Options;
+};
+
+} // namespace
+
+void SanitizerCoverageModule::CreateInitCallForSection(
+    Module &M, const char *InitFunctionName, Type *Ty,
+    const std::string &Section) {
+  IRBuilder<> IRB(M.getContext());
+  Function *CtorFunc;
+  GlobalVariable *SecStart =
+      new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage, nullptr,
+                         getSectionStart(Section));
+  SecStart->setVisibility(GlobalValue::HiddenVisibility);
+  GlobalVariable *SecEnd =
+      new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
+                         nullptr, getSectionEnd(Section));
+  SecEnd->setVisibility(GlobalValue::HiddenVisibility);
+
+  std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, SanCovModuleCtorName, InitFunctionName, {Ty, Ty},
+      {IRB.CreatePointerCast(SecStart, Ty), IRB.CreatePointerCast(SecEnd, Ty)});
+
+  if (TargetTriple.supportsCOMDAT()) {
+    // Use comdat to dedup CtorFunc.
+    CtorFunc->setComdat(M.getOrInsertComdat(SanCovModuleCtorName));
+    appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority, CtorFunc);
+  } else {
+    appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
+  }
+}
+
+bool SanitizerCoverageModule::runOnModule(Module &M) {
+  if (Options.CoverageType == SanitizerCoverageOptions::SCK_None)
+    return false;
+  C = &(M.getContext());
+  DL = &M.getDataLayout();
+  CurModule = &M;
+  TargetTriple = Triple(M.getTargetTriple());
+  FunctionGuardArray = nullptr;
+  Function8bitCounterArray = nullptr;
+  IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());
+  IntptrPtrTy = PointerType::getUnqual(IntptrTy);
+  Type *VoidTy = Type::getVoidTy(*C);
+  IRBuilder<> IRB(*C);
+  Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty());
+  Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
+  Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
+  Int64Ty = IRB.getInt64Ty();
+  Int32Ty = IRB.getInt32Ty();
+  Int8Ty = IRB.getInt8Ty();
+
+  SanCovTracePCIndir = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy));
+  SanCovTraceCmpFunction[0] =
+      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+          SanCovTraceCmp1, VoidTy, IRB.getInt8Ty(), IRB.getInt8Ty()));
+  SanCovTraceCmpFunction[1] = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(SanCovTraceCmp2, VoidTy, IRB.getInt16Ty(),
+                            IRB.getInt16Ty()));
+  SanCovTraceCmpFunction[2] = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(SanCovTraceCmp4, VoidTy, IRB.getInt32Ty(),
+                            IRB.getInt32Ty()));
+  SanCovTraceCmpFunction[3] =
+      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+          SanCovTraceCmp8, VoidTy, Int64Ty, Int64Ty));
+
+  SanCovTraceDivFunction[0] =
+      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+          SanCovTraceDiv4, VoidTy, IRB.getInt32Ty()));
+  SanCovTraceDivFunction[1] =
+      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+          SanCovTraceDiv8, VoidTy, Int64Ty));
+  SanCovTraceGepFunction =
+      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+          SanCovTraceGep, VoidTy, IntptrTy));
+  SanCovTraceSwitchFunction =
+      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+          SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy));
+
+  // We insert an empty inline asm after cov callbacks to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+
+  SanCovTracePC = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(SanCovTracePCName, VoidTy));
+  SanCovTracePCGuard = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      SanCovTracePCGuardName, VoidTy, Int32PtrTy));
+
+  for (auto &F : M)
+    runOnFunction(F);
+
+  if (FunctionGuardArray)
+    CreateInitCallForSection(M, SanCovTracePCGuardInitName, Int32PtrTy,
+                             SanCovGuardsSectionName);
+  if (Function8bitCounterArray)
+    CreateInitCallForSection(M, SanCov8bitCountersInitName, Int8PtrTy,
+                             SanCovCountersSectionName);
+
+  return true;
+}
+
+// True if block has successors and it dominates all of them.
+static bool isFullDominator(const BasicBlock *BB, const DominatorTree *DT) {
+  if (succ_begin(BB) == succ_end(BB))
+    return false;
+
+  for (const BasicBlock *SUCC : make_range(succ_begin(BB), succ_end(BB))) {
+    if (!DT->dominates(BB, SUCC))
+      return false;
+  }
+
+  return true;
+}
+
+// True if block has predecessors and it postdominates all of them.
+static bool isFullPostDominator(const BasicBlock *BB,
+                                const PostDominatorTree *PDT) {
+  if (pred_begin(BB) == pred_end(BB))
+    return false;
+
+  for (const BasicBlock *PRED : make_range(pred_begin(BB), pred_end(BB))) {
+    if (!PDT->dominates(BB, PRED))
+      return false;
+  }
+
+  return true;
+}
+
+static bool shouldInstrumentBlock(const Function &F, const BasicBlock *BB,
+                                  const DominatorTree *DT,
+                                  const PostDominatorTree *PDT,
+                                  const SanitizerCoverageOptions &Options) {
+  // Don't insert coverage for unreachable blocks: we will never call
+  // __sanitizer_cov() for them, so counting them in
+  // NumberOfInstrumentedBlocks() might complicate calculation of code coverage
+  // percentage. Also, unreachable instructions frequently have no debug
+  // locations.
+  if (isa<UnreachableInst>(BB->getTerminator()))
+    return false;
+
+  // Don't insert coverage into blocks without a valid insertion point
+  // (catchswitch blocks).
+  if (BB->getFirstInsertionPt() == BB->end())
+    return false;
+
+  if (Options.NoPrune || &F.getEntryBlock() == BB)
+    return true;
+
+  // Do not instrument full dominators, or full post-dominators with multiple
+  // predecessors.
+  return !isFullDominator(BB, DT)
+    && !(isFullPostDominator(BB, PDT) && !BB->getSinglePredecessor());
+}
+
+bool SanitizerCoverageModule::runOnFunction(Function &F) {
+  if (F.empty())
+    return false;
+  if (F.getName().find(".module_ctor") != std::string::npos)
+    return false; // Should not instrument sanitizer init functions.
+  if (F.getName().startswith("__sanitizer_"))
+    return false;  // Don't instrument __sanitizer_* callbacks.
+  // Don't instrument MSVC CRT configuration helpers. They may run before normal
+  // initialization.
+  if (F.getName() == "__local_stdio_printf_options" ||
+      F.getName() == "__local_stdio_scanf_options")
+    return false;
+  // Don't instrument functions using SEH for now. Splitting basic blocks like
+  // we do for coverage breaks WinEHPrepare.
+  // FIXME: Remove this when SEH no longer uses landingpad pattern matching.
+  if (F.hasPersonalityFn() &&
+      isAsynchronousEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+    return false;
+  if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge)
+    SplitAllCriticalEdges(F);
+  SmallVector<Instruction *, 8> IndirCalls;
+  SmallVector<BasicBlock *, 16> BlocksToInstrument;
+  SmallVector<Instruction *, 8> CmpTraceTargets;
+  SmallVector<Instruction *, 8> SwitchTraceTargets;
+  SmallVector<BinaryOperator *, 8> DivTraceTargets;
+  SmallVector<GetElementPtrInst *, 8> GepTraceTargets;
+
+  const DominatorTree *DT =
+      &getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+  const PostDominatorTree *PDT =
+      &getAnalysis<PostDominatorTreeWrapperPass>(F).getPostDomTree();
+
+  for (auto &BB : F) {
+    if (shouldInstrumentBlock(F, &BB, DT, PDT, Options))
+      BlocksToInstrument.push_back(&BB);
+    for (auto &Inst : BB) {
+      if (Options.IndirectCalls) {
+        CallSite CS(&Inst);
+        if (CS && !CS.getCalledFunction())
+          IndirCalls.push_back(&Inst);
+      }
+      if (Options.TraceCmp) {
+        if (isa<ICmpInst>(&Inst))
+          CmpTraceTargets.push_back(&Inst);
+        if (isa<SwitchInst>(&Inst))
+          SwitchTraceTargets.push_back(&Inst);
+      }
+      if (Options.TraceDiv)
+        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&Inst))
+          if (BO->getOpcode() == Instruction::SDiv ||
+              BO->getOpcode() == Instruction::UDiv)
+            DivTraceTargets.push_back(BO);
+      if (Options.TraceGep)
+        if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Inst))
+          GepTraceTargets.push_back(GEP);
+   }
+  }
+
+  InjectCoverage(F, BlocksToInstrument);
+  InjectCoverageForIndirectCalls(F, IndirCalls);
+  InjectTraceForCmp(F, CmpTraceTargets);
+  InjectTraceForSwitch(F, SwitchTraceTargets);
+  InjectTraceForDiv(F, DivTraceTargets);
+  InjectTraceForGep(F, GepTraceTargets);
+  return true;
+}
+
+GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection(
+    size_t NumElements, Function &F, Type *Ty, const char *Section) {
+  ArrayType *ArrayTy = ArrayType::get(Ty, NumElements);
+  auto Array = new GlobalVariable(
+      *CurModule, ArrayTy, false, GlobalVariable::PrivateLinkage,
+      Constant::getNullValue(ArrayTy), "__sancov_gen_");
+  if (auto Comdat = F.getComdat())
+    Array->setComdat(Comdat);
+  Array->setSection(getSectionName(Section));
+  return Array;
+}
+void SanitizerCoverageModule::CreateFunctionLocalArrays(size_t NumGuards,
+                                                       Function &F) {
+  if (Options.TracePCGuard)
+    FunctionGuardArray = CreateFunctionLocalArrayInSection(
+        NumGuards, F, Int32Ty, SanCovGuardsSectionName);
+  if (Options.Inline8bitCounters)
+    Function8bitCounterArray = CreateFunctionLocalArrayInSection(
+        NumGuards, F, Int8Ty, SanCovCountersSectionName);
+}
+
+bool SanitizerCoverageModule::InjectCoverage(Function &F,
+                                             ArrayRef<BasicBlock *> AllBlocks) {
+  if (AllBlocks.empty()) return false;
+  switch (Options.CoverageType) {
+  case SanitizerCoverageOptions::SCK_None:
+    return false;
+  case SanitizerCoverageOptions::SCK_Function:
+    CreateFunctionLocalArrays(1, F);
+    InjectCoverageAtBlock(F, F.getEntryBlock(), 0);
+    return true;
+  default: {
+    CreateFunctionLocalArrays(AllBlocks.size(), F);
+    for (size_t i = 0, N = AllBlocks.size(); i < N; i++)
+      InjectCoverageAtBlock(F, *AllBlocks[i], i);
+    return true;
+  }
+  }
+}
+
+// On every indirect call we call a run-time function
+// __sanitizer_cov_indir_call* with two parameters:
+//   - callee address,
+//   - global cache array that contains CacheSize pointers (zero-initialized).
+//     The cache is used to speed up recording the caller-callee pairs.
+// The address of the caller is passed implicitly via caller PC.
+// CacheSize is encoded in the name of the run-time function.
+void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
+    Function &F, ArrayRef<Instruction *> IndirCalls) {
+  if (IndirCalls.empty())
+    return;
+  assert(Options.TracePC || Options.TracePCGuard || Options.Inline8bitCounters);
+  for (auto I : IndirCalls) {
+    IRBuilder<> IRB(I);
+    CallSite CS(I);
+    Value *Callee = CS.getCalledValue();
+    if (isa<InlineAsm>(Callee))
+      continue;
+    IRB.CreateCall(SanCovTracePCIndir, IRB.CreatePointerCast(Callee, IntptrTy));
+  }
+}
+
+// For every switch statement we insert a call:
+// __sanitizer_cov_trace_switch(CondValue,
+//      {NumCases, ValueSizeInBits, Case0Value, Case1Value, Case2Value, ... })
+
+void SanitizerCoverageModule::InjectTraceForSwitch(
+    Function &, ArrayRef<Instruction *> SwitchTraceTargets) {
+  for (auto I : SwitchTraceTargets) {
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+      IRBuilder<> IRB(I);
+      SmallVector<Constant *, 16> Initializers;
+      Value *Cond = SI->getCondition();
+      if (Cond->getType()->getScalarSizeInBits() >
+          Int64Ty->getScalarSizeInBits())
+        continue;
+      Initializers.push_back(ConstantInt::get(Int64Ty, SI->getNumCases()));
+      Initializers.push_back(
+          ConstantInt::get(Int64Ty, Cond->getType()->getScalarSizeInBits()));
+      if (Cond->getType()->getScalarSizeInBits() <
+          Int64Ty->getScalarSizeInBits())
+        Cond = IRB.CreateIntCast(Cond, Int64Ty, false);
+      for (auto It : SI->cases()) {
+        Constant *C = It.getCaseValue();
+        if (C->getType()->getScalarSizeInBits() <
+            Int64Ty->getScalarSizeInBits())
+          C = ConstantExpr::getCast(CastInst::ZExt, It.getCaseValue(), Int64Ty);
+        Initializers.push_back(C);
+      }
+      std::sort(Initializers.begin() + 2, Initializers.end(),
+                [](const Constant *A, const Constant *B) {
+                  return cast<ConstantInt>(A)->getLimitedValue() <
+                         cast<ConstantInt>(B)->getLimitedValue();
+                });
+      ArrayType *ArrayOfInt64Ty = ArrayType::get(Int64Ty, Initializers.size());
+      GlobalVariable *GV = new GlobalVariable(
+          *CurModule, ArrayOfInt64Ty, false, GlobalVariable::InternalLinkage,
+          ConstantArray::get(ArrayOfInt64Ty, Initializers),
+          "__sancov_gen_cov_switch_values");
+      IRB.CreateCall(SanCovTraceSwitchFunction,
+                     {Cond, IRB.CreatePointerCast(GV, Int64PtrTy)});
+    }
+  }
+}
+
+void SanitizerCoverageModule::InjectTraceForDiv(
+    Function &, ArrayRef<BinaryOperator *> DivTraceTargets) {
+  for (auto BO : DivTraceTargets) {
+    IRBuilder<> IRB(BO);
+    Value *A1 = BO->getOperand(1);
+    if (isa<ConstantInt>(A1)) continue;
+    if (!A1->getType()->isIntegerTy())
+      continue;
+    uint64_t TypeSize = DL->getTypeStoreSizeInBits(A1->getType());
+    int CallbackIdx = TypeSize == 32 ? 0 :
+        TypeSize == 64 ? 1 : -1;
+    if (CallbackIdx < 0) continue;
+    auto Ty = Type::getIntNTy(*C, TypeSize);
+    IRB.CreateCall(SanCovTraceDivFunction[CallbackIdx],
+                   {IRB.CreateIntCast(A1, Ty, true)});
+  }
+}
+
+void SanitizerCoverageModule::InjectTraceForGep(
+    Function &, ArrayRef<GetElementPtrInst *> GepTraceTargets) {
+  for (auto GEP : GepTraceTargets) {
+    IRBuilder<> IRB(GEP);
+    for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+      if (!isa<ConstantInt>(*I) && (*I)->getType()->isIntegerTy())
+        IRB.CreateCall(SanCovTraceGepFunction,
+                       {IRB.CreateIntCast(*I, IntptrTy, true)});
+  }
+}
+
+void SanitizerCoverageModule::InjectTraceForCmp(
+    Function &, ArrayRef<Instruction *> CmpTraceTargets) {
+  for (auto I : CmpTraceTargets) {
+    if (ICmpInst *ICMP = dyn_cast<ICmpInst>(I)) {
+      IRBuilder<> IRB(ICMP);
+      Value *A0 = ICMP->getOperand(0);
+      Value *A1 = ICMP->getOperand(1);
+      if (!A0->getType()->isIntegerTy())
+        continue;
+      uint64_t TypeSize = DL->getTypeStoreSizeInBits(A0->getType());
+      int CallbackIdx = TypeSize == 8 ? 0 :
+                        TypeSize == 16 ? 1 :
+                        TypeSize == 32 ? 2 :
+                        TypeSize == 64 ? 3 : -1;
+      if (CallbackIdx < 0) continue;
+      // __sanitizer_cov_trace_cmp((type_size << 32) | predicate, A0, A1);
+      auto Ty = Type::getIntNTy(*C, TypeSize);
+      IRB.CreateCall(
+          SanCovTraceCmpFunction[CallbackIdx],
+          {IRB.CreateIntCast(A0, Ty, true), IRB.CreateIntCast(A1, Ty, true)});
+    }
+  }
+}
+
+void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
+                                                    size_t Idx) {
+  BasicBlock::iterator IP = BB.getFirstInsertionPt();
+  bool IsEntryBB = &BB == &F.getEntryBlock();
+  DebugLoc EntryLoc;
+  if (IsEntryBB) {
+    if (auto SP = F.getSubprogram())
+      EntryLoc = DebugLoc::get(SP->getScopeLine(), 0, SP);
+    // Keep static allocas and llvm.localescape calls in the entry block.  Even
+    // if we aren't splitting the block, it's nice for allocas to be before
+    // calls.
+    IP = PrepareToSplitEntryBlock(BB, IP);
+  } else {
+    EntryLoc = IP->getDebugLoc();
+  }
+
+  IRBuilder<> IRB(&*IP);
+  IRB.SetCurrentDebugLocation(EntryLoc);
+  if (Options.TracePC) {
+    IRB.CreateCall(SanCovTracePC); // gets the PC using GET_CALLER_PC.
+    IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
+  }
+  if (Options.TracePCGuard) {
+    auto GuardPtr = IRB.CreateIntToPtr(
+        IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy),
+                      ConstantInt::get(IntptrTy, Idx * 4)),
+        Int32PtrTy);
+    IRB.CreateCall(SanCovTracePCGuard, GuardPtr);
+    IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
+  }
+  if (Options.Inline8bitCounters) {
+    auto CounterPtr = IRB.CreateGEP(
+        Function8bitCounterArray,
+        {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)});
+    auto Load = IRB.CreateLoad(CounterPtr);
+    auto Inc = IRB.CreateAdd(Load, ConstantInt::get(Int8Ty, 1));
+    auto Store = IRB.CreateStore(Inc, CounterPtr);
+    SetNoSanitizeMetadata(Load);
+    SetNoSanitizeMetadata(Store);
+  }
+}
+
+std::string
+SanitizerCoverageModule::getSectionName(const std::string &Section) const {
+  if (TargetTriple.getObjectFormat() == Triple::COFF)
+    return ".SCOV$M";
+  if (TargetTriple.isOSBinFormatMachO())
+    return "__DATA,__" + Section;
+  return "__" + Section;
+}
+
+std::string
+SanitizerCoverageModule::getSectionStart(const std::string &Section) const {
+  if (TargetTriple.isOSBinFormatMachO())
+    return "\1section$start$__DATA$__" + Section;
+  return "__start___" + Section;
+}
+
+std::string
+SanitizerCoverageModule::getSectionEnd(const std::string &Section) const {
+  if (TargetTriple.isOSBinFormatMachO())
+    return "\1section$end$__DATA$__" + Section;
+  return "__stop___" + Section;
+}
+
+
+char SanitizerCoverageModule::ID = 0;
+INITIALIZE_PASS_BEGIN(SanitizerCoverageModule, "sancov",
+                      "SanitizerCoverage: TODO."
+                      "ModulePass",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(SanitizerCoverageModule, "sancov",
+                    "SanitizerCoverage: TODO."
+                    "ModulePass",
+                    false, false)
+ModulePass *llvm::createSanitizerCoverageModulePass(
+    const SanitizerCoverageOptions &Options) {
+  return new SanitizerCoverageModule(Options);
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
new file mode 100644
index 000000000000..ec6904486e10
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -0,0 +1,703 @@
+//===-- ThreadSanitizer.cpp - race detector -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer, a race detector.
+//
+// The tool is under development, for the details about previous versions see
+// http://code.google.com/p/data-race-test
+//
+// The instrumentation phase is quite simple:
+//   - Insert calls to run-time library before every memory access.
+//      - Optimizations may apply to avoid instrumenting some of the accesses.
+//   - Insert calls at function entry/exit.
+// The rest is handled by the run-time library.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tsan"
+
+static cl::opt<bool>  ClInstrumentMemoryAccesses(
+    "tsan-instrument-memory-accesses", cl::init(true),
+    cl::desc("Instrument memory accesses"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentFuncEntryExit(
+    "tsan-instrument-func-entry-exit", cl::init(true),
+    cl::desc("Instrument function entry and exit"), cl::Hidden);
+static cl::opt<bool>  ClHandleCxxExceptions(
+    "tsan-handle-cxx-exceptions", cl::init(true),
+    cl::desc("Handle C++ exceptions (insert cleanup blocks for unwinding)"),
+    cl::Hidden);
+static cl::opt<bool>  ClInstrumentAtomics(
+    "tsan-instrument-atomics", cl::init(true),
+    cl::desc("Instrument atomics"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentMemIntrinsics(
+    "tsan-instrument-memintrinsics", cl::init(true),
+    cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOmittedReadsBeforeWrite,
+          "Number of reads ignored due to following writes");
+STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
+STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
+STATISTIC(NumInstrumentedVtableReads, "Number of vtable ptr reads");
+STATISTIC(NumOmittedReadsFromConstantGlobals,
+          "Number of reads from constant globals");
+STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads");
+STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing");
+
+static const char *const kTsanModuleCtorName = "tsan.module_ctor";
+static const char *const kTsanInitName = "__tsan_init";
+
+namespace {
+
+/// ThreadSanitizer: instrument the code in module to find races.
+struct ThreadSanitizer : public FunctionPass {
+  ThreadSanitizer() : FunctionPass(ID) {}
+  StringRef getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+  bool doInitialization(Module &M) override;
+  static char ID;  // Pass identification, replacement for typeid.
+
+ private:
+  void initializeCallbacks(Module &M);
+  bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL);
+  bool instrumentAtomic(Instruction *I, const DataLayout &DL);
+  bool instrumentMemIntrinsic(Instruction *I);
+  void chooseInstructionsToInstrument(SmallVectorImpl<Instruction *> &Local,
+                                      SmallVectorImpl<Instruction *> &All,
+                                      const DataLayout &DL);
+  bool addrPointsToConstantData(Value *Addr);
+  int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL);
+  void InsertRuntimeIgnores(Function &F);
+
+  Type *IntptrTy;
+  IntegerType *OrdTy;
+  // Callbacks to run-time library are computed in doInitialization.
+  Function *TsanFuncEntry;
+  Function *TsanFuncExit;
+  Function *TsanIgnoreBegin;
+  Function *TsanIgnoreEnd;
+  // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+  static const size_t kNumberOfAccessSizes = 5;
+  Function *TsanRead[kNumberOfAccessSizes];
+  Function *TsanWrite[kNumberOfAccessSizes];
+  Function *TsanUnalignedRead[kNumberOfAccessSizes];
+  Function *TsanUnalignedWrite[kNumberOfAccessSizes];
+  Function *TsanAtomicLoad[kNumberOfAccessSizes];
+  Function *TsanAtomicStore[kNumberOfAccessSizes];
+  Function *TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1][kNumberOfAccessSizes];
+  Function *TsanAtomicCAS[kNumberOfAccessSizes];
+  Function *TsanAtomicThreadFence;
+  Function *TsanAtomicSignalFence;
+  Function *TsanVptrUpdate;
+  Function *TsanVptrLoad;
+  Function *MemmoveFn, *MemcpyFn, *MemsetFn;
+  Function *TsanCtorFunction;
+};
+}  // namespace
+
+char ThreadSanitizer::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    ThreadSanitizer, "tsan",
+    "ThreadSanitizer: detects data races.",
+    false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    ThreadSanitizer, "tsan",
+    "ThreadSanitizer: detects data races.",
+    false, false)
+
+StringRef ThreadSanitizer::getPassName() const { return "ThreadSanitizer"; }
+
+void ThreadSanitizer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
+
+FunctionPass *llvm::createThreadSanitizerPass() {
+  return new ThreadSanitizer();
+}
+
+void ThreadSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(M.getContext());
+  AttributeList Attr;
+  Attr = Attr.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                           Attribute::NoUnwind);
+  // Initialize the callbacks.
+  TsanFuncEntry = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_func_entry", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
+  TsanFuncExit = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("__tsan_func_exit", Attr, IRB.getVoidTy()));
+  TsanIgnoreBegin = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_ignore_thread_begin", Attr, IRB.getVoidTy()));
+  TsanIgnoreEnd = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_ignore_thread_end", Attr, IRB.getVoidTy()));
+  OrdTy = IRB.getInt32Ty();
+  for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
+    const unsigned ByteSize = 1U << i;
+    const unsigned BitSize = ByteSize * 8;
+    std::string ByteSizeStr = utostr(ByteSize);
+    std::string BitSizeStr = utostr(BitSize);
+    SmallString<32> ReadName("__tsan_read" + ByteSizeStr);
+    TsanRead[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+        ReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
+
+    SmallString<32> WriteName("__tsan_write" + ByteSizeStr);
+    TsanWrite[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+        WriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
+
+    SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr);
+    TsanUnalignedRead[i] =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            UnalignedReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
+
+    SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + ByteSizeStr);
+    TsanUnalignedWrite[i] =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            UnalignedWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
+
+    Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load");
+    TsanAtomicLoad[i] = checkSanitizerInterfaceFunction(
+        M.getOrInsertFunction(AtomicLoadName, Attr, Ty, PtrTy, OrdTy));
+
+    SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store");
+    TsanAtomicStore[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+        AtomicStoreName, Attr, IRB.getVoidTy(), PtrTy, Ty, OrdTy));
+
+    for (int op = AtomicRMWInst::FIRST_BINOP;
+        op <= AtomicRMWInst::LAST_BINOP; ++op) {
+      TsanAtomicRMW[op][i] = nullptr;
+      const char *NamePart = nullptr;
+      if (op == AtomicRMWInst::Xchg)
+        NamePart = "_exchange";
+      else if (op == AtomicRMWInst::Add)
+        NamePart = "_fetch_add";
+      else if (op == AtomicRMWInst::Sub)
+        NamePart = "_fetch_sub";
+      else if (op == AtomicRMWInst::And)
+        NamePart = "_fetch_and";
+      else if (op == AtomicRMWInst::Or)
+        NamePart = "_fetch_or";
+      else if (op == AtomicRMWInst::Xor)
+        NamePart = "_fetch_xor";
+      else if (op == AtomicRMWInst::Nand)
+        NamePart = "_fetch_nand";
+      else
+        continue;
+      SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
+      TsanAtomicRMW[op][i] = checkSanitizerInterfaceFunction(
+          M.getOrInsertFunction(RMWName, Attr, Ty, PtrTy, Ty, OrdTy));
+    }
+
+    SmallString<32> AtomicCASName("__tsan_atomic" + BitSizeStr +
+                                  "_compare_exchange_val");
+    TsanAtomicCAS[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+        AtomicCASName, Attr, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy));
+  }
+  TsanVptrUpdate = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("__tsan_vptr_update", Attr, IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy()));
+  TsanVptrLoad = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_vptr_read", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
+  TsanAtomicThreadFence = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_atomic_thread_fence", Attr, IRB.getVoidTy(), OrdTy));
+  TsanAtomicSignalFence = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_atomic_signal_fence", Attr, IRB.getVoidTy(), OrdTy));
+
+  MemmoveFn = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("memmove", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IntptrTy));
+  MemcpyFn = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("memcpy", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IntptrTy));
+  MemsetFn = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("memset", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt32Ty(), IntptrTy));
+}
+
+bool ThreadSanitizer::doInitialization(Module &M) {
+  const DataLayout &DL = M.getDataLayout();
+  IntptrTy = DL.getIntPtrType(M.getContext());
+  std::tie(TsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{},
+      /*InitArgs=*/{});
+
+  appendToGlobalCtors(M, TsanCtorFunction, 0);
+
+  return true;
+}
+
+static bool isVtableAccess(Instruction *I) {
+  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa))
+    return Tag->isTBAAVtableAccess();
+  return false;
+}
+
+// Do not instrument known races/"benign races" that come from compiler
+// instrumentatin. The user has no way of suppressing them.
+static bool shouldInstrumentReadWriteFromAddress(const Module *M, Value *Addr) {
+  // Peel off GEPs and BitCasts.
+  Addr = Addr->stripInBoundsOffsets();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->hasSection()) {
+      StringRef SectionName = GV->getSection();
+      // Check if the global is in the PGO counters section.
+      auto OF = Triple(M->getTargetTriple()).getObjectFormat();
+      if (SectionName.endswith(
+              getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false)))
+        return false;
+    }
+
+    // Check if the global is private gcov data.
+    if (GV->getName().startswith("__llvm_gcov") ||
+        GV->getName().startswith("__llvm_gcda"))
+      return false;
+  }
+
+  // Do not instrument acesses from different address spaces; we cannot deal
+  // with them.
+  if (Addr) {
+    Type *PtrTy = cast<PointerType>(Addr->getType()->getScalarType());
+    if (PtrTy->getPointerAddressSpace() != 0)
+      return false;
+  }
+
+  return true;
+}
+
+bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
+  // If this is a GEP, just analyze its pointer operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr))
+    Addr = GEP->getPointerOperand();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->isConstant()) {
+      // Reads from constant globals can not race with any writes.
+      NumOmittedReadsFromConstantGlobals++;
+      return true;
+    }
+  } else if (LoadInst *L = dyn_cast<LoadInst>(Addr)) {
+    if (isVtableAccess(L)) {
+      // Reads from a vtable pointer can not race with any writes.
+      NumOmittedReadsFromVtable++;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Instrumenting some of the accesses may be proven redundant.
+// Currently handled:
+//  - read-before-write (within same BB, no calls between)
+//  - not captured variables
+//
+// We do not handle some of the patterns that should not survive
+// after the classic compiler optimizations.
+// E.g. two reads from the same temp should be eliminated by CSE,
+// two writes should be eliminated by DSE, etc.
+//
+// 'Local' is a vector of insns within the same BB (no calls between).
+// 'All' is a vector of insns that will be instrumented.
+void ThreadSanitizer::chooseInstructionsToInstrument(
+    SmallVectorImpl<Instruction *> &Local, SmallVectorImpl<Instruction *> &All,
+    const DataLayout &DL) {
+  SmallSet<Value*, 8> WriteTargets;
+  // Iterate from the end.
+  for (Instruction *I : reverse(Local)) {
+    if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
+      Value *Addr = Store->getPointerOperand();
+      if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
+        continue;
+      WriteTargets.insert(Addr);
+    } else {
+      LoadInst *Load = cast<LoadInst>(I);
+      Value *Addr = Load->getPointerOperand();
+      if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
+        continue;
+      if (WriteTargets.count(Addr)) {
+        // We will write to this temp, so no reason to analyze the read.
+        NumOmittedReadsBeforeWrite++;
+        continue;
+      }
+      if (addrPointsToConstantData(Addr)) {
+        // Addr points to some constant data -- it can not race with any writes.
+        continue;
+      }
+    }
+    Value *Addr = isa<StoreInst>(*I)
+        ? cast<StoreInst>(I)->getPointerOperand()
+        : cast<LoadInst>(I)->getPointerOperand();
+    if (isa<AllocaInst>(GetUnderlyingObject(Addr, DL)) &&
+        !PointerMayBeCaptured(Addr, true, true)) {
+      // The variable is addressable but not captured, so it cannot be
+      // referenced from a different thread and participate in a data race
+      // (see llvm/Analysis/CaptureTracking.h for details).
+      NumOmittedNonCaptured++;
+      continue;
+    }
+    All.push_back(I);
+  }
+  Local.clear();
+}
+
+static bool isAtomic(Instruction *I) {
+  // TODO: Ask TTI whether synchronization scope is between threads.
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isAtomic() && LI->getSyncScopeID() != SyncScope::SingleThread;
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isAtomic() && SI->getSyncScopeID() != SyncScope::SingleThread;
+  if (isa<AtomicRMWInst>(I))
+    return true;
+  if (isa<AtomicCmpXchgInst>(I))
+    return true;
+  if (isa<FenceInst>(I))
+    return true;
+  return false;
+}
+
+void ThreadSanitizer::InsertRuntimeIgnores(Function &F) {
+  IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+  IRB.CreateCall(TsanIgnoreBegin);
+  EscapeEnumerator EE(F, "tsan_ignore_cleanup", ClHandleCxxExceptions);
+  while (IRBuilder<> *AtExit = EE.Next()) {
+    AtExit->CreateCall(TsanIgnoreEnd);
+  }
+}
+
+bool ThreadSanitizer::runOnFunction(Function &F) {
+  // This is required to prevent instrumenting call to __tsan_init from within
+  // the module constructor.
+  if (&F == TsanCtorFunction)
+    return false;
+  initializeCallbacks(*F.getParent());
+  SmallVector<Instruction*, 8> AllLoadsAndStores;
+  SmallVector<Instruction*, 8> LocalLoadsAndStores;
+  SmallVector<Instruction*, 8> AtomicAccesses;
+  SmallVector<Instruction*, 8> MemIntrinCalls;
+  bool Res = false;
+  bool HasCalls = false;
+  bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread);
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  // Traverse all instructions, collect loads/stores/returns, check for calls.
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if (isAtomic(&Inst))
+        AtomicAccesses.push_back(&Inst);
+      else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+        LocalLoadsAndStores.push_back(&Inst);
+      else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+        if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+          maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
+        if (isa<MemIntrinsic>(Inst))
+          MemIntrinCalls.push_back(&Inst);
+        HasCalls = true;
+        chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores,
+                                       DL);
+      }
+    }
+    chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, DL);
+  }
+
+  // We have collected all loads and stores.
+  // FIXME: many of these accesses do not need to be checked for races
+  // (e.g. variables that do not escape, etc).
+
+  // Instrument memory accesses only if we want to report bugs in the function.
+  if (ClInstrumentMemoryAccesses && SanitizeFunction)
+    for (auto Inst : AllLoadsAndStores) {
+      Res |= instrumentLoadOrStore(Inst, DL);
+    }
+
+  // Instrument atomic memory accesses in any case (they can be used to
+  // implement synchronization).
+  if (ClInstrumentAtomics)
+    for (auto Inst : AtomicAccesses) {
+      Res |= instrumentAtomic(Inst, DL);
+    }
+
+  if (ClInstrumentMemIntrinsics && SanitizeFunction)
+    for (auto Inst : MemIntrinCalls) {
+      Res |= instrumentMemIntrinsic(Inst);
+    }
+
+  if (F.hasFnAttribute("sanitize_thread_no_checking_at_run_time")) {
+    assert(!F.hasFnAttribute(Attribute::SanitizeThread));
+    if (HasCalls)
+      InsertRuntimeIgnores(F);
+  }
+
+  // Instrument function entry/exit points if there were instrumented accesses.
+  if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
+    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    Value *ReturnAddress = IRB.CreateCall(
+        Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress),
+        IRB.getInt32(0));
+    IRB.CreateCall(TsanFuncEntry, ReturnAddress);
+
+    EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions);
+    while (IRBuilder<> *AtExit = EE.Next()) {
+      AtExit->CreateCall(TsanFuncExit, {});
+    }
+    Res = true;
+  }
+  return Res;
+}
+
+bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I,
+                                            const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  bool IsWrite = isa<StoreInst>(*I);
+  Value *Addr = IsWrite
+      ? cast<StoreInst>(I)->getPointerOperand()
+      : cast<LoadInst>(I)->getPointerOperand();
+
+  // swifterror memory addresses are mem2reg promoted by instruction selection.
+  // As such they cannot have regular uses like an instrumentation function and
+  // it makes no sense to track them as memory.
+  if (Addr->isSwiftError())
+    return false;
+
+  int Idx = getMemoryAccessFuncIndex(Addr, DL);
+  if (Idx < 0)
+    return false;
+  if (IsWrite && isVtableAccess(I)) {
+    DEBUG(dbgs() << "  VPTR : " << *I << "\n");
+    Value *StoredValue = cast<StoreInst>(I)->getValueOperand();
+    // StoredValue may be a vector type if we are storing several vptrs at once.
+    // In this case, just take the first element of the vector since this is
+    // enough to find vptr races.
+    if (isa<VectorType>(StoredValue->getType()))
+      StoredValue = IRB.CreateExtractElement(
+          StoredValue, ConstantInt::get(IRB.getInt32Ty(), 0));
+    if (StoredValue->getType()->isIntegerTy())
+      StoredValue = IRB.CreateIntToPtr(StoredValue, IRB.getInt8PtrTy());
+    // Call TsanVptrUpdate.
+    IRB.CreateCall(TsanVptrUpdate,
+                   {IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                    IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy())});
+    NumInstrumentedVtableWrites++;
+    return true;
+  }
+  if (!IsWrite && isVtableAccess(I)) {
+    IRB.CreateCall(TsanVptrLoad,
+                   IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+    NumInstrumentedVtableReads++;
+    return true;
+  }
+  const unsigned Alignment = IsWrite
+      ? cast<StoreInst>(I)->getAlignment()
+      : cast<LoadInst>(I)->getAlignment();
+  Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+  const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+  Value *OnAccessFunc = nullptr;
+  if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0)
+    OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
+  else
+    OnAccessFunc = IsWrite ? TsanUnalignedWrite[Idx] : TsanUnalignedRead[Idx];
+  IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+  if (IsWrite) NumInstrumentedWrites++;
+  else         NumInstrumentedReads++;
+  return true;
+}
+
+static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
+  uint32_t v = 0;
+  switch (ord) {
+    case AtomicOrdering::NotAtomic:
+      llvm_unreachable("unexpected atomic ordering!");
+    case AtomicOrdering::Unordered:              LLVM_FALLTHROUGH;
+    case AtomicOrdering::Monotonic:              v = 0; break;
+    // Not specified yet:
+    // case AtomicOrdering::Consume:                v = 1; break;
+    case AtomicOrdering::Acquire:                v = 2; break;
+    case AtomicOrdering::Release:                v = 3; break;
+    case AtomicOrdering::AcquireRelease:         v = 4; break;
+    case AtomicOrdering::SequentiallyConsistent: v = 5; break;
+  }
+  return IRB->getInt32(v);
+}
+
+// If a memset intrinsic gets inlined by the code gen, we will miss races on it.
+// So, we either need to ensure the intrinsic is not inlined, or instrument it.
+// We do not instrument memset/memmove/memcpy intrinsics (too complicated),
+// instead we simply replace them with regular function calls, which are then
+// intercepted by the run-time.
+// Since tsan is running after everyone else, the calls should not be
+// replaced back with intrinsics. If that becomes wrong at some point,
+// we will need to call e.g. __tsan_memset to avoid the intrinsics.
+bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) {
+  IRBuilder<> IRB(I);
+  if (MemSetInst *M = dyn_cast<MemSetInst>(I)) {
+    IRB.CreateCall(
+        MemsetFn,
+        {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+    I->eraseFromParent();
+  } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) {
+    IRB.CreateCall(
+        isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn,
+        {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+    I->eraseFromParent();
+  }
+  return false;
+}
+
+// Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x
+// standards.  For background see C++11 standard.  A slightly older, publicly
+// available draft of the standard (not entirely up-to-date, but close enough
+// for casual browsing) is available here:
+// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2011/n3242.pdf
+// The following page contains more background information:
+// http://www.hpl.hp.com/personal/Hans_Boehm/c++mm/
+
+bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    Value *Addr = LI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     createOrdering(&IRB, LI->getOrdering())};
+    Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+    Value *C = IRB.CreateCall(TsanAtomicLoad[Idx], Args);
+    Value *Cast = IRB.CreateBitOrPointerCast(C, OrigTy);
+    I->replaceAllUsesWith(Cast);
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    Value *Addr = SI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateBitOrPointerCast(SI->getValueOperand(), Ty),
+                     createOrdering(&IRB, SI->getOrdering())};
+    CallInst *C = CallInst::Create(TsanAtomicStore[Idx], Args);
+    ReplaceInstWithInst(I, C);
+  } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+    Value *Addr = RMWI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx];
+    if (!F)
+      return false;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(RMWI->getValOperand(), Ty, false),
+                     createOrdering(&IRB, RMWI->getOrdering())};
+    CallInst *C = CallInst::Create(F, Args);
+    ReplaceInstWithInst(I, C);
+  } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
+    Value *Addr = CASI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *CmpOperand =
+      IRB.CreateBitOrPointerCast(CASI->getCompareOperand(), Ty);
+    Value *NewOperand =
+      IRB.CreateBitOrPointerCast(CASI->getNewValOperand(), Ty);
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     CmpOperand,
+                     NewOperand,
+                     createOrdering(&IRB, CASI->getSuccessOrdering()),
+                     createOrdering(&IRB, CASI->getFailureOrdering())};
+    CallInst *C = IRB.CreateCall(TsanAtomicCAS[Idx], Args);
+    Value *Success = IRB.CreateICmpEQ(C, CmpOperand);
+    Value *OldVal = C;
+    Type *OrigOldValTy = CASI->getNewValOperand()->getType();
+    if (Ty != OrigOldValTy) {
+      // The value is a pointer, so we need to cast the return value.
+      OldVal = IRB.CreateIntToPtr(C, OrigOldValTy);
+    }
+
+    Value *Res =
+      IRB.CreateInsertValue(UndefValue::get(CASI->getType()), OldVal, 0);
+    Res = IRB.CreateInsertValue(Res, Success, 1);
+
+    I->replaceAllUsesWith(Res);
+    I->eraseFromParent();
+  } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) {
+    Value *Args[] = {createOrdering(&IRB, FI->getOrdering())};
+    Function *F = FI->getSyncScopeID() == SyncScope::SingleThread ?
+        TsanAtomicSignalFence : TsanAtomicThreadFence;
+    CallInst *C = CallInst::Create(F, Args);
+    ReplaceInstWithInst(I, C);
+  }
+  return true;
+}
+
+int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr,
+                                              const DataLayout &DL) {
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+  assert(OrigTy->isSized());
+  uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+  if (TypeSize != 8  && TypeSize != 16 &&
+      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
+    NumAccessesWithBadSize++;
+    // Ignore all unusual sizes.
+    return -1;
+  }
+  size_t Idx = countTrailingZeros(TypeSize / 8);
+  assert(Idx < kNumberOfAccessSizes);
+  return Idx;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
new file mode 100644
index 000000000000..cb3b5757f8d0
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -0,0 +1,179 @@
+//===- ARCRuntimeEntryPoints.h - ObjC ARC Optimization --*- C++ -*---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file contains a class ARCRuntimeEntryPoints for use in
+/// creating/managing references to entry points to the arc objective c runtime.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
+
+#include "ObjCARC.h"
+
+namespace llvm {
+namespace objcarc {
+
+enum class ARCRuntimeEntryPointKind {
+  AutoreleaseRV,
+  Release,
+  Retain,
+  RetainBlock,
+  Autorelease,
+  StoreStrong,
+  RetainRV,
+  RetainAutorelease,
+  RetainAutoreleaseRV,
+};
+
+/// Declarations for ObjC runtime functions and constants. These are initialized
+/// lazily to avoid cluttering up the Module with unused declarations.
+class ARCRuntimeEntryPoints {
+public:
+  ARCRuntimeEntryPoints() : TheModule(nullptr),
+                            AutoreleaseRV(nullptr),
+                            Release(nullptr),
+                            Retain(nullptr),
+                            RetainBlock(nullptr),
+                            Autorelease(nullptr),
+                            StoreStrong(nullptr),
+                            RetainRV(nullptr),
+                            RetainAutorelease(nullptr),
+                            RetainAutoreleaseRV(nullptr) { }
+
+  void init(Module *M) {
+    TheModule = M;
+    AutoreleaseRV = nullptr;
+    Release = nullptr;
+    Retain = nullptr;
+    RetainBlock = nullptr;
+    Autorelease = nullptr;
+    StoreStrong = nullptr;
+    RetainRV = nullptr;
+    RetainAutorelease = nullptr;
+    RetainAutoreleaseRV = nullptr;
+  }
+
+  Constant *get(ARCRuntimeEntryPointKind kind) {
+    assert(TheModule != nullptr && "Not initialized.");
+
+    switch (kind) {
+    case ARCRuntimeEntryPointKind::AutoreleaseRV:
+      return getI8XRetI8XEntryPoint(AutoreleaseRV,
+                                    "objc_autoreleaseReturnValue", true);
+    case ARCRuntimeEntryPointKind::Release:
+      return getVoidRetI8XEntryPoint(Release, "objc_release");
+    case ARCRuntimeEntryPointKind::Retain:
+      return getI8XRetI8XEntryPoint(Retain, "objc_retain", true);
+    case ARCRuntimeEntryPointKind::RetainBlock:
+      return getI8XRetI8XEntryPoint(RetainBlock, "objc_retainBlock", false);
+    case ARCRuntimeEntryPointKind::Autorelease:
+      return getI8XRetI8XEntryPoint(Autorelease, "objc_autorelease", true);
+    case ARCRuntimeEntryPointKind::StoreStrong:
+      return getI8XRetI8XXI8XEntryPoint(StoreStrong, "objc_storeStrong");
+    case ARCRuntimeEntryPointKind::RetainRV:
+      return getI8XRetI8XEntryPoint(RetainRV,
+                                    "objc_retainAutoreleasedReturnValue", true);
+    case ARCRuntimeEntryPointKind::RetainAutorelease:
+      return getI8XRetI8XEntryPoint(RetainAutorelease, "objc_retainAutorelease",
+                                    true);
+    case ARCRuntimeEntryPointKind::RetainAutoreleaseRV:
+      return getI8XRetI8XEntryPoint(RetainAutoreleaseRV,
+                                    "objc_retainAutoreleaseReturnValue", true);
+    }
+
+    llvm_unreachable("Switch should be a covered switch.");
+  }
+
+private:
+  /// Cached reference to the module which we will insert declarations into.
+  Module *TheModule;
+
+  /// Declaration for ObjC runtime function objc_autoreleaseReturnValue.
+  Constant *AutoreleaseRV;
+  /// Declaration for ObjC runtime function objc_release.
+  Constant *Release;
+  /// Declaration for ObjC runtime function objc_retain.
+  Constant *Retain;
+  /// Declaration for ObjC runtime function objc_retainBlock.
+  Constant *RetainBlock;
+  /// Declaration for ObjC runtime function objc_autorelease.
+  Constant *Autorelease;
+  /// Declaration for objc_storeStrong().
+  Constant *StoreStrong;
+  /// Declaration for objc_retainAutoreleasedReturnValue().
+  Constant *RetainRV;
+  /// Declaration for objc_retainAutorelease().
+  Constant *RetainAutorelease;
+  /// Declaration for objc_retainAutoreleaseReturnValue().
+  Constant *RetainAutoreleaseRV;
+
+  Constant *getVoidRetI8XEntryPoint(Constant *&Decl, StringRef Name) {
+    if (Decl)
+      return Decl;
+
+    LLVMContext &C = TheModule->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttributeList Attr = AttributeList().addAttribute(
+        C, AttributeList::FunctionIndex, Attribute::NoUnwind);
+    FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params,
+                                          /*isVarArg=*/false);
+    return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
+  }
+
+  Constant *getI8XRetI8XEntryPoint(Constant *&Decl, StringRef Name,
+                                   bool NoUnwind = false) {
+    if (Decl)
+      return Decl;
+
+    LLVMContext &C = TheModule->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *Fty = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeList Attr = AttributeList();
+
+    if (NoUnwind)
+      Attr = Attr.addAttribute(C, AttributeList::FunctionIndex,
+                               Attribute::NoUnwind);
+
+    return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
+  }
+
+  Constant *getI8XRetI8XXI8XEntryPoint(Constant *&Decl, StringRef Name) {
+    if (Decl)
+      return Decl;
+
+    LLVMContext &C = TheModule->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *I8XX = PointerType::getUnqual(I8X);
+    Type *Params[] = { I8XX, I8X };
+
+    AttributeList Attr = AttributeList().addAttribute(
+        C, AttributeList::FunctionIndex, Attribute::NoUnwind);
+    Attr = Attr.addParamAttribute(C, 0, Attribute::NoCapture);
+
+    FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params,
+                                          /*isVarArg=*/false);
+
+    return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
+  }
+
+}; // class ARCRuntimeEntryPoints
+
+} // namespace objcarc
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/BlotMapVector.h b/contrib/llvm/lib/Transforms/ObjCARC/BlotMapVector.h
new file mode 100644
index 000000000000..9c5cf6f5f5ab
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/BlotMapVector.h
@@ -0,0 +1,108 @@
+//===- BlotMapVector.h - A MapVector with the blot operation -*- C++ -*----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include <algorithm>
+#include <vector>
+
+namespace llvm {
+/// \brief An associative container with fast insertion-order (deterministic)
+/// iteration over its elements. Plus the special blot operation.
+template <class KeyT, class ValueT> class BlotMapVector {
+  /// Map keys to indices in Vector.
+  typedef DenseMap<KeyT, size_t> MapTy;
+  MapTy Map;
+
+  typedef std::vector<std::pair<KeyT, ValueT>> VectorTy;
+  /// Keys and values.
+  VectorTy Vector;
+
+public:
+  typedef typename VectorTy::iterator iterator;
+  typedef typename VectorTy::const_iterator const_iterator;
+  iterator begin() { return Vector.begin(); }
+  iterator end() { return Vector.end(); }
+  const_iterator begin() const { return Vector.begin(); }
+  const_iterator end() const { return Vector.end(); }
+
+#ifdef EXPENSIVE_CHECKS
+  ~BlotMapVector() {
+    assert(Vector.size() >= Map.size()); // May differ due to blotting.
+    for (typename MapTy::const_iterator I = Map.begin(), E = Map.end(); I != E;
+         ++I) {
+      assert(I->second < Vector.size());
+      assert(Vector[I->second].first == I->first);
+    }
+    for (typename VectorTy::const_iterator I = Vector.begin(), E = Vector.end();
+         I != E; ++I)
+      assert(!I->first || (Map.count(I->first) &&
+                           Map[I->first] == size_t(I - Vector.begin())));
+  }
+#endif
+
+  ValueT &operator[](const KeyT &Arg) {
+    std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(Arg, size_t(0)));
+    if (Pair.second) {
+      size_t Num = Vector.size();
+      Pair.first->second = Num;
+      Vector.push_back(std::make_pair(Arg, ValueT()));
+      return Vector[Num].second;
+    }
+    return Vector[Pair.first->second].second;
+  }
+
+  std::pair<iterator, bool> insert(const std::pair<KeyT, ValueT> &InsertPair) {
+    std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(InsertPair.first, size_t(0)));
+    if (Pair.second) {
+      size_t Num = Vector.size();
+      Pair.first->second = Num;
+      Vector.push_back(InsertPair);
+      return std::make_pair(Vector.begin() + Num, true);
+    }
+    return std::make_pair(Vector.begin() + Pair.first->second, false);
+  }
+
+  iterator find(const KeyT &Key) {
+    typename MapTy::iterator It = Map.find(Key);
+    if (It == Map.end())
+      return Vector.end();
+    return Vector.begin() + It->second;
+  }
+
+  const_iterator find(const KeyT &Key) const {
+    typename MapTy::const_iterator It = Map.find(Key);
+    if (It == Map.end())
+      return Vector.end();
+    return Vector.begin() + It->second;
+  }
+
+  /// This is similar to erase, but instead of removing the element from the
+  /// vector, it just zeros out the key in the vector. This leaves iterators
+  /// intact, but clients must be prepared for zeroed-out keys when iterating.
+  void blot(const KeyT &Key) {
+    typename MapTy::iterator It = Map.find(Key);
+    if (It == Map.end())
+      return;
+    Vector[It->second].first = KeyT();
+    Map.erase(It);
+  }
+
+  void clear() {
+    Map.clear();
+    Vector.clear();
+  }
+
+  bool empty() const {
+    assert(Map.empty() == Vector.empty());
+    return Map.empty();
+  }
+};
+} //
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
new file mode 100644
index 000000000000..464805051c65
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -0,0 +1,278 @@
+//===- DependencyAnalysis.cpp - ObjC ARC Optimization ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines special dependency analysis routines used in Objective C
+/// ARC Optimizations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "DependencyAnalysis.h"
+#include "ObjCARC.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/IR/CFG.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-dependency"
+
+/// Test whether the given instruction can result in a reference count
+/// modification (positive or negative) for the pointer's object.
+bool llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                                     ProvenanceAnalysis &PA,
+                                     ARCInstKind Class) {
+  switch (Class) {
+  case ARCInstKind::Autorelease:
+  case ARCInstKind::AutoreleaseRV:
+  case ARCInstKind::IntrinsicUser:
+  case ARCInstKind::User:
+    // These operations never directly modify a reference count.
+    return false;
+  default: break;
+  }
+
+  ImmutableCallSite CS(Inst);
+  assert(CS && "Only calls can alter reference counts!");
+
+  // See if AliasAnalysis can help us with the call.
+  FunctionModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS);
+  if (AliasAnalysis::onlyReadsMemory(MRB))
+    return false;
+  if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
+    const DataLayout &DL = Inst->getModule()->getDataLayout();
+    for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+         I != E; ++I) {
+      const Value *Op = *I;
+      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) &&
+          PA.related(Ptr, Op, DL))
+        return true;
+    }
+    return false;
+  }
+
+  // Assume the worst.
+  return true;
+}
+
+bool llvm::objcarc::CanDecrementRefCount(const Instruction *Inst,
+                                         const Value *Ptr,
+                                         ProvenanceAnalysis &PA,
+                                         ARCInstKind Class) {
+  // First perform a quick check if Class can not touch ref counts.
+  if (!CanDecrementRefCount(Class))
+    return false;
+
+  // Otherwise, just use CanAlterRefCount for now.
+  return CanAlterRefCount(Inst, Ptr, PA, Class);
+}
+
+/// Test whether the given instruction can "use" the given pointer's object in a
+/// way that requires the reference count to be positive.
+bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
+                           ProvenanceAnalysis &PA, ARCInstKind Class) {
+  // ARCInstKind::Call operations (as opposed to
+  // ARCInstKind::CallOrUser) never "use" objc pointers.
+  if (Class == ARCInstKind::Call)
+    return false;
+
+  const DataLayout &DL = Inst->getModule()->getDataLayout();
+
+  // Consider various instructions which may have pointer arguments which are
+  // not "uses".
+  if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
+    // Comparing a pointer with null, or any other constant, isn't really a use,
+    // because we don't care what the pointer points to, or about the values
+    // of any other dynamic reference-counted pointers.
+    if (!IsPotentialRetainableObjPtr(ICI->getOperand(1), *PA.getAA()))
+      return false;
+  } else if (auto CS = ImmutableCallSite(Inst)) {
+    // For calls, just check the arguments (and not the callee operand).
+    for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
+         OE = CS.arg_end(); OI != OE; ++OI) {
+      const Value *Op = *OI;
+      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) &&
+          PA.related(Ptr, Op, DL))
+        return true;
+    }
+    return false;
+  } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    // Special-case stores, because we don't care about the stored value, just
+    // the store address.
+    const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand(), DL);
+    // If we can't tell what the underlying object was, assume there is a
+    // dependence.
+    return IsPotentialRetainableObjPtr(Op, *PA.getAA()) &&
+           PA.related(Op, Ptr, DL);
+  }
+
+  // Check each operand for a match.
+  for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
+       OI != OE; ++OI) {
+    const Value *Op = *OI;
+    if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op, DL))
+      return true;
+  }
+  return false;
+}
+
+/// Test if there can be dependencies on Inst through Arg. This function only
+/// tests dependencies relevant for removing pairs of calls.
+bool
+llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
+                       const Value *Arg, ProvenanceAnalysis &PA) {
+  // If we've reached the definition of Arg, stop.
+  if (Inst == Arg)
+    return true;
+
+  switch (Flavor) {
+  case NeedsPositiveRetainCount: {
+    ARCInstKind Class = GetARCInstKind(Inst);
+    switch (Class) {
+    case ARCInstKind::AutoreleasepoolPop:
+    case ARCInstKind::AutoreleasepoolPush:
+    case ARCInstKind::None:
+      return false;
+    default:
+      return CanUse(Inst, Arg, PA, Class);
+    }
+  }
+
+  case AutoreleasePoolBoundary: {
+    ARCInstKind Class = GetARCInstKind(Inst);
+    switch (Class) {
+    case ARCInstKind::AutoreleasepoolPop:
+    case ARCInstKind::AutoreleasepoolPush:
+      // These mark the end and begin of an autorelease pool scope.
+      return true;
+    default:
+      // Nothing else does this.
+      return false;
+    }
+  }
+
+  case CanChangeRetainCount: {
+    ARCInstKind Class = GetARCInstKind(Inst);
+    switch (Class) {
+    case ARCInstKind::AutoreleasepoolPop:
+      // Conservatively assume this can decrement any count.
+      return true;
+    case ARCInstKind::AutoreleasepoolPush:
+    case ARCInstKind::None:
+      return false;
+    default:
+      return CanAlterRefCount(Inst, Arg, PA, Class);
+    }
+  }
+
+  case RetainAutoreleaseDep:
+    switch (GetBasicARCInstKind(Inst)) {
+    case ARCInstKind::AutoreleasepoolPop:
+    case ARCInstKind::AutoreleasepoolPush:
+      // Don't merge an objc_autorelease with an objc_retain inside a different
+      // autoreleasepool scope.
+      return true;
+    case ARCInstKind::Retain:
+    case ARCInstKind::RetainRV:
+      // Check for a retain of the same pointer for merging.
+      return GetArgRCIdentityRoot(Inst) == Arg;
+    default:
+      // Nothing else matters for objc_retainAutorelease formation.
+      return false;
+    }
+
+  case RetainAutoreleaseRVDep: {
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
+    switch (Class) {
+    case ARCInstKind::Retain:
+    case ARCInstKind::RetainRV:
+      // Check for a retain of the same pointer for merging.
+      return GetArgRCIdentityRoot(Inst) == Arg;
+    default:
+      // Anything that can autorelease interrupts
+      // retainAutoreleaseReturnValue formation.
+      return CanInterruptRV(Class);
+    }
+  }
+
+  case RetainRVDep:
+    return CanInterruptRV(GetBasicARCInstKind(Inst));
+  }
+
+  llvm_unreachable("Invalid dependence flavor");
+}
+
+/// Walk up the CFG from StartPos (which is in StartBB) and find local and
+/// non-local dependencies on Arg.
+///
+/// TODO: Cache results?
+void
+llvm::objcarc::FindDependencies(DependenceKind Flavor,
+                                const Value *Arg,
+                                BasicBlock *StartBB, Instruction *StartInst,
+                                SmallPtrSetImpl<Instruction *> &DependingInsts,
+                                SmallPtrSetImpl<const BasicBlock *> &Visited,
+                                ProvenanceAnalysis &PA) {
+  BasicBlock::iterator StartPos = StartInst->getIterator();
+
+  SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;
+  Worklist.push_back(std::make_pair(StartBB, StartPos));
+  do {
+    std::pair<BasicBlock *, BasicBlock::iterator> Pair =
+      Worklist.pop_back_val();
+    BasicBlock *LocalStartBB = Pair.first;
+    BasicBlock::iterator LocalStartPos = Pair.second;
+    BasicBlock::iterator StartBBBegin = LocalStartBB->begin();
+    for (;;) {
+      if (LocalStartPos == StartBBBegin) {
+        pred_iterator PI(LocalStartBB), PE(LocalStartBB, false);
+        if (PI == PE)
+          // If we've reached the function entry, produce a null dependence.
+          DependingInsts.insert(nullptr);
+        else
+          // Add the predecessors to the worklist.
+          do {
+            BasicBlock *PredBB = *PI;
+            if (Visited.insert(PredBB).second)
+              Worklist.push_back(std::make_pair(PredBB, PredBB->end()));
+          } while (++PI != PE);
+        break;
+      }
+
+      Instruction *Inst = &*--LocalStartPos;
+      if (Depends(Flavor, Inst, Arg, PA)) {
+        DependingInsts.insert(Inst);
+        break;
+      }
+    }
+  } while (!Worklist.empty());
+
+  // Determine whether the original StartBB post-dominates all of the blocks we
+  // visited. If not, insert a sentinal indicating that most optimizations are
+  // not safe.
+  for (const BasicBlock *BB : Visited) {
+    if (BB == StartBB)
+      continue;
+    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+    for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
+      const BasicBlock *Succ = *SI;
+      if (Succ != StartBB && !Visited.count(Succ)) {
+        DependingInsts.insert(reinterpret_cast<Instruction *>(-1));
+        return;
+      }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
new file mode 100644
index 000000000000..8cc1232b18ca
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -0,0 +1,89 @@
+//===- DependencyAnalysis.h - ObjC ARC Optimization ---*- C++ -*-----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file declares special dependency analysis routines used in Objective C
+/// ARC Optimizations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_DEPENDENCYANALYSIS_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_DEPENDENCYANALYSIS_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
+
+namespace llvm {
+  class BasicBlock;
+  class Instruction;
+  class Value;
+}
+
+namespace llvm {
+namespace objcarc {
+
+class ProvenanceAnalysis;
+
+/// \enum DependenceKind
+/// \brief Defines different dependence kinds among various ARC constructs.
+///
+/// There are several kinds of dependence-like concepts in use here.
+///
+enum DependenceKind {
+  NeedsPositiveRetainCount,
+  AutoreleasePoolBoundary,
+  CanChangeRetainCount,
+  RetainAutoreleaseDep,       ///< Blocks objc_retainAutorelease.
+  RetainAutoreleaseRVDep,     ///< Blocks objc_retainAutoreleaseReturnValue.
+  RetainRVDep                 ///< Blocks objc_retainAutoreleasedReturnValue.
+};
+
+void FindDependencies(DependenceKind Flavor,
+                      const Value *Arg,
+                      BasicBlock *StartBB, Instruction *StartInst,
+                      SmallPtrSetImpl<Instruction *> &DependingInstructions,
+                      SmallPtrSetImpl<const BasicBlock *> &Visited,
+                      ProvenanceAnalysis &PA);
+
+bool
+Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
+        ProvenanceAnalysis &PA);
+
+/// Test whether the given instruction can "use" the given pointer's object in a
+/// way that requires the reference count to be positive.
+bool CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
+            ARCInstKind Class);
+
+/// Test whether the given instruction can result in a reference count
+/// modification (positive or negative) for the pointer's object.
+bool CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                      ProvenanceAnalysis &PA, ARCInstKind Class);
+
+/// Returns true if we can not conservatively prove that Inst can not decrement
+/// the reference count of Ptr. Returns false if we can.
+bool CanDecrementRefCount(const Instruction *Inst, const Value *Ptr,
+                          ProvenanceAnalysis &PA, ARCInstKind Class);
+
+static inline bool CanDecrementRefCount(const Instruction *Inst,
+                                        const Value *Ptr,
+                                        ProvenanceAnalysis &PA) {
+  return CanDecrementRefCount(Inst, Ptr, PA, GetARCInstKind(Inst));
+}
+
+} // namespace objcarc
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
new file mode 100644
index 000000000000..688dd12c408a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -0,0 +1,41 @@
+//===-- ObjCARC.cpp -------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMObjCARCOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm-c/Core.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/InitializePasses.h"
+
+namespace llvm {
+  class PassRegistry;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// initializeObjCARCOptsPasses - Initialize all passes linked into the
+/// ObjCARCOpts library.
+void llvm::initializeObjCARCOpts(PassRegistry &Registry) {
+  initializeObjCARCAAWrapperPassPass(Registry);
+  initializeObjCARCAPElimPass(Registry);
+  initializeObjCARCExpandPass(Registry);
+  initializeObjCARCContractPass(Registry);
+  initializeObjCARCOptPass(Registry);
+  initializePAEvalPass(Registry);
+}
+
+void LLVMInitializeObjCARCOpts(LLVMPassRegistryRef R) {
+  initializeObjCARCOpts(*unwrap(R));
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
new file mode 100644
index 000000000000..cd9b3d96a14f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
@@ -0,0 +1,88 @@
+//===- ObjCARC.h - ObjC ARC Optimization --------------*- C++ -*-----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines common definitions/declarations used by the ObjC ARC
+/// Optimizer. ARC stands for Automatic Reference Counting and is a system for
+/// managing reference counts for objects in Objective C.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/ObjCARC.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+namespace llvm {
+class raw_ostream;
+}
+
+namespace llvm {
+namespace objcarc {
+
+/// \brief Erase the given instruction.
+///
+/// Many ObjC calls return their argument verbatim,
+/// so if it's such a call and the return value has users, replace them with the
+/// argument value.
+///
+static inline void EraseInstruction(Instruction *CI) {
+  Value *OldArg = cast<CallInst>(CI)->getArgOperand(0);
+
+  bool Unused = CI->use_empty();
+
+  if (!Unused) {
+    // Replace the return value with the argument.
+    assert((IsForwarding(GetBasicARCInstKind(CI)) ||
+            (IsNoopOnNull(GetBasicARCInstKind(CI)) &&
+             isa<ConstantPointerNull>(OldArg))) &&
+           "Can't delete non-forwarding instruction with users!");
+    CI->replaceAllUsesWith(OldArg);
+  }
+
+  CI->eraseFromParent();
+
+  if (Unused)
+    RecursivelyDeleteTriviallyDeadInstructions(OldArg);
+}
+
+/// If Inst is a ReturnRV and its operand is a call or invoke, return the
+/// operand. Otherwise return null.
+static inline const Instruction *getreturnRVOperand(const Instruction &Inst,
+                                                    ARCInstKind Class) {
+  if (Class != ARCInstKind::RetainRV)
+    return nullptr;
+
+  const auto *Opnd = Inst.getOperand(0)->stripPointerCasts();
+  if (const auto *C = dyn_cast<CallInst>(Opnd))
+    return C;
+  return dyn_cast<InvokeInst>(Opnd);
+}
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
new file mode 100644
index 000000000000..b2c62a0e8eeb
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -0,0 +1,176 @@
+//===- ObjCARCAPElim.cpp - ObjC ARC Optimization --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file implements optimizations which remove extraneous
+/// autorelease pools.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-ap-elim"
+
+namespace {
+  /// \brief Autorelease pool elimination.
+  class ObjCARCAPElim : public ModulePass {
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+    bool runOnModule(Module &M) override;
+
+    static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0);
+    static bool OptimizeBB(BasicBlock *BB);
+
+  public:
+    static char ID;
+    ObjCARCAPElim() : ModulePass(ID) {
+      initializeObjCARCAPElimPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCAPElim::ID = 0;
+INITIALIZE_PASS(ObjCARCAPElim,
+                "objc-arc-apelim",
+                "ObjC ARC autorelease pool elimination",
+                false, false)
+
+Pass *llvm::createObjCARCAPElimPass() {
+  return new ObjCARCAPElim();
+}
+
+void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+/// Interprocedurally determine if calls made by the given call site can
+/// possibly produce autoreleases.
+bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {
+  if (const Function *Callee = CS.getCalledFunction()) {
+    if (!Callee->hasExactDefinition())
+      return true;
+    for (const BasicBlock &BB : *Callee) {
+      for (const Instruction &I : BB)
+        if (ImmutableCallSite JCS = ImmutableCallSite(&I))
+          // This recursion depth limit is arbitrary. It's just great
+          // enough to cover known interesting testcases.
+          if (Depth < 3 &&
+              !JCS.onlyReadsMemory() &&
+              MayAutorelease(JCS, Depth + 1))
+            return true;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
+  bool Changed = false;
+
+  Instruction *Push = nullptr;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+    Instruction *Inst = &*I++;
+    switch (GetBasicARCInstKind(Inst)) {
+    case ARCInstKind::AutoreleasepoolPush:
+      Push = Inst;
+      break;
+    case ARCInstKind::AutoreleasepoolPop:
+      // If this pop matches a push and nothing in between can autorelease,
+      // zap the pair.
+      if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
+        Changed = true;
+        DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
+                        "autorelease pair:\n"
+                        "                           Pop: " << *Inst << "\n"
+                     << "                           Push: " << *Push << "\n");
+        Inst->eraseFromParent();
+        Push->eraseFromParent();
+      }
+      Push = nullptr;
+      break;
+    case ARCInstKind::CallOrUser:
+      if (MayAutorelease(ImmutableCallSite(Inst)))
+        Push = nullptr;
+      break;
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool ObjCARCAPElim::runOnModule(Module &M) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!ModuleHasARC(M))
+    return false;
+
+  if (skipModule(M))
+    return false;
+
+  // Find the llvm.global_ctors variable, as the first step in
+  // identifying the global constructors. In theory, unnecessary autorelease
+  // pools could occur anywhere, but in practice it's pretty rare. Global
+  // ctors are a place where autorelease pools get inserted automatically,
+  // so it's pretty common for them to be unnecessary, and it's pretty
+  // profitable to eliminate them.
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (!GV)
+    return false;
+
+  assert(GV->hasDefinitiveInitializer() &&
+         "llvm.global_ctors is uncooperative!");
+
+  bool Changed = false;
+
+  // Dig the constructor functions out of GV's initializer.
+  ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
+  for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
+       OI != OE; ++OI) {
+    Value *Op = *OI;
+    // llvm.global_ctors is an array of three-field structs where the second
+    // members are constructor functions.
+    Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
+    // If the user used a constructor function with the wrong signature and
+    // it got bitcasted or whatever, look the other way.
+    if (!F)
+      continue;
+    // Only look at function definitions.
+    if (F->isDeclaration())
+      continue;
+    // Only look at functions with one basic block.
+    if (std::next(F->begin()) != F->end())
+      continue;
+    // Ok, a single-block constructor function definition. Try to optimize it.
+    Changed |= OptimizeBB(&F->front());
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
new file mode 100644
index 000000000000..e70e7591f6a7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -0,0 +1,694 @@
+//===- ObjCARCContract.cpp - ObjC ARC Optimization ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines late ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file mainly deals with ``contracting'' multiple lower level
+/// operations into singular higher level operations through pattern matching.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO: ObjCARCContract could insert PHI nodes when uses aren't
+// dominated by single calls.
+
+#include "ARCRuntimeEntryPoints.h"
+#include "DependencyAnalysis.h"
+#include "ObjCARC.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-contract"
+
+STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
+
+//===----------------------------------------------------------------------===//
+//                                Declarations
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// \brief Late ARC optimizations
+  ///
+  /// These change the IR in a way that makes it difficult to be analyzed by
+  /// ObjCARCOpt, so it's run late.
+  class ObjCARCContract : public FunctionPass {
+    bool Changed;
+    AliasAnalysis *AA;
+    DominatorTree *DT;
+    ProvenanceAnalysis PA;
+    ARCRuntimeEntryPoints EP;
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+    /// The inline asm string to insert between calls and RetainRV calls to make
+    /// the optimization work on targets which need it.
+    const MDString *RVInstMarker;
+
+    /// The set of inserted objc_storeStrong calls. If at the end of walking the
+    /// function we have found no alloca instructions, these calls can be marked
+    /// "tail".
+    SmallPtrSet<CallInst *, 8> StoreStrongCalls;
+
+    /// Returns true if we eliminated Inst.
+    bool tryToPeepholeInstruction(Function &F, Instruction *Inst,
+                                  inst_iterator &Iter,
+                                  SmallPtrSetImpl<Instruction *> &DepInsts,
+                                  SmallPtrSetImpl<const BasicBlock *> &Visited,
+                                  bool &TailOkForStoreStrong);
+
+    bool optimizeRetainCall(Function &F, Instruction *Retain);
+
+    bool
+    contractAutorelease(Function &F, Instruction *Autorelease,
+                        ARCInstKind Class,
+                        SmallPtrSetImpl<Instruction *> &DependingInstructions,
+                        SmallPtrSetImpl<const BasicBlock *> &Visited);
+
+    void tryToContractReleaseIntoStoreStrong(Instruction *Release,
+                                             inst_iterator &Iter);
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+    bool doInitialization(Module &M) override;
+    bool runOnFunction(Function &F) override;
+
+  public:
+    static char ID;
+    ObjCARCContract() : FunctionPass(ID) {
+      initializeObjCARCContractPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+//===----------------------------------------------------------------------===//
+//                               Implementation
+//===----------------------------------------------------------------------===//
+
+/// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a
+/// return value. We do this late so we do not disrupt the dataflow analysis in
+/// ObjCARCOpt.
+bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) {
+  ImmutableCallSite CS(GetArgRCIdentityRoot(Retain));
+  const Instruction *Call = CS.getInstruction();
+  if (!Call)
+    return false;
+  if (Call->getParent() != Retain->getParent())
+    return false;
+
+  // Check that the call is next to the retain.
+  BasicBlock::const_iterator I = ++Call->getIterator();
+  while (IsNoopInstruction(&*I))
+    ++I;
+  if (&*I != Retain)
+    return false;
+
+  // Turn it to an objc_retainAutoreleasedReturnValue.
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "Transforming objc_retain => "
+                  "objc_retainAutoreleasedReturnValue since the operand is a "
+                  "return value.\nOld: "<< *Retain << "\n");
+
+  // We do not have to worry about tail calls/does not throw since
+  // retain/retainRV have the same properties.
+  Constant *Decl = EP.get(ARCRuntimeEntryPointKind::RetainRV);
+  cast<CallInst>(Retain)->setCalledFunction(Decl);
+
+  DEBUG(dbgs() << "New: " << *Retain << "\n");
+  return true;
+}
+
+/// Merge an autorelease with a retain into a fused call.
+bool ObjCARCContract::contractAutorelease(
+    Function &F, Instruction *Autorelease, ARCInstKind Class,
+    SmallPtrSetImpl<Instruction *> &DependingInstructions,
+    SmallPtrSetImpl<const BasicBlock *> &Visited) {
+  const Value *Arg = GetArgRCIdentityRoot(Autorelease);
+
+  // Check that there are no instructions between the retain and the autorelease
+  // (such as an autorelease_pop) which may change the count.
+  CallInst *Retain = nullptr;
+  if (Class == ARCInstKind::AutoreleaseRV)
+    FindDependencies(RetainAutoreleaseRVDep, Arg,
+                     Autorelease->getParent(), Autorelease,
+                     DependingInstructions, Visited, PA);
+  else
+    FindDependencies(RetainAutoreleaseDep, Arg,
+                     Autorelease->getParent(), Autorelease,
+                     DependingInstructions, Visited, PA);
+
+  Visited.clear();
+  if (DependingInstructions.size() != 1) {
+    DependingInstructions.clear();
+    return false;
+  }
+
+  Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+  DependingInstructions.clear();
+
+  if (!Retain || GetBasicARCInstKind(Retain) != ARCInstKind::Retain ||
+      GetArgRCIdentityRoot(Retain) != Arg)
+    return false;
+
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "    Fusing retain/autorelease!\n"
+                  "        Autorelease:" << *Autorelease << "\n"
+                  "        Retain: " << *Retain << "\n");
+
+  Constant *Decl = EP.get(Class == ARCInstKind::AutoreleaseRV
+                              ? ARCRuntimeEntryPointKind::RetainAutoreleaseRV
+                              : ARCRuntimeEntryPointKind::RetainAutorelease);
+  Retain->setCalledFunction(Decl);
+
+  DEBUG(dbgs() << "        New RetainAutorelease: " << *Retain << "\n");
+
+  EraseInstruction(Autorelease);
+  return true;
+}
+
+static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,
+                                                         Instruction *Release,
+                                                         ProvenanceAnalysis &PA,
+                                                         AliasAnalysis *AA) {
+  StoreInst *Store = nullptr;
+  bool SawRelease = false;
+
+  // Get the location associated with Load.
+  MemoryLocation Loc = MemoryLocation::get(Load);
+  auto *LocPtr = Loc.Ptr->stripPointerCasts();
+
+  // Walk down to find the store and the release, which may be in either order.
+  for (auto I = std::next(BasicBlock::iterator(Load)),
+            E = Load->getParent()->end();
+       I != E; ++I) {
+    // If we found the store we were looking for and saw the release,
+    // break. There is no more work to be done.
+    if (Store && SawRelease)
+      break;
+
+    // Now we know that we have not seen either the store or the release. If I
+    // is the release, mark that we saw the release and continue.
+    Instruction *Inst = &*I;
+    if (Inst == Release) {
+      SawRelease = true;
+      continue;
+    }
+
+    // Otherwise, we check if Inst is a "good" store. Grab the instruction class
+    // of Inst.
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
+
+    // If Inst is an unrelated retain, we don't care about it.
+    //
+    // TODO: This is one area where the optimization could be made more
+    // aggressive.
+    if (IsRetain(Class))
+      continue;
+
+    // If we have seen the store, but not the release...
+    if (Store) {
+      // We need to make sure that it is safe to move the release from its
+      // current position to the store. This implies proving that any
+      // instruction in between Store and the Release conservatively can not use
+      // the RCIdentityRoot of Release. If we can prove we can ignore Inst, so
+      // continue...
+      if (!CanUse(Inst, Load, PA, Class)) {
+        continue;
+      }
+
+      // Otherwise, be conservative and return nullptr.
+      return nullptr;
+    }
+
+    // Ok, now we know we have not seen a store yet. See if Inst can write to
+    // our load location, if it can not, just ignore the instruction.
+    if (!(AA->getModRefInfo(Inst, Loc) & MRI_Mod))
+      continue;
+
+    Store = dyn_cast<StoreInst>(Inst);
+
+    // If Inst can, then check if Inst is a simple store. If Inst is not a
+    // store or a store that is not simple, then we have some we do not
+    // understand writing to this memory implying we can not move the load
+    // over the write to any subsequent store that we may find.
+    if (!Store || !Store->isSimple())
+      return nullptr;
+
+    // Then make sure that the pointer we are storing to is Ptr. If so, we
+    // found our Store!
+    if (Store->getPointerOperand()->stripPointerCasts() == LocPtr)
+      continue;
+
+    // Otherwise, we have an unknown store to some other ptr that clobbers
+    // Loc.Ptr. Bail!
+    return nullptr;
+  }
+
+  // If we did not find the store or did not see the release, fail.
+  if (!Store || !SawRelease)
+    return nullptr;
+
+  // We succeeded!
+  return Store;
+}
+
+static Instruction *
+findRetainForStoreStrongContraction(Value *New, StoreInst *Store,
+                                    Instruction *Release,
+                                    ProvenanceAnalysis &PA) {
+  // Walk up from the Store to find the retain.
+  BasicBlock::iterator I = Store->getIterator();
+  BasicBlock::iterator Begin = Store->getParent()->begin();
+  while (I != Begin && GetBasicARCInstKind(&*I) != ARCInstKind::Retain) {
+    Instruction *Inst = &*I;
+
+    // It is only safe to move the retain to the store if we can prove
+    // conservatively that nothing besides the release can decrement reference
+    // counts in between the retain and the store.
+    if (CanDecrementRefCount(Inst, New, PA) && Inst != Release)
+      return nullptr;
+    --I;
+  }
+  Instruction *Retain = &*I;
+  if (GetBasicARCInstKind(Retain) != ARCInstKind::Retain)
+    return nullptr;
+  if (GetArgRCIdentityRoot(Retain) != New)
+    return nullptr;
+  return Retain;
+}
+
+/// Attempt to merge an objc_release with a store, load, and objc_retain to form
+/// an objc_storeStrong. An objc_storeStrong:
+///
+///   objc_storeStrong(i8** %old_ptr, i8* new_value)
+///
+/// is equivalent to the following IR sequence:
+///
+///   ; Load old value.
+///   %old_value = load i8** %old_ptr               (1)
+///
+///   ; Increment the new value and then release the old value. This must occur
+///   ; in order in case old_value releases new_value in its destructor causing
+///   ; us to potentially have a dangling ptr.
+///   tail call i8* @objc_retain(i8* %new_value)    (2)
+///   tail call void @objc_release(i8* %old_value)  (3)
+///
+///   ; Store the new_value into old_ptr
+///   store i8* %new_value, i8** %old_ptr           (4)
+///
+/// The safety of this optimization is based around the following
+/// considerations:
+///
+///  1. We are forming the store strong at the store. Thus to perform this
+///     optimization it must be safe to move the retain, load, and release to
+///     (4).
+///  2. We need to make sure that any re-orderings of (1), (2), (3), (4) are
+///     safe.
+void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release,
+                                                          inst_iterator &Iter) {
+  // See if we are releasing something that we just loaded.
+  auto *Load = dyn_cast<LoadInst>(GetArgRCIdentityRoot(Release));
+  if (!Load || !Load->isSimple())
+    return;
+
+  // For now, require everything to be in one basic block.
+  BasicBlock *BB = Release->getParent();
+  if (Load->getParent() != BB)
+    return;
+
+  // First scan down the BB from Load, looking for a store of the RCIdentityRoot
+  // of Load's
+  StoreInst *Store =
+      findSafeStoreForStoreStrongContraction(Load, Release, PA, AA);
+  // If we fail, bail.
+  if (!Store)
+    return;
+
+  // Then find what new_value's RCIdentity Root is.
+  Value *New = GetRCIdentityRoot(Store->getValueOperand());
+
+  // Then walk up the BB and look for a retain on New without any intervening
+  // instructions which conservatively might decrement ref counts.
+  Instruction *Retain =
+      findRetainForStoreStrongContraction(New, Store, Release, PA);
+
+  // If we fail, bail.
+  if (!Retain)
+    return;
+
+  Changed = true;
+  ++NumStoreStrongs;
+
+  DEBUG(
+      llvm::dbgs() << "    Contracting retain, release into objc_storeStrong.\n"
+                   << "        Old:\n"
+                   << "            Store:   " << *Store << "\n"
+                   << "            Release: " << *Release << "\n"
+                   << "            Retain:  " << *Retain << "\n"
+                   << "            Load:    " << *Load << "\n");
+
+  LLVMContext &C = Release->getContext();
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
+
+  Value *Args[] = { Load->getPointerOperand(), New };
+  if (Args[0]->getType() != I8XX)
+    Args[0] = new BitCastInst(Args[0], I8XX, "", Store);
+  if (Args[1]->getType() != I8X)
+    Args[1] = new BitCastInst(Args[1], I8X, "", Store);
+  Constant *Decl = EP.get(ARCRuntimeEntryPointKind::StoreStrong);
+  CallInst *StoreStrong = CallInst::Create(Decl, Args, "", Store);
+  StoreStrong->setDoesNotThrow();
+  StoreStrong->setDebugLoc(Store->getDebugLoc());
+
+  // We can't set the tail flag yet, because we haven't yet determined
+  // whether there are any escaping allocas. Remember this call, so that
+  // we can set the tail flag once we know it's safe.
+  StoreStrongCalls.insert(StoreStrong);
+
+  DEBUG(llvm::dbgs() << "        New Store Strong: " << *StoreStrong << "\n");
+
+  if (&*Iter == Retain) ++Iter;
+  if (&*Iter == Store) ++Iter;
+  Store->eraseFromParent();
+  Release->eraseFromParent();
+  EraseInstruction(Retain);
+  if (Load->use_empty())
+    Load->eraseFromParent();
+}
+
+bool ObjCARCContract::tryToPeepholeInstruction(
+  Function &F, Instruction *Inst, inst_iterator &Iter,
+  SmallPtrSetImpl<Instruction *> &DependingInsts,
+  SmallPtrSetImpl<const BasicBlock *> &Visited,
+  bool &TailOkForStoreStrongs) {
+    // Only these library routines return their argument. In particular,
+    // objc_retainBlock does not necessarily return its argument.
+  ARCInstKind Class = GetBasicARCInstKind(Inst);
+    switch (Class) {
+    case ARCInstKind::FusedRetainAutorelease:
+    case ARCInstKind::FusedRetainAutoreleaseRV:
+      return false;
+    case ARCInstKind::Autorelease:
+    case ARCInstKind::AutoreleaseRV:
+      return contractAutorelease(F, Inst, Class, DependingInsts, Visited);
+    case ARCInstKind::Retain:
+      // Attempt to convert retains to retainrvs if they are next to function
+      // calls.
+      if (!optimizeRetainCall(F, Inst))
+        return false;
+      // If we succeed in our optimization, fall through.
+      LLVM_FALLTHROUGH;
+    case ARCInstKind::RetainRV:
+    case ARCInstKind::ClaimRV: {
+      // If we're compiling for a target which needs a special inline-asm
+      // marker to do the return value optimization, insert it now.
+      if (!RVInstMarker)
+        return false;
+      BasicBlock::iterator BBI = Inst->getIterator();
+      BasicBlock *InstParent = Inst->getParent();
+
+      // Step up to see if the call immediately precedes the RV call.
+      // If it's an invoke, we have to cross a block boundary. And we have
+      // to carefully dodge no-op instructions.
+      do {
+        if (BBI == InstParent->begin()) {
+          BasicBlock *Pred = InstParent->getSinglePredecessor();
+          if (!Pred)
+            goto decline_rv_optimization;
+          BBI = Pred->getTerminator()->getIterator();
+          break;
+        }
+        --BBI;
+      } while (IsNoopInstruction(&*BBI));
+
+      if (&*BBI == GetArgRCIdentityRoot(Inst)) {
+        DEBUG(dbgs() << "Adding inline asm marker for the return value "
+                        "optimization.\n");
+        Changed = true;
+        InlineAsm *IA = InlineAsm::get(
+            FunctionType::get(Type::getVoidTy(Inst->getContext()),
+                              /*isVarArg=*/false),
+            RVInstMarker->getString(),
+            /*Constraints=*/"", /*hasSideEffects=*/true);
+        CallInst::Create(IA, "", Inst);
+      }
+    decline_rv_optimization:
+      return false;
+    }
+    case ARCInstKind::InitWeak: {
+      // objc_initWeak(p, null) => *p = null
+      CallInst *CI = cast<CallInst>(Inst);
+      if (IsNullOrUndef(CI->getArgOperand(1))) {
+        Value *Null =
+          ConstantPointerNull::get(cast<PointerType>(CI->getType()));
+        Changed = true;
+        new StoreInst(Null, CI->getArgOperand(0), CI);
+
+        DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n"
+                     << "                 New = " << *Null << "\n");
+
+        CI->replaceAllUsesWith(Null);
+        CI->eraseFromParent();
+      }
+      return true;
+    }
+    case ARCInstKind::Release:
+      // Try to form an objc store strong from our release. If we fail, there is
+      // nothing further to do below, so continue.
+      tryToContractReleaseIntoStoreStrong(Inst, Iter);
+      return true;
+    case ARCInstKind::User:
+      // Be conservative if the function has any alloca instructions.
+      // Technically we only care about escaping alloca instructions,
+      // but this is sufficient to handle some interesting cases.
+      if (isa<AllocaInst>(Inst))
+        TailOkForStoreStrongs = false;
+      return true;
+    case ARCInstKind::IntrinsicUser:
+      // Remove calls to @clang.arc.use(...).
+      Inst->eraseFromParent();
+      return true;
+    default:
+      return true;
+    }
+}
+
+//===----------------------------------------------------------------------===//
+//                              Top Level Driver
+//===----------------------------------------------------------------------===//
+
+bool ObjCARCContract::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  Changed = false;
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
+
+  DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n");
+
+  // Track whether it's ok to mark objc_storeStrong calls with the "tail"
+  // keyword. Be conservative if the function has variadic arguments.
+  // It seems that functions which "return twice" are also unsafe for the
+  // "tail" argument, because they are setjmp, which could need to
+  // return to an earlier stack state.
+  bool TailOkForStoreStrongs =
+      !F.isVarArg() && !F.callsFunctionThatReturnsTwice();
+
+  // For ObjC library calls which return their argument, replace uses of the
+  // argument with uses of the call return value, if it dominates the use. This
+  // reduces register pressure.
+  SmallPtrSet<Instruction *, 4> DependingInstructions;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E;) {
+    Instruction *Inst = &*I++;
+
+    DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
+
+    // First try to peephole Inst. If there is nothing further we can do in
+    // terms of undoing objc-arc-expand, process the next inst.
+    if (tryToPeepholeInstruction(F, Inst, I, DependingInstructions, Visited,
+                                 TailOkForStoreStrongs))
+      continue;
+
+    // Otherwise, try to undo objc-arc-expand.
+
+    // Don't use GetArgRCIdentityRoot because we don't want to look through bitcasts
+    // and such; to do the replacement, the argument must have type i8*.
+
+    // Function for replacing uses of Arg dominated by Inst.
+    auto ReplaceArgUses = [Inst, this](Value *Arg) {
+      // If we're compiling bugpointed code, don't get in trouble.
+      if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
+        return;
+
+      // Look through the uses of the pointer.
+      for (Value::use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+           UI != UE; ) {
+        // Increment UI now, because we may unlink its element.
+        Use &U = *UI++;
+        unsigned OperandNo = U.getOperandNo();
+
+        // If the call's return value dominates a use of the call's argument
+        // value, rewrite the use to use the return value. We check for
+        // reachability here because an unreachable call is considered to
+        // trivially dominate itself, which would lead us to rewriting its
+        // argument in terms of its return value, which would lead to
+        // infinite loops in GetArgRCIdentityRoot.
+        if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) {
+          Changed = true;
+          Instruction *Replacement = Inst;
+          Type *UseTy = U.get()->getType();
+          if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
+            // For PHI nodes, insert the bitcast in the predecessor block.
+            unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
+            BasicBlock *BB = PHI->getIncomingBlock(ValNo);
+            if (Replacement->getType() != UseTy)
+              Replacement = new BitCastInst(Replacement, UseTy, "",
+                                            &BB->back());
+            // While we're here, rewrite all edges for this PHI, rather
+            // than just one use at a time, to minimize the number of
+            // bitcasts we emit.
+            for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
+              if (PHI->getIncomingBlock(i) == BB) {
+                // Keep the UI iterator valid.
+                if (UI != UE &&
+                    &PHI->getOperandUse(
+                        PHINode::getOperandNumForIncomingValue(i)) == &*UI)
+                  ++UI;
+                PHI->setIncomingValue(i, Replacement);
+              }
+          } else {
+            if (Replacement->getType() != UseTy)
+              Replacement = new BitCastInst(Replacement, UseTy, "",
+                                            cast<Instruction>(U.getUser()));
+            U.set(Replacement);
+          }
+        }
+      }
+    };
+
+
+    Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
+    Value *OrigArg = Arg;
+
+    // TODO: Change this to a do-while.
+    for (;;) {
+      ReplaceArgUses(Arg);
+
+      // If Arg is a no-op casted pointer, strip one level of casts and iterate.
+      if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
+        Arg = BI->getOperand(0);
+      else if (isa<GEPOperator>(Arg) &&
+               cast<GEPOperator>(Arg)->hasAllZeroIndices())
+        Arg = cast<GEPOperator>(Arg)->getPointerOperand();
+      else if (isa<GlobalAlias>(Arg) &&
+               !cast<GlobalAlias>(Arg)->isInterposable())
+        Arg = cast<GlobalAlias>(Arg)->getAliasee();
+      else
+        break;
+    }
+
+    // Replace bitcast users of Arg that are dominated by Inst.
+    SmallVector<BitCastInst *, 2> BitCastUsers;
+
+    // Add all bitcast users of the function argument first.
+    for (User *U : OrigArg->users())
+      if (auto *BC = dyn_cast<BitCastInst>(U))
+        BitCastUsers.push_back(BC);
+
+    // Replace the bitcasts with the call return. Iterate until list is empty.
+    while (!BitCastUsers.empty()) {
+      auto *BC = BitCastUsers.pop_back_val();
+      for (User *U : BC->users())
+        if (auto *B = dyn_cast<BitCastInst>(U))
+          BitCastUsers.push_back(B);
+
+      ReplaceArgUses(BC);
+    }
+  }
+
+  // If this function has no escaping allocas or suspicious vararg usage,
+  // objc_storeStrong calls can be marked with the "tail" keyword.
+  if (TailOkForStoreStrongs)
+    for (CallInst *CI : StoreStrongCalls)
+      CI->setTailCall();
+  StoreStrongCalls.clear();
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                             Misc Pass Manager
+//===----------------------------------------------------------------------===//
+
+char ObjCARCContract::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCContract, "objc-arc-contract",
+                      "ObjC ARC contraction", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ObjCARCContract, "objc-arc-contract",
+                    "ObjC ARC contraction", false, false)
+
+void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.setPreservesCFG();
+}
+
+Pass *llvm::createObjCARCContractPass() { return new ObjCARCContract(); }
+
+bool ObjCARCContract::doInitialization(Module &M) {
+  // If nothing in the Module uses ARC, don't do anything.
+  Run = ModuleHasARC(M);
+  if (!Run)
+    return false;
+
+  EP.init(&M);
+
+  // Initialize RVInstMarker.
+  RVInstMarker = nullptr;
+  if (NamedMDNode *NMD =
+          M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker"))
+    if (NMD->getNumOperands() == 1) {
+      const MDNode *N = NMD->getOperand(0);
+      if (N->getNumOperands() == 1)
+        if (const MDString *S = dyn_cast<MDString>(N->getOperand(0)))
+          RVInstMarker = S;
+    }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
new file mode 100644
index 000000000000..bb6a0a0e73db
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -0,0 +1,127 @@
+//===- ObjCARCExpand.cpp - ObjC ARC Optimization --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file deals with early optimizations which perform certain
+/// cleanup operations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "objc-arc-expand"
+
+namespace llvm {
+  class Module;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
+  /// \brief Early ARC transformations.
+  class ObjCARCExpand : public FunctionPass {
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+    bool doInitialization(Module &M) override;
+    bool runOnFunction(Function &F) override;
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+  public:
+    static char ID;
+    ObjCARCExpand() : FunctionPass(ID) {
+      initializeObjCARCExpandPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCExpand::ID = 0;
+INITIALIZE_PASS(ObjCARCExpand,
+                "objc-arc-expand", "ObjC ARC expansion", false, false)
+
+Pass *llvm::createObjCARCExpandPass() {
+  return new ObjCARCExpand();
+}
+
+void ObjCARCExpand::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+bool ObjCARCExpand::doInitialization(Module &M) {
+  Run = ModuleHasARC(M);
+  return false;
+}
+
+bool ObjCARCExpand::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  bool Changed = false;
+
+  DEBUG(dbgs() << "ObjCARCExpand: Visiting Function: " << F.getName() << "\n");
+
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+    Instruction *Inst = &*I;
+
+    DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
+
+    switch (GetBasicARCInstKind(Inst)) {
+    case ARCInstKind::Retain:
+    case ARCInstKind::RetainRV:
+    case ARCInstKind::Autorelease:
+    case ARCInstKind::AutoreleaseRV:
+    case ARCInstKind::FusedRetainAutorelease:
+    case ARCInstKind::FusedRetainAutoreleaseRV: {
+      // These calls return their argument verbatim, as a low-level
+      // optimization. However, this makes high-level optimizations
+      // harder. Undo any uses of this optimization that the front-end
+      // emitted here. We'll redo them in the contract pass.
+      Changed = true;
+      Value *Value = cast<CallInst>(Inst)->getArgOperand(0);
+      DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst << "\n"
+                      "               New = " << *Value << "\n");
+      Inst->replaceAllUsesWith(Value);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n");
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
new file mode 100644
index 000000000000..8f3a33f66c7f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -0,0 +1,2171 @@
+//===- ObjCARCOpts.cpp - ObjC ARC Optimization ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// The optimizations performed include elimination of redundant, partially
+/// redundant, and inconsequential reference count operations, elimination of
+/// redundant weak pointer operations, and numerous minor simplifications.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ARCRuntimeEntryPoints.h"
+#include "BlotMapVector.h"
+#include "DependencyAnalysis.h"
+#include "ObjCARC.h"
+#include "ProvenanceAnalysis.h"
+#include "PtrState.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ObjCARCAliasAnalysis.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-opts"
+
+/// \defgroup ARCUtilities Utility declarations/definitions specific to ARC.
+/// @{
+
+/// \brief This is similar to GetRCIdentityRoot but it stops as soon
+/// as it finds a value with multiple uses.
+static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
+  // ConstantData (like ConstantPointerNull and UndefValue) is used across
+  // modules.  It's never a single-use value.
+  if (isa<ConstantData>(Arg))
+    return nullptr;
+
+  if (Arg->hasOneUse()) {
+    if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg))
+      return FindSingleUseIdentifiedObject(BC->getOperand(0));
+    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg))
+      if (GEP->hasAllZeroIndices())
+        return FindSingleUseIdentifiedObject(GEP->getPointerOperand());
+    if (IsForwarding(GetBasicARCInstKind(Arg)))
+      return FindSingleUseIdentifiedObject(
+               cast<CallInst>(Arg)->getArgOperand(0));
+    if (!IsObjCIdentifiedObject(Arg))
+      return nullptr;
+    return Arg;
+  }
+
+  // If we found an identifiable object but it has multiple uses, but they are
+  // trivial uses, we can still consider this to be a single-use value.
+  if (IsObjCIdentifiedObject(Arg)) {
+    for (const User *U : Arg->users())
+      if (!U->use_empty() || GetRCIdentityRoot(U) != Arg)
+         return nullptr;
+
+    return Arg;
+  }
+
+  return nullptr;
+}
+
+/// @}
+///
+/// \defgroup ARCOpt ARC Optimization.
+/// @{
+
+// TODO: On code like this:
+//
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+// stuff_that_cannot_release()
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+//
+// The second retain and autorelease can be deleted.
+
+// TODO: It should be possible to delete
+// objc_autoreleasePoolPush and objc_autoreleasePoolPop
+// pairs if nothing is actually autoreleased between them. Also, autorelease
+// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code
+// after inlining) can be turned into plain release calls.
+
+// TODO: Critical-edge splitting. If the optimial insertion point is
+// a critical edge, the current algorithm has to fail, because it doesn't
+// know how to split edges. It should be possible to make the optimizer
+// think in terms of edges, rather than blocks, and then split critical
+// edges on demand.
+
+// TODO: OptimizeSequences could generalized to be Interprocedural.
+
+// TODO: Recognize that a bunch of other objc runtime calls have
+// non-escaping arguments and non-releasing arguments, and may be
+// non-autoreleasing.
+
+// TODO: Sink autorelease calls as far as possible. Unfortunately we
+// usually can't sink them past other calls, which would be the main
+// case where it would be useful.
+
+// TODO: The pointer returned from objc_loadWeakRetained is retained.
+
+// TODO: Delete release+retain pairs (rare).
+
+STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
+STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
+STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases");
+STATISTIC(NumRets,        "Number of return value forwarding "
+                          "retain+autoreleases eliminated");
+STATISTIC(NumRRs,         "Number of retain+release paths eliminated");
+STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+#ifndef NDEBUG
+STATISTIC(NumRetainsBeforeOpt,
+          "Number of retains before optimization");
+STATISTIC(NumReleasesBeforeOpt,
+          "Number of releases before optimization");
+STATISTIC(NumRetainsAfterOpt,
+          "Number of retains after optimization");
+STATISTIC(NumReleasesAfterOpt,
+          "Number of releases after optimization");
+#endif
+
+namespace {
+  /// \brief Per-BasicBlock state.
+  class BBState {
+    /// The number of unique control paths from the entry which can reach this
+    /// block.
+    unsigned TopDownPathCount;
+
+    /// The number of unique control paths to exits from this block.
+    unsigned BottomUpPathCount;
+
+    /// The top-down traversal uses this to record information known about a
+    /// pointer at the bottom of each block.
+    BlotMapVector<const Value *, TopDownPtrState> PerPtrTopDown;
+
+    /// The bottom-up traversal uses this to record information known about a
+    /// pointer at the top of each block.
+    BlotMapVector<const Value *, BottomUpPtrState> PerPtrBottomUp;
+
+    /// Effective predecessors of the current block ignoring ignorable edges and
+    /// ignored backedges.
+    SmallVector<BasicBlock *, 2> Preds;
+
+    /// Effective successors of the current block ignoring ignorable edges and
+    /// ignored backedges.
+    SmallVector<BasicBlock *, 2> Succs;
+
+  public:
+    static const unsigned OverflowOccurredValue;
+
+    BBState() : TopDownPathCount(0), BottomUpPathCount(0) { }
+
+    typedef decltype(PerPtrTopDown)::iterator top_down_ptr_iterator;
+    typedef decltype(PerPtrTopDown)::const_iterator const_top_down_ptr_iterator;
+
+    top_down_ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); }
+    top_down_ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); }
+    const_top_down_ptr_iterator top_down_ptr_begin() const {
+      return PerPtrTopDown.begin();
+    }
+    const_top_down_ptr_iterator top_down_ptr_end() const {
+      return PerPtrTopDown.end();
+    }
+    bool hasTopDownPtrs() const {
+      return !PerPtrTopDown.empty();
+    }
+
+    typedef decltype(PerPtrBottomUp)::iterator bottom_up_ptr_iterator;
+    typedef decltype(
+        PerPtrBottomUp)::const_iterator const_bottom_up_ptr_iterator;
+
+    bottom_up_ptr_iterator bottom_up_ptr_begin() {
+      return PerPtrBottomUp.begin();
+    }
+    bottom_up_ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); }
+    const_bottom_up_ptr_iterator bottom_up_ptr_begin() const {
+      return PerPtrBottomUp.begin();
+    }
+    const_bottom_up_ptr_iterator bottom_up_ptr_end() const {
+      return PerPtrBottomUp.end();
+    }
+    bool hasBottomUpPtrs() const {
+      return !PerPtrBottomUp.empty();
+    }
+
+    /// Mark this block as being an entry block, which has one path from the
+    /// entry by definition.
+    void SetAsEntry() { TopDownPathCount = 1; }
+
+    /// Mark this block as being an exit block, which has one path to an exit by
+    /// definition.
+    void SetAsExit()  { BottomUpPathCount = 1; }
+
+    /// Attempt to find the PtrState object describing the top down state for
+    /// pointer Arg. Return a new initialized PtrState describing the top down
+    /// state for Arg if we do not find one.
+    TopDownPtrState &getPtrTopDownState(const Value *Arg) {
+      return PerPtrTopDown[Arg];
+    }
+
+    /// Attempt to find the PtrState object describing the bottom up state for
+    /// pointer Arg. Return a new initialized PtrState describing the bottom up
+    /// state for Arg if we do not find one.
+    BottomUpPtrState &getPtrBottomUpState(const Value *Arg) {
+      return PerPtrBottomUp[Arg];
+    }
+
+    /// Attempt to find the PtrState object describing the bottom up state for
+    /// pointer Arg.
+    bottom_up_ptr_iterator findPtrBottomUpState(const Value *Arg) {
+      return PerPtrBottomUp.find(Arg);
+    }
+
+    void clearBottomUpPointers() {
+      PerPtrBottomUp.clear();
+    }
+
+    void clearTopDownPointers() {
+      PerPtrTopDown.clear();
+    }
+
+    void InitFromPred(const BBState &Other);
+    void InitFromSucc(const BBState &Other);
+    void MergePred(const BBState &Other);
+    void MergeSucc(const BBState &Other);
+
+    /// Compute the number of possible unique paths from an entry to an exit
+    /// which pass through this block. This is only valid after both the
+    /// top-down and bottom-up traversals are complete.
+    ///
+    /// Returns true if overflow occurred. Returns false if overflow did not
+    /// occur.
+    bool GetAllPathCountWithOverflow(unsigned &PathCount) const {
+      if (TopDownPathCount == OverflowOccurredValue ||
+          BottomUpPathCount == OverflowOccurredValue)
+        return true;
+      unsigned long long Product =
+        (unsigned long long)TopDownPathCount*BottomUpPathCount;
+      // Overflow occurred if any of the upper bits of Product are set or if all
+      // the lower bits of Product are all set.
+      return (Product >> 32) ||
+             ((PathCount = Product) == OverflowOccurredValue);
+    }
+
+    // Specialized CFG utilities.
+    typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator;
+    edge_iterator pred_begin() const { return Preds.begin(); }
+    edge_iterator pred_end() const { return Preds.end(); }
+    edge_iterator succ_begin() const { return Succs.begin(); }
+    edge_iterator succ_end() const { return Succs.end(); }
+
+    void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
+    void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
+
+    bool isExit() const { return Succs.empty(); }
+  };
+
+  const unsigned BBState::OverflowOccurredValue = 0xffffffff;
+}
+
+namespace llvm {
+raw_ostream &operator<<(raw_ostream &OS,
+                        BBState &BBState) LLVM_ATTRIBUTE_UNUSED;
+}
+
+void BBState::InitFromPred(const BBState &Other) {
+  PerPtrTopDown = Other.PerPtrTopDown;
+  TopDownPathCount = Other.TopDownPathCount;
+}
+
+void BBState::InitFromSucc(const BBState &Other) {
+  PerPtrBottomUp = Other.PerPtrBottomUp;
+  BottomUpPathCount = Other.BottomUpPathCount;
+}
+
+/// The top-down traversal uses this to merge information about predecessors to
+/// form the initial state for a new block.
+void BBState::MergePred(const BBState &Other) {
+  if (TopDownPathCount == OverflowOccurredValue)
+    return;
+
+  // Other.TopDownPathCount can be 0, in which case it is either dead or a
+  // loop backedge. Loop backedges are special.
+  TopDownPathCount += Other.TopDownPathCount;
+
+  // In order to be consistent, we clear the top down pointers when by adding
+  // TopDownPathCount becomes OverflowOccurredValue even though "true" overflow
+  // has not occurred.
+  if (TopDownPathCount == OverflowOccurredValue) {
+    clearTopDownPointers();
+    return;
+  }
+
+  // Check for overflow. If we have overflow, fall back to conservative
+  // behavior.
+  if (TopDownPathCount < Other.TopDownPathCount) {
+    TopDownPathCount = OverflowOccurredValue;
+    clearTopDownPointers();
+    return;
+  }
+
+  // For each entry in the other set, if our set has an entry with the same key,
+  // merge the entries. Otherwise, copy the entry and merge it with an empty
+  // entry.
+  for (auto MI = Other.top_down_ptr_begin(), ME = Other.top_down_ptr_end();
+       MI != ME; ++MI) {
+    auto Pair = PerPtrTopDown.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? TopDownPtrState() : MI->second,
+                             /*TopDown=*/true);
+  }
+
+  // For each entry in our set, if the other set doesn't have an entry with the
+  // same key, force it to merge with an empty entry.
+  for (auto MI = top_down_ptr_begin(), ME = top_down_ptr_end(); MI != ME; ++MI)
+    if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end())
+      MI->second.Merge(TopDownPtrState(), /*TopDown=*/true);
+}
+
+/// The bottom-up traversal uses this to merge information about successors to
+/// form the initial state for a new block.
+void BBState::MergeSucc(const BBState &Other) {
+  if (BottomUpPathCount == OverflowOccurredValue)
+    return;
+
+  // Other.BottomUpPathCount can be 0, in which case it is either dead or a
+  // loop backedge. Loop backedges are special.
+  BottomUpPathCount += Other.BottomUpPathCount;
+
+  // In order to be consistent, we clear the top down pointers when by adding
+  // BottomUpPathCount becomes OverflowOccurredValue even though "true" overflow
+  // has not occurred.
+  if (BottomUpPathCount == OverflowOccurredValue) {
+    clearBottomUpPointers();
+    return;
+  }
+
+  // Check for overflow. If we have overflow, fall back to conservative
+  // behavior.
+  if (BottomUpPathCount < Other.BottomUpPathCount) {
+    BottomUpPathCount = OverflowOccurredValue;
+    clearBottomUpPointers();
+    return;
+  }
+
+  // For each entry in the other set, if our set has an entry with the
+  // same key, merge the entries. Otherwise, copy the entry and merge
+  // it with an empty entry.
+  for (auto MI = Other.bottom_up_ptr_begin(), ME = Other.bottom_up_ptr_end();
+       MI != ME; ++MI) {
+    auto Pair = PerPtrBottomUp.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? BottomUpPtrState() : MI->second,
+                             /*TopDown=*/false);
+  }
+
+  // For each entry in our set, if the other set doesn't have an entry
+  // with the same key, force it to merge with an empty entry.
+  for (auto MI = bottom_up_ptr_begin(), ME = bottom_up_ptr_end(); MI != ME;
+       ++MI)
+    if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end())
+      MI->second.Merge(BottomUpPtrState(), /*TopDown=*/false);
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, BBState &BBInfo) {
+  // Dump the pointers we are tracking.
+  OS << "    TopDown State:\n";
+  if (!BBInfo.hasTopDownPtrs()) {
+    DEBUG(llvm::dbgs() << "        NONE!\n");
+  } else {
+    for (auto I = BBInfo.top_down_ptr_begin(), E = BBInfo.top_down_ptr_end();
+         I != E; ++I) {
+      const PtrState &P = I->second;
+      OS << "        Ptr: " << *I->first
+         << "\n            KnownSafe:        " << (P.IsKnownSafe()?"true":"false")
+         << "\n            ImpreciseRelease: "
+           << (P.IsTrackingImpreciseReleases()?"true":"false") << "\n"
+         << "            HasCFGHazards:    "
+           << (P.IsCFGHazardAfflicted()?"true":"false") << "\n"
+         << "            KnownPositive:    "
+           << (P.HasKnownPositiveRefCount()?"true":"false") << "\n"
+         << "            Seq:              "
+         << P.GetSeq() << "\n";
+    }
+  }
+
+  OS << "    BottomUp State:\n";
+  if (!BBInfo.hasBottomUpPtrs()) {
+    DEBUG(llvm::dbgs() << "        NONE!\n");
+  } else {
+    for (auto I = BBInfo.bottom_up_ptr_begin(), E = BBInfo.bottom_up_ptr_end();
+         I != E; ++I) {
+      const PtrState &P = I->second;
+      OS << "        Ptr: " << *I->first
+         << "\n            KnownSafe:        " << (P.IsKnownSafe()?"true":"false")
+         << "\n            ImpreciseRelease: "
+           << (P.IsTrackingImpreciseReleases()?"true":"false") << "\n"
+         << "            HasCFGHazards:    "
+           << (P.IsCFGHazardAfflicted()?"true":"false") << "\n"
+         << "            KnownPositive:    "
+           << (P.HasKnownPositiveRefCount()?"true":"false") << "\n"
+         << "            Seq:              "
+         << P.GetSeq() << "\n";
+    }
+  }
+
+  return OS;
+}
+
+namespace {
+
+  /// \brief The main ARC optimization pass.
+  class ObjCARCOpt : public FunctionPass {
+    bool Changed;
+    ProvenanceAnalysis PA;
+
+    /// A cache of references to runtime entry point constants.
+    ARCRuntimeEntryPoints EP;
+
+    /// A cache of MDKinds that can be passed into other functions to propagate
+    /// MDKind identifiers.
+    ARCMDKindCache MDKindCache;
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+    /// Flags which determine whether each of the interesting runtime functions
+    /// is in fact used in the current function.
+    unsigned UsedInThisFunction;
+
+    bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
+    void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
+                                   ARCInstKind &Class);
+    void OptimizeIndividualCalls(Function &F);
+
+    void CheckForCFGHazards(const BasicBlock *BB,
+                            DenseMap<const BasicBlock *, BBState> &BBStates,
+                            BBState &MyStates) const;
+    bool VisitInstructionBottomUp(Instruction *Inst, BasicBlock *BB,
+                                  BlotMapVector<Value *, RRInfo> &Retains,
+                                  BBState &MyStates);
+    bool VisitBottomUp(BasicBlock *BB,
+                       DenseMap<const BasicBlock *, BBState> &BBStates,
+                       BlotMapVector<Value *, RRInfo> &Retains);
+    bool VisitInstructionTopDown(Instruction *Inst,
+                                 DenseMap<Value *, RRInfo> &Releases,
+                                 BBState &MyStates);
+    bool VisitTopDown(BasicBlock *BB,
+                      DenseMap<const BasicBlock *, BBState> &BBStates,
+                      DenseMap<Value *, RRInfo> &Releases);
+    bool Visit(Function &F, DenseMap<const BasicBlock *, BBState> &BBStates,
+               BlotMapVector<Value *, RRInfo> &Retains,
+               DenseMap<Value *, RRInfo> &Releases);
+
+    void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
+                   BlotMapVector<Value *, RRInfo> &Retains,
+                   DenseMap<Value *, RRInfo> &Releases,
+                   SmallVectorImpl<Instruction *> &DeadInsts, Module *M);
+
+    bool
+    PairUpRetainsAndReleases(DenseMap<const BasicBlock *, BBState> &BBStates,
+                             BlotMapVector<Value *, RRInfo> &Retains,
+                             DenseMap<Value *, RRInfo> &Releases, Module *M,
+                             Instruction * Retain,
+                             SmallVectorImpl<Instruction *> &DeadInsts,
+                             RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
+                             Value *Arg, bool KnownSafe,
+                             bool &AnyPairsCompletelyEliminated);
+
+    bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
+                              BlotMapVector<Value *, RRInfo> &Retains,
+                              DenseMap<Value *, RRInfo> &Releases, Module *M);
+
+    void OptimizeWeakCalls(Function &F);
+
+    bool OptimizeSequences(Function &F);
+
+    void OptimizeReturns(Function &F);
+
+#ifndef NDEBUG
+    void GatherStatistics(Function &F, bool AfterOptimization = false);
+#endif
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+    bool doInitialization(Module &M) override;
+    bool runOnFunction(Function &F) override;
+    void releaseMemory() override;
+
+  public:
+    static char ID;
+    ObjCARCOpt() : FunctionPass(ID) {
+      initializeObjCARCOptPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCOpt::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCOpt,
+                      "objc-arc", "ObjC ARC optimization", false, false)
+INITIALIZE_PASS_DEPENDENCY(ObjCARCAAWrapperPass)
+INITIALIZE_PASS_END(ObjCARCOpt,
+                    "objc-arc", "ObjC ARC optimization", false, false)
+
+Pass *llvm::createObjCARCOptPass() {
+  return new ObjCARCOpt();
+}
+
+void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<ObjCARCAAWrapperPass>();
+  AU.addRequired<AAResultsWrapperPass>();
+  // ARC optimization doesn't currently split critical edges.
+  AU.setPreservesCFG();
+}
+
+/// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is
+/// not a return value.  Or, if it can be paired with an
+/// objc_autoreleaseReturnValue, delete the pair and return true.
+bool
+ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
+  // Check for the argument being from an immediately preceding call or invoke.
+  const Value *Arg = GetArgRCIdentityRoot(RetainRV);
+  ImmutableCallSite CS(Arg);
+  if (const Instruction *Call = CS.getInstruction()) {
+    if (Call->getParent() == RetainRV->getParent()) {
+      BasicBlock::const_iterator I(Call);
+      ++I;
+      while (IsNoopInstruction(&*I))
+        ++I;
+      if (&*I == RetainRV)
+        return false;
+    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      BasicBlock *RetainRVParent = RetainRV->getParent();
+      if (II->getNormalDest() == RetainRVParent) {
+        BasicBlock::const_iterator I = RetainRVParent->begin();
+        while (IsNoopInstruction(&*I))
+          ++I;
+        if (&*I == RetainRV)
+          return false;
+      }
+    }
+  }
+
+  // Check for being preceded by an objc_autoreleaseReturnValue on the same
+  // pointer. In this case, we can delete the pair.
+  BasicBlock::iterator I = RetainRV->getIterator(),
+                       Begin = RetainRV->getParent()->begin();
+  if (I != Begin) {
+    do
+      --I;
+    while (I != Begin && IsNoopInstruction(&*I));
+    if (GetBasicARCInstKind(&*I) == ARCInstKind::AutoreleaseRV &&
+        GetArgRCIdentityRoot(&*I) == Arg) {
+      Changed = true;
+      ++NumPeeps;
+
+      DEBUG(dbgs() << "Erasing autoreleaseRV,retainRV pair: " << *I << "\n"
+                   << "Erasing " << *RetainRV << "\n");
+
+      EraseInstruction(&*I);
+      EraseInstruction(RetainRV);
+      return true;
+    }
+  }
+
+  // Turn it to a plain objc_retain.
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "Transforming objc_retainAutoreleasedReturnValue => "
+                  "objc_retain since the operand is not a return value.\n"
+                  "Old = " << *RetainRV << "\n");
+
+  Constant *NewDecl = EP.get(ARCRuntimeEntryPointKind::Retain);
+  cast<CallInst>(RetainRV)->setCalledFunction(NewDecl);
+
+  DEBUG(dbgs() << "New = " << *RetainRV << "\n");
+
+  return false;
+}
+
+/// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not
+/// used as a return value.
+void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F,
+                                           Instruction *AutoreleaseRV,
+                                           ARCInstKind &Class) {
+  // Check for a return of the pointer value.
+  const Value *Ptr = GetArgRCIdentityRoot(AutoreleaseRV);
+
+  // If the argument is ConstantPointerNull or UndefValue, its other users
+  // aren't actually interesting to look at.
+  if (isa<ConstantData>(Ptr))
+    return;
+
+  SmallVector<const Value *, 2> Users;
+  Users.push_back(Ptr);
+  do {
+    Ptr = Users.pop_back_val();
+    for (const User *U : Ptr->users()) {
+      if (isa<ReturnInst>(U) || GetBasicARCInstKind(U) == ARCInstKind::RetainRV)
+        return;
+      if (isa<BitCastInst>(U))
+        Users.push_back(U);
+    }
+  } while (!Users.empty());
+
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "Transforming objc_autoreleaseReturnValue => "
+                  "objc_autorelease since its operand is not used as a return "
+                  "value.\n"
+                  "Old = " << *AutoreleaseRV << "\n");
+
+  CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV);
+  Constant *NewDecl = EP.get(ARCRuntimeEntryPointKind::Autorelease);
+  AutoreleaseRVCI->setCalledFunction(NewDecl);
+  AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
+  Class = ARCInstKind::Autorelease;
+
+  DEBUG(dbgs() << "New: " << *AutoreleaseRV << "\n");
+
+}
+
+/// Visit each call, one at a time, and make simplifications without doing any
+/// additional analysis.
+void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
+  DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeIndividualCalls ==\n");
+  // Reset all the flags in preparation for recomputing them.
+  UsedInThisFunction = 0;
+
+  // Visit all objc_* calls in F.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
+
+    DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n");
+
+    switch (Class) {
+    default: break;
+
+    // Delete no-op casts. These function calls have special semantics, but
+    // the semantics are entirely implemented via lowering in the front-end,
+    // so by the time they reach the optimizer, they are just no-op calls
+    // which return their argument.
+    //
+    // There are gray areas here, as the ability to cast reference-counted
+    // pointers to raw void* and back allows code to break ARC assumptions,
+    // however these are currently considered to be unimportant.
+    case ARCInstKind::NoopCast:
+      Changed = true;
+      ++NumNoops;
+      DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n");
+      EraseInstruction(Inst);
+      continue;
+
+    // If the pointer-to-weak-pointer is null, it's undefined behavior.
+    case ARCInstKind::StoreWeak:
+    case ARCInstKind::LoadWeak:
+    case ARCInstKind::LoadWeakRetained:
+    case ARCInstKind::InitWeak:
+    case ARCInstKind::DestroyWeak: {
+      CallInst *CI = cast<CallInst>(Inst);
+      if (IsNullOrUndef(CI->getArgOperand(0))) {
+        Changed = true;
+        Type *Ty = CI->getArgOperand(0)->getType();
+        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                      Constant::getNullValue(Ty),
+                      CI);
+        llvm::Value *NewValue = UndefValue::get(CI->getType());
+        DEBUG(dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+                       "\nOld = " << *CI << "\nNew = " << *NewValue << "\n");
+        CI->replaceAllUsesWith(NewValue);
+        CI->eraseFromParent();
+        continue;
+      }
+      break;
+    }
+    case ARCInstKind::CopyWeak:
+    case ARCInstKind::MoveWeak: {
+      CallInst *CI = cast<CallInst>(Inst);
+      if (IsNullOrUndef(CI->getArgOperand(0)) ||
+          IsNullOrUndef(CI->getArgOperand(1))) {
+        Changed = true;
+        Type *Ty = CI->getArgOperand(0)->getType();
+        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                      Constant::getNullValue(Ty),
+                      CI);
+
+        llvm::Value *NewValue = UndefValue::get(CI->getType());
+        DEBUG(dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+                        "\nOld = " << *CI << "\nNew = " << *NewValue << "\n");
+
+        CI->replaceAllUsesWith(NewValue);
+        CI->eraseFromParent();
+        continue;
+      }
+      break;
+    }
+    case ARCInstKind::RetainRV:
+      if (OptimizeRetainRVCall(F, Inst))
+        continue;
+      break;
+    case ARCInstKind::AutoreleaseRV:
+      OptimizeAutoreleaseRVCall(F, Inst, Class);
+      break;
+    }
+
+    // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
+    if (IsAutorelease(Class) && Inst->use_empty()) {
+      CallInst *Call = cast<CallInst>(Inst);
+      const Value *Arg = Call->getArgOperand(0);
+      Arg = FindSingleUseIdentifiedObject(Arg);
+      if (Arg) {
+        Changed = true;
+        ++NumAutoreleases;
+
+        // Create the declaration lazily.
+        LLVMContext &C = Inst->getContext();
+
+        Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Release);
+        CallInst *NewCall = CallInst::Create(Decl, Call->getArgOperand(0), "",
+                                             Call);
+        NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease),
+                             MDNode::get(C, None));
+
+        DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
+              "since x is otherwise unused.\nOld: " << *Call << "\nNew: "
+              << *NewCall << "\n");
+
+        EraseInstruction(Call);
+        Inst = NewCall;
+        Class = ARCInstKind::Release;
+      }
+    }
+
+    // For functions which can never be passed stack arguments, add
+    // a tail keyword.
+    if (IsAlwaysTail(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "Adding tail keyword to function since it can never be "
+                      "passed stack args: " << *Inst << "\n");
+      cast<CallInst>(Inst)->setTailCall();
+    }
+
+    // Ensure that functions that can never have a "tail" keyword due to the
+    // semantics of ARC truly do not do so.
+    if (IsNeverTail(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst <<
+            "\n");
+      cast<CallInst>(Inst)->setTailCall(false);
+    }
+
+    // Set nounwind as needed.
+    if (IsNoThrow(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "Found no throw class. Setting nounwind on: " << *Inst
+                   << "\n");
+      cast<CallInst>(Inst)->setDoesNotThrow();
+    }
+
+    if (!IsNoopOnNull(Class)) {
+      UsedInThisFunction |= 1 << unsigned(Class);
+      continue;
+    }
+
+    const Value *Arg = GetArgRCIdentityRoot(Inst);
+
+    // ARC calls with null are no-ops. Delete them.
+    if (IsNullOrUndef(Arg)) {
+      Changed = true;
+      ++NumNoops;
+      DEBUG(dbgs() << "ARC calls with  null are no-ops. Erasing: " << *Inst
+            << "\n");
+      EraseInstruction(Inst);
+      continue;
+    }
+
+    // Keep track of which of retain, release, autorelease, and retain_block
+    // are actually present in this function.
+    UsedInThisFunction |= 1 << unsigned(Class);
+
+    // If Arg is a PHI, and one or more incoming values to the
+    // PHI are null, and the call is control-equivalent to the PHI, and there
+    // are no relevant side effects between the PHI and the call, the call
+    // could be pushed up to just those paths with non-null incoming values.
+    // For now, don't bother splitting critical edges for this.
+    SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist;
+    Worklist.push_back(std::make_pair(Inst, Arg));
+    do {
+      std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val();
+      Inst = Pair.first;
+      Arg = Pair.second;
+
+      const PHINode *PN = dyn_cast<PHINode>(Arg);
+      if (!PN) continue;
+
+      // Determine if the PHI has any null operands, or any incoming
+      // critical edges.
+      bool HasNull = false;
+      bool HasCriticalEdges = false;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        Value *Incoming =
+          GetRCIdentityRoot(PN->getIncomingValue(i));
+        if (IsNullOrUndef(Incoming))
+          HasNull = true;
+        else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back())
+                   .getNumSuccessors() != 1) {
+          HasCriticalEdges = true;
+          break;
+        }
+      }
+      // If we have null operands and no critical edges, optimize.
+      if (!HasCriticalEdges && HasNull) {
+        SmallPtrSet<Instruction *, 4> DependingInstructions;
+        SmallPtrSet<const BasicBlock *, 4> Visited;
+
+        // Check that there is nothing that cares about the reference
+        // count between the call and the phi.
+        switch (Class) {
+        case ARCInstKind::Retain:
+        case ARCInstKind::RetainBlock:
+          // These can always be moved up.
+          break;
+        case ARCInstKind::Release:
+          // These can't be moved across things that care about the retain
+          // count.
+          FindDependencies(NeedsPositiveRetainCount, Arg,
+                           Inst->getParent(), Inst,
+                           DependingInstructions, Visited, PA);
+          break;
+        case ARCInstKind::Autorelease:
+          // These can't be moved across autorelease pool scope boundaries.
+          FindDependencies(AutoreleasePoolBoundary, Arg,
+                           Inst->getParent(), Inst,
+                           DependingInstructions, Visited, PA);
+          break;
+        case ARCInstKind::ClaimRV:
+        case ARCInstKind::RetainRV:
+        case ARCInstKind::AutoreleaseRV:
+          // Don't move these; the RV optimization depends on the autoreleaseRV
+          // being tail called, and the retainRV being immediately after a call
+          // (which might still happen if we get lucky with codegen layout, but
+          // it's not worth taking the chance).
+          continue;
+        default:
+          llvm_unreachable("Invalid dependence flavor");
+        }
+
+        if (DependingInstructions.size() == 1 &&
+            *DependingInstructions.begin() == PN) {
+          Changed = true;
+          ++NumPartialNoops;
+          // Clone the call into each predecessor that has a non-null value.
+          CallInst *CInst = cast<CallInst>(Inst);
+          Type *ParamTy = CInst->getArgOperand(0)->getType();
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+            Value *Incoming =
+              GetRCIdentityRoot(PN->getIncomingValue(i));
+            if (!IsNullOrUndef(Incoming)) {
+              CallInst *Clone = cast<CallInst>(CInst->clone());
+              Value *Op = PN->getIncomingValue(i);
+              Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
+              if (Op->getType() != ParamTy)
+                Op = new BitCastInst(Op, ParamTy, "", InsertPos);
+              Clone->setArgOperand(0, Op);
+              Clone->insertBefore(InsertPos);
+
+              DEBUG(dbgs() << "Cloning "
+                           << *CInst << "\n"
+                           "And inserting clone at " << *InsertPos << "\n");
+              Worklist.push_back(std::make_pair(Clone, Incoming));
+            }
+          }
+          // Erase the original call.
+          DEBUG(dbgs() << "Erasing: " << *CInst << "\n");
+          EraseInstruction(CInst);
+          continue;
+        }
+      }
+    } while (!Worklist.empty());
+  }
+}
+
+/// If we have a top down pointer in the S_Use state, make sure that there are
+/// no CFG hazards by checking the states of various bottom up pointers.
+static void CheckForUseCFGHazard(const Sequence SuccSSeq,
+                                 const bool SuccSRRIKnownSafe,
+                                 TopDownPtrState &S,
+                                 bool &SomeSuccHasSame,
+                                 bool &AllSuccsHaveSame,
+                                 bool &NotAllSeqEqualButKnownSafe,
+                                 bool &ShouldContinue) {
+  switch (SuccSSeq) {
+  case S_CanRelease: {
+    if (!S.IsKnownSafe() && !SuccSRRIKnownSafe) {
+      S.ClearSequenceProgress();
+      break;
+    }
+    S.SetCFGHazardAfflicted(true);
+    ShouldContinue = true;
+    break;
+  }
+  case S_Use:
+    SomeSuccHasSame = true;
+    break;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+    if (!S.IsKnownSafe() && !SuccSRRIKnownSafe)
+      AllSuccsHaveSame = false;
+    else
+      NotAllSeqEqualButKnownSafe = true;
+    break;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  case S_None:
+    llvm_unreachable("This should have been handled earlier.");
+  }
+}
+
+/// If we have a Top Down pointer in the S_CanRelease state, make sure that
+/// there are no CFG hazards by checking the states of various bottom up
+/// pointers.
+static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq,
+                                        const bool SuccSRRIKnownSafe,
+                                        TopDownPtrState &S,
+                                        bool &SomeSuccHasSame,
+                                        bool &AllSuccsHaveSame,
+                                        bool &NotAllSeqEqualButKnownSafe) {
+  switch (SuccSSeq) {
+  case S_CanRelease:
+    SomeSuccHasSame = true;
+    break;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+  case S_Use:
+    if (!S.IsKnownSafe() && !SuccSRRIKnownSafe)
+      AllSuccsHaveSame = false;
+    else
+      NotAllSeqEqualButKnownSafe = true;
+    break;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  case S_None:
+    llvm_unreachable("This should have been handled earlier.");
+  }
+}
+
+/// Check for critical edges, loop boundaries, irreducible control flow, or
+/// other CFG structures where moving code across the edge would result in it
+/// being executed more.
+void
+ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
+                               DenseMap<const BasicBlock *, BBState> &BBStates,
+                               BBState &MyStates) const {
+  // If any top-down local-use or possible-dec has a succ which is earlier in
+  // the sequence, forget it.
+  for (auto I = MyStates.top_down_ptr_begin(), E = MyStates.top_down_ptr_end();
+       I != E; ++I) {
+    TopDownPtrState &S = I->second;
+    const Sequence Seq = I->second.GetSeq();
+
+    // We only care about S_Retain, S_CanRelease, and S_Use.
+    if (Seq == S_None)
+      continue;
+
+    // Make sure that if extra top down states are added in the future that this
+    // code is updated to handle it.
+    assert((Seq == S_Retain || Seq == S_CanRelease || Seq == S_Use) &&
+           "Unknown top down sequence state.");
+
+    const Value *Arg = I->first;
+    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+    bool SomeSuccHasSame = false;
+    bool AllSuccsHaveSame = true;
+    bool NotAllSeqEqualButKnownSafe = false;
+
+    succ_const_iterator SI(TI), SE(TI, false);
+
+    for (; SI != SE; ++SI) {
+      // If VisitBottomUp has pointer information for this successor, take
+      // what we know about it.
+      const DenseMap<const BasicBlock *, BBState>::iterator BBI =
+        BBStates.find(*SI);
+      assert(BBI != BBStates.end());
+      const BottomUpPtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+      const Sequence SuccSSeq = SuccS.GetSeq();
+
+      // If bottom up, the pointer is in an S_None state, clear the sequence
+      // progress since the sequence in the bottom up state finished
+      // suggesting a mismatch in between retains/releases. This is true for
+      // all three cases that we are handling here: S_Retain, S_Use, and
+      // S_CanRelease.
+      if (SuccSSeq == S_None) {
+        S.ClearSequenceProgress();
+        continue;
+      }
+
+      // If we have S_Use or S_CanRelease, perform our check for cfg hazard
+      // checks.
+      const bool SuccSRRIKnownSafe = SuccS.IsKnownSafe();
+
+      // *NOTE* We do not use Seq from above here since we are allowing for
+      // S.GetSeq() to change while we are visiting basic blocks.
+      switch(S.GetSeq()) {
+      case S_Use: {
+        bool ShouldContinue = false;
+        CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S, SomeSuccHasSame,
+                             AllSuccsHaveSame, NotAllSeqEqualButKnownSafe,
+                             ShouldContinue);
+        if (ShouldContinue)
+          continue;
+        break;
+      }
+      case S_CanRelease: {
+        CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S,
+                                    SomeSuccHasSame, AllSuccsHaveSame,
+                                    NotAllSeqEqualButKnownSafe);
+        break;
+      }
+      case S_Retain:
+      case S_None:
+      case S_Stop:
+      case S_Release:
+      case S_MovableRelease:
+        break;
+      }
+    }
+
+    // If the state at the other end of any of the successor edges
+    // matches the current state, require all edges to match. This
+    // guards against loops in the middle of a sequence.
+    if (SomeSuccHasSame && !AllSuccsHaveSame) {
+      S.ClearSequenceProgress();
+    } else if (NotAllSeqEqualButKnownSafe) {
+      // If we would have cleared the state foregoing the fact that we are known
+      // safe, stop code motion. This is because whether or not it is safe to
+      // remove RR pairs via KnownSafe is an orthogonal concept to whether we
+      // are allowed to perform code motion.
+      S.SetCFGHazardAfflicted(true);
+    }
+  }
+}
+
+bool ObjCARCOpt::VisitInstructionBottomUp(
+    Instruction *Inst, BasicBlock *BB, BlotMapVector<Value *, RRInfo> &Retains,
+    BBState &MyStates) {
+  bool NestingDetected = false;
+  ARCInstKind Class = GetARCInstKind(Inst);
+  const Value *Arg = nullptr;
+
+  DEBUG(dbgs() << "        Class: " << Class << "\n");
+
+  switch (Class) {
+  case ARCInstKind::Release: {
+    Arg = GetArgRCIdentityRoot(Inst);
+
+    BottomUpPtrState &S = MyStates.getPtrBottomUpState(Arg);
+    NestingDetected |= S.InitBottomUp(MDKindCache, Inst);
+    break;
+  }
+  case ARCInstKind::RetainBlock:
+    // In OptimizeIndividualCalls, we have strength reduced all optimizable
+    // objc_retainBlocks to objc_retains. Thus at this point any
+    // objc_retainBlocks that we see are not optimizable.
+    break;
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV: {
+    Arg = GetArgRCIdentityRoot(Inst);
+    BottomUpPtrState &S = MyStates.getPtrBottomUpState(Arg);
+    if (S.MatchWithRetain()) {
+      // Don't do retain+release tracking for ARCInstKind::RetainRV, because
+      // it's better to let it remain as the first instruction after a call.
+      if (Class != ARCInstKind::RetainRV) {
+        DEBUG(llvm::dbgs() << "        Matching with: " << *Inst << "\n");
+        Retains[Inst] = S.GetRRInfo();
+      }
+      S.ClearSequenceProgress();
+    }
+    // A retain moving bottom up can be a use.
+    break;
+  }
+  case ARCInstKind::AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearBottomUpPointers();
+    return NestingDetected;
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::None:
+    // These are irrelevant.
+    return NestingDetected;
+  default:
+    break;
+  }
+
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (auto MI = MyStates.bottom_up_ptr_begin(),
+            ME = MyStates.bottom_up_ptr_end();
+       MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    BottomUpPtrState &S = MI->second;
+
+    if (S.HandlePotentialAlterRefCount(Inst, Ptr, PA, Class))
+      continue;
+
+    S.HandlePotentialUse(BB, Inst, Ptr, PA, Class);
+  }
+
+  return NestingDetected;
+}
+
+bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
+                               DenseMap<const BasicBlock *, BBState> &BBStates,
+                               BlotMapVector<Value *, RRInfo> &Retains) {
+
+  DEBUG(dbgs() << "\n== ObjCARCOpt::VisitBottomUp ==\n");
+
+  bool NestingDetected = false;
+  BBState &MyStates = BBStates[BB];
+
+  // Merge the states from each successor to compute the initial state
+  // for the current block.
+  BBState::edge_iterator SI(MyStates.succ_begin()),
+                         SE(MyStates.succ_end());
+  if (SI != SE) {
+    const BasicBlock *Succ = *SI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
+    assert(I != BBStates.end());
+    MyStates.InitFromSucc(I->second);
+    ++SI;
+    for (; SI != SE; ++SI) {
+      Succ = *SI;
+      I = BBStates.find(Succ);
+      assert(I != BBStates.end());
+      MyStates.MergeSucc(I->second);
+    }
+  }
+
+  DEBUG(llvm::dbgs() << "Before:\n" << BBStates[BB] << "\n"
+                     << "Performing Dataflow:\n");
+
+  // Visit all the instructions, bottom-up.
+  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
+    Instruction *Inst = &*std::prev(I);
+
+    // Invoke instructions are visited as part of their successors (below).
+    if (isa<InvokeInst>(Inst))
+      continue;
+
+    DEBUG(dbgs() << "    Visiting " << *Inst << "\n");
+
+    NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
+  }
+
+  // If there's a predecessor with an invoke, visit the invoke as if it were
+  // part of this block, since we can't insert code after an invoke in its own
+  // block, and we don't want to split critical edges.
+  for (BBState::edge_iterator PI(MyStates.pred_begin()),
+       PE(MyStates.pred_end()); PI != PE; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back()))
+      NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
+  }
+
+  DEBUG(llvm::dbgs() << "\nFinal State:\n" << BBStates[BB] << "\n");
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
+                                    DenseMap<Value *, RRInfo> &Releases,
+                                    BBState &MyStates) {
+  bool NestingDetected = false;
+  ARCInstKind Class = GetARCInstKind(Inst);
+  const Value *Arg = nullptr;
+
+  DEBUG(llvm::dbgs() << "        Class: " << Class << "\n");
+
+  switch (Class) {
+  case ARCInstKind::RetainBlock:
+    // In OptimizeIndividualCalls, we have strength reduced all optimizable
+    // objc_retainBlocks to objc_retains. Thus at this point any
+    // objc_retainBlocks that we see are not optimizable. We need to break since
+    // a retain can be a potential use.
+    break;
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV: {
+    Arg = GetArgRCIdentityRoot(Inst);
+    TopDownPtrState &S = MyStates.getPtrTopDownState(Arg);
+    NestingDetected |= S.InitTopDown(Class, Inst);
+    // A retain can be a potential use; proceed to the generic checking
+    // code below.
+    break;
+  }
+  case ARCInstKind::Release: {
+    Arg = GetArgRCIdentityRoot(Inst);
+    TopDownPtrState &S = MyStates.getPtrTopDownState(Arg);
+    // Try to form a tentative pair in between this release instruction and the
+    // top down pointers that we are tracking.
+    if (S.MatchWithRelease(MDKindCache, Inst)) {
+      // If we succeed, copy S's RRInfo into the Release -> {Retain Set
+      // Map}. Then we clear S.
+      DEBUG(llvm::dbgs() << "        Matching with: " << *Inst << "\n");
+      Releases[Inst] = S.GetRRInfo();
+      S.ClearSequenceProgress();
+    }
+    break;
+  }
+  case ARCInstKind::AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearTopDownPointers();
+    return false;
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::None:
+    // These can not be uses of
+    return false;
+  default:
+    break;
+  }
+
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (auto MI = MyStates.top_down_ptr_begin(),
+            ME = MyStates.top_down_ptr_end();
+       MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    TopDownPtrState &S = MI->second;
+    if (S.HandlePotentialAlterRefCount(Inst, Ptr, PA, Class))
+      continue;
+
+    S.HandlePotentialUse(Inst, Ptr, PA, Class);
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitTopDown(BasicBlock *BB,
+                         DenseMap<const BasicBlock *, BBState> &BBStates,
+                         DenseMap<Value *, RRInfo> &Releases) {
+  DEBUG(dbgs() << "\n== ObjCARCOpt::VisitTopDown ==\n");
+  bool NestingDetected = false;
+  BBState &MyStates = BBStates[BB];
+
+  // Merge the states from each predecessor to compute the initial state
+  // for the current block.
+  BBState::edge_iterator PI(MyStates.pred_begin()),
+                         PE(MyStates.pred_end());
+  if (PI != PE) {
+    const BasicBlock *Pred = *PI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
+    assert(I != BBStates.end());
+    MyStates.InitFromPred(I->second);
+    ++PI;
+    for (; PI != PE; ++PI) {
+      Pred = *PI;
+      I = BBStates.find(Pred);
+      assert(I != BBStates.end());
+      MyStates.MergePred(I->second);
+    }
+  }
+
+  DEBUG(llvm::dbgs() << "Before:\n" << BBStates[BB]  << "\n"
+                     << "Performing Dataflow:\n");
+
+  // Visit all the instructions, top-down.
+  for (Instruction &Inst : *BB) {
+    DEBUG(dbgs() << "    Visiting " << Inst << "\n");
+
+    NestingDetected |= VisitInstructionTopDown(&Inst, Releases, MyStates);
+  }
+
+  DEBUG(llvm::dbgs() << "\nState Before Checking for CFG Hazards:\n"
+                     << BBStates[BB] << "\n\n");
+  CheckForCFGHazards(BB, BBStates, MyStates);
+  DEBUG(llvm::dbgs() << "Final State:\n" << BBStates[BB] << "\n");
+  return NestingDetected;
+}
+
+static void
+ComputePostOrders(Function &F,
+                  SmallVectorImpl<BasicBlock *> &PostOrder,
+                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder,
+                  unsigned NoObjCARCExceptionsMDKind,
+                  DenseMap<const BasicBlock *, BBState> &BBStates) {
+  /// The visited set, for doing DFS walks.
+  SmallPtrSet<BasicBlock *, 16> Visited;
+
+  // Do DFS, computing the PostOrder.
+  SmallPtrSet<BasicBlock *, 16> OnStack;
+  SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack;
+
+  // Functions always have exactly one entry block, and we don't have
+  // any other block that we treat like an entry block.
+  BasicBlock *EntryBB = &F.getEntryBlock();
+  BBState &MyStates = BBStates[EntryBB];
+  MyStates.SetAsEntry();
+  TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back());
+  SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
+  Visited.insert(EntryBB);
+  OnStack.insert(EntryBB);
+  do {
+  dfs_next_succ:
+    BasicBlock *CurrBB = SuccStack.back().first;
+    TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back());
+    succ_iterator SE(TI, false);
+
+    while (SuccStack.back().second != SE) {
+      BasicBlock *SuccBB = *SuccStack.back().second++;
+      if (Visited.insert(SuccBB).second) {
+        TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back());
+        SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI)));
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBState &SuccStates = BBStates[SuccBB];
+        SuccStates.addPred(CurrBB);
+        OnStack.insert(SuccBB);
+        goto dfs_next_succ;
+      }
+
+      if (!OnStack.count(SuccBB)) {
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBStates[SuccBB].addPred(CurrBB);
+      }
+    }
+    OnStack.erase(CurrBB);
+    PostOrder.push_back(CurrBB);
+    SuccStack.pop_back();
+  } while (!SuccStack.empty());
+
+  Visited.clear();
+
+  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
+  // Functions may have many exits, and there also blocks which we treat
+  // as exits due to ignored edges.
+  SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack;
+  for (BasicBlock &ExitBB : F) {
+    BBState &MyStates = BBStates[&ExitBB];
+    if (!MyStates.isExit())
+      continue;
+
+    MyStates.SetAsExit();
+
+    PredStack.push_back(std::make_pair(&ExitBB, MyStates.pred_begin()));
+    Visited.insert(&ExitBB);
+    while (!PredStack.empty()) {
+    reverse_dfs_next_succ:
+      BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
+      while (PredStack.back().second != PE) {
+        BasicBlock *BB = *PredStack.back().second++;
+        if (Visited.insert(BB).second) {
+          PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin()));
+          goto reverse_dfs_next_succ;
+        }
+      }
+      ReverseCFGPostOrder.push_back(PredStack.pop_back_val().first);
+    }
+  }
+}
+
+// Visit the function both top-down and bottom-up.
+bool ObjCARCOpt::Visit(Function &F,
+                       DenseMap<const BasicBlock *, BBState> &BBStates,
+                       BlotMapVector<Value *, RRInfo> &Retains,
+                       DenseMap<Value *, RRInfo> &Releases) {
+
+  // Use reverse-postorder traversals, because we magically know that loops
+  // will be well behaved, i.e. they won't repeatedly call retain on a single
+  // pointer without doing a release. We can't use the ReversePostOrderTraversal
+  // class here because we want the reverse-CFG postorder to consider each
+  // function exit point, and we want to ignore selected cycle edges.
+  SmallVector<BasicBlock *, 16> PostOrder;
+  SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
+  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder,
+                    MDKindCache.get(ARCMDKindID::NoObjCARCExceptions),
+                    BBStates);
+
+  // Use reverse-postorder on the reverse CFG for bottom-up.
+  bool BottomUpNestingDetected = false;
+  for (BasicBlock *BB : reverse(ReverseCFGPostOrder))
+    BottomUpNestingDetected |= VisitBottomUp(BB, BBStates, Retains);
+
+  // Use reverse-postorder for top-down.
+  bool TopDownNestingDetected = false;
+  for (BasicBlock *BB : reverse(PostOrder))
+    TopDownNestingDetected |= VisitTopDown(BB, BBStates, Releases);
+
+  return TopDownNestingDetected && BottomUpNestingDetected;
+}
+
+/// Move the calls in RetainsToMove and ReleasesToMove.
+void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove,
+                           RRInfo &ReleasesToMove,
+                           BlotMapVector<Value *, RRInfo> &Retains,
+                           DenseMap<Value *, RRInfo> &Releases,
+                           SmallVectorImpl<Instruction *> &DeadInsts,
+                           Module *M) {
+  Type *ArgTy = Arg->getType();
+  Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
+
+  DEBUG(dbgs() << "== ObjCARCOpt::MoveCalls ==\n");
+
+  // Insert the new retain and release calls.
+  for (Instruction *InsertPt : ReleasesToMove.ReverseInsertPts) {
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
+    CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt);
+    Call->setDoesNotThrow();
+    Call->setTailCall();
+
+    DEBUG(dbgs() << "Inserting new Retain: " << *Call << "\n"
+                    "At insertion point: " << *InsertPt << "\n");
+  }
+  for (Instruction *InsertPt : RetainsToMove.ReverseInsertPts) {
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Release);
+    CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt);
+    // Attach a clang.imprecise_release metadata tag, if appropriate.
+    if (MDNode *M = ReleasesToMove.ReleaseMetadata)
+      Call->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease), M);
+    Call->setDoesNotThrow();
+    if (ReleasesToMove.IsTailCallRelease)
+      Call->setTailCall();
+
+    DEBUG(dbgs() << "Inserting new Release: " << *Call << "\n"
+                    "At insertion point: " << *InsertPt << "\n");
+  }
+
+  // Delete the original retain and release calls.
+  for (Instruction *OrigRetain : RetainsToMove.Calls) {
+    Retains.blot(OrigRetain);
+    DeadInsts.push_back(OrigRetain);
+    DEBUG(dbgs() << "Deleting retain: " << *OrigRetain << "\n");
+  }
+  for (Instruction *OrigRelease : ReleasesToMove.Calls) {
+    Releases.erase(OrigRelease);
+    DeadInsts.push_back(OrigRelease);
+    DEBUG(dbgs() << "Deleting release: " << *OrigRelease << "\n");
+  }
+
+}
+
+bool ObjCARCOpt::PairUpRetainsAndReleases(
+    DenseMap<const BasicBlock *, BBState> &BBStates,
+    BlotMapVector<Value *, RRInfo> &Retains,
+    DenseMap<Value *, RRInfo> &Releases, Module *M,
+    Instruction *Retain,
+    SmallVectorImpl<Instruction *> &DeadInsts, RRInfo &RetainsToMove,
+    RRInfo &ReleasesToMove, Value *Arg, bool KnownSafe,
+    bool &AnyPairsCompletelyEliminated) {
+  // If a pair happens in a region where it is known that the reference count
+  // is already incremented, we can similarly ignore possible decrements unless
+  // we are dealing with a retainable object with multiple provenance sources.
+  bool KnownSafeTD = true, KnownSafeBU = true;
+  bool CFGHazardAfflicted = false;
+
+  // Connect the dots between the top-down-collected RetainsToMove and
+  // bottom-up-collected ReleasesToMove to form sets of related calls.
+  // This is an iterative process so that we connect multiple releases
+  // to multiple retains if needed.
+  unsigned OldDelta = 0;
+  unsigned NewDelta = 0;
+  unsigned OldCount = 0;
+  unsigned NewCount = 0;
+  bool FirstRelease = true;
+  for (SmallVector<Instruction *, 4> NewRetains{Retain};;) {
+    SmallVector<Instruction *, 4> NewReleases;
+    for (Instruction *NewRetain : NewRetains) {
+      auto It = Retains.find(NewRetain);
+      assert(It != Retains.end());
+      const RRInfo &NewRetainRRI = It->second;
+      KnownSafeTD &= NewRetainRRI.KnownSafe;
+      for (Instruction *NewRetainRelease : NewRetainRRI.Calls) {
+        auto Jt = Releases.find(NewRetainRelease);
+        if (Jt == Releases.end())
+          return false;
+        const RRInfo &NewRetainReleaseRRI = Jt->second;
+
+        // If the release does not have a reference to the retain as well,
+        // something happened which is unaccounted for. Do not do anything.
+        //
+        // This can happen if we catch an additive overflow during path count
+        // merging.
+        if (!NewRetainReleaseRRI.Calls.count(NewRetain))
+          return false;
+
+        if (ReleasesToMove.Calls.insert(NewRetainRelease).second) {
+
+          // If we overflow when we compute the path count, don't remove/move
+          // anything.
+          const BBState &NRRBBState = BBStates[NewRetainRelease->getParent()];
+          unsigned PathCount = BBState::OverflowOccurredValue;
+          if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
+            return false;
+          assert(PathCount != BBState::OverflowOccurredValue &&
+                 "PathCount at this point can not be "
+                 "OverflowOccurredValue.");
+          OldDelta -= PathCount;
+
+          // Merge the ReleaseMetadata and IsTailCallRelease values.
+          if (FirstRelease) {
+            ReleasesToMove.ReleaseMetadata =
+              NewRetainReleaseRRI.ReleaseMetadata;
+            ReleasesToMove.IsTailCallRelease =
+              NewRetainReleaseRRI.IsTailCallRelease;
+            FirstRelease = false;
+          } else {
+            if (ReleasesToMove.ReleaseMetadata !=
+                NewRetainReleaseRRI.ReleaseMetadata)
+              ReleasesToMove.ReleaseMetadata = nullptr;
+            if (ReleasesToMove.IsTailCallRelease !=
+                NewRetainReleaseRRI.IsTailCallRelease)
+              ReleasesToMove.IsTailCallRelease = false;
+          }
+
+          // Collect the optimal insertion points.
+          if (!KnownSafe)
+            for (Instruction *RIP : NewRetainReleaseRRI.ReverseInsertPts) {
+              if (ReleasesToMove.ReverseInsertPts.insert(RIP).second) {
+                // If we overflow when we compute the path count, don't
+                // remove/move anything.
+                const BBState &RIPBBState = BBStates[RIP->getParent()];
+                PathCount = BBState::OverflowOccurredValue;
+                if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
+                  return false;
+                assert(PathCount != BBState::OverflowOccurredValue &&
+                       "PathCount at this point can not be "
+                       "OverflowOccurredValue.");
+                NewDelta -= PathCount;
+              }
+            }
+          NewReleases.push_back(NewRetainRelease);
+        }
+      }
+    }
+    NewRetains.clear();
+    if (NewReleases.empty()) break;
+
+    // Back the other way.
+    for (Instruction *NewRelease : NewReleases) {
+      auto It = Releases.find(NewRelease);
+      assert(It != Releases.end());
+      const RRInfo &NewReleaseRRI = It->second;
+      KnownSafeBU &= NewReleaseRRI.KnownSafe;
+      CFGHazardAfflicted |= NewReleaseRRI.CFGHazardAfflicted;
+      for (Instruction *NewReleaseRetain : NewReleaseRRI.Calls) {
+        auto Jt = Retains.find(NewReleaseRetain);
+        if (Jt == Retains.end())
+          return false;
+        const RRInfo &NewReleaseRetainRRI = Jt->second;
+
+        // If the retain does not have a reference to the release as well,
+        // something happened which is unaccounted for. Do not do anything.
+        //
+        // This can happen if we catch an additive overflow during path count
+        // merging.
+        if (!NewReleaseRetainRRI.Calls.count(NewRelease))
+          return false;
+
+        if (RetainsToMove.Calls.insert(NewReleaseRetain).second) {
+          // If we overflow when we compute the path count, don't remove/move
+          // anything.
+          const BBState &NRRBBState = BBStates[NewReleaseRetain->getParent()];
+          unsigned PathCount = BBState::OverflowOccurredValue;
+          if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
+            return false;
+          assert(PathCount != BBState::OverflowOccurredValue &&
+                 "PathCount at this point can not be "
+                 "OverflowOccurredValue.");
+          OldDelta += PathCount;
+          OldCount += PathCount;
+
+          // Collect the optimal insertion points.
+          if (!KnownSafe)
+            for (Instruction *RIP : NewReleaseRetainRRI.ReverseInsertPts) {
+              if (RetainsToMove.ReverseInsertPts.insert(RIP).second) {
+                // If we overflow when we compute the path count, don't
+                // remove/move anything.
+                const BBState &RIPBBState = BBStates[RIP->getParent()];
+
+                PathCount = BBState::OverflowOccurredValue;
+                if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
+                  return false;
+                assert(PathCount != BBState::OverflowOccurredValue &&
+                       "PathCount at this point can not be "
+                       "OverflowOccurredValue.");
+                NewDelta += PathCount;
+                NewCount += PathCount;
+              }
+            }
+          NewRetains.push_back(NewReleaseRetain);
+        }
+      }
+    }
+    if (NewRetains.empty()) break;
+  }
+
+  // We can only remove pointers if we are known safe in both directions.
+  bool UnconditionallySafe = KnownSafeTD && KnownSafeBU;
+  if (UnconditionallySafe) {
+    RetainsToMove.ReverseInsertPts.clear();
+    ReleasesToMove.ReverseInsertPts.clear();
+    NewCount = 0;
+  } else {
+    // Determine whether the new insertion points we computed preserve the
+    // balance of retain and release calls through the program.
+    // TODO: If the fully aggressive solution isn't valid, try to find a
+    // less aggressive solution which is.
+    if (NewDelta != 0)
+      return false;
+
+    // At this point, we are not going to remove any RR pairs, but we still are
+    // able to move RR pairs. If one of our pointers is afflicted with
+    // CFGHazards, we cannot perform such code motion so exit early.
+    const bool WillPerformCodeMotion = RetainsToMove.ReverseInsertPts.size() ||
+      ReleasesToMove.ReverseInsertPts.size();
+    if (CFGHazardAfflicted && WillPerformCodeMotion)
+      return false;
+  }
+
+  // Determine whether the original call points are balanced in the retain and
+  // release calls through the program. If not, conservatively don't touch
+  // them.
+  // TODO: It's theoretically possible to do code motion in this case, as
+  // long as the existing imbalances are maintained.
+  if (OldDelta != 0)
+    return false;
+
+  Changed = true;
+  assert(OldCount != 0 && "Unreachable code?");
+  NumRRs += OldCount - NewCount;
+  // Set to true if we completely removed any RR pairs.
+  AnyPairsCompletelyEliminated = NewCount == 0;
+
+  // We can move calls!
+  return true;
+}
+
+/// Identify pairings between the retains and releases, and delete and/or move
+/// them.
+bool ObjCARCOpt::PerformCodePlacement(
+    DenseMap<const BasicBlock *, BBState> &BBStates,
+    BlotMapVector<Value *, RRInfo> &Retains,
+    DenseMap<Value *, RRInfo> &Releases, Module *M) {
+  DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n");
+
+  bool AnyPairsCompletelyEliminated = false;
+  SmallVector<Instruction *, 8> DeadInsts;
+
+  // Visit each retain.
+  for (BlotMapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
+                                                      E = Retains.end();
+       I != E; ++I) {
+    Value *V = I->first;
+    if (!V) continue; // blotted
+
+    Instruction *Retain = cast<Instruction>(V);
+
+    DEBUG(dbgs() << "Visiting: " << *Retain << "\n");
+
+    Value *Arg = GetArgRCIdentityRoot(Retain);
+
+    // If the object being released is in static or stack storage, we know it's
+    // not being managed by ObjC reference counting, so we can delete pairs
+    // regardless of what possible decrements or uses lie between them.
+    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
+
+    // A constant pointer can't be pointing to an object on the heap. It may
+    // be reference-counted, but it won't be deleted.
+    if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
+      if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(
+              GetRCIdentityRoot(LI->getPointerOperand())))
+        if (GV->isConstant())
+          KnownSafe = true;
+
+    // Connect the dots between the top-down-collected RetainsToMove and
+    // bottom-up-collected ReleasesToMove to form sets of related calls.
+    RRInfo RetainsToMove, ReleasesToMove;
+
+    bool PerformMoveCalls = PairUpRetainsAndReleases(
+        BBStates, Retains, Releases, M, Retain, DeadInsts,
+        RetainsToMove, ReleasesToMove, Arg, KnownSafe,
+        AnyPairsCompletelyEliminated);
+
+    if (PerformMoveCalls) {
+      // Ok, everything checks out and we're all set. Let's move/delete some
+      // code!
+      MoveCalls(Arg, RetainsToMove, ReleasesToMove,
+                Retains, Releases, DeadInsts, M);
+    }
+  }
+
+  // Now that we're done moving everything, we can delete the newly dead
+  // instructions, as we no longer need them as insert points.
+  while (!DeadInsts.empty())
+    EraseInstruction(DeadInsts.pop_back_val());
+
+  return AnyPairsCompletelyEliminated;
+}
+
+/// Weak pointer optimizations.
+void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
+  DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeWeakCalls ==\n");
+
+  // First, do memdep-style RLE and S2L optimizations. We can't use memdep
+  // itself because it uses AliasAnalysis and we need to do provenance
+  // queries instead.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
+
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
+    if (Class != ARCInstKind::LoadWeak &&
+        Class != ARCInstKind::LoadWeakRetained)
+      continue;
+
+    // Delete objc_loadWeak calls with no users.
+    if (Class == ARCInstKind::LoadWeak && Inst->use_empty()) {
+      Inst->eraseFromParent();
+      continue;
+    }
+
+    // TODO: For now, just look for an earlier available version of this value
+    // within the same block. Theoretically, we could do memdep-style non-local
+    // analysis too, but that would want caching. A better approach would be to
+    // use the technique that EarlyCSE uses.
+    inst_iterator Current = std::prev(I);
+    BasicBlock *CurrentBB = &*Current.getBasicBlockIterator();
+    for (BasicBlock::iterator B = CurrentBB->begin(),
+                              J = Current.getInstructionIterator();
+         J != B; --J) {
+      Instruction *EarlierInst = &*std::prev(J);
+      ARCInstKind EarlierClass = GetARCInstKind(EarlierInst);
+      switch (EarlierClass) {
+      case ARCInstKind::LoadWeak:
+      case ARCInstKind::LoadWeakRetained: {
+        // If this is loading from the same pointer, replace this load's value
+        // with that one.
+        CallInst *Call = cast<CallInst>(Inst);
+        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+        Value *Arg = Call->getArgOperand(0);
+        Value *EarlierArg = EarlierCall->getArgOperand(0);
+        switch (PA.getAA()->alias(Arg, EarlierArg)) {
+        case MustAlias:
+          Changed = true;
+          // If the load has a builtin retain, insert a plain retain for it.
+          if (Class == ARCInstKind::LoadWeakRetained) {
+            Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
+            CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call);
+            CI->setTailCall();
+          }
+          // Zap the fully redundant load.
+          Call->replaceAllUsesWith(EarlierCall);
+          Call->eraseFromParent();
+          goto clobbered;
+        case MayAlias:
+        case PartialAlias:
+          goto clobbered;
+        case NoAlias:
+          break;
+        }
+        break;
+      }
+      case ARCInstKind::StoreWeak:
+      case ARCInstKind::InitWeak: {
+        // If this is storing to the same pointer and has the same size etc.
+        // replace this load's value with the stored value.
+        CallInst *Call = cast<CallInst>(Inst);
+        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+        Value *Arg = Call->getArgOperand(0);
+        Value *EarlierArg = EarlierCall->getArgOperand(0);
+        switch (PA.getAA()->alias(Arg, EarlierArg)) {
+        case MustAlias:
+          Changed = true;
+          // If the load has a builtin retain, insert a plain retain for it.
+          if (Class == ARCInstKind::LoadWeakRetained) {
+            Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
+            CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call);
+            CI->setTailCall();
+          }
+          // Zap the fully redundant load.
+          Call->replaceAllUsesWith(EarlierCall->getArgOperand(1));
+          Call->eraseFromParent();
+          goto clobbered;
+        case MayAlias:
+        case PartialAlias:
+          goto clobbered;
+        case NoAlias:
+          break;
+        }
+        break;
+      }
+      case ARCInstKind::MoveWeak:
+      case ARCInstKind::CopyWeak:
+        // TOOD: Grab the copied value.
+        goto clobbered;
+      case ARCInstKind::AutoreleasepoolPush:
+      case ARCInstKind::None:
+      case ARCInstKind::IntrinsicUser:
+      case ARCInstKind::User:
+        // Weak pointers are only modified through the weak entry points
+        // (and arbitrary calls, which could call the weak entry points).
+        break;
+      default:
+        // Anything else could modify the weak pointer.
+        goto clobbered;
+      }
+    }
+  clobbered:;
+  }
+
+  // Then, for each destroyWeak with an alloca operand, check to see if
+  // the alloca and all its users can be zapped.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
+    if (Class != ARCInstKind::DestroyWeak)
+      continue;
+
+    CallInst *Call = cast<CallInst>(Inst);
+    Value *Arg = Call->getArgOperand(0);
+    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
+      for (User *U : Alloca->users()) {
+        const Instruction *UserInst = cast<Instruction>(U);
+        switch (GetBasicARCInstKind(UserInst)) {
+        case ARCInstKind::InitWeak:
+        case ARCInstKind::StoreWeak:
+        case ARCInstKind::DestroyWeak:
+          continue;
+        default:
+          goto done;
+        }
+      }
+      Changed = true;
+      for (auto UI = Alloca->user_begin(), UE = Alloca->user_end(); UI != UE;) {
+        CallInst *UserInst = cast<CallInst>(*UI++);
+        switch (GetBasicARCInstKind(UserInst)) {
+        case ARCInstKind::InitWeak:
+        case ARCInstKind::StoreWeak:
+          // These functions return their second argument.
+          UserInst->replaceAllUsesWith(UserInst->getArgOperand(1));
+          break;
+        case ARCInstKind::DestroyWeak:
+          // No return value.
+          break;
+        default:
+          llvm_unreachable("alloca really is used!");
+        }
+        UserInst->eraseFromParent();
+      }
+      Alloca->eraseFromParent();
+    done:;
+    }
+  }
+}
+
+/// Identify program paths which execute sequences of retains and releases which
+/// can be eliminated.
+bool ObjCARCOpt::OptimizeSequences(Function &F) {
+  // Releases, Retains - These are used to store the results of the main flow
+  // analysis. These use Value* as the key instead of Instruction* so that the
+  // map stays valid when we get around to rewriting code and calls get
+  // replaced by arguments.
+  DenseMap<Value *, RRInfo> Releases;
+  BlotMapVector<Value *, RRInfo> Retains;
+
+  // This is used during the traversal of the function to track the
+  // states for each identified object at each block.
+  DenseMap<const BasicBlock *, BBState> BBStates;
+
+  // Analyze the CFG of the function, and all instructions.
+  bool NestingDetected = Visit(F, BBStates, Retains, Releases);
+
+  // Transform.
+  bool AnyPairsCompletelyEliminated = PerformCodePlacement(BBStates, Retains,
+                                                           Releases,
+                                                           F.getParent());
+
+  return AnyPairsCompletelyEliminated && NestingDetected;
+}
+
+/// Check if there is a dependent call earlier that does not have anything in
+/// between the Retain and the call that can affect the reference count of their
+/// shared pointer argument. Note that Retain need not be in BB.
+static bool
+HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain,
+                             SmallPtrSetImpl<Instruction *> &DepInsts,
+                             SmallPtrSetImpl<const BasicBlock *> &Visited,
+                             ProvenanceAnalysis &PA) {
+  FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
+                   DepInsts, Visited, PA);
+  if (DepInsts.size() != 1)
+    return false;
+
+  auto *Call = dyn_cast_or_null<CallInst>(*DepInsts.begin());
+
+  // Check that the pointer is the return value of the call.
+  if (!Call || Arg != Call)
+    return false;
+
+  // Check that the call is a regular call.
+  ARCInstKind Class = GetBasicARCInstKind(Call);
+  return Class == ARCInstKind::CallOrUser || Class == ARCInstKind::Call;
+}
+
+/// Find a dependent retain that precedes the given autorelease for which there
+/// is nothing in between the two instructions that can affect the ref count of
+/// Arg.
+static CallInst *
+FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
+                                  Instruction *Autorelease,
+                                  SmallPtrSetImpl<Instruction *> &DepInsts,
+                                  SmallPtrSetImpl<const BasicBlock *> &Visited,
+                                  ProvenanceAnalysis &PA) {
+  FindDependencies(CanChangeRetainCount, Arg,
+                   BB, Autorelease, DepInsts, Visited, PA);
+  if (DepInsts.size() != 1)
+    return nullptr;
+
+  auto *Retain = dyn_cast_or_null<CallInst>(*DepInsts.begin());
+
+  // Check that we found a retain with the same argument.
+  if (!Retain || !IsRetain(GetBasicARCInstKind(Retain)) ||
+      GetArgRCIdentityRoot(Retain) != Arg) {
+    return nullptr;
+  }
+
+  return Retain;
+}
+
+/// Look for an ``autorelease'' instruction dependent on Arg such that there are
+/// no instructions dependent on Arg that need a positive ref count in between
+/// the autorelease and the ret.
+static CallInst *
+FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB,
+                                       ReturnInst *Ret,
+                                       SmallPtrSetImpl<Instruction *> &DepInsts,
+                                       SmallPtrSetImpl<const BasicBlock *> &V,
+                                       ProvenanceAnalysis &PA) {
+  FindDependencies(NeedsPositiveRetainCount, Arg,
+                   BB, Ret, DepInsts, V, PA);
+  if (DepInsts.size() != 1)
+    return nullptr;
+
+  auto *Autorelease = dyn_cast_or_null<CallInst>(*DepInsts.begin());
+  if (!Autorelease)
+    return nullptr;
+  ARCInstKind AutoreleaseClass = GetBasicARCInstKind(Autorelease);
+  if (!IsAutorelease(AutoreleaseClass))
+    return nullptr;
+  if (GetArgRCIdentityRoot(Autorelease) != Arg)
+    return nullptr;
+
+  return Autorelease;
+}
+
+/// Look for this pattern:
+/// \code
+///    %call = call i8* @something(...)
+///    %2 = call i8* @objc_retain(i8* %call)
+///    %3 = call i8* @objc_autorelease(i8* %2)
+///    ret i8* %3
+/// \endcode
+/// And delete the retain and autorelease.
+void ObjCARCOpt::OptimizeReturns(Function &F) {
+  if (!F.getReturnType()->isPointerTy())
+    return;
+
+  DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeReturns ==\n");
+
+  SmallPtrSet<Instruction *, 4> DependingInstructions;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  for (BasicBlock &BB: F) {
+    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB.back());
+    if (!Ret)
+      continue;
+
+    DEBUG(dbgs() << "Visiting: " << *Ret << "\n");
+
+    const Value *Arg = GetRCIdentityRoot(Ret->getOperand(0));
+
+    // Look for an ``autorelease'' instruction that is a predecessor of Ret and
+    // dependent on Arg such that there are no instructions dependent on Arg
+    // that need a positive ref count in between the autorelease and Ret.
+    CallInst *Autorelease = FindPredecessorAutoreleaseWithSafePath(
+        Arg, &BB, Ret, DependingInstructions, Visited, PA);
+    DependingInstructions.clear();
+    Visited.clear();
+
+    if (!Autorelease)
+      continue;
+
+    CallInst *Retain = FindPredecessorRetainWithSafePath(
+        Arg, &BB, Autorelease, DependingInstructions, Visited, PA);
+    DependingInstructions.clear();
+    Visited.clear();
+
+    if (!Retain)
+      continue;
+
+    // Check that there is nothing that can affect the reference count
+    // between the retain and the call.  Note that Retain need not be in BB.
+    bool HasSafePathToCall = HasSafePathToPredecessorCall(Arg, Retain,
+                                                          DependingInstructions,
+                                                          Visited, PA);
+    DependingInstructions.clear();
+    Visited.clear();
+
+    if (!HasSafePathToCall)
+      continue;
+
+    // If so, we can zap the retain and autorelease.
+    Changed = true;
+    ++NumRets;
+    DEBUG(dbgs() << "Erasing: " << *Retain << "\nErasing: "
+          << *Autorelease << "\n");
+    EraseInstruction(Retain);
+    EraseInstruction(Autorelease);
+  }
+}
+
+#ifndef NDEBUG
+void
+ObjCARCOpt::GatherStatistics(Function &F, bool AfterOptimization) {
+  llvm::Statistic &NumRetains =
+    AfterOptimization? NumRetainsAfterOpt : NumRetainsBeforeOpt;
+  llvm::Statistic &NumReleases =
+    AfterOptimization? NumReleasesAfterOpt : NumReleasesBeforeOpt;
+
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+    switch (GetBasicARCInstKind(Inst)) {
+    default:
+      break;
+    case ARCInstKind::Retain:
+      ++NumRetains;
+      break;
+    case ARCInstKind::Release:
+      ++NumReleases;
+      break;
+    }
+  }
+}
+#endif
+
+bool ObjCARCOpt::doInitialization(Module &M) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  Run = ModuleHasARC(M);
+  if (!Run)
+    return false;
+
+  // Intuitively, objc_retain and others are nocapture, however in practice
+  // they are not, because they return their argument value. And objc_release
+  // calls finalizers which can have arbitrary side effects.
+  MDKindCache.init(&M);
+
+  // Initialize our runtime entry point cache.
+  EP.init(&M);
+
+  return false;
+}
+
+bool ObjCARCOpt::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  Changed = false;
+
+  DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName() << " >>>"
+        "\n");
+
+  PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
+
+#ifndef NDEBUG
+  if (AreStatisticsEnabled()) {
+    GatherStatistics(F, false);
+  }
+#endif
+
+  // This pass performs several distinct transformations. As a compile-time aid
+  // when compiling code that isn't ObjC, skip these if the relevant ObjC
+  // library functions aren't declared.
+
+  // Preliminary optimizations. This also computes UsedInThisFunction.
+  OptimizeIndividualCalls(F);
+
+  // Optimizations for weak pointers.
+  if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::LoadWeak)) |
+                            (1 << unsigned(ARCInstKind::LoadWeakRetained)) |
+                            (1 << unsigned(ARCInstKind::StoreWeak)) |
+                            (1 << unsigned(ARCInstKind::InitWeak)) |
+                            (1 << unsigned(ARCInstKind::CopyWeak)) |
+                            (1 << unsigned(ARCInstKind::MoveWeak)) |
+                            (1 << unsigned(ARCInstKind::DestroyWeak))))
+    OptimizeWeakCalls(F);
+
+  // Optimizations for retain+release pairs.
+  if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Retain)) |
+                            (1 << unsigned(ARCInstKind::RetainRV)) |
+                            (1 << unsigned(ARCInstKind::RetainBlock))))
+    if (UsedInThisFunction & (1 << unsigned(ARCInstKind::Release)))
+      // Run OptimizeSequences until it either stops making changes or
+      // no retain+release pair nesting is detected.
+      while (OptimizeSequences(F)) {}
+
+  // Optimizations if objc_autorelease is used.
+  if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Autorelease)) |
+                            (1 << unsigned(ARCInstKind::AutoreleaseRV))))
+    OptimizeReturns(F);
+
+  // Gather statistics after optimization.
+#ifndef NDEBUG
+  if (AreStatisticsEnabled()) {
+    GatherStatistics(F, true);
+  }
+#endif
+
+  DEBUG(dbgs() << "\n");
+
+  return Changed;
+}
+
+void ObjCARCOpt::releaseMemory() {
+  PA.clear();
+}
+
+/// @}
+///
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
new file mode 100644
index 000000000000..62fc52f6d091
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
@@ -0,0 +1,177 @@
+//===- ProvenanceAnalysis.cpp - ObjC ARC Optimization ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a special form of Alias Analysis called ``Provenance
+/// Analysis''. The word ``provenance'' refers to the history of the ownership
+/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
+/// use various techniques to determine if locally
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ProvenanceAnalysis.h"
+#include "ObjCARC.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+bool ProvenanceAnalysis::relatedSelect(const SelectInst *A,
+                                       const Value *B) {
+  const DataLayout &DL = A->getModule()->getDataLayout();
+  // If the values are Selects with the same condition, we can do a more precise
+  // check: just check for relations between the values on corresponding arms.
+  if (const SelectInst *SB = dyn_cast<SelectInst>(B))
+    if (A->getCondition() == SB->getCondition())
+      return related(A->getTrueValue(), SB->getTrueValue(), DL) ||
+             related(A->getFalseValue(), SB->getFalseValue(), DL);
+
+  // Check both arms of the Select node individually.
+  return related(A->getTrueValue(), B, DL) ||
+         related(A->getFalseValue(), B, DL);
+}
+
+bool ProvenanceAnalysis::relatedPHI(const PHINode *A,
+                                    const Value *B) {
+  const DataLayout &DL = A->getModule()->getDataLayout();
+  // If the values are PHIs in the same block, we can do a more precise as well
+  // as efficient check: just check for relations between the values on
+  // corresponding edges.
+  if (const PHINode *PNB = dyn_cast<PHINode>(B))
+    if (PNB->getParent() == A->getParent()) {
+      for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i)
+        if (related(A->getIncomingValue(i),
+                    PNB->getIncomingValueForBlock(A->getIncomingBlock(i)), DL))
+          return true;
+      return false;
+    }
+
+  // Check each unique source of the PHI node against B.
+  SmallPtrSet<const Value *, 4> UniqueSrc;
+  for (Value *PV1 : A->incoming_values()) {
+    if (UniqueSrc.insert(PV1).second && related(PV1, B, DL))
+      return true;
+  }
+
+  // All of the arms checked out.
+  return false;
+}
+
+/// Test if the value of P, or any value covered by its provenance, is ever
+/// stored within the function (not counting callees).
+static bool IsStoredObjCPointer(const Value *P) {
+  SmallPtrSet<const Value *, 8> Visited;
+  SmallVector<const Value *, 8> Worklist;
+  Worklist.push_back(P);
+  Visited.insert(P);
+  do {
+    P = Worklist.pop_back_val();
+    for (const Use &U : P->uses()) {
+      const User *Ur = U.getUser();
+      if (isa<StoreInst>(Ur)) {
+        if (U.getOperandNo() == 0)
+          // The pointer is stored.
+          return true;
+        // The pointed is stored through.
+        continue;
+      }
+      if (isa<CallInst>(Ur))
+        // The pointer is passed as an argument, ignore this.
+        continue;
+      if (isa<PtrToIntInst>(P))
+        // Assume the worst.
+        return true;
+      if (Visited.insert(Ur).second)
+        Worklist.push_back(Ur);
+    }
+  } while (!Worklist.empty());
+
+  // Everything checked out.
+  return false;
+}
+
+bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B,
+                                      const DataLayout &DL) {
+  // Skip past provenance pass-throughs.
+  A = GetUnderlyingObjCPtr(A, DL);
+  B = GetUnderlyingObjCPtr(B, DL);
+
+  // Quick check.
+  if (A == B)
+    return true;
+
+  // Ask regular AliasAnalysis, for a first approximation.
+  switch (AA->alias(A, B)) {
+  case NoAlias:
+    return false;
+  case MustAlias:
+  case PartialAlias:
+    return true;
+  case MayAlias:
+    break;
+  }
+
+  bool AIsIdentified = IsObjCIdentifiedObject(A);
+  bool BIsIdentified = IsObjCIdentifiedObject(B);
+
+  // An ObjC-Identified object can't alias a load if it is never locally stored.
+  if (AIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(B))
+      return IsStoredObjCPointer(A);
+    if (BIsIdentified) {
+      // Check for an obvious escape.
+      if (isa<LoadInst>(A))
+        return IsStoredObjCPointer(B);
+      // Both pointers are identified and escapes aren't an evident problem.
+      return false;
+    }
+  } else if (BIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(A))
+      return IsStoredObjCPointer(B);
+  }
+
+   // Special handling for PHI and Select.
+  if (const PHINode *PN = dyn_cast<PHINode>(A))
+    return relatedPHI(PN, B);
+  if (const PHINode *PN = dyn_cast<PHINode>(B))
+    return relatedPHI(PN, A);
+  if (const SelectInst *S = dyn_cast<SelectInst>(A))
+    return relatedSelect(S, B);
+  if (const SelectInst *S = dyn_cast<SelectInst>(B))
+    return relatedSelect(S, A);
+
+  // Conservative.
+  return true;
+}
+
+bool ProvenanceAnalysis::related(const Value *A, const Value *B,
+                                 const DataLayout &DL) {
+  // Begin by inserting a conservative value into the map. If the insertion
+  // fails, we have the answer already. If it succeeds, leave it there until we
+  // compute the real answer to guard against recursive queries.
+  if (A > B) std::swap(A, B);
+  std::pair<CachedResultsTy::iterator, bool> Pair =
+    CachedResults.insert(std::make_pair(ValuePairTy(A, B), true));
+  if (!Pair.second)
+    return Pair.first->second;
+
+  bool Result = relatedCheck(A, B, DL);
+  CachedResults[ValuePairTy(A, B)] = Result;
+  return Result;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
new file mode 100644
index 000000000000..1a12b659e5a3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -0,0 +1,81 @@
+//===- ProvenanceAnalysis.h - ObjC ARC Optimization ---*- C++ -*-----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file declares a special form of Alias Analysis called ``Provenance
+/// Analysis''. The word ``provenance'' refers to the history of the ownership
+/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
+/// use various techniques to determine if locally
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+
+namespace llvm {
+  class Value;
+  class DataLayout;
+  class PHINode;
+  class SelectInst;
+}
+
+namespace llvm {
+namespace objcarc {
+
+/// \brief This is similar to BasicAliasAnalysis, and it uses many of the same
+/// techniques, except it uses special ObjC-specific reasoning about pointer
+/// relationships.
+///
+/// In this context ``Provenance'' is defined as the history of an object's
+/// ownership. Thus ``Provenance Analysis'' is defined by using the notion of
+/// an ``independent provenance source'' of a pointer to determine whether or
+/// not two pointers have the same provenance source and thus could
+/// potentially be related.
+class ProvenanceAnalysis {
+  AliasAnalysis *AA;
+
+  typedef std::pair<const Value *, const Value *> ValuePairTy;
+  typedef DenseMap<ValuePairTy, bool> CachedResultsTy;
+  CachedResultsTy CachedResults;
+
+  bool relatedCheck(const Value *A, const Value *B, const DataLayout &DL);
+  bool relatedSelect(const SelectInst *A, const Value *B);
+  bool relatedPHI(const PHINode *A, const Value *B);
+
+  void operator=(const ProvenanceAnalysis &) = delete;
+  ProvenanceAnalysis(const ProvenanceAnalysis &) = delete;
+
+public:
+  ProvenanceAnalysis() {}
+
+  void setAA(AliasAnalysis *aa) { AA = aa; }
+
+  AliasAnalysis *getAA() const { return AA; }
+
+  bool related(const Value *A, const Value *B, const DataLayout &DL);
+
+  void clear() {
+    CachedResults.clear();
+  }
+};
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
new file mode 100644
index 000000000000..870a5f600fd8
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
@@ -0,0 +1,94 @@
+//===- ProvenanceAnalysisEvaluator.cpp - ObjC ARC Optimization ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
+class PAEval : public FunctionPass {
+
+public:
+  static char ID;
+  PAEval();
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+};
+}
+
+char PAEval::ID = 0;
+PAEval::PAEval() : FunctionPass(ID) {}
+
+void PAEval::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AAResultsWrapperPass>();
+}
+
+static StringRef getName(Value *V) {
+  StringRef Name = V->getName();
+  if (Name.startswith("\1"))
+    return Name.substr(1);
+  return Name;
+}
+
+static void insertIfNamed(SetVector<Value *> &Values, Value *V) {
+  if (!V->hasName())
+    return;
+  Values.insert(V);
+}
+
+bool PAEval::runOnFunction(Function &F) {
+  SetVector<Value *> Values;
+
+  for (auto &Arg : F.args())
+    insertIfNamed(Values, &Arg);
+
+  for (auto I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    insertIfNamed(Values, &*I);
+
+    for (auto &Op : I->operands())
+    insertIfNamed(Values, Op);
+  }
+
+  ProvenanceAnalysis PA;
+  PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  for (Value *V1 : Values) {
+    StringRef NameV1 = getName(V1);
+    for (Value *V2 : Values) {
+      StringRef NameV2 = getName(V2);
+      if (NameV1 >= NameV2)
+        continue;
+      errs() << NameV1 << " and " << NameV2;
+      if (PA.related(V1, V2, DL))
+        errs() << " are related.\n";
+      else
+        errs() << " are not related.\n";
+    }
+  }
+
+  return false;
+}
+
+FunctionPass *llvm::createPAEvalPass() { return new PAEval(); }
+
+INITIALIZE_PASS_BEGIN(PAEval, "pa-eval",
+                      "Evaluate ProvenanceAnalysis on all pairs", false, true)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(PAEval, "pa-eval",
+                    "Evaluate ProvenanceAnalysis on all pairs", false, true)
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp
new file mode 100644
index 000000000000..d13e941044f1
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp
@@ -0,0 +1,410 @@
+//===--- PtrState.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PtrState.h"
+#include "DependencyAnalysis.h"
+#include "ObjCARC.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-ptr-state"
+
+//===----------------------------------------------------------------------===//
+//                                  Utility
+//===----------------------------------------------------------------------===//
+
+raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, const Sequence S) {
+  switch (S) {
+  case S_None:
+    return OS << "S_None";
+  case S_Retain:
+    return OS << "S_Retain";
+  case S_CanRelease:
+    return OS << "S_CanRelease";
+  case S_Use:
+    return OS << "S_Use";
+  case S_Release:
+    return OS << "S_Release";
+  case S_MovableRelease:
+    return OS << "S_MovableRelease";
+  case S_Stop:
+    return OS << "S_Stop";
+  }
+  llvm_unreachable("Unknown sequence type.");
+}
+
+//===----------------------------------------------------------------------===//
+//                                  Sequence
+//===----------------------------------------------------------------------===//
+
+static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
+  // The easy cases.
+  if (A == B)
+    return A;
+  if (A == S_None || B == S_None)
+    return S_None;
+
+  if (A > B)
+    std::swap(A, B);
+  if (TopDown) {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Retain || A == S_CanRelease) &&
+        (B == S_CanRelease || B == S_Use))
+      return B;
+  } else {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Use || A == S_CanRelease) &&
+        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
+      return A;
+    // If both sides are releases, choose the more conservative one.
+    if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
+      return A;
+    if (A == S_Release && B == S_MovableRelease)
+      return A;
+  }
+
+  return S_None;
+}
+
+//===----------------------------------------------------------------------===//
+//                                   RRInfo
+//===----------------------------------------------------------------------===//
+
+void RRInfo::clear() {
+  KnownSafe = false;
+  IsTailCallRelease = false;
+  ReleaseMetadata = nullptr;
+  Calls.clear();
+  ReverseInsertPts.clear();
+  CFGHazardAfflicted = false;
+}
+
+bool RRInfo::Merge(const RRInfo &Other) {
+  // Conservatively merge the ReleaseMetadata information.
+  if (ReleaseMetadata != Other.ReleaseMetadata)
+    ReleaseMetadata = nullptr;
+
+  // Conservatively merge the boolean state.
+  KnownSafe &= Other.KnownSafe;
+  IsTailCallRelease &= Other.IsTailCallRelease;
+  CFGHazardAfflicted |= Other.CFGHazardAfflicted;
+
+  // Merge the call sets.
+  Calls.insert(Other.Calls.begin(), Other.Calls.end());
+
+  // Merge the insert point sets. If there are any differences,
+  // that makes this a partial merge.
+  bool Partial = ReverseInsertPts.size() != Other.ReverseInsertPts.size();
+  for (Instruction *Inst : Other.ReverseInsertPts)
+    Partial |= ReverseInsertPts.insert(Inst).second;
+  return Partial;
+}
+
+//===----------------------------------------------------------------------===//
+//                                  PtrState
+//===----------------------------------------------------------------------===//
+
+void PtrState::SetKnownPositiveRefCount() {
+  DEBUG(dbgs() << "        Setting Known Positive.\n");
+  KnownPositiveRefCount = true;
+}
+
+void PtrState::ClearKnownPositiveRefCount() {
+  DEBUG(dbgs() << "        Clearing Known Positive.\n");
+  KnownPositiveRefCount = false;
+}
+
+void PtrState::SetSeq(Sequence NewSeq) {
+  DEBUG(dbgs() << "            Old: " << GetSeq() << "; New: " << NewSeq << "\n");
+  Seq = NewSeq;
+}
+
+void PtrState::ResetSequenceProgress(Sequence NewSeq) {
+  DEBUG(dbgs() << "        Resetting sequence progress.\n");
+  SetSeq(NewSeq);
+  Partial = false;
+  RRI.clear();
+}
+
+void PtrState::Merge(const PtrState &Other, bool TopDown) {
+  Seq = MergeSeqs(GetSeq(), Other.GetSeq(), TopDown);
+  KnownPositiveRefCount &= Other.KnownPositiveRefCount;
+
+  // If we're not in a sequence (anymore), drop all associated state.
+  if (Seq == S_None) {
+    Partial = false;
+    RRI.clear();
+  } else if (Partial || Other.Partial) {
+    // If we're doing a merge on a path that's previously seen a partial
+    // merge, conservatively drop the sequence, to avoid doing partial
+    // RR elimination. If the branch predicates for the two merge differ,
+    // mixing them is unsafe.
+    ClearSequenceProgress();
+  } else {
+    // Otherwise merge the other PtrState's RRInfo into our RRInfo. At this
+    // point, we know that currently we are not partial. Stash whether or not
+    // the merge operation caused us to undergo a partial merging of reverse
+    // insertion points.
+    Partial = RRI.Merge(Other.RRI);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                              BottomUpPtrState
+//===----------------------------------------------------------------------===//
+
+bool BottomUpPtrState::InitBottomUp(ARCMDKindCache &Cache, Instruction *I) {
+  // If we see two releases in a row on the same pointer. If so, make
+  // a note, and we'll cicle back to revisit it after we've
+  // hopefully eliminated the second release, which may allow us to
+  // eliminate the first release too.
+  // Theoretically we could implement removal of nested retain+release
+  // pairs by making PtrState hold a stack of states, but this is
+  // simple and avoids adding overhead for the non-nested case.
+  bool NestingDetected = false;
+  if (GetSeq() == S_Release || GetSeq() == S_MovableRelease) {
+    DEBUG(dbgs() << "        Found nested releases (i.e. a release pair)\n");
+    NestingDetected = true;
+  }
+
+  MDNode *ReleaseMetadata =
+      I->getMetadata(Cache.get(ARCMDKindID::ImpreciseRelease));
+  Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release;
+  ResetSequenceProgress(NewSeq);
+  SetReleaseMetadata(ReleaseMetadata);
+  SetKnownSafe(HasKnownPositiveRefCount());
+  SetTailCallRelease(cast<CallInst>(I)->isTailCall());
+  InsertCall(I);
+  SetKnownPositiveRefCount();
+  return NestingDetected;
+}
+
+bool BottomUpPtrState::MatchWithRetain() {
+  SetKnownPositiveRefCount();
+
+  Sequence OldSeq = GetSeq();
+  switch (OldSeq) {
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+  case S_Use:
+    // If OldSeq is not S_Use or OldSeq is S_Use and we are tracking an
+    // imprecise release, clear our reverse insertion points.
+    if (OldSeq != S_Use || IsTrackingImpreciseReleases())
+      ClearReverseInsertPts();
+    LLVM_FALLTHROUGH;
+  case S_CanRelease:
+    return true;
+  case S_None:
+    return false;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  }
+  llvm_unreachable("Sequence unknown enum value");
+}
+
+bool BottomUpPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
+                                                    const Value *Ptr,
+                                                    ProvenanceAnalysis &PA,
+                                                    ARCInstKind Class) {
+  Sequence S = GetSeq();
+
+  // Check for possible releases.
+  if (!CanAlterRefCount(Inst, Ptr, PA, Class))
+    return false;
+
+  DEBUG(dbgs() << "            CanAlterRefCount: Seq: " << S << "; " << *Ptr
+               << "\n");
+  switch (S) {
+  case S_Use:
+    SetSeq(S_CanRelease);
+    return true;
+  case S_CanRelease:
+  case S_Release:
+  case S_MovableRelease:
+  case S_Stop:
+  case S_None:
+    return false;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  }
+  llvm_unreachable("Sequence unknown enum value");
+}
+
+void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
+                                          const Value *Ptr,
+                                          ProvenanceAnalysis &PA,
+                                          ARCInstKind Class) {
+  auto SetSeqAndInsertReverseInsertPt = [&](Sequence NewSeq){
+    assert(!HasReverseInsertPts());
+    SetSeq(NewSeq);
+    // If this is an invoke instruction, we're scanning it as part of
+    // one of its successor blocks, since we can't insert code after it
+    // in its own block, and we don't want to split critical edges.
+    if (isa<InvokeInst>(Inst))
+      InsertReverseInsertPt(&*BB->getFirstInsertionPt());
+    else
+      InsertReverseInsertPt(&*++Inst->getIterator());
+  };
+
+  // Check for possible direct uses.
+  switch (GetSeq()) {
+  case S_Release:
+  case S_MovableRelease:
+    if (CanUse(Inst, Ptr, PA, Class)) {
+      DEBUG(dbgs() << "            CanUse: Seq: " << GetSeq() << "; " << *Ptr
+                   << "\n");
+      SetSeqAndInsertReverseInsertPt(S_Use);
+    } else if (Seq == S_Release && IsUser(Class)) {
+      DEBUG(dbgs() << "            PreciseReleaseUse: Seq: " << GetSeq() << "; "
+                   << *Ptr << "\n");
+      // Non-movable releases depend on any possible objc pointer use.
+      SetSeqAndInsertReverseInsertPt(S_Stop);
+    } else if (const auto *Call = getreturnRVOperand(*Inst, Class)) {
+      if (CanUse(Call, Ptr, PA, GetBasicARCInstKind(Call))) {
+        DEBUG(dbgs() << "            ReleaseUse: Seq: " << GetSeq() << "; "
+                     << *Ptr << "\n");
+        SetSeqAndInsertReverseInsertPt(S_Stop);
+      }
+    }
+    break;
+  case S_Stop:
+    if (CanUse(Inst, Ptr, PA, Class)) {
+      DEBUG(dbgs() << "            PreciseStopUse: Seq: " << GetSeq() << "; "
+                   << *Ptr << "\n");
+      SetSeq(S_Use);
+    }
+    break;
+  case S_CanRelease:
+  case S_Use:
+  case S_None:
+    break;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                              TopDownPtrState
+//===----------------------------------------------------------------------===//
+
+bool TopDownPtrState::InitTopDown(ARCInstKind Kind, Instruction *I) {
+  bool NestingDetected = false;
+  // Don't do retain+release tracking for ARCInstKind::RetainRV, because
+  // it's
+  // better to let it remain as the first instruction after a call.
+  if (Kind != ARCInstKind::RetainRV) {
+    // If we see two retains in a row on the same pointer. If so, make
+    // a note, and we'll cicle back to revisit it after we've
+    // hopefully eliminated the second retain, which may allow us to
+    // eliminate the first retain too.
+    // Theoretically we could implement removal of nested retain+release
+    // pairs by making PtrState hold a stack of states, but this is
+    // simple and avoids adding overhead for the non-nested case.
+    if (GetSeq() == S_Retain)
+      NestingDetected = true;
+
+    ResetSequenceProgress(S_Retain);
+    SetKnownSafe(HasKnownPositiveRefCount());
+    InsertCall(I);
+  }
+
+  SetKnownPositiveRefCount();
+  return NestingDetected;
+}
+
+bool TopDownPtrState::MatchWithRelease(ARCMDKindCache &Cache,
+                                       Instruction *Release) {
+  ClearKnownPositiveRefCount();
+
+  Sequence OldSeq = GetSeq();
+
+  MDNode *ReleaseMetadata =
+      Release->getMetadata(Cache.get(ARCMDKindID::ImpreciseRelease));
+
+  switch (OldSeq) {
+  case S_Retain:
+  case S_CanRelease:
+    if (OldSeq == S_Retain || ReleaseMetadata != nullptr)
+      ClearReverseInsertPts();
+    LLVM_FALLTHROUGH;
+  case S_Use:
+    SetReleaseMetadata(ReleaseMetadata);
+    SetTailCallRelease(cast<CallInst>(Release)->isTailCall());
+    return true;
+  case S_None:
+    return false;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+    llvm_unreachable("top-down pointer in bottom up state!");
+  }
+  llvm_unreachable("Sequence unknown enum value");
+}
+
+bool TopDownPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
+                                                   const Value *Ptr,
+                                                   ProvenanceAnalysis &PA,
+                                                   ARCInstKind Class) {
+  // Check for possible releases. Treat clang.arc.use as a releasing instruction
+  // to prevent sinking a retain past it.
+  if (!CanAlterRefCount(Inst, Ptr, PA, Class) &&
+      Class != ARCInstKind::IntrinsicUser)
+    return false;
+
+  DEBUG(dbgs() << "            CanAlterRefCount: Seq: " << GetSeq() << "; " << *Ptr
+               << "\n");
+  ClearKnownPositiveRefCount();
+  switch (GetSeq()) {
+  case S_Retain:
+    SetSeq(S_CanRelease);
+    assert(!HasReverseInsertPts());
+    InsertReverseInsertPt(Inst);
+
+    // One call can't cause a transition from S_Retain to S_CanRelease
+    // and S_CanRelease to S_Use. If we've made the first transition,
+    // we're done.
+    return true;
+  case S_Use:
+  case S_CanRelease:
+  case S_None:
+    return false;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+    llvm_unreachable("top-down pointer in release state!");
+  }
+  llvm_unreachable("covered switch is not covered!?");
+}
+
+void TopDownPtrState::HandlePotentialUse(Instruction *Inst, const Value *Ptr,
+                                         ProvenanceAnalysis &PA,
+                                         ARCInstKind Class) {
+  // Check for possible direct uses.
+  switch (GetSeq()) {
+  case S_CanRelease:
+    if (!CanUse(Inst, Ptr, PA, Class))
+      return;
+    DEBUG(dbgs() << "             CanUse: Seq: " << GetSeq() << "; " << *Ptr
+                 << "\n");
+    SetSeq(S_Use);
+    return;
+  case S_Retain:
+  case S_Use:
+  case S_None:
+    return;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+    llvm_unreachable("top-down pointer in release state!");
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h
new file mode 100644
index 000000000000..87298fa59bfd
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h
@@ -0,0 +1,210 @@
+//===--- PtrState.h - ARC State for a Ptr -------------------*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file contains declarations for the ARC state associated with a ptr. It
+//  is only used by the ARC Sequence Dataflow computation. By separating this
+//  from the actual dataflow, it is easier to consider the mechanics of the ARC
+//  optimization separate from the actual predicates being used.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace objcarc {
+
+class ARCMDKindCache;
+class ProvenanceAnalysis;
+
+/// \enum Sequence
+///
+/// \brief A sequence of states that a pointer may go through in which an
+/// objc_retain and objc_release are actually needed.
+enum Sequence {
+  S_None,
+  S_Retain,        ///< objc_retain(x).
+  S_CanRelease,    ///< foo(x) -- x could possibly see a ref count decrement.
+  S_Use,           ///< any use of x.
+  S_Stop,          ///< like S_Release, but code motion is stopped.
+  S_Release,       ///< objc_release(x).
+  S_MovableRelease ///< objc_release(x), !clang.imprecise_release.
+};
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const Sequence S) LLVM_ATTRIBUTE_UNUSED;
+
+/// \brief Unidirectional information about either a
+/// retain-decrement-use-release sequence or release-use-decrement-retain
+/// reverse sequence.
+struct RRInfo {
+  /// After an objc_retain, the reference count of the referenced
+  /// object is known to be positive. Similarly, before an objc_release, the
+  /// reference count of the referenced object is known to be positive. If
+  /// there are retain-release pairs in code regions where the retain count
+  /// is known to be positive, they can be eliminated, regardless of any side
+  /// effects between them.
+  ///
+  /// Also, a retain+release pair nested within another retain+release
+  /// pair all on the known same pointer value can be eliminated, regardless
+  /// of any intervening side effects.
+  ///
+  /// KnownSafe is true when either of these conditions is satisfied.
+  bool KnownSafe;
+
+  /// True of the objc_release calls are all marked with the "tail" keyword.
+  bool IsTailCallRelease;
+
+  /// If the Calls are objc_release calls and they all have a
+  /// clang.imprecise_release tag, this is the metadata tag.
+  MDNode *ReleaseMetadata;
+
+  /// For a top-down sequence, the set of objc_retains or
+  /// objc_retainBlocks. For bottom-up, the set of objc_releases.
+  SmallPtrSet<Instruction *, 2> Calls;
+
+  /// The set of optimal insert positions for moving calls in the opposite
+  /// sequence.
+  SmallPtrSet<Instruction *, 2> ReverseInsertPts;
+
+  /// If this is true, we cannot perform code motion but can still remove
+  /// retain/release pairs.
+  bool CFGHazardAfflicted;
+
+  RRInfo()
+      : KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(nullptr),
+        CFGHazardAfflicted(false) {}
+
+  void clear();
+
+  /// Conservatively merge the two RRInfo. Returns true if a partial merge has
+  /// occurred, false otherwise.
+  bool Merge(const RRInfo &Other);
+};
+
+/// \brief This class summarizes several per-pointer runtime properties which
+/// are propagated through the flow graph.
+class PtrState {
+protected:
+  /// True if the reference count is known to be incremented.
+  bool KnownPositiveRefCount;
+
+  /// True if we've seen an opportunity for partial RR elimination, such as
+  /// pushing calls into a CFG triangle or into one side of a CFG diamond.
+  bool Partial;
+
+  /// The current position in the sequence.
+  unsigned char Seq : 8;
+
+  /// Unidirectional information about the current sequence.
+  RRInfo RRI;
+
+  PtrState() : KnownPositiveRefCount(false), Partial(false), Seq(S_None) {}
+
+public:
+  bool IsKnownSafe() const { return RRI.KnownSafe; }
+
+  void SetKnownSafe(const bool NewValue) { RRI.KnownSafe = NewValue; }
+
+  bool IsTailCallRelease() const { return RRI.IsTailCallRelease; }
+
+  void SetTailCallRelease(const bool NewValue) {
+    RRI.IsTailCallRelease = NewValue;
+  }
+
+  bool IsTrackingImpreciseReleases() const {
+    return RRI.ReleaseMetadata != nullptr;
+  }
+
+  const MDNode *GetReleaseMetadata() const { return RRI.ReleaseMetadata; }
+
+  void SetReleaseMetadata(MDNode *NewValue) { RRI.ReleaseMetadata = NewValue; }
+
+  bool IsCFGHazardAfflicted() const { return RRI.CFGHazardAfflicted; }
+
+  void SetCFGHazardAfflicted(const bool NewValue) {
+    RRI.CFGHazardAfflicted = NewValue;
+  }
+
+  void SetKnownPositiveRefCount();
+  void ClearKnownPositiveRefCount();
+
+  bool HasKnownPositiveRefCount() const { return KnownPositiveRefCount; }
+
+  void SetSeq(Sequence NewSeq);
+
+  Sequence GetSeq() const { return static_cast<Sequence>(Seq); }
+
+  void ClearSequenceProgress() { ResetSequenceProgress(S_None); }
+
+  void ResetSequenceProgress(Sequence NewSeq);
+  void Merge(const PtrState &Other, bool TopDown);
+
+  void InsertCall(Instruction *I) { RRI.Calls.insert(I); }
+
+  void InsertReverseInsertPt(Instruction *I) { RRI.ReverseInsertPts.insert(I); }
+
+  void ClearReverseInsertPts() { RRI.ReverseInsertPts.clear(); }
+
+  bool HasReverseInsertPts() const { return !RRI.ReverseInsertPts.empty(); }
+
+  const RRInfo &GetRRInfo() const { return RRI; }
+};
+
+struct BottomUpPtrState : PtrState {
+  BottomUpPtrState() : PtrState() {}
+
+  /// (Re-)Initialize this bottom up pointer returning true if we detected a
+  /// pointer with nested releases.
+  bool InitBottomUp(ARCMDKindCache &Cache, Instruction *I);
+
+  /// Return true if this set of releases can be paired with a release. Modifies
+  /// state appropriately to reflect that the matching occurred if it is
+  /// successful.
+  ///
+  /// It is assumed that one has already checked that the RCIdentity of the
+  /// retain and the RCIdentity of this ptr state are the same.
+  bool MatchWithRetain();
+
+  void HandlePotentialUse(BasicBlock *BB, Instruction *Inst, const Value *Ptr,
+                          ProvenanceAnalysis &PA, ARCInstKind Class);
+  bool HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr,
+                                    ProvenanceAnalysis &PA, ARCInstKind Class);
+};
+
+struct TopDownPtrState : PtrState {
+  TopDownPtrState() : PtrState() {}
+
+  /// (Re-)Initialize this bottom up pointer returning true if we detected a
+  /// pointer with nested releases.
+  bool InitTopDown(ARCInstKind Kind, Instruction *I);
+
+  /// Return true if this set of retains can be paired with the given
+  /// release. Modifies state appropriately to reflect that the matching
+  /// occurred.
+  bool MatchWithRelease(ARCMDKindCache &Cache, Instruction *Release);
+
+  void HandlePotentialUse(Instruction *Inst, const Value *Ptr,
+                          ProvenanceAnalysis &PA, ARCInstKind Class);
+
+  bool HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr,
+                                    ProvenanceAnalysis &PA, ARCInstKind Class);
+};
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
new file mode 100644
index 000000000000..5b467dc9fe12
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -0,0 +1,664 @@
+//===- ADCE.cpp - Code to perform dead code elimination -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Aggressive Dead Code Elimination pass.  This pass
+// optimistically assumes that all instructions are dead until proven otherwise,
+// allowing it to eliminate dead computations that other DCE passes do not
+// catch, particularly involving loop computations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ADCE.h"
+
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "adce"
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+STATISTIC(NumBranchesRemoved, "Number of branch instructions removed");
+
+// This is a temporary option until we change the interface to this pass based
+// on optimization level.
+static cl::opt<bool> RemoveControlFlowFlag("adce-remove-control-flow",
+                                           cl::init(true), cl::Hidden);
+
+// This option enables removing of may-be-infinite loops which have no other
+// effect.
+static cl::opt<bool> RemoveLoops("adce-remove-loops", cl::init(false),
+                                 cl::Hidden);
+
+namespace {
+/// Information about Instructions
+struct InstInfoType {
+  /// True if the associated instruction is live.
+  bool Live = false;
+  /// Quick access to information for block containing associated Instruction.
+  struct BlockInfoType *Block = nullptr;
+};
+
+/// Information about basic blocks relevant to dead code elimination.
+struct BlockInfoType {
+  /// True when this block contains a live instructions.
+  bool Live = false;
+  /// True when this block ends in an unconditional branch.
+  bool UnconditionalBranch = false;
+  /// True when this block is known to have live PHI nodes.
+  bool HasLivePhiNodes = false;
+  /// Control dependence sources need to be live for this block.
+  bool CFLive = false;
+
+  /// Quick access to the LiveInfo for the terminator,
+  /// holds the value &InstInfo[Terminator]
+  InstInfoType *TerminatorLiveInfo = nullptr;
+
+  bool terminatorIsLive() const { return TerminatorLiveInfo->Live; }
+
+  /// Corresponding BasicBlock.
+  BasicBlock *BB = nullptr;
+
+  /// Cache of BB->getTerminator().
+  TerminatorInst *Terminator = nullptr;
+
+  /// Post-order numbering of reverse control flow graph.
+  unsigned PostOrder;
+};
+
+class AggressiveDeadCodeElimination {
+  Function &F;
+  PostDominatorTree &PDT;
+
+  /// Mapping of blocks to associated information, an element in BlockInfoVec.
+  DenseMap<BasicBlock *, BlockInfoType> BlockInfo;
+  bool isLive(BasicBlock *BB) { return BlockInfo[BB].Live; }
+
+  /// Mapping of instructions to associated information.
+  DenseMap<Instruction *, InstInfoType> InstInfo;
+  bool isLive(Instruction *I) { return InstInfo[I].Live; }
+
+  /// Instructions known to be live where we need to mark
+  /// reaching definitions as live.
+  SmallVector<Instruction *, 128> Worklist;
+  /// Debug info scopes around a live instruction.
+  SmallPtrSet<const Metadata *, 32> AliveScopes;
+
+  /// Set of blocks with not known to have live terminators.
+  SmallPtrSet<BasicBlock *, 16> BlocksWithDeadTerminators;
+
+  /// The set of blocks which we have determined whose control
+  /// dependence sources must be live and which have not had
+  /// those dependences analyzed.
+  SmallPtrSet<BasicBlock *, 16> NewLiveBlocks;
+
+  /// Set up auxiliary data structures for Instructions and BasicBlocks and
+  /// initialize the Worklist to the set of must-be-live Instruscions.
+  void initialize();
+  /// Return true for operations which are always treated as live.
+  bool isAlwaysLive(Instruction &I);
+  /// Return true for instrumentation instructions for value profiling.
+  bool isInstrumentsConstant(Instruction &I);
+
+  /// Propagate liveness to reaching definitions.
+  void markLiveInstructions();
+  /// Mark an instruction as live.
+  void markLive(Instruction *I);
+  /// Mark a block as live.
+  void markLive(BlockInfoType &BB);
+  void markLive(BasicBlock *BB) { markLive(BlockInfo[BB]); }
+
+  /// Mark terminators of control predecessors of a PHI node live.
+  void markPhiLive(PHINode *PN);
+
+  /// Record the Debug Scopes which surround live debug information.
+  void collectLiveScopes(const DILocalScope &LS);
+  void collectLiveScopes(const DILocation &DL);
+
+  /// Analyze dead branches to find those whose branches are the sources
+  /// of control dependences impacting a live block. Those branches are
+  /// marked live.
+  void markLiveBranchesFromControlDependences();
+
+  /// Remove instructions not marked live, return if any any instruction
+  /// was removed.
+  bool removeDeadInstructions();
+
+  /// Identify connected sections of the control flow graph which have
+  /// dead terminators and rewrite the control flow graph to remove them.
+  void updateDeadRegions();
+
+  /// Set the BlockInfo::PostOrder field based on a post-order
+  /// numbering of the reverse control flow graph.
+  void computeReversePostOrder();
+
+  /// Make the terminator of this block an unconditional branch to \p Target.
+  void makeUnconditional(BasicBlock *BB, BasicBlock *Target);
+
+public:
+  AggressiveDeadCodeElimination(Function &F, PostDominatorTree &PDT)
+      : F(F), PDT(PDT) {}
+  bool performDeadCodeElimination();
+};
+}
+
+bool AggressiveDeadCodeElimination::performDeadCodeElimination() {
+  initialize();
+  markLiveInstructions();
+  return removeDeadInstructions();
+}
+
+static bool isUnconditionalBranch(TerminatorInst *Term) {
+  auto *BR = dyn_cast<BranchInst>(Term);
+  return BR && BR->isUnconditional();
+}
+
+void AggressiveDeadCodeElimination::initialize() {
+
+  auto NumBlocks = F.size();
+
+  // We will have an entry in the map for each block so we grow the
+  // structure to twice that size to keep the load factor low in the hash table.
+  BlockInfo.reserve(NumBlocks);
+  size_t NumInsts = 0;
+
+  // Iterate over blocks and initialize BlockInfoVec entries, count
+  // instructions to size the InstInfo hash table.
+  for (auto &BB : F) {
+    NumInsts += BB.size();
+    auto &Info = BlockInfo[&BB];
+    Info.BB = &BB;
+    Info.Terminator = BB.getTerminator();
+    Info.UnconditionalBranch = isUnconditionalBranch(Info.Terminator);
+  }
+
+  // Initialize instruction map and set pointers to block info.
+  InstInfo.reserve(NumInsts);
+  for (auto &BBInfo : BlockInfo)
+    for (Instruction &I : *BBInfo.second.BB)
+      InstInfo[&I].Block = &BBInfo.second;
+
+  // Since BlockInfoVec holds pointers into InstInfo and vice-versa, we may not
+  // add any more elements to either after this point.
+  for (auto &BBInfo : BlockInfo)
+    BBInfo.second.TerminatorLiveInfo = &InstInfo[BBInfo.second.Terminator];
+
+  // Collect the set of "root" instructions that are known live.
+  for (Instruction &I : instructions(F))
+    if (isAlwaysLive(I))
+      markLive(&I);
+
+  if (!RemoveControlFlowFlag)
+    return;
+
+  if (!RemoveLoops) {
+    // This stores state for the depth-first iterator. In addition
+    // to recording which nodes have been visited we also record whether
+    // a node is currently on the "stack" of active ancestors of the current
+    // node.
+    typedef DenseMap<BasicBlock *, bool>  StatusMap ;
+    class DFState : public StatusMap {
+    public:
+      std::pair<StatusMap::iterator, bool> insert(BasicBlock *BB) {
+        return StatusMap::insert(std::make_pair(BB, true));
+      }
+
+      // Invoked after we have visited all children of a node.
+      void completed(BasicBlock *BB) { (*this)[BB] = false; }
+
+      // Return true if \p BB is currently on the active stack
+      // of ancestors.
+      bool onStack(BasicBlock *BB) {
+        auto Iter = find(BB);
+        return Iter != end() && Iter->second;
+      }
+    } State;
+
+    State.reserve(F.size());
+    // Iterate over blocks in depth-first pre-order and
+    // treat all edges to a block already seen as loop back edges
+    // and mark the branch live it if there is a back edge.
+    for (auto *BB: depth_first_ext(&F.getEntryBlock(), State)) {
+      TerminatorInst *Term = BB->getTerminator();
+      if (isLive(Term))
+        continue;
+
+      for (auto *Succ : successors(BB))
+        if (State.onStack(Succ)) {
+          // back edge....
+          markLive(Term);
+          break;
+        }
+    }
+  }
+
+  // Mark blocks live if there is no path from the block to the
+  // return of the function or a successor for which this is true.
+  // This protects IDFCalculator which cannot handle such blocks.
+  for (auto &BBInfoPair : BlockInfo) {
+    auto &BBInfo = BBInfoPair.second;
+    if (BBInfo.terminatorIsLive())
+      continue;
+    auto *BB = BBInfo.BB;
+    if (!PDT.getNode(BB)) {
+      DEBUG(dbgs() << "Not post-dominated by return: " << BB->getName()
+                   << '\n';);
+      markLive(BBInfo.Terminator);
+      continue;
+    }
+    for (auto *Succ : successors(BB))
+      if (!PDT.getNode(Succ)) {
+        DEBUG(dbgs() << "Successor not post-dominated by return: "
+                     << BB->getName() << '\n';);
+        markLive(BBInfo.Terminator);
+        break;
+      }
+  }
+
+  // Treat the entry block as always live
+  auto *BB = &F.getEntryBlock();
+  auto &EntryInfo = BlockInfo[BB];
+  EntryInfo.Live = true;
+  if (EntryInfo.UnconditionalBranch)
+    markLive(EntryInfo.Terminator);
+
+  // Build initial collection of blocks with dead terminators
+  for (auto &BBInfo : BlockInfo)
+    if (!BBInfo.second.terminatorIsLive())
+      BlocksWithDeadTerminators.insert(BBInfo.second.BB);
+}
+
+bool AggressiveDeadCodeElimination::isAlwaysLive(Instruction &I) {
+  // TODO -- use llvm::isInstructionTriviallyDead
+  if (I.isEHPad() || I.mayHaveSideEffects()) {
+    // Skip any value profile instrumentation calls if they are
+    // instrumenting constants.
+    if (isInstrumentsConstant(I))
+      return false;
+    return true;
+  }
+  if (!isa<TerminatorInst>(I))
+    return false;
+  if (RemoveControlFlowFlag && (isa<BranchInst>(I) || isa<SwitchInst>(I)))
+    return false;
+  return true;
+}
+
+// Check if this instruction is a runtime call for value profiling and
+// if it's instrumenting a constant.
+bool AggressiveDeadCodeElimination::isInstrumentsConstant(Instruction &I) {
+  // TODO -- move this test into llvm::isInstructionTriviallyDead
+  if (CallInst *CI = dyn_cast<CallInst>(&I))
+    if (Function *Callee = CI->getCalledFunction())
+      if (Callee->getName().equals(getInstrProfValueProfFuncName()))
+        if (isa<Constant>(CI->getArgOperand(0)))
+          return true;
+  return false;
+}
+
+void AggressiveDeadCodeElimination::markLiveInstructions() {
+
+  // Propagate liveness backwards to operands.
+  do {
+    // Worklist holds newly discovered live instructions
+    // where we need to mark the inputs as live.
+    while (!Worklist.empty()) {
+      Instruction *LiveInst = Worklist.pop_back_val();
+      DEBUG(dbgs() << "work live: "; LiveInst->dump(););
+
+      for (Use &OI : LiveInst->operands())
+        if (Instruction *Inst = dyn_cast<Instruction>(OI))
+          markLive(Inst);
+
+      if (auto *PN = dyn_cast<PHINode>(LiveInst))
+        markPhiLive(PN);
+    }
+
+    // After data flow liveness has been identified, examine which branch
+    // decisions are required to determine live instructions are executed.
+    markLiveBranchesFromControlDependences();
+
+  } while (!Worklist.empty());
+}
+
+void AggressiveDeadCodeElimination::markLive(Instruction *I) {
+
+  auto &Info = InstInfo[I];
+  if (Info.Live)
+    return;
+
+  DEBUG(dbgs() << "mark live: "; I->dump());
+  Info.Live = true;
+  Worklist.push_back(I);
+
+  // Collect the live debug info scopes attached to this instruction.
+  if (const DILocation *DL = I->getDebugLoc())
+    collectLiveScopes(*DL);
+
+  // Mark the containing block live
+  auto &BBInfo = *Info.Block;
+  if (BBInfo.Terminator == I) {
+    BlocksWithDeadTerminators.erase(BBInfo.BB);
+    // For live terminators, mark destination blocks
+    // live to preserve this control flow edges.
+    if (!BBInfo.UnconditionalBranch)
+      for (auto *BB : successors(I->getParent()))
+        markLive(BB);
+  }
+  markLive(BBInfo);
+}
+
+void AggressiveDeadCodeElimination::markLive(BlockInfoType &BBInfo) {
+  if (BBInfo.Live)
+    return;
+  DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n');
+  BBInfo.Live = true;
+  if (!BBInfo.CFLive) {
+    BBInfo.CFLive = true;
+    NewLiveBlocks.insert(BBInfo.BB);
+  }
+
+  // Mark unconditional branches at the end of live
+  // blocks as live since there is no work to do for them later
+  if (BBInfo.UnconditionalBranch)
+    markLive(BBInfo.Terminator);
+}
+
+void AggressiveDeadCodeElimination::collectLiveScopes(const DILocalScope &LS) {
+  if (!AliveScopes.insert(&LS).second)
+    return;
+
+  if (isa<DISubprogram>(LS))
+    return;
+
+  // Tail-recurse through the scope chain.
+  collectLiveScopes(cast<DILocalScope>(*LS.getScope()));
+}
+
+void AggressiveDeadCodeElimination::collectLiveScopes(const DILocation &DL) {
+  // Even though DILocations are not scopes, shove them into AliveScopes so we
+  // don't revisit them.
+  if (!AliveScopes.insert(&DL).second)
+    return;
+
+  // Collect live scopes from the scope chain.
+  collectLiveScopes(*DL.getScope());
+
+  // Tail-recurse through the inlined-at chain.
+  if (const DILocation *IA = DL.getInlinedAt())
+    collectLiveScopes(*IA);
+}
+
+void AggressiveDeadCodeElimination::markPhiLive(PHINode *PN) {
+  auto &Info = BlockInfo[PN->getParent()];
+  // Only need to check this once per block.
+  if (Info.HasLivePhiNodes)
+    return;
+  Info.HasLivePhiNodes = true;
+
+  // If a predecessor block is not live, mark it as control-flow live
+  // which will trigger marking live branches upon which
+  // that block is control dependent.
+  for (auto *PredBB : predecessors(Info.BB)) {
+    auto &Info = BlockInfo[PredBB];
+    if (!Info.CFLive) {
+      Info.CFLive = true;
+      NewLiveBlocks.insert(PredBB);
+    }
+  }
+}
+
+void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
+
+  if (BlocksWithDeadTerminators.empty())
+    return;
+
+  DEBUG({
+    dbgs() << "new live blocks:\n";
+    for (auto *BB : NewLiveBlocks)
+      dbgs() << "\t" << BB->getName() << '\n';
+    dbgs() << "dead terminator blocks:\n";
+    for (auto *BB : BlocksWithDeadTerminators)
+      dbgs() << "\t" << BB->getName() << '\n';
+  });
+
+  // The dominance frontier of a live block X in the reverse
+  // control graph is the set of blocks upon which X is control
+  // dependent. The following sequence computes the set of blocks
+  // which currently have dead terminators that are control
+  // dependence sources of a block which is in NewLiveBlocks.
+
+  SmallVector<BasicBlock *, 32> IDFBlocks;
+  ReverseIDFCalculator IDFs(PDT);
+  IDFs.setDefiningBlocks(NewLiveBlocks);
+  IDFs.setLiveInBlocks(BlocksWithDeadTerminators);
+  IDFs.calculate(IDFBlocks);
+  NewLiveBlocks.clear();
+
+  // Dead terminators which control live blocks are now marked live.
+  for (auto *BB : IDFBlocks) {
+    DEBUG(dbgs() << "live control in: " << BB->getName() << '\n');
+    markLive(BB->getTerminator());
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//
+//  Routines to update the CFG and SSA information before removing dead code.
+//
+//===----------------------------------------------------------------------===//
+bool AggressiveDeadCodeElimination::removeDeadInstructions() {
+
+  // Updates control and dataflow around dead blocks
+  updateDeadRegions();
+
+  DEBUG({
+    for (Instruction &I : instructions(F)) {
+      // Check if the instruction is alive.
+      if (isLive(&I))
+        continue;
+
+      if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+        // Check if the scope of this variable location is alive.
+        if (AliveScopes.count(DII->getDebugLoc()->getScope()))
+          continue;
+
+        // If intrinsic is pointing at a live SSA value, there may be an
+        // earlier optimization bug: if we know the location of the variable,
+        // why isn't the scope of the location alive?
+        if (Value *V = DII->getVariableLocation())
+          if (Instruction *II = dyn_cast<Instruction>(V))
+            if (isLive(II))
+              dbgs() << "Dropping debug info for " << *DII << "\n";
+      }
+    }
+  });
+
+  // The inverse of the live set is the dead set.  These are those instructions
+  // that have no side effects and do not influence the control flow or return
+  // value of the function, and may therefore be deleted safely.
+  // NOTE: We reuse the Worklist vector here for memory efficiency.
+  for (Instruction &I : instructions(F)) {
+    // Check if the instruction is alive.
+    if (isLive(&I))
+      continue;
+
+    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+      // Check if the scope of this variable location is alive.
+      if (AliveScopes.count(DII->getDebugLoc()->getScope()))
+        continue;
+
+      // Fallthrough and drop the intrinsic.
+    }
+
+    // Prepare to delete.
+    Worklist.push_back(&I);
+    I.dropAllReferences();
+  }
+
+  for (Instruction *&I : Worklist) {
+    ++NumRemoved;
+    I->eraseFromParent();
+  }
+
+  return !Worklist.empty();
+}
+
+// A dead region is the set of dead blocks with a common live post-dominator.
+void AggressiveDeadCodeElimination::updateDeadRegions() {
+
+  DEBUG({
+    dbgs() << "final dead terminator blocks: " << '\n';
+    for (auto *BB : BlocksWithDeadTerminators)
+      dbgs() << '\t' << BB->getName()
+             << (BlockInfo[BB].Live ? " LIVE\n" : "\n");
+  });
+
+  // Don't compute the post ordering unless we needed it.
+  bool HavePostOrder = false;
+
+  for (auto *BB : BlocksWithDeadTerminators) {
+    auto &Info = BlockInfo[BB];
+    if (Info.UnconditionalBranch) {
+      InstInfo[Info.Terminator].Live = true;
+      continue;
+    }
+
+    if (!HavePostOrder) {
+      computeReversePostOrder();
+      HavePostOrder = true;
+    }
+
+    // Add an unconditional branch to the successor closest to the
+    // end of the function which insures a path to the exit for each
+    // live edge.
+    BlockInfoType *PreferredSucc = nullptr;
+    for (auto *Succ : successors(BB)) {
+      auto *Info = &BlockInfo[Succ];
+      if (!PreferredSucc || PreferredSucc->PostOrder < Info->PostOrder)
+        PreferredSucc = Info;
+    }
+    assert((PreferredSucc && PreferredSucc->PostOrder > 0) &&
+           "Failed to find safe successor for dead branch");
+    bool First = true;
+    for (auto *Succ : successors(BB)) {
+      if (!First || Succ != PreferredSucc->BB)
+        Succ->removePredecessor(BB);
+      else
+        First = false;
+    }
+    makeUnconditional(BB, PreferredSucc->BB);
+    NumBranchesRemoved += 1;
+  }
+}
+
+// reverse top-sort order
+void AggressiveDeadCodeElimination::computeReversePostOrder() {
+
+  // This provides a post-order numbering of the reverse control flow graph
+  // Note that it is incomplete in the presence of infinite loops but we don't
+  // need numbers blocks which don't reach the end of the functions since
+  // all branches in those blocks are forced live.
+
+  // For each block without successors, extend the DFS from the block
+  // backward through the graph
+  SmallPtrSet<BasicBlock*, 16> Visited;
+  unsigned PostOrder = 0;
+  for (auto &BB : F) {
+    if (succ_begin(&BB) != succ_end(&BB))
+      continue;
+    for (BasicBlock *Block : inverse_post_order_ext(&BB,Visited))
+      BlockInfo[Block].PostOrder = PostOrder++;
+  }
+}
+
+void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
+                                                      BasicBlock *Target) {
+  TerminatorInst *PredTerm = BB->getTerminator();
+  // Collect the live debug info scopes attached to this instruction.
+  if (const DILocation *DL = PredTerm->getDebugLoc())
+    collectLiveScopes(*DL);
+
+  // Just mark live an existing unconditional branch
+  if (isUnconditionalBranch(PredTerm)) {
+    PredTerm->setSuccessor(0, Target);
+    InstInfo[PredTerm].Live = true;
+    return;
+  }
+  DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n');
+  NumBranchesRemoved += 1;
+  IRBuilder<> Builder(PredTerm);
+  auto *NewTerm = Builder.CreateBr(Target);
+  InstInfo[NewTerm].Live = true;
+  if (const DILocation *DL = PredTerm->getDebugLoc())
+    NewTerm->setDebugLoc(DL);
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Pass Manager integration code
+//
+//===----------------------------------------------------------------------===//
+PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) {
+  auto &PDT = FAM.getResult<PostDominatorTreeAnalysis>(F);
+  if (!AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination())
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+namespace {
+struct ADCELegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  ADCELegacyPass() : FunctionPass(ID) {
+    initializeADCELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+    return AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    if (!RemoveControlFlowFlag)
+      AU.setPreservesCFG();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+}
+
+char ADCELegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ADCELegacyPass, "adce",
+                      "Aggressive Dead Code Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination",
+                    false, false)
+
+FunctionPass *llvm::createAggressiveDCEPass() { return new ADCELegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
new file mode 100644
index 000000000000..99480f12da9e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -0,0 +1,450 @@
+//===----------------------- AlignmentFromAssumptions.cpp -----------------===//
+//                  Set Load/Store Alignments From Assumptions
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a ScalarEvolution-based transformation to set
+// the alignments of load, stores and memory intrinsics based on the truth
+// expressions of assume intrinsics. The primary motivation is to handle
+// complex alignment assumptions that apply to vector loads and stores that
+// appear after vectorization and unrolling.
+//
+//===----------------------------------------------------------------------===//
+
+#define AA_NAME "alignment-from-assumptions"
+#define DEBUG_TYPE AA_NAME
+#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+STATISTIC(NumLoadAlignChanged,
+  "Number of loads changed by alignment assumptions");
+STATISTIC(NumStoreAlignChanged,
+  "Number of stores changed by alignment assumptions");
+STATISTIC(NumMemIntAlignChanged,
+  "Number of memory intrinsics changed by alignment assumptions");
+
+namespace {
+struct AlignmentFromAssumptions : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  AlignmentFromAssumptions() : FunctionPass(ID) {
+    initializeAlignmentFromAssumptionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+
+    AU.setPreservesCFG();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+  }
+
+  AlignmentFromAssumptionsPass Impl;
+};
+}
+
+char AlignmentFromAssumptions::ID = 0;
+static const char aip_name[] = "Alignment from assumptions";
+INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME,
+                      aip_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME,
+                    aip_name, false, false)
+
+FunctionPass *llvm::createAlignmentFromAssumptionsPass() {
+  return new AlignmentFromAssumptions();
+}
+
+// Given an expression for the (constant) alignment, AlignSCEV, and an
+// expression for the displacement between a pointer and the aligned address,
+// DiffSCEV, compute the alignment of the displaced pointer if it can be reduced
+// to a constant. Using SCEV to compute alignment handles the case where
+// DiffSCEV is a recurrence with constant start such that the aligned offset
+// is constant. e.g. {16,+,32} % 32 -> 16.
+static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
+                                    const SCEV *AlignSCEV,
+                                    ScalarEvolution *SE) {
+  // DiffUnits = Diff % int64_t(Alignment)
+  const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV);
+  const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
+  const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
+
+  DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " <<
+                  *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
+
+  if (const SCEVConstant *ConstDUSCEV =
+      dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {
+    int64_t DiffUnits = ConstDUSCEV->getValue()->getSExtValue();
+
+    // If the displacement is an exact multiple of the alignment, then the
+    // displaced pointer has the same alignment as the aligned pointer, so
+    // return the alignment value.
+    if (!DiffUnits)
+      return (unsigned)
+        cast<SCEVConstant>(AlignSCEV)->getValue()->getSExtValue();
+
+    // If the displacement is not an exact multiple, but the remainder is a
+    // constant, then return this remainder (but only if it is a power of 2).
+    uint64_t DiffUnitsAbs = std::abs(DiffUnits);
+    if (isPowerOf2_64(DiffUnitsAbs))
+      return (unsigned) DiffUnitsAbs;
+  }
+
+  return 0;
+}
+
+// There is an address given by an offset OffSCEV from AASCEV which has an
+// alignment AlignSCEV. Use that information, if possible, to compute a new
+// alignment for Ptr.
+static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
+                                const SCEV *OffSCEV, Value *Ptr,
+                                ScalarEvolution *SE) {
+  const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+  const SCEV *DiffSCEV = SE->getMinusSCEV(PtrSCEV, AASCEV);
+
+  // On 32-bit platforms, DiffSCEV might now have type i32 -- we've always
+  // sign-extended OffSCEV to i64, so make sure they agree again.
+  DiffSCEV = SE->getNoopOrSignExtend(DiffSCEV, OffSCEV->getType());
+
+  // What we really want to know is the overall offset to the aligned
+  // address. This address is displaced by the provided offset.
+  DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV);
+
+  DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to " <<
+                  *AlignSCEV << " and offset " << *OffSCEV <<
+                  " using diff " << *DiffSCEV << "\n");
+
+  unsigned NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE);
+  DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n");
+
+  if (NewAlignment) {
+    return NewAlignment;
+  } else if (const SCEVAddRecExpr *DiffARSCEV =
+             dyn_cast<SCEVAddRecExpr>(DiffSCEV)) {
+    // The relative offset to the alignment assumption did not yield a constant,
+    // but we should try harder: if we assume that a is 32-byte aligned, then in
+    // for (i = 0; i < 1024; i += 4) r += a[i]; not all of the loads from a are
+    // 32-byte aligned, but instead alternate between 32 and 16-byte alignment.
+    // As a result, the new alignment will not be a constant, but can still
+    // be improved over the default (of 4) to 16.
+
+    const SCEV *DiffStartSCEV = DiffARSCEV->getStart();
+    const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE);
+
+    DEBUG(dbgs() << "\ttrying start/inc alignment using start " <<
+                    *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
+
+    // Now compute the new alignment using the displacement to the value in the
+    // first iteration, and also the alignment using the per-iteration delta.
+    // If these are the same, then use that answer. Otherwise, use the smaller
+    // one, but only if it divides the larger one.
+    NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
+    unsigned NewIncAlignment = getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
+
+    DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n");
+    DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n");
+
+    if (!NewAlignment || !NewIncAlignment) {
+      return 0;
+    } else if (NewAlignment > NewIncAlignment) {
+      if (NewAlignment % NewIncAlignment == 0) {
+        DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+                        NewIncAlignment << "\n");
+        return NewIncAlignment;
+      }
+    } else if (NewIncAlignment > NewAlignment) {
+      if (NewIncAlignment % NewAlignment == 0) {
+        DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+                        NewAlignment << "\n");
+        return NewAlignment;
+      }
+    } else if (NewIncAlignment == NewAlignment) {
+      DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+                      NewAlignment << "\n");
+      return NewAlignment;
+    }
+  }
+
+  return 0;
+}
+
+bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I,
+                                                        Value *&AAPtr,
+                                                        const SCEV *&AlignSCEV,
+                                                        const SCEV *&OffSCEV) {
+  // An alignment assume must be a statement about the least-significant
+  // bits of the pointer being zero, possibly with some offset.
+  ICmpInst *ICI = dyn_cast<ICmpInst>(I->getArgOperand(0));
+  if (!ICI)
+    return false;
+
+  // This must be an expression of the form: x & m == 0.
+  if (ICI->getPredicate() != ICmpInst::ICMP_EQ)
+    return false;
+
+  // Swap things around so that the RHS is 0.
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+  const SCEV *CmpLHSSCEV = SE->getSCEV(CmpLHS);
+  const SCEV *CmpRHSSCEV = SE->getSCEV(CmpRHS);
+  if (CmpLHSSCEV->isZero())
+    std::swap(CmpLHS, CmpRHS);
+  else if (!CmpRHSSCEV->isZero())
+    return false;
+
+  BinaryOperator *CmpBO = dyn_cast<BinaryOperator>(CmpLHS);
+  if (!CmpBO || CmpBO->getOpcode() != Instruction::And)
+    return false;
+
+  // Swap things around so that the right operand of the and is a constant
+  // (the mask); we cannot deal with variable masks.
+  Value *AndLHS = CmpBO->getOperand(0);
+  Value *AndRHS = CmpBO->getOperand(1);
+  const SCEV *AndLHSSCEV = SE->getSCEV(AndLHS);
+  const SCEV *AndRHSSCEV = SE->getSCEV(AndRHS);
+  if (isa<SCEVConstant>(AndLHSSCEV)) {
+    std::swap(AndLHS, AndRHS);
+    std::swap(AndLHSSCEV, AndRHSSCEV);
+  }
+
+  const SCEVConstant *MaskSCEV = dyn_cast<SCEVConstant>(AndRHSSCEV);
+  if (!MaskSCEV)
+    return false;
+
+  // The mask must have some trailing ones (otherwise the condition is
+  // trivial and tells us nothing about the alignment of the left operand).
+  unsigned TrailingOnes = MaskSCEV->getAPInt().countTrailingOnes();
+  if (!TrailingOnes)
+    return false;
+
+  // Cap the alignment at the maximum with which LLVM can deal (and make sure
+  // we don't overflow the shift).
+  uint64_t Alignment;
+  TrailingOnes = std::min(TrailingOnes,
+    unsigned(sizeof(unsigned) * CHAR_BIT - 1));
+  Alignment = std::min(1u << TrailingOnes, +Value::MaximumAlignment);
+
+  Type *Int64Ty = Type::getInt64Ty(I->getParent()->getParent()->getContext());
+  AlignSCEV = SE->getConstant(Int64Ty, Alignment);
+
+  // The LHS might be a ptrtoint instruction, or it might be the pointer
+  // with an offset.
+  AAPtr = nullptr;
+  OffSCEV = nullptr;
+  if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) {
+    AAPtr = PToI->getPointerOperand();
+    OffSCEV = SE->getZero(Int64Ty);
+  } else if (const SCEVAddExpr* AndLHSAddSCEV =
+             dyn_cast<SCEVAddExpr>(AndLHSSCEV)) {
+    // Try to find the ptrtoint; subtract it and the rest is the offset.
+    for (SCEVAddExpr::op_iterator J = AndLHSAddSCEV->op_begin(),
+         JE = AndLHSAddSCEV->op_end(); J != JE; ++J)
+      if (const SCEVUnknown *OpUnk = dyn_cast<SCEVUnknown>(*J))
+        if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(OpUnk->getValue())) {
+          AAPtr = PToI->getPointerOperand();
+          OffSCEV = SE->getMinusSCEV(AndLHSAddSCEV, *J);
+          break;
+        }
+  }
+
+  if (!AAPtr)
+    return false;
+
+  // Sign extend the offset to 64 bits (so that it is like all of the other
+  // expressions). 
+  unsigned OffSCEVBits = OffSCEV->getType()->getPrimitiveSizeInBits();
+  if (OffSCEVBits < 64)
+    OffSCEV = SE->getSignExtendExpr(OffSCEV, Int64Ty);
+  else if (OffSCEVBits > 64)
+    return false;
+
+  AAPtr = AAPtr->stripPointerCasts();
+  return true;
+}
+
+bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
+  Value *AAPtr;
+  const SCEV *AlignSCEV, *OffSCEV;
+  if (!extractAlignmentInfo(ACall, AAPtr, AlignSCEV, OffSCEV))
+    return false;
+
+  // Skip ConstantPointerNull and UndefValue.  Assumptions on these shouldn't
+  // affect other users.
+  if (isa<ConstantData>(AAPtr))
+    return false;
+
+  const SCEV *AASCEV = SE->getSCEV(AAPtr);
+
+  // Apply the assumption to all other users of the specified pointer.
+  SmallPtrSet<Instruction *, 32> Visited;
+  SmallVector<Instruction*, 16> WorkList;
+  for (User *J : AAPtr->users()) {
+    if (J == ACall)
+      continue;
+
+    if (Instruction *K = dyn_cast<Instruction>(J))
+      if (isValidAssumeForContext(ACall, K, DT))
+        WorkList.push_back(K);
+  }
+
+  while (!WorkList.empty()) {
+    Instruction *J = WorkList.pop_back_val();
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(J)) {
+      unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+        LI->getPointerOperand(), SE);
+
+      if (NewAlignment > LI->getAlignment()) {
+        LI->setAlignment(NewAlignment);
+        ++NumLoadAlignChanged;
+      }
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
+      unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+        SI->getPointerOperand(), SE);
+
+      if (NewAlignment > SI->getAlignment()) {
+        SI->setAlignment(NewAlignment);
+        ++NumStoreAlignChanged;
+      }
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
+      unsigned NewDestAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+        MI->getDest(), SE);
+
+      // For memory transfers, we need a common alignment for both the
+      // source and destination. If we have a new alignment for this
+      // instruction, but only for one operand, save it. If we reach the
+      // other operand through another assumption later, then we may
+      // change the alignment at that point.
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+        unsigned NewSrcAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+          MTI->getSource(), SE);
+
+        DenseMap<MemTransferInst *, unsigned>::iterator DI =
+          NewDestAlignments.find(MTI);
+        unsigned AltDestAlignment = (DI == NewDestAlignments.end()) ?
+                                    0 : DI->second;
+
+        DenseMap<MemTransferInst *, unsigned>::iterator SI =
+          NewSrcAlignments.find(MTI);
+        unsigned AltSrcAlignment = (SI == NewSrcAlignments.end()) ?
+                                   0 : SI->second;
+
+        DEBUG(dbgs() << "\tmem trans: " << NewDestAlignment << " " <<
+                        AltDestAlignment << " " << NewSrcAlignment <<
+                        " " << AltSrcAlignment << "\n");
+
+        // Of these four alignments, pick the largest possible...
+        unsigned NewAlignment = 0;
+        if (NewDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
+          NewAlignment = std::max(NewAlignment, NewDestAlignment);
+        if (AltDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
+          NewAlignment = std::max(NewAlignment, AltDestAlignment);
+        if (NewSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
+          NewAlignment = std::max(NewAlignment, NewSrcAlignment);
+        if (AltSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
+          NewAlignment = std::max(NewAlignment, AltSrcAlignment);
+
+        if (NewAlignment > MI->getAlignment()) {
+          MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
+            MI->getParent()->getContext()), NewAlignment));
+          ++NumMemIntAlignChanged;
+        }
+
+        NewDestAlignments.insert(std::make_pair(MTI, NewDestAlignment));
+        NewSrcAlignments.insert(std::make_pair(MTI, NewSrcAlignment));
+      } else if (NewDestAlignment > MI->getAlignment()) {
+        assert((!isa<MemIntrinsic>(MI) || isa<MemSetInst>(MI)) &&
+               "Unknown memory intrinsic");
+
+        MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
+          MI->getParent()->getContext()), NewDestAlignment));
+        ++NumMemIntAlignChanged;
+      }
+    }
+
+    // Now that we've updated that use of the pointer, look for other uses of
+    // the pointer to update.
+    Visited.insert(J);
+    for (User *UJ : J->users()) {
+      Instruction *K = cast<Instruction>(UJ);
+      if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DT))
+        WorkList.push_back(K);
+    }
+  }
+
+  return true;
+}
+
+bool AlignmentFromAssumptions::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  return Impl.runImpl(F, AC, SE, DT);
+}
+
+bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC,
+                                           ScalarEvolution *SE_,
+                                           DominatorTree *DT_) {
+  SE = SE_;
+  DT = DT_;
+
+  NewDestAlignments.clear();
+  NewSrcAlignments.clear();
+
+  bool Changed = false;
+  for (auto &AssumeVH : AC.assumptions())
+    if (AssumeVH)
+      Changed |= processAssumption(cast<CallInst>(AssumeVH));
+
+  return Changed;
+}
+
+PreservedAnalyses
+AlignmentFromAssumptionsPass::run(Function &F, FunctionAnalysisManager &AM) {
+
+  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
+  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  if (!runImpl(F, AC, &SE, &DT))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<AAManager>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
new file mode 100644
index 000000000000..61e8700f1cd6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -0,0 +1,118 @@
+//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Bit-Tracking Dead Code Elimination pass. Some
+// instructions (shifts, some ands, ors, etc.) kill some of their input bits.
+// We track these dead bits and remove instructions that compute only these
+// dead bits.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/BDCE.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bdce"
+
+STATISTIC(NumRemoved, "Number of instructions removed (unused)");
+STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
+
+static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
+  SmallVector<Instruction*, 128> Worklist;
+  bool Changed = false;
+  for (Instruction &I : instructions(F)) {
+    // If the instruction has side effects and no non-dbg uses,
+    // skip it. This way we avoid computing known bits on an instruction
+    // that will not help us.
+    if (I.mayHaveSideEffects() && I.use_empty())
+      continue;
+
+    if (I.getType()->isIntegerTy() &&
+        !DB.getDemandedBits(&I).getBoolValue()) {
+      // For live instructions that have all dead bits, first make them dead by
+      // replacing all uses with something else. Then, if they don't need to
+      // remain live (because they have side effects, etc.) we can remove them.
+      DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
+      // FIXME: In theory we could substitute undef here instead of zero.
+      // This should be reconsidered once we settle on the semantics of
+      // undef, poison, etc.
+      Value *Zero = ConstantInt::get(I.getType(), 0);
+      ++NumSimplified;
+      I.replaceNonMetadataUsesWith(Zero);
+      Changed = true;
+    }
+    if (!DB.isInstructionDead(&I))
+      continue;
+
+    Worklist.push_back(&I);
+    I.dropAllReferences();
+    Changed = true;
+  }
+
+  for (Instruction *&I : Worklist) {
+    ++NumRemoved;
+    I->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses BDCEPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
+  if (!bitTrackingDCE(F, DB))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+namespace {
+struct BDCELegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  BDCELegacyPass() : FunctionPass(ID) {
+    initializeBDCELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto &DB = getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+    return bitTrackingDCE(F, DB);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DemandedBitsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+}
+
+char BDCELegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(BDCELegacyPass, "bdce",
+                      "Bit-Tracking Dead Code Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_END(BDCELegacyPass, "bdce",
+                    "Bit-Tracking Dead Code Elimination", false, false)
+
+FunctionPass *llvm::createBitTrackingDCEPass() { return new BDCELegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
new file mode 100644
index 000000000000..122c9314e022
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -0,0 +1,797 @@
+//===- ConstantHoisting.cpp - Prepare code for expensive constants --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies expensive constants to hoist and coalesces them to
+// better prepare it for SelectionDAG-based code generation. This works around
+// the limitations of the basic-block-at-a-time approach.
+//
+// First it scans all instructions for integer constants and calculates its
+// cost. If the constant can be folded into the instruction (the cost is
+// TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't
+// consider it expensive and leave it alone. This is the default behavior and
+// the default implementation of getIntImmCost will always return TCC_Free.
+//
+// If the cost is more than TCC_BASIC, then the integer constant can't be folded
+// into the instruction and it might be beneficial to hoist the constant.
+// Similar constants are coalesced to reduce register pressure and
+// materialization code.
+//
+// When a constant is hoisted, it is also hidden behind a bitcast to force it to
+// be live-out of the basic block. Otherwise the constant would be just
+// duplicated and each basic block would have its own copy in the SelectionDAG.
+// The SelectionDAG recognizes such constants as opaque and doesn't perform
+// certain transformations on them, which would create a new expensive constant.
+//
+// This optimization is only applied to integer constants in instructions and
+// simple (this means not nested) constant cast expressions. For example:
+// %0 = load i64* inttoptr (i64 big_constant to i64*)
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ConstantHoisting.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <tuple>
+
+using namespace llvm;
+using namespace consthoist;
+
+#define DEBUG_TYPE "consthoist"
+
+STATISTIC(NumConstantsHoisted, "Number of constants hoisted");
+STATISTIC(NumConstantsRebased, "Number of constants rebased");
+
+static cl::opt<bool> ConstHoistWithBlockFrequency(
+    "consthoist-with-block-frequency", cl::init(true), cl::Hidden,
+    cl::desc("Enable the use of the block frequency analysis to reduce the "
+             "chance to execute const materialization more frequently than "
+             "without hoisting."));
+
+namespace {
+/// \brief The constant hoisting pass.
+class ConstantHoistingLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  ConstantHoistingLegacyPass() : FunctionPass(ID) {
+    initializeConstantHoistingLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &Fn) override;
+
+  StringRef getPassName() const override { return "Constant Hoisting"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    if (ConstHoistWithBlockFrequency)
+      AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  void releaseMemory() override { Impl.releaseMemory(); }
+
+private:
+  ConstantHoistingPass Impl;
+};
+}
+
+char ConstantHoistingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist",
+                      "Constant Hoisting", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist",
+                    "Constant Hoisting", false, false)
+
+FunctionPass *llvm::createConstantHoistingPass() {
+  return new ConstantHoistingLegacyPass();
+}
+
+/// \brief Perform the constant hoisting optimization for the given function.
+bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
+  if (skipFunction(Fn))
+    return false;
+
+  DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
+  DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
+
+  bool MadeChange =
+      Impl.runImpl(Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
+                   getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+                   ConstHoistWithBlockFrequency
+                       ? &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI()
+                       : nullptr,
+                   Fn.getEntryBlock());
+
+  if (MadeChange) {
+    DEBUG(dbgs() << "********** Function after Constant Hoisting: "
+                 << Fn.getName() << '\n');
+    DEBUG(dbgs() << Fn);
+  }
+  DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
+
+  return MadeChange;
+}
+
+
+/// \brief Find the constant materialization insertion point.
+Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
+                                                   unsigned Idx) const {
+  // If the operand is a cast instruction, then we have to materialize the
+  // constant before the cast instruction.
+  if (Idx != ~0U) {
+    Value *Opnd = Inst->getOperand(Idx);
+    if (auto CastInst = dyn_cast<Instruction>(Opnd))
+      if (CastInst->isCast())
+        return CastInst;
+  }
+
+  // The simple and common case. This also includes constant expressions.
+  if (!isa<PHINode>(Inst) && !Inst->isEHPad())
+    return Inst;
+
+  // We can't insert directly before a phi node or an eh pad. Insert before
+  // the terminator of the incoming or dominating block.
+  assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!");
+  if (Idx != ~0U && isa<PHINode>(Inst))
+    return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator();
+
+  // This must be an EH pad. Iterate over immediate dominators until we find a
+  // non-EH pad. We need to skip over catchswitch blocks, which are both EH pads
+  // and terminators.
+  auto IDom = DT->getNode(Inst->getParent())->getIDom();
+  while (IDom->getBlock()->isEHPad()) {
+    assert(Entry != IDom->getBlock() && "eh pad in entry block");
+    IDom = IDom->getIDom();
+  }
+
+  return IDom->getBlock()->getTerminator();
+}
+
+/// \brief Given \p BBs as input, find another set of BBs which collectively
+/// dominates \p BBs and have the minimal sum of frequencies. Return the BB
+/// set found in \p BBs.
+static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
+                                 BasicBlock *Entry,
+                                 SmallPtrSet<BasicBlock *, 8> &BBs) {
+  assert(!BBs.count(Entry) && "Assume Entry is not in BBs");
+  // Nodes on the current path to the root.
+  SmallPtrSet<BasicBlock *, 8> Path;
+  // Candidates includes any block 'BB' in set 'BBs' that is not strictly
+  // dominated by any other blocks in set 'BBs', and all nodes in the path
+  // in the dominator tree from Entry to 'BB'.
+  SmallPtrSet<BasicBlock *, 16> Candidates;
+  for (auto BB : BBs) {
+    Path.clear();
+    // Walk up the dominator tree until Entry or another BB in BBs
+    // is reached. Insert the nodes on the way to the Path.
+    BasicBlock *Node = BB;
+    // The "Path" is a candidate path to be added into Candidates set.
+    bool isCandidate = false;
+    do {
+      Path.insert(Node);
+      if (Node == Entry || Candidates.count(Node)) {
+        isCandidate = true;
+        break;
+      }
+      assert(DT.getNode(Node)->getIDom() &&
+             "Entry doens't dominate current Node");
+      Node = DT.getNode(Node)->getIDom()->getBlock();
+    } while (!BBs.count(Node));
+
+    // If isCandidate is false, Node is another Block in BBs dominating
+    // current 'BB'. Drop the nodes on the Path.
+    if (!isCandidate)
+      continue;
+
+    // Add nodes on the Path into Candidates.
+    Candidates.insert(Path.begin(), Path.end());
+  }
+
+  // Sort the nodes in Candidates in top-down order and save the nodes
+  // in Orders.
+  unsigned Idx = 0;
+  SmallVector<BasicBlock *, 16> Orders;
+  Orders.push_back(Entry);
+  while (Idx != Orders.size()) {
+    BasicBlock *Node = Orders[Idx++];
+    for (auto ChildDomNode : DT.getNode(Node)->getChildren()) {
+      if (Candidates.count(ChildDomNode->getBlock()))
+        Orders.push_back(ChildDomNode->getBlock());
+    }
+  }
+
+  // Visit Orders in bottom-up order.
+  typedef std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>
+      InsertPtsCostPair;
+  // InsertPtsMap is a map from a BB to the best insertion points for the
+  // subtree of BB (subtree not including the BB itself).
+  DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap;
+  InsertPtsMap.reserve(Orders.size() + 1);
+  for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) {
+    BasicBlock *Node = *RIt;
+    bool NodeInBBs = BBs.count(Node);
+    SmallPtrSet<BasicBlock *, 16> &InsertPts = InsertPtsMap[Node].first;
+    BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second;
+
+    // Return the optimal insert points in BBs.
+    if (Node == Entry) {
+      BBs.clear();
+      if (InsertPtsFreq > BFI.getBlockFreq(Node) ||
+          (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1))
+        BBs.insert(Entry);
+      else
+        BBs.insert(InsertPts.begin(), InsertPts.end());
+      break;
+    }
+
+    BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock();
+    // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child
+    // will update its parent's ParentInsertPts and ParentPtsFreq.
+    SmallPtrSet<BasicBlock *, 16> &ParentInsertPts = InsertPtsMap[Parent].first;
+    BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second;
+    // Choose to insert in Node or in subtree of Node.
+    // Don't hoist to EHPad because we may not find a proper place to insert
+    // in EHPad.
+    // If the total frequency of InsertPts is the same as the frequency of the
+    // target Node, and InsertPts contains more than one nodes, choose hoisting
+    // to reduce code size.
+    if (NodeInBBs ||
+        (!Node->isEHPad() &&
+         (InsertPtsFreq > BFI.getBlockFreq(Node) ||
+          (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1)))) {
+      ParentInsertPts.insert(Node);
+      ParentPtsFreq += BFI.getBlockFreq(Node);
+    } else {
+      ParentInsertPts.insert(InsertPts.begin(), InsertPts.end());
+      ParentPtsFreq += InsertPtsFreq;
+    }
+  }
+}
+
+/// \brief Find an insertion point that dominates all uses.
+SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
+    const ConstantInfo &ConstInfo) const {
+  assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
+  // Collect all basic blocks.
+  SmallPtrSet<BasicBlock *, 8> BBs;
+  SmallPtrSet<Instruction *, 8> InsertPts;
+  for (auto const &RCI : ConstInfo.RebasedConstants)
+    for (auto const &U : RCI.Uses)
+      BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent());
+
+  if (BBs.count(Entry)) {
+    InsertPts.insert(&Entry->front());
+    return InsertPts;
+  }
+
+  if (BFI) {
+    findBestInsertionSet(*DT, *BFI, Entry, BBs);
+    for (auto BB : BBs) {
+      BasicBlock::iterator InsertPt = BB->begin();
+      for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
+        ;
+      InsertPts.insert(&*InsertPt);
+    }
+    return InsertPts;
+  }
+
+  while (BBs.size() >= 2) {
+    BasicBlock *BB, *BB1, *BB2;
+    BB1 = *BBs.begin();
+    BB2 = *std::next(BBs.begin());
+    BB = DT->findNearestCommonDominator(BB1, BB2);
+    if (BB == Entry) {
+      InsertPts.insert(&Entry->front());
+      return InsertPts;
+    }
+    BBs.erase(BB1);
+    BBs.erase(BB2);
+    BBs.insert(BB);
+  }
+  assert((BBs.size() == 1) && "Expected only one element.");
+  Instruction &FirstInst = (*BBs.begin())->front();
+  InsertPts.insert(findMatInsertPt(&FirstInst));
+  return InsertPts;
+}
+
+
+/// \brief Record constant integer ConstInt for instruction Inst at operand
+/// index Idx.
+///
+/// The operand at index Idx is not necessarily the constant integer itself. It
+/// could also be a cast instruction or a constant expression that uses the
+// constant integer.
+void ConstantHoistingPass::collectConstantCandidates(
+    ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx,
+    ConstantInt *ConstInt) {
+  unsigned Cost;
+  // Ask the target about the cost of materializing the constant for the given
+  // instruction and operand index.
+  if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst))
+    Cost = TTI->getIntImmCost(IntrInst->getIntrinsicID(), Idx,
+                              ConstInt->getValue(), ConstInt->getType());
+  else
+    Cost = TTI->getIntImmCost(Inst->getOpcode(), Idx, ConstInt->getValue(),
+                              ConstInt->getType());
+
+  // Ignore cheap integer constants.
+  if (Cost > TargetTransformInfo::TCC_Basic) {
+    ConstCandMapType::iterator Itr;
+    bool Inserted;
+    std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(ConstInt, 0));
+    if (Inserted) {
+      ConstCandVec.push_back(ConstantCandidate(ConstInt));
+      Itr->second = ConstCandVec.size() - 1;
+    }
+    ConstCandVec[Itr->second].addUser(Inst, Idx, Cost);
+    DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx)))
+            dbgs() << "Collect constant " << *ConstInt << " from " << *Inst
+                   << " with cost " << Cost << '\n';
+          else
+          dbgs() << "Collect constant " << *ConstInt << " indirectly from "
+                 << *Inst << " via " << *Inst->getOperand(Idx) << " with cost "
+                 << Cost << '\n';
+    );
+  }
+}
+
+
+/// \brief Check the operand for instruction Inst at index Idx.
+void ConstantHoistingPass::collectConstantCandidates(
+    ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) {
+  Value *Opnd = Inst->getOperand(Idx);
+
+  // Visit constant integers.
+  if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
+    collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+    return;
+  }
+
+  // Visit cast instructions that have constant integers.
+  if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+    // Only visit cast instructions, which have been skipped. All other
+    // instructions should have already been visited.
+    if (!CastInst->isCast())
+      return;
+
+    if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
+      // Pretend the constant is directly used by the instruction and ignore
+      // the cast instruction.
+      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+      return;
+    }
+  }
+
+  // Visit constant expressions that have constant integers.
+  if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+    // Only visit constant cast expressions.
+    if (!ConstExpr->isCast())
+      return;
+
+    if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
+      // Pretend the constant is directly used by the instruction and ignore
+      // the constant expression.
+      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+      return;
+    }
+  }
+}
+
+
+/// \brief Scan the instruction for expensive integer constants and record them
+/// in the constant candidate vector.
+void ConstantHoistingPass::collectConstantCandidates(
+    ConstCandMapType &ConstCandMap, Instruction *Inst) {
+  // Skip all cast instructions. They are visited indirectly later on.
+  if (Inst->isCast())
+    return;
+
+  // Scan all operands.
+  for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
+    // The cost of materializing the constants (defined in
+    // `TargetTransformInfo::getIntImmCost`) for instructions which only take
+    // constant variables is lower than `TargetTransformInfo::TCC_Basic`. So
+    // it's safe for us to collect constant candidates from all IntrinsicInsts.
+    if (canReplaceOperandWithVariable(Inst, Idx) || isa<IntrinsicInst>(Inst)) {
+      collectConstantCandidates(ConstCandMap, Inst, Idx);
+    }
+  } // end of for all operands
+}
+
+/// \brief Collect all integer constants in the function that cannot be folded
+/// into an instruction itself.
+void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
+  ConstCandMapType ConstCandMap;
+  for (BasicBlock &BB : Fn)
+    for (Instruction &Inst : BB)
+      collectConstantCandidates(ConstCandMap, &Inst);
+}
+
+// This helper function is necessary to deal with values that have different
+// bit widths (APInt Operator- does not like that). If the value cannot be
+// represented in uint64 we return an "empty" APInt. This is then interpreted
+// as the value is not in range.
+static llvm::Optional<APInt> calculateOffsetDiff(const APInt &V1,
+                                                 const APInt &V2) {
+  llvm::Optional<APInt> Res = None;
+  unsigned BW = V1.getBitWidth() > V2.getBitWidth() ?
+                V1.getBitWidth() : V2.getBitWidth();
+  uint64_t LimVal1 = V1.getLimitedValue();
+  uint64_t LimVal2 = V2.getLimitedValue();
+
+  if (LimVal1 == ~0ULL || LimVal2 == ~0ULL)
+    return Res;
+
+  uint64_t Diff = LimVal1 - LimVal2;
+  return APInt(BW, Diff, true);
+}
+
+// From a list of constants, one needs to picked as the base and the other
+// constants will be transformed into an offset from that base constant. The
+// question is which we can pick best? For example, consider these constants
+// and their number of uses:
+//
+//  Constants| 2 | 4 | 12 | 42 |
+//  NumUses  | 3 | 2 |  8 |  7 |
+//
+// Selecting constant 12 because it has the most uses will generate negative
+// offsets for constants 2 and 4 (i.e. -10 and -8 respectively). If negative
+// offsets lead to less optimal code generation, then there might be better
+// solutions. Suppose immediates in the range of 0..35 are most optimally
+// supported by the architecture, then selecting constant 2 is most optimal
+// because this will generate offsets: 0, 2, 10, 40. Offsets 0, 2 and 10 are in
+// range 0..35, and thus 3 + 2 + 8 = 13 uses are in range. Selecting 12 would
+// have only 8 uses in range, so choosing 2 as a base is more optimal. Thus, in
+// selecting the base constant the range of the offsets is a very important
+// factor too that we take into account here. This algorithm calculates a total
+// costs for selecting a constant as the base and substract the costs if
+// immediates are out of range. It has quadratic complexity, so we call this
+// function only when we're optimising for size and there are less than 100
+// constants, we fall back to the straightforward algorithm otherwise
+// which does not do all the offset calculations.
+unsigned
+ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
+                                           ConstCandVecType::iterator E,
+                                           ConstCandVecType::iterator &MaxCostItr) {
+  unsigned NumUses = 0;
+
+  if(!Entry->getParent()->optForSize() || std::distance(S,E) > 100) {
+    for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+      NumUses += ConstCand->Uses.size();
+      if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost)
+        MaxCostItr = ConstCand;
+    }
+    return NumUses;
+  }
+
+  DEBUG(dbgs() << "== Maximize constants in range ==\n");
+  int MaxCost = -1;
+  for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+    auto Value = ConstCand->ConstInt->getValue();
+    Type *Ty = ConstCand->ConstInt->getType();
+    int Cost = 0;
+    NumUses += ConstCand->Uses.size();
+    DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue() << "\n");
+
+    for (auto User : ConstCand->Uses) {
+      unsigned Opcode = User.Inst->getOpcode();
+      unsigned OpndIdx = User.OpndIdx;
+      Cost += TTI->getIntImmCost(Opcode, OpndIdx, Value, Ty);
+      DEBUG(dbgs() << "Cost: " << Cost << "\n");
+
+      for (auto C2 = S; C2 != E; ++C2) {
+        llvm::Optional<APInt> Diff = calculateOffsetDiff(
+                                      C2->ConstInt->getValue(),
+                                      ConstCand->ConstInt->getValue());
+        if (Diff) {
+          const int ImmCosts =
+            TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty);
+          Cost -= ImmCosts;
+          DEBUG(dbgs() << "Offset " << Diff.getValue() << " "
+                       << "has penalty: " << ImmCosts << "\n"
+                       << "Adjusted cost: " << Cost << "\n");
+        }
+      }
+    }
+    DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n");
+    if (Cost > MaxCost) {
+      MaxCost = Cost;
+      MaxCostItr = ConstCand;
+      DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue()
+                   << "\n");
+    }
+  }
+  return NumUses;
+}
+
+/// \brief Find the base constant within the given range and rebase all other
+/// constants with respect to the base constant.
+void ConstantHoistingPass::findAndMakeBaseConstant(
+    ConstCandVecType::iterator S, ConstCandVecType::iterator E) {
+  auto MaxCostItr = S;
+  unsigned NumUses = maximizeConstantsInRange(S, E, MaxCostItr);
+
+  // Don't hoist constants that have only one use.
+  if (NumUses <= 1)
+    return;
+
+  ConstantInfo ConstInfo;
+  ConstInfo.BaseConstant = MaxCostItr->ConstInt;
+  Type *Ty = ConstInfo.BaseConstant->getType();
+
+  // Rebase the constants with respect to the base constant.
+  for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+    APInt Diff = ConstCand->ConstInt->getValue() -
+                 ConstInfo.BaseConstant->getValue();
+    Constant *Offset = Diff == 0 ? nullptr : ConstantInt::get(Ty, Diff);
+    ConstInfo.RebasedConstants.push_back(
+      RebasedConstantInfo(std::move(ConstCand->Uses), Offset));
+  }
+  ConstantVec.push_back(std::move(ConstInfo));
+}
+
+/// \brief Finds and combines constant candidates that can be easily
+/// rematerialized with an add from a common base constant.
+void ConstantHoistingPass::findBaseConstants() {
+  // Sort the constants by value and type. This invalidates the mapping!
+  std::sort(ConstCandVec.begin(), ConstCandVec.end(),
+            [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) {
+    if (LHS.ConstInt->getType() != RHS.ConstInt->getType())
+      return LHS.ConstInt->getType()->getBitWidth() <
+             RHS.ConstInt->getType()->getBitWidth();
+    return LHS.ConstInt->getValue().ult(RHS.ConstInt->getValue());
+  });
+
+  // Simple linear scan through the sorted constant candidate vector for viable
+  // merge candidates.
+  auto MinValItr = ConstCandVec.begin();
+  for (auto CC = std::next(ConstCandVec.begin()), E = ConstCandVec.end();
+       CC != E; ++CC) {
+    if (MinValItr->ConstInt->getType() == CC->ConstInt->getType()) {
+      // Check if the constant is in range of an add with immediate.
+      APInt Diff = CC->ConstInt->getValue() - MinValItr->ConstInt->getValue();
+      if ((Diff.getBitWidth() <= 64) &&
+          TTI->isLegalAddImmediate(Diff.getSExtValue()))
+        continue;
+    }
+    // We either have now a different constant type or the constant is not in
+    // range of an add with immediate anymore.
+    findAndMakeBaseConstant(MinValItr, CC);
+    // Start a new base constant search.
+    MinValItr = CC;
+  }
+  // Finalize the last base constant search.
+  findAndMakeBaseConstant(MinValItr, ConstCandVec.end());
+}
+
+/// \brief Updates the operand at Idx in instruction Inst with the result of
+///        instruction Mat. If the instruction is a PHI node then special
+///        handling for duplicate values form the same incoming basic block is
+///        required.
+/// \return The update will always succeed, but the return value indicated if
+///         Mat was used for the update or not.
+static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) {
+  if (auto PHI = dyn_cast<PHINode>(Inst)) {
+    // Check if any previous operand of the PHI node has the same incoming basic
+    // block. This is a very odd case that happens when the incoming basic block
+    // has a switch statement. In this case use the same value as the previous
+    // operand(s), otherwise we will fail verification due to different values.
+    // The values are actually the same, but the variable names are different
+    // and the verifier doesn't like that.
+    BasicBlock *IncomingBB = PHI->getIncomingBlock(Idx);
+    for (unsigned i = 0; i < Idx; ++i) {
+      if (PHI->getIncomingBlock(i) == IncomingBB) {
+        Value *IncomingVal = PHI->getIncomingValue(i);
+        Inst->setOperand(Idx, IncomingVal);
+        return false;
+      }
+    }
+  }
+
+  Inst->setOperand(Idx, Mat);
+  return true;
+}
+
+/// \brief Emit materialization code for all rebased constants and update their
+/// users.
+void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
+                                             Constant *Offset,
+                                             const ConstantUser &ConstUser) {
+  Instruction *Mat = Base;
+  if (Offset) {
+    Instruction *InsertionPt = findMatInsertPt(ConstUser.Inst,
+                                               ConstUser.OpndIdx);
+    Mat = BinaryOperator::Create(Instruction::Add, Base, Offset,
+                                 "const_mat", InsertionPt);
+
+    DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
+                 << " + " << *Offset << ") in BB "
+                 << Mat->getParent()->getName() << '\n' << *Mat << '\n');
+    Mat->setDebugLoc(ConstUser.Inst->getDebugLoc());
+  }
+  Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx);
+
+  // Visit constant integer.
+  if (isa<ConstantInt>(Opnd)) {
+    DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset)
+      Mat->eraseFromParent();
+    DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    return;
+  }
+
+  // Visit cast instruction.
+  if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+    assert(CastInst->isCast() && "Expected an cast instruction!");
+    // Check if we already have visited this cast instruction before to avoid
+    // unnecessary cloning.
+    Instruction *&ClonedCastInst = ClonedCastMap[CastInst];
+    if (!ClonedCastInst) {
+      ClonedCastInst = CastInst->clone();
+      ClonedCastInst->setOperand(0, Mat);
+      ClonedCastInst->insertAfter(CastInst);
+      // Use the same debug location as the original cast instruction.
+      ClonedCastInst->setDebugLoc(CastInst->getDebugLoc());
+      DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n'
+                   << "To               : " << *ClonedCastInst << '\n');
+    }
+
+    DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst);
+    DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    return;
+  }
+
+  // Visit constant expression.
+  if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+    Instruction *ConstExprInst = ConstExpr->getAsInstruction();
+    ConstExprInst->setOperand(0, Mat);
+    ConstExprInst->insertBefore(findMatInsertPt(ConstUser.Inst,
+                                                ConstUser.OpndIdx));
+
+    // Use the same debug location as the instruction we are about to update.
+    ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc());
+
+    DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n'
+                 << "From              : " << *ConstExpr << '\n');
+    DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) {
+      ConstExprInst->eraseFromParent();
+      if (Offset)
+        Mat->eraseFromParent();
+    }
+    DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    return;
+  }
+}
+
+/// \brief Hoist and hide the base constant behind a bitcast and emit
+/// materialization code for derived constants.
+bool ConstantHoistingPass::emitBaseConstants() {
+  bool MadeChange = false;
+  for (auto const &ConstInfo : ConstantVec) {
+    // Hoist and hide the base constant behind a bitcast.
+    SmallPtrSet<Instruction *, 8> IPSet = findConstantInsertionPoint(ConstInfo);
+    assert(!IPSet.empty() && "IPSet is empty");
+
+    unsigned UsesNum = 0;
+    unsigned ReBasesNum = 0;
+    for (Instruction *IP : IPSet) {
+      IntegerType *Ty = ConstInfo.BaseConstant->getType();
+      Instruction *Base =
+          new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP);
+      DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant
+                   << ") to BB " << IP->getParent()->getName() << '\n'
+                   << *Base << '\n');
+
+      // Emit materialization code for all rebased constants.
+      unsigned Uses = 0;
+      for (auto const &RCI : ConstInfo.RebasedConstants) {
+        for (auto const &U : RCI.Uses) {
+          Uses++;
+          BasicBlock *OrigMatInsertBB =
+              findMatInsertPt(U.Inst, U.OpndIdx)->getParent();
+          // If Base constant is to be inserted in multiple places,
+          // generate rebase for U using the Base dominating U.
+          if (IPSet.size() == 1 ||
+              DT->dominates(Base->getParent(), OrigMatInsertBB)) {
+            emitBaseConstants(Base, RCI.Offset, U);
+            ReBasesNum++;
+          }
+        }
+      }
+      UsesNum = Uses;
+
+      // Use the same debug location as the last user of the constant.
+      assert(!Base->use_empty() && "The use list is empty!?");
+      assert(isa<Instruction>(Base->user_back()) &&
+             "All uses should be instructions.");
+      Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc());
+    }
+    (void)UsesNum;
+    (void)ReBasesNum;
+    // Expect all uses are rebased after rebase is done.
+    assert(UsesNum == ReBasesNum && "Not all uses are rebased");
+
+    NumConstantsHoisted++;
+
+    // Base constant is also included in ConstInfo.RebasedConstants, so
+    // deduct 1 from ConstInfo.RebasedConstants.size().
+    NumConstantsRebased = ConstInfo.RebasedConstants.size() - 1;
+
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+/// \brief Check all cast instructions we made a copy of and remove them if they
+/// have no more users.
+void ConstantHoistingPass::deleteDeadCastInst() const {
+  for (auto const &I : ClonedCastMap)
+    if (I.first->use_empty())
+      I.first->eraseFromParent();
+}
+
+/// \brief Optimize expensive integer constants in the given function.
+bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
+                                   DominatorTree &DT, BlockFrequencyInfo *BFI,
+                                   BasicBlock &Entry) {
+  this->TTI = &TTI;
+  this->DT = &DT;
+  this->BFI = BFI;
+  this->Entry = &Entry;  
+  // Collect all constant candidates.
+  collectConstantCandidates(Fn);
+
+  // There are no constant candidates to worry about.
+  if (ConstCandVec.empty())
+    return false;
+
+  // Combine constants that can be easily materialized with an add from a common
+  // base constant.
+  findBaseConstants();
+
+  // There are no constants to emit.
+  if (ConstantVec.empty())
+    return false;
+
+  // Finally hoist the base constant and emit materialization code for dependent
+  // constants.
+  bool MadeChange = emitBaseConstants();
+
+  // Cleanup dead instructions.
+  deleteDeadCastInst();
+
+  return MadeChange;
+}
+
+PreservedAnalyses ConstantHoistingPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto BFI = ConstHoistWithBlockFrequency
+                 ? &AM.getResult<BlockFrequencyAnalysis>(F)
+                 : nullptr;
+  if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock()))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
new file mode 100644
index 000000000000..4fa27891a974
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
@@ -0,0 +1,104 @@
+//===- ConstantProp.cpp - Code to perform Simple Constant Propagation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements constant propagation and merging:
+//
+// Specifically, this:
+//   * Converts instructions like "add int 1, 2" into 3
+//
+// Notice that:
+//   * This pass has a habit of making definitions be dead.  It is a good idea
+//     to run a DIE pass sometime after running this pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "constprop"
+
+STATISTIC(NumInstKilled, "Number of instructions killed");
+
+namespace {
+  struct ConstantPropagation : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    ConstantPropagation() : FunctionPass(ID) {
+      initializeConstantPropagationPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+    }
+  };
+}
+
+char ConstantPropagation::ID = 0;
+INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop",
+                "Simple constant propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(ConstantPropagation, "constprop",
+                "Simple constant propagation", false, false)
+
+FunctionPass *llvm::createConstantPropagationPass() {
+  return new ConstantPropagation();
+}
+
+bool ConstantPropagation::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  // Initialize the worklist to all of the instructions ready to process...
+  std::set<Instruction*> WorkList;
+  for (Instruction &I: instructions(&F))
+    WorkList.insert(&I);
+
+  bool Changed = false;
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  while (!WorkList.empty()) {
+    Instruction *I = *WorkList.begin();
+    WorkList.erase(WorkList.begin());    // Get an element from the worklist...
+
+    if (!I->use_empty())                 // Don't muck with dead instructions...
+      if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) {
+        // Add all of the users of this instruction to the worklist, they might
+        // be constant propagatable now...
+        for (User *U : I->users())
+          WorkList.insert(cast<Instruction>(U));
+
+        // Replace all of the uses of a variable with uses of the constant.
+        I->replaceAllUsesWith(C);
+
+        // Remove the dead instruction.
+        WorkList.erase(I);
+        if (isInstructionTriviallyDead(I, TLI)) {
+          I->eraseFromParent();
+          ++NumInstKilled;
+        }
+
+        // We made a change to the function...
+        Changed = true;
+      }
+  }
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
new file mode 100644
index 000000000000..28157783daa7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -0,0 +1,580 @@
+//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Correlated Value Propagation pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "correlated-value-propagation"
+
+STATISTIC(NumPhis,      "Number of phis propagated");
+STATISTIC(NumSelects,   "Number of selects propagated");
+STATISTIC(NumMemAccess, "Number of memory access targets propagated");
+STATISTIC(NumCmps,      "Number of comparisons propagated");
+STATISTIC(NumReturns,   "Number of return values propagated");
+STATISTIC(NumDeadCases, "Number of switch cases removed");
+STATISTIC(NumSDivs,     "Number of sdiv converted to udiv");
+STATISTIC(NumAShrs,     "Number of ashr converted to lshr");
+STATISTIC(NumSRems,     "Number of srem converted to urem");
+
+static cl::opt<bool> DontProcessAdds("cvp-dont-process-adds", cl::init(true));
+
+namespace {
+  class CorrelatedValuePropagation : public FunctionPass {
+  public:
+    static char ID;
+    CorrelatedValuePropagation(): FunctionPass(ID) {
+     initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LazyValueInfoWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+    }
+  };
+}
+
+char CorrelatedValuePropagation::ID = 0;
+INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
+                "Value Propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
+                "Value Propagation", false, false)
+
+// Public interface to the Value Propagation pass
+Pass *llvm::createCorrelatedValuePropagationPass() {
+  return new CorrelatedValuePropagation();
+}
+
+static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
+  if (S->getType()->isVectorTy()) return false;
+  if (isa<Constant>(S->getOperand(0))) return false;
+
+  Constant *C = LVI->getConstant(S->getOperand(0), S->getParent(), S);
+  if (!C) return false;
+
+  ConstantInt *CI = dyn_cast<ConstantInt>(C);
+  if (!CI) return false;
+
+  Value *ReplaceWith = S->getOperand(1);
+  Value *Other = S->getOperand(2);
+  if (!CI->isOne()) std::swap(ReplaceWith, Other);
+  if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType());
+
+  S->replaceAllUsesWith(ReplaceWith);
+  S->eraseFromParent();
+
+  ++NumSelects;
+
+  return true;
+}
+
+static bool processPHI(PHINode *P, LazyValueInfo *LVI,
+                       const SimplifyQuery &SQ) {
+  bool Changed = false;
+
+  BasicBlock *BB = P->getParent();
+  for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
+    Value *Incoming = P->getIncomingValue(i);
+    if (isa<Constant>(Incoming)) continue;
+
+    Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P);
+
+    // Look if the incoming value is a select with a scalar condition for which
+    // LVI can tells us the value. In that case replace the incoming value with
+    // the appropriate value of the select. This often allows us to remove the
+    // select later.
+    if (!V) {
+      SelectInst *SI = dyn_cast<SelectInst>(Incoming);
+      if (!SI) continue;
+
+      Value *Condition = SI->getCondition();
+      if (!Condition->getType()->isVectorTy()) {
+        if (Constant *C = LVI->getConstantOnEdge(
+                Condition, P->getIncomingBlock(i), BB, P)) {
+          if (C->isOneValue()) {
+            V = SI->getTrueValue();
+          } else if (C->isZeroValue()) {
+            V = SI->getFalseValue();
+          }
+          // Once LVI learns to handle vector types, we could also add support
+          // for vector type constants that are not all zeroes or all ones.
+        }
+      }
+
+      // Look if the select has a constant but LVI tells us that the incoming
+      // value can never be that constant. In that case replace the incoming
+      // value with the other value of the select. This often allows us to
+      // remove the select later.
+      if (!V) {
+        Constant *C = dyn_cast<Constant>(SI->getFalseValue());
+        if (!C) continue;
+
+        if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
+              P->getIncomingBlock(i), BB, P) !=
+            LazyValueInfo::False)
+          continue;
+        V = SI->getTrueValue();
+      }
+
+      DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
+    }
+
+    P->setIncomingValue(i, V);
+    Changed = true;
+  }
+
+  if (Value *V = SimplifyInstruction(P, SQ)) {
+    P->replaceAllUsesWith(V);
+    P->eraseFromParent();
+    Changed = true;
+  }
+
+  if (Changed)
+    ++NumPhis;
+
+  return Changed;
+}
+
+static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) {
+  Value *Pointer = nullptr;
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    Pointer = L->getPointerOperand();
+  else
+    Pointer = cast<StoreInst>(I)->getPointerOperand();
+
+  if (isa<Constant>(Pointer)) return false;
+
+  Constant *C = LVI->getConstant(Pointer, I->getParent(), I);
+  if (!C) return false;
+
+  ++NumMemAccess;
+  I->replaceUsesOfWith(Pointer, C);
+  return true;
+}
+
+/// See if LazyValueInfo's ability to exploit edge conditions or range
+/// information is sufficient to prove this comparison. Even for local
+/// conditions, this can sometimes prove conditions instcombine can't by
+/// exploiting range information.
+static bool processCmp(CmpInst *C, LazyValueInfo *LVI) {
+  Value *Op0 = C->getOperand(0);
+  Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
+  if (!Op1) return false;
+
+  // As a policy choice, we choose not to waste compile time on anything where
+  // the comparison is testing local values.  While LVI can sometimes reason
+  // about such cases, it's not its primary purpose.  We do make sure to do
+  // the block local query for uses from terminator instructions, but that's
+  // handled in the code for each terminator.
+  auto *I = dyn_cast<Instruction>(Op0);
+  if (I && I->getParent() == C->getParent())
+    return false;
+
+  LazyValueInfo::Tristate Result =
+    LVI->getPredicateAt(C->getPredicate(), Op0, Op1, C);
+  if (Result == LazyValueInfo::Unknown) return false;
+
+  ++NumCmps;
+  if (Result == LazyValueInfo::True)
+    C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext()));
+  else
+    C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext()));
+  C->eraseFromParent();
+
+  return true;
+}
+
+/// Simplify a switch instruction by removing cases which can never fire. If the
+/// uselessness of a case could be determined locally then constant propagation
+/// would already have figured it out. Instead, walk the predecessors and
+/// statically evaluate cases based on information available on that edge. Cases
+/// that cannot fire no matter what the incoming edge can safely be removed. If
+/// a case fires on every incoming edge then the entire switch can be removed
+/// and replaced with a branch to the case destination.
+static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
+  Value *Cond = SI->getCondition();
+  BasicBlock *BB = SI->getParent();
+
+  // If the condition was defined in same block as the switch then LazyValueInfo
+  // currently won't say anything useful about it, though in theory it could.
+  if (isa<Instruction>(Cond) && cast<Instruction>(Cond)->getParent() == BB)
+    return false;
+
+  // If the switch is unreachable then trying to improve it is a waste of time.
+  pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
+  if (PB == PE) return false;
+
+  // Analyse each switch case in turn.
+  bool Changed = false;
+  for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
+    ConstantInt *Case = CI->getCaseValue();
+
+    // Check to see if the switch condition is equal to/not equal to the case
+    // value on every incoming edge, equal/not equal being the same each time.
+    LazyValueInfo::Tristate State = LazyValueInfo::Unknown;
+    for (pred_iterator PI = PB; PI != PE; ++PI) {
+      // Is the switch condition equal to the case value?
+      LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ,
+                                                              Cond, Case, *PI,
+                                                              BB, SI);
+      // Give up on this case if nothing is known.
+      if (Value == LazyValueInfo::Unknown) {
+        State = LazyValueInfo::Unknown;
+        break;
+      }
+
+      // If this was the first edge to be visited, record that all other edges
+      // need to give the same result.
+      if (PI == PB) {
+        State = Value;
+        continue;
+      }
+
+      // If this case is known to fire for some edges and known not to fire for
+      // others then there is nothing we can do - give up.
+      if (Value != State) {
+        State = LazyValueInfo::Unknown;
+        break;
+      }
+    }
+
+    if (State == LazyValueInfo::False) {
+      // This case never fires - remove it.
+      CI->getCaseSuccessor()->removePredecessor(BB);
+      CI = SI->removeCase(CI);
+      CE = SI->case_end();
+
+      // The condition can be modified by removePredecessor's PHI simplification
+      // logic.
+      Cond = SI->getCondition();
+
+      ++NumDeadCases;
+      Changed = true;
+      continue;
+    }
+    if (State == LazyValueInfo::True) {
+      // This case always fires.  Arrange for the switch to be turned into an
+      // unconditional branch by replacing the switch condition with the case
+      // value.
+      SI->setCondition(Case);
+      NumDeadCases += SI->getNumCases();
+      Changed = true;
+      break;
+    }
+
+    // Increment the case iterator since we didn't delete it.
+    ++CI;
+  }
+
+  if (Changed)
+    // If the switch has been simplified to the point where it can be replaced
+    // by a branch then do so now.
+    ConstantFoldTerminator(BB);
+
+  return Changed;
+}
+
+/// Infer nonnull attributes for the arguments at the specified callsite.
+static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
+  SmallVector<unsigned, 4> ArgNos;
+  unsigned ArgNo = 0;
+
+  for (Value *V : CS.args()) {
+    PointerType *Type = dyn_cast<PointerType>(V->getType());
+    // Try to mark pointer typed parameters as non-null.  We skip the
+    // relatively expensive analysis for constants which are obviously either
+    // null or non-null to start with.
+    if (Type && !CS.paramHasAttr(ArgNo, Attribute::NonNull) &&
+        !isa<Constant>(V) && 
+        LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
+                            ConstantPointerNull::get(Type),
+                            CS.getInstruction()) == LazyValueInfo::False)
+      ArgNos.push_back(ArgNo);
+    ArgNo++;
+  }
+
+  assert(ArgNo == CS.arg_size() && "sanity check");
+
+  if (ArgNos.empty())
+    return false;
+
+  AttributeList AS = CS.getAttributes();
+  LLVMContext &Ctx = CS.getInstruction()->getContext();
+  AS = AS.addParamAttribute(Ctx, ArgNos,
+                            Attribute::get(Ctx, Attribute::NonNull));
+  CS.setAttributes(AS);
+
+  return true;
+}
+
+// Helper function to rewrite srem and sdiv. As a policy choice, we choose not
+// to waste compile time on anything where the operands are local defs.  While
+// LVI can sometimes reason about such cases, it's not its primary purpose.
+static bool hasLocalDefs(BinaryOperator *SDI) {
+  for (Value *O : SDI->operands()) {
+    auto *I = dyn_cast<Instruction>(O);
+    if (I && I->getParent() == SDI->getParent())
+      return true;
+  }
+  return false;
+}
+
+static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
+  Constant *Zero = ConstantInt::get(SDI->getType(), 0);
+  for (Value *O : SDI->operands()) {
+    auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SGE, O, Zero, SDI);
+    if (Result != LazyValueInfo::True)
+      return false;
+  }
+  return true;
+}
+
+static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
+  if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) ||
+      !hasPositiveOperands(SDI, LVI))
+    return false;
+
+  ++NumSRems;
+  auto *BO = BinaryOperator::CreateURem(SDI->getOperand(0), SDI->getOperand(1),
+                                        SDI->getName(), SDI);
+  SDI->replaceAllUsesWith(BO);
+  SDI->eraseFromParent();
+  return true;
+}
+
+/// See if LazyValueInfo's ability to exploit edge conditions or range
+/// information is sufficient to prove the both operands of this SDiv are
+/// positive.  If this is the case, replace the SDiv with a UDiv. Even for local
+/// conditions, this can sometimes prove conditions instcombine can't by
+/// exploiting range information.
+static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
+  if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) ||
+      !hasPositiveOperands(SDI, LVI))
+    return false;
+
+  ++NumSDivs;
+  auto *BO = BinaryOperator::CreateUDiv(SDI->getOperand(0), SDI->getOperand(1),
+                                        SDI->getName(), SDI);
+  BO->setIsExact(SDI->isExact());
+  SDI->replaceAllUsesWith(BO);
+  SDI->eraseFromParent();
+
+  return true;
+}
+
+static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
+  if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI))
+    return false;
+
+  Constant *Zero = ConstantInt::get(SDI->getType(), 0);
+  if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, SDI->getOperand(0), Zero, SDI) !=
+      LazyValueInfo::True)
+    return false;
+
+  ++NumAShrs;
+  auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1),
+                                        SDI->getName(), SDI);
+  BO->setIsExact(SDI->isExact());
+  SDI->replaceAllUsesWith(BO);
+  SDI->eraseFromParent();
+
+  return true;
+}
+
+static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
+  typedef OverflowingBinaryOperator OBO;
+
+  if (DontProcessAdds)
+    return false;
+
+  if (AddOp->getType()->isVectorTy() || hasLocalDefs(AddOp))
+    return false;
+
+  bool NSW = AddOp->hasNoSignedWrap();
+  bool NUW = AddOp->hasNoUnsignedWrap();
+  if (NSW && NUW)
+    return false;
+
+  BasicBlock *BB = AddOp->getParent();
+
+  Value *LHS = AddOp->getOperand(0);
+  Value *RHS = AddOp->getOperand(1);
+
+  ConstantRange LRange = LVI->getConstantRange(LHS, BB, AddOp);
+
+  // Initialize RRange only if we need it. If we know that guaranteed no wrap
+  // range for the given LHS range is empty don't spend time calculating the
+  // range for the RHS.
+  Optional<ConstantRange> RRange;
+  auto LazyRRange = [&] () {
+      if (!RRange)
+        RRange = LVI->getConstantRange(RHS, BB, AddOp);
+      return RRange.getValue();
+  };
+
+  bool Changed = false;
+  if (!NUW) {
+    ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+        BinaryOperator::Add, LRange, OBO::NoUnsignedWrap);
+    if (!NUWRange.isEmptySet()) {
+      bool NewNUW = NUWRange.contains(LazyRRange());
+      AddOp->setHasNoUnsignedWrap(NewNUW);
+      Changed |= NewNUW;
+    }
+  }
+  if (!NSW) {
+    ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+        BinaryOperator::Add, LRange, OBO::NoSignedWrap);
+    if (!NSWRange.isEmptySet()) {
+      bool NewNSW = NSWRange.contains(LazyRRange());
+      AddOp->setHasNoSignedWrap(NewNSW);
+      Changed |= NewNSW;
+    }
+  }
+
+  return Changed;
+}
+
+static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
+  if (Constant *C = LVI->getConstant(V, At->getParent(), At))
+    return C;
+
+  // TODO: The following really should be sunk inside LVI's core algorithm, or
+  // at least the outer shims around such.
+  auto *C = dyn_cast<CmpInst>(V);
+  if (!C) return nullptr;
+
+  Value *Op0 = C->getOperand(0);
+  Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
+  if (!Op1) return nullptr;
+  
+  LazyValueInfo::Tristate Result =
+    LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At);
+  if (Result == LazyValueInfo::Unknown)
+    return nullptr;
+  
+  return (Result == LazyValueInfo::True) ?
+    ConstantInt::getTrue(C->getContext()) :
+    ConstantInt::getFalse(C->getContext());
+}
+
+static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
+  bool FnChanged = false;
+  // Visiting in a pre-order depth-first traversal causes us to simplify early
+  // blocks before querying later blocks (which require us to analyze early
+  // blocks).  Eagerly simplifying shallow blocks means there is strictly less
+  // work to do for deep blocks.  This also means we don't visit unreachable
+  // blocks. 
+  for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+    bool BBChanged = false;
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+      Instruction *II = &*BI++;
+      switch (II->getOpcode()) {
+      case Instruction::Select:
+        BBChanged |= processSelect(cast<SelectInst>(II), LVI);
+        break;
+      case Instruction::PHI:
+        BBChanged |= processPHI(cast<PHINode>(II), LVI, SQ);
+        break;
+      case Instruction::ICmp:
+      case Instruction::FCmp:
+        BBChanged |= processCmp(cast<CmpInst>(II), LVI);
+        break;
+      case Instruction::Load:
+      case Instruction::Store:
+        BBChanged |= processMemAccess(II, LVI);
+        break;
+      case Instruction::Call:
+      case Instruction::Invoke:
+        BBChanged |= processCallSite(CallSite(II), LVI);
+        break;
+      case Instruction::SRem:
+        BBChanged |= processSRem(cast<BinaryOperator>(II), LVI);
+        break;
+      case Instruction::SDiv:
+        BBChanged |= processSDiv(cast<BinaryOperator>(II), LVI);
+        break;
+      case Instruction::AShr:
+        BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
+        break;
+      case Instruction::Add:
+        BBChanged |= processAdd(cast<BinaryOperator>(II), LVI);
+        break;
+      }
+    }
+
+    Instruction *Term = BB->getTerminator();
+    switch (Term->getOpcode()) {
+    case Instruction::Switch:
+      BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI);
+      break;
+    case Instruction::Ret: {
+      auto *RI = cast<ReturnInst>(Term);
+      // Try to determine the return value if we can.  This is mainly here to
+      // simplify the writing of unit tests, but also helps to enable IPO by
+      // constant folding the return values of callees.
+      auto *RetVal = RI->getReturnValue();
+      if (!RetVal) break; // handle "ret void"
+      if (isa<Constant>(RetVal)) break; // nothing to do
+      if (auto *C = getConstantAt(RetVal, RI, LVI)) {
+        ++NumReturns;
+        RI->replaceUsesOfWith(RetVal, C);
+        BBChanged = true;        
+      }
+    }
+    }
+
+    FnChanged |= BBChanged;
+  }
+
+  return FnChanged;
+}
+
+bool CorrelatedValuePropagation::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+  return runImpl(F, LVI, getBestSimplifyQuery(*this, F));
+}
+
+PreservedAnalyses
+CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
+
+  LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
+  bool Changed = runImpl(F, LVI, getBestSimplifyQuery(AM, F));
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
new file mode 100644
index 000000000000..fa4806e884c3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -0,0 +1,163 @@
+//===- DCE.cpp - Code to perform dead code elimination --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead inst elimination and dead code elimination.
+//
+// Dead Inst Elimination performs a single pass over the function removing
+// instructions that are obviously dead.  Dead Code Elimination is similar, but
+// it rechecks instructions that were used by removed instructions to see if
+// they are newly dead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/DCE.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "dce"
+
+STATISTIC(DIEEliminated, "Number of insts removed by DIE pass");
+STATISTIC(DCEEliminated, "Number of insts removed");
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  // DeadInstElimination pass implementation
+  //
+  struct DeadInstElimination : public BasicBlockPass {
+    static char ID; // Pass identification, replacement for typeid
+    DeadInstElimination() : BasicBlockPass(ID) {
+      initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
+    }
+    bool runOnBasicBlock(BasicBlock &BB) override {
+      if (skipBasicBlock(BB))
+        return false;
+      auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+      TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+      bool Changed = false;
+      for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
+        Instruction *Inst = &*DI++;
+        if (isInstructionTriviallyDead(Inst, TLI)) {
+          Inst->eraseFromParent();
+          Changed = true;
+          ++DIEEliminated;
+        }
+      }
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+    }
+  };
+}
+
+char DeadInstElimination::ID = 0;
+INITIALIZE_PASS(DeadInstElimination, "die",
+                "Dead Instruction Elimination", false, false)
+
+Pass *llvm::createDeadInstEliminationPass() {
+  return new DeadInstElimination();
+}
+
+static bool DCEInstruction(Instruction *I,
+                           SmallSetVector<Instruction *, 16> &WorkList,
+                           const TargetLibraryInfo *TLI) {
+  if (isInstructionTriviallyDead(I, TLI)) {
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      Value *OpV = I->getOperand(i);
+      I->setOperand(i, nullptr);
+
+      if (!OpV->use_empty() || I == OpV)
+        continue;
+
+      // If the operand is an instruction that became dead as we nulled out the
+      // operand, and if it is 'trivially' dead, delete it in a future loop
+      // iteration.
+      if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+        if (isInstructionTriviallyDead(OpI, TLI))
+          WorkList.insert(OpI);
+    }
+
+    I->eraseFromParent();
+    ++DCEEliminated;
+    return true;
+  }
+  return false;
+}
+
+static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) {
+  bool MadeChange = false;
+  SmallSetVector<Instruction *, 16> WorkList;
+  // Iterate over the original function, only adding insts to the worklist
+  // if they actually need to be revisited. This avoids having to pre-init
+  // the worklist with the entire function's worth of instructions.
+  for (inst_iterator FI = inst_begin(F), FE = inst_end(F); FI != FE;) {
+    Instruction *I = &*FI;
+    ++FI;
+
+    // We're visiting this instruction now, so make sure it's not in the
+    // worklist from an earlier visit.
+    if (!WorkList.count(I))
+      MadeChange |= DCEInstruction(I, WorkList, TLI);
+  }
+
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.pop_back_val();
+    MadeChange |= DCEInstruction(I, WorkList, TLI);
+  }
+  return MadeChange;
+}
+
+PreservedAnalyses DCEPass::run(Function &F, FunctionAnalysisManager &AM) {
+  if (!eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+namespace {
+struct DCELegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  DCELegacyPass() : FunctionPass(ID) {
+    initializeDCELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+
+    return eliminateDeadCode(F, TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+};
+}
+
+char DCELegacyPass::ID = 0;
+INITIALIZE_PASS(DCELegacyPass, "dce", "Dead Code Elimination", false, false)
+
+FunctionPass *llvm::createDeadCodeEliminationPass() {
+  return new DCELegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
new file mode 100644
index 000000000000..1ec38e56aa4c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -0,0 +1,1242 @@
+//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a trivial dead store elimination that only considers
+// basic-block local redundant stores.
+//
+// FIXME: This should eventually be extended to be a post-dominator tree
+// traversal.  Doing so would be pretty trivial.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <map>
+using namespace llvm;
+
+#define DEBUG_TYPE "dse"
+
+STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
+STATISTIC(NumFastStores, "Number of stores deleted");
+STATISTIC(NumFastOther , "Number of other instrs removed");
+STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
+
+static cl::opt<bool>
+EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
+  cl::init(true), cl::Hidden,
+  cl::desc("Enable partial-overwrite tracking in DSE"));
+
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+typedef std::map<int64_t, int64_t> OverlapIntervalsTy;
+typedef DenseMap<Instruction *, OverlapIntervalsTy> InstOverlapIntervalsTy;
+
+/// Delete this instruction.  Before we do, go through and zero out all the
+/// operands of this instruction.  If any of them become dead, delete them and
+/// the computation tree that feeds them.
+/// If ValueSet is non-null, remove any deleted instructions from it as well.
+static void
+deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
+                      MemoryDependenceResults &MD, const TargetLibraryInfo &TLI,
+                      InstOverlapIntervalsTy &IOL,
+                      DenseMap<Instruction*, size_t> *InstrOrdering,
+                      SmallSetVector<Value *, 16> *ValueSet = nullptr) {
+  SmallVector<Instruction*, 32> NowDeadInsts;
+
+  NowDeadInsts.push_back(I);
+  --NumFastOther;
+
+  // Keeping the iterator straight is a pain, so we let this routine tell the
+  // caller what the next instruction is after we're done mucking about.
+  BasicBlock::iterator NewIter = *BBI;
+
+  // Before we touch this instruction, remove it from memdep!
+  do {
+    Instruction *DeadInst = NowDeadInsts.pop_back_val();
+    ++NumFastOther;
+
+    // This instruction is dead, zap it, in stages.  Start by removing it from
+    // MemDep, which needs to know the operands and needs it to be in the
+    // function.
+    MD.removeInstruction(DeadInst);
+
+    for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
+      Value *Op = DeadInst->getOperand(op);
+      DeadInst->setOperand(op, nullptr);
+
+      // If this operand just became dead, add it to the NowDeadInsts list.
+      if (!Op->use_empty()) continue;
+
+      if (Instruction *OpI = dyn_cast<Instruction>(Op))
+        if (isInstructionTriviallyDead(OpI, &TLI))
+          NowDeadInsts.push_back(OpI);
+    }
+
+    if (ValueSet) ValueSet->remove(DeadInst);
+    InstrOrdering->erase(DeadInst);
+    IOL.erase(DeadInst);
+
+    if (NewIter == DeadInst->getIterator())
+      NewIter = DeadInst->eraseFromParent();
+    else
+      DeadInst->eraseFromParent();
+  } while (!NowDeadInsts.empty());
+  *BBI = NewIter;
+}
+
+/// Does this instruction write some memory?  This only returns true for things
+/// that we can analyze with other helpers below.
+static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
+  if (isa<StoreInst>(I))
+    return true;
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::memset:
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+    case Intrinsic::init_trampoline:
+    case Intrinsic::lifetime_end:
+      return true;
+    }
+  }
+  if (auto CS = CallSite(I)) {
+    if (Function *F = CS.getCalledFunction()) {
+      StringRef FnName = F->getName();
+      if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy))
+        return true;
+      if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy))
+        return true;
+      if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat))
+        return true;
+      if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat))
+        return true;
+    }
+  }
+  return false;
+}
+
+/// Return a Location stored to by the specified instruction. If isRemovable
+/// returns true, this function and getLocForRead completely describe the memory
+/// operations for this instruction.
+static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return MemoryLocation::get(SI);
+
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) {
+    // memcpy/memmove/memset.
+    MemoryLocation Loc = MemoryLocation::getForDest(MI);
+    return Loc;
+  }
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
+  if (!II)
+    return MemoryLocation();
+
+  switch (II->getIntrinsicID()) {
+  default:
+    return MemoryLocation(); // Unhandled intrinsic.
+  case Intrinsic::init_trampoline:
+    // FIXME: We don't know the size of the trampoline, so we can't really
+    // handle it here.
+    return MemoryLocation(II->getArgOperand(0));
+  case Intrinsic::lifetime_end: {
+    uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
+    return MemoryLocation(II->getArgOperand(1), Len);
+  }
+  }
+}
+
+/// Return the location read by the specified "hasMemoryWrite" instruction if
+/// any.
+static MemoryLocation getLocForRead(Instruction *Inst,
+                                    const TargetLibraryInfo &TLI) {
+  assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case");
+
+  // The only instructions that both read and write are the mem transfer
+  // instructions (memcpy/memmove).
+  if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst))
+    return MemoryLocation::getForSource(MTI);
+  return MemoryLocation();
+}
+
+/// If the value of this instruction and the memory it writes to is unused, may
+/// we delete this instruction?
+static bool isRemovable(Instruction *I) {
+  // Don't remove volatile/atomic stores.
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isUnordered();
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate");
+    case Intrinsic::lifetime_end:
+      // Never remove dead lifetime_end's, e.g. because it is followed by a
+      // free.
+      return false;
+    case Intrinsic::init_trampoline:
+      // Always safe to remove init_trampoline.
+      return true;
+
+    case Intrinsic::memset:
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+      // Don't remove volatile memory intrinsics.
+      return !cast<MemIntrinsic>(II)->isVolatile();
+    }
+  }
+
+  if (auto CS = CallSite(I))
+    return CS.getInstruction()->use_empty();
+
+  return false;
+}
+
+
+/// Returns true if the end of this instruction can be safely shortened in
+/// length.
+static bool isShortenableAtTheEnd(Instruction *I) {
+  // Don't shorten stores for now
+  if (isa<StoreInst>(I))
+    return false;
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+      default: return false;
+      case Intrinsic::memset:
+      case Intrinsic::memcpy:
+        // Do shorten memory intrinsics.
+        // FIXME: Add memmove if it's also safe to transform.
+        return true;
+    }
+  }
+
+  // Don't shorten libcalls calls for now.
+
+  return false;
+}
+
+/// Returns true if the beginning of this instruction can be safely shortened
+/// in length.
+static bool isShortenableAtTheBeginning(Instruction *I) {
+  // FIXME: Handle only memset for now. Supporting memcpy/memmove should be
+  // easily done by offsetting the source address.
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+  return II && II->getIntrinsicID() == Intrinsic::memset;
+}
+
+/// Return the pointer that is being written to.
+static Value *getStoredPointerOperand(Instruction *I) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return MI->getDest();
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::init_trampoline:
+      return II->getArgOperand(0);
+    }
+  }
+
+  CallSite CS(I);
+  // All the supported functions so far happen to have dest as their first
+  // argument.
+  return CS.getArgument(0);
+}
+
+static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
+                               const TargetLibraryInfo &TLI) {
+  uint64_t Size;
+  if (getObjectSize(V, Size, DL, &TLI))
+    return Size;
+  return MemoryLocation::UnknownSize;
+}
+
+namespace {
+enum OverwriteResult { OW_Begin, OW_Complete, OW_End, OW_Unknown };
+}
+
+/// Return 'OW_Complete' if a store to the 'Later' location completely
+/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
+/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
+/// beginning of the 'Earlier' location is overwritten by 'Later', or
+/// 'OW_Unknown' if nothing can be determined.
+static OverwriteResult isOverwrite(const MemoryLocation &Later,
+                                   const MemoryLocation &Earlier,
+                                   const DataLayout &DL,
+                                   const TargetLibraryInfo &TLI,
+                                   int64_t &EarlierOff, int64_t &LaterOff,
+                                   Instruction *DepWrite,
+                                   InstOverlapIntervalsTy &IOL) {
+  // If we don't know the sizes of either access, then we can't do a comparison.
+  if (Later.Size == MemoryLocation::UnknownSize ||
+      Earlier.Size == MemoryLocation::UnknownSize)
+    return OW_Unknown;
+
+  const Value *P1 = Earlier.Ptr->stripPointerCasts();
+  const Value *P2 = Later.Ptr->stripPointerCasts();
+
+  // If the start pointers are the same, we just have to compare sizes to see if
+  // the later store was larger than the earlier store.
+  if (P1 == P2) {
+    // Make sure that the Later size is >= the Earlier size.
+    if (Later.Size >= Earlier.Size)
+      return OW_Complete;
+  }
+
+  // Check to see if the later store is to the entire object (either a global,
+  // an alloca, or a byval/inalloca argument).  If so, then it clearly
+  // overwrites any other store to the same object.
+  const Value *UO1 = GetUnderlyingObject(P1, DL),
+              *UO2 = GetUnderlyingObject(P2, DL);
+
+  // If we can't resolve the same pointers to the same object, then we can't
+  // analyze them at all.
+  if (UO1 != UO2)
+    return OW_Unknown;
+
+  // If the "Later" store is to a recognizable object, get its size.
+  uint64_t ObjectSize = getPointerSize(UO2, DL, TLI);
+  if (ObjectSize != MemoryLocation::UnknownSize)
+    if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
+      return OW_Complete;
+
+  // Okay, we have stores to two completely different pointers.  Try to
+  // decompose the pointer into a "base + constant_offset" form.  If the base
+  // pointers are equal, then we can reason about the two stores.
+  EarlierOff = 0;
+  LaterOff = 0;
+  const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL);
+  const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL);
+
+  // If the base pointers still differ, we have two completely different stores.
+  if (BP1 != BP2)
+    return OW_Unknown;
+
+  // The later store completely overlaps the earlier store if:
+  //
+  // 1. Both start at the same offset and the later one's size is greater than
+  //    or equal to the earlier one's, or
+  //
+  //      |--earlier--|
+  //      |--   later   --|
+  //
+  // 2. The earlier store has an offset greater than the later offset, but which
+  //    still lies completely within the later store.
+  //
+  //        |--earlier--|
+  //    |-----  later  ------|
+  //
+  // We have to be careful here as *Off is signed while *.Size is unsigned.
+  if (EarlierOff >= LaterOff &&
+      Later.Size >= Earlier.Size &&
+      uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
+    return OW_Complete;
+
+  // We may now overlap, although the overlap is not complete. There might also
+  // be other incomplete overlaps, and together, they might cover the complete
+  // earlier write.
+  // Note: The correctness of this logic depends on the fact that this function
+  // is not even called providing DepWrite when there are any intervening reads.
+  if (EnablePartialOverwriteTracking &&
+      LaterOff < int64_t(EarlierOff + Earlier.Size) &&
+      int64_t(LaterOff + Later.Size) >= EarlierOff) {
+
+    // Insert our part of the overlap into the map.
+    auto &IM = IOL[DepWrite];
+    DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff << ", " <<
+                    int64_t(EarlierOff + Earlier.Size) << ") Later [" <<
+                    LaterOff << ", " << int64_t(LaterOff + Later.Size) << ")\n");
+
+    // Make sure that we only insert non-overlapping intervals and combine
+    // adjacent intervals. The intervals are stored in the map with the ending
+    // offset as the key (in the half-open sense) and the starting offset as
+    // the value.
+    int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + Later.Size;
+
+    // Find any intervals ending at, or after, LaterIntStart which start
+    // before LaterIntEnd.
+    auto ILI = IM.lower_bound(LaterIntStart);
+    if (ILI != IM.end() && ILI->second <= LaterIntEnd) {
+      // This existing interval is overlapped with the current store somewhere
+      // in [LaterIntStart, LaterIntEnd]. Merge them by erasing the existing
+      // intervals and adjusting our start and end.
+      LaterIntStart = std::min(LaterIntStart, ILI->second);
+      LaterIntEnd = std::max(LaterIntEnd, ILI->first);
+      ILI = IM.erase(ILI);
+
+      // Continue erasing and adjusting our end in case other previous
+      // intervals are also overlapped with the current store.
+      //
+      // |--- ealier 1 ---|  |--- ealier 2 ---|
+      //     |------- later---------|
+      //
+      while (ILI != IM.end() && ILI->second <= LaterIntEnd) {
+        assert(ILI->second > LaterIntStart && "Unexpected interval");
+        LaterIntEnd = std::max(LaterIntEnd, ILI->first);
+        ILI = IM.erase(ILI);
+      }
+    }
+
+    IM[LaterIntEnd] = LaterIntStart;
+
+    ILI = IM.begin();
+    if (ILI->second <= EarlierOff &&
+        ILI->first >= int64_t(EarlierOff + Earlier.Size)) {
+      DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier [" <<
+                      EarlierOff << ", " <<
+                      int64_t(EarlierOff + Earlier.Size) <<
+                      ") Composite Later [" <<
+                      ILI->second << ", " << ILI->first << ")\n");
+      ++NumCompletePartials;
+      return OW_Complete;
+    }
+  }
+
+  // Another interesting case is if the later store overwrites the end of the
+  // earlier store.
+  //
+  //      |--earlier--|
+  //                |--   later   --|
+  //
+  // In this case we may want to trim the size of earlier to avoid generating
+  // writes to addresses which will definitely be overwritten later
+  if (!EnablePartialOverwriteTracking &&
+      (LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + Earlier.Size) &&
+       int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size)))
+    return OW_End;
+
+  // Finally, we also need to check if the later store overwrites the beginning
+  // of the earlier store.
+  //
+  //                |--earlier--|
+  //      |--   later   --|
+  //
+  // In this case we may want to move the destination address and trim the size
+  // of earlier to avoid generating writes to addresses which will definitely
+  // be overwritten later.
+  if (!EnablePartialOverwriteTracking &&
+      (LaterOff <= EarlierOff && int64_t(LaterOff + Later.Size) > EarlierOff)) {
+    assert(int64_t(LaterOff + Later.Size) <
+               int64_t(EarlierOff + Earlier.Size) &&
+           "Expect to be handled as OW_Complete");
+    return OW_Begin;
+  }
+  // Otherwise, they don't completely overlap.
+  return OW_Unknown;
+}
+
+/// If 'Inst' might be a self read (i.e. a noop copy of a
+/// memory region into an identical pointer) then it doesn't actually make its
+/// input dead in the traditional sense.  Consider this case:
+///
+///   memcpy(A <- B)
+///   memcpy(A <- A)
+///
+/// In this case, the second store to A does not make the first store to A dead.
+/// The usual situation isn't an explicit A<-A store like this (which can be
+/// trivially removed) but a case where two pointers may alias.
+///
+/// This function detects when it is unsafe to remove a dependent instruction
+/// because the DSE inducing instruction may be a self-read.
+static bool isPossibleSelfRead(Instruction *Inst,
+                               const MemoryLocation &InstStoreLoc,
+                               Instruction *DepWrite,
+                               const TargetLibraryInfo &TLI,
+                               AliasAnalysis &AA) {
+  // Self reads can only happen for instructions that read memory.  Get the
+  // location read.
+  MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);
+  if (!InstReadLoc.Ptr) return false;  // Not a reading instruction.
+
+  // If the read and written loc obviously don't alias, it isn't a read.
+  if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false;
+
+  // Okay, 'Inst' may copy over itself.  However, we can still remove a the
+  // DepWrite instruction if we can prove that it reads from the same location
+  // as Inst.  This handles useful cases like:
+  //   memcpy(A <- B)
+  //   memcpy(A <- B)
+  // Here we don't know if A/B may alias, but we do know that B/B are must
+  // aliases, so removing the first memcpy is safe (assuming it writes <= #
+  // bytes as the second one.
+  MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
+
+  if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
+    return false;
+
+  // If DepWrite doesn't read memory or if we can't prove it is a must alias,
+  // then it can't be considered dead.
+  return true;
+}
+
+/// Returns true if the memory which is accessed by the second instruction is not
+/// modified between the first and the second instruction.
+/// Precondition: Second instruction must be dominated by the first
+/// instruction.
+static bool memoryIsNotModifiedBetween(Instruction *FirstI,
+                                       Instruction *SecondI,
+                                       AliasAnalysis *AA) {
+  SmallVector<BasicBlock *, 16> WorkList;
+  SmallPtrSet<BasicBlock *, 8> Visited;
+  BasicBlock::iterator FirstBBI(FirstI);
+  ++FirstBBI;
+  BasicBlock::iterator SecondBBI(SecondI);
+  BasicBlock *FirstBB = FirstI->getParent();
+  BasicBlock *SecondBB = SecondI->getParent();
+  MemoryLocation MemLoc = MemoryLocation::get(SecondI);
+
+  // Start checking the store-block.
+  WorkList.push_back(SecondBB);
+  bool isFirstBlock = true;
+
+  // Check all blocks going backward until we reach the load-block.
+  while (!WorkList.empty()) {
+    BasicBlock *B = WorkList.pop_back_val();
+
+    // Ignore instructions before LI if this is the FirstBB.
+    BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin());
+
+    BasicBlock::iterator EI;
+    if (isFirstBlock) {
+      // Ignore instructions after SI if this is the first visit of SecondBB.
+      assert(B == SecondBB && "first block is not the store block");
+      EI = SecondBBI;
+      isFirstBlock = false;
+    } else {
+      // It's not SecondBB or (in case of a loop) the second visit of SecondBB.
+      // In this case we also have to look at instructions after SI.
+      EI = B->end();
+    }
+    for (; BI != EI; ++BI) {
+      Instruction *I = &*BI;
+      if (I->mayWriteToMemory() && I != SecondI) {
+        auto Res = AA->getModRefInfo(I, MemLoc);
+        if (Res & MRI_Mod)
+          return false;
+      }
+    }
+    if (B != FirstBB) {
+      assert(B != &FirstBB->getParent()->getEntryBlock() &&
+          "Should not hit the entry block because SI must be dominated by LI");
+      for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) {
+        if (!Visited.insert(*PredI).second)
+          continue;
+        WorkList.push_back(*PredI);
+      }
+    }
+  }
+  return true;
+}
+
+/// Find all blocks that will unconditionally lead to the block BB and append
+/// them to F.
+static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
+                                   BasicBlock *BB, DominatorTree *DT) {
+  for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+    BasicBlock *Pred = *I;
+    if (Pred == BB) continue;
+    TerminatorInst *PredTI = Pred->getTerminator();
+    if (PredTI->getNumSuccessors() != 1)
+      continue;
+
+    if (DT->isReachableFromEntry(Pred))
+      Blocks.push_back(Pred);
+  }
+}
+
+/// Handle frees of entire structures whose dependency is a store
+/// to a field of that structure.
+static bool handleFree(CallInst *F, AliasAnalysis *AA,
+                       MemoryDependenceResults *MD, DominatorTree *DT,
+                       const TargetLibraryInfo *TLI,
+                       InstOverlapIntervalsTy &IOL,
+                       DenseMap<Instruction*, size_t> *InstrOrdering) {
+  bool MadeChange = false;
+
+  MemoryLocation Loc = MemoryLocation(F->getOperand(0));
+  SmallVector<BasicBlock *, 16> Blocks;
+  Blocks.push_back(F->getParent());
+  const DataLayout &DL = F->getModule()->getDataLayout();
+
+  while (!Blocks.empty()) {
+    BasicBlock *BB = Blocks.pop_back_val();
+    Instruction *InstPt = BB->getTerminator();
+    if (BB == F->getParent()) InstPt = F;
+
+    MemDepResult Dep =
+        MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB);
+    while (Dep.isDef() || Dep.isClobber()) {
+      Instruction *Dependency = Dep.getInst();
+      if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency))
+        break;
+
+      Value *DepPointer =
+          GetUnderlyingObject(getStoredPointerOperand(Dependency), DL);
+
+      // Check for aliasing.
+      if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
+        break;
+
+      DEBUG(dbgs() << "DSE: Dead Store to soon to be freed memory:\n  DEAD: "
+                   << *Dependency << '\n');
+
+      // DCE instructions only used to calculate that store.
+      BasicBlock::iterator BBI(Dependency);
+      deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, InstrOrdering);
+      ++NumFastStores;
+      MadeChange = true;
+
+      // Inst's old Dependency is now deleted. Compute the next dependency,
+      // which may also be dead, as in
+      //    s[0] = 0;
+      //    s[1] = 0; // This has just been deleted.
+      //    free(s);
+      Dep = MD->getPointerDependencyFrom(Loc, false, BBI, BB);
+    }
+
+    if (Dep.isNonLocal())
+      findUnconditionalPreds(Blocks, BB, DT);
+  }
+
+  return MadeChange;
+}
+
+/// Check to see if the specified location may alias any of the stack objects in
+/// the DeadStackObjects set. If so, they become live because the location is
+/// being loaded.
+static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
+                                  SmallSetVector<Value *, 16> &DeadStackObjects,
+                                  const DataLayout &DL, AliasAnalysis *AA,
+                                  const TargetLibraryInfo *TLI) {
+  const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL);
+
+  // A constant can't be in the dead pointer set.
+  if (isa<Constant>(UnderlyingPointer))
+    return;
+
+  // If the kill pointer can be easily reduced to an alloca, don't bother doing
+  // extraneous AA queries.
+  if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
+    DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer));
+    return;
+  }
+
+  // Remove objects that could alias LoadedLoc.
+  DeadStackObjects.remove_if([&](Value *I) {
+    // See if the loaded location could alias the stack location.
+    MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI));
+    return !AA->isNoAlias(StackLoc, LoadedLoc);
+  });
+}
+
+/// Remove dead stores to stack-allocated locations in the function end block.
+/// Ex:
+/// %A = alloca i32
+/// ...
+/// store i32 1, i32* %A
+/// ret void
+static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
+                             MemoryDependenceResults *MD,
+                             const TargetLibraryInfo *TLI,
+                             InstOverlapIntervalsTy &IOL,
+                             DenseMap<Instruction*, size_t> *InstrOrdering) {
+  bool MadeChange = false;
+
+  // Keep track of all of the stack objects that are dead at the end of the
+  // function.
+  SmallSetVector<Value*, 16> DeadStackObjects;
+
+  // Find all of the alloca'd pointers in the entry block.
+  BasicBlock &Entry = BB.getParent()->front();
+  for (Instruction &I : Entry) {
+    if (isa<AllocaInst>(&I))
+      DeadStackObjects.insert(&I);
+
+    // Okay, so these are dead heap objects, but if the pointer never escapes
+    // then it's leaked by this function anyways.
+    else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true))
+      DeadStackObjects.insert(&I);
+  }
+
+  // Treat byval or inalloca arguments the same, stores to them are dead at the
+  // end of the function.
+  for (Argument &AI : BB.getParent()->args())
+    if (AI.hasByValOrInAllocaAttr())
+      DeadStackObjects.insert(&AI);
+
+  const DataLayout &DL = BB.getModule()->getDataLayout();
+
+  // Scan the basic block backwards
+  for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
+    --BBI;
+
+    // If we find a store, check to see if it points into a dead stack value.
+    if (hasMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
+      // See through pointer-to-pointer bitcasts
+      SmallVector<Value *, 4> Pointers;
+      GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL);
+
+      // Stores to stack values are valid candidates for removal.
+      bool AllDead = true;
+      for (Value *Pointer : Pointers)
+        if (!DeadStackObjects.count(Pointer)) {
+          AllDead = false;
+          break;
+        }
+
+      if (AllDead) {
+        Instruction *Dead = &*BBI;
+
+        DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: "
+                     << *Dead << "\n  Objects: ";
+              for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+                   E = Pointers.end(); I != E; ++I) {
+                dbgs() << **I;
+                if (std::next(I) != E)
+                  dbgs() << ", ";
+              }
+              dbgs() << '\n');
+
+        // DCE instructions only used to calculate that store.
+        deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, InstrOrdering, &DeadStackObjects);
+        ++NumFastStores;
+        MadeChange = true;
+        continue;
+      }
+    }
+
+    // Remove any dead non-memory-mutating instructions.
+    if (isInstructionTriviallyDead(&*BBI, TLI)) {
+      DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n  DEAD: "
+                   << *&*BBI << '\n');
+      deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, InstrOrdering, &DeadStackObjects);
+      ++NumFastOther;
+      MadeChange = true;
+      continue;
+    }
+
+    if (isa<AllocaInst>(BBI)) {
+      // Remove allocas from the list of dead stack objects; there can't be
+      // any references before the definition.
+      DeadStackObjects.remove(&*BBI);
+      continue;
+    }
+
+    if (auto CS = CallSite(&*BBI)) {
+      // Remove allocation function calls from the list of dead stack objects;
+      // there can't be any references before the definition.
+      if (isAllocLikeFn(&*BBI, TLI))
+        DeadStackObjects.remove(&*BBI);
+
+      // If this call does not access memory, it can't be loading any of our
+      // pointers.
+      if (AA->doesNotAccessMemory(CS))
+        continue;
+
+      // If the call might load from any of our allocas, then any store above
+      // the call is live.
+      DeadStackObjects.remove_if([&](Value *I) {
+        // See if the call site touches the value.
+        ModRefInfo A = AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI));
+
+        return A == MRI_ModRef || A == MRI_Ref;
+      });
+
+      // If all of the allocas were clobbered by the call then we're not going
+      // to find anything else to process.
+      if (DeadStackObjects.empty())
+        break;
+
+      continue;
+    }
+
+    // We can remove the dead stores, irrespective of the fence and its ordering
+    // (release/acquire/seq_cst). Fences only constraints the ordering of
+    // already visible stores, it does not make a store visible to other
+    // threads. So, skipping over a fence does not change a store from being
+    // dead.
+    if (isa<FenceInst>(*BBI))
+      continue;
+
+    MemoryLocation LoadedLoc;
+
+    // If we encounter a use of the pointer, it is no longer considered dead
+    if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
+      if (!L->isUnordered()) // Be conservative with atomic/volatile load
+        break;
+      LoadedLoc = MemoryLocation::get(L);
+    } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
+      LoadedLoc = MemoryLocation::get(V);
+    } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) {
+      LoadedLoc = MemoryLocation::getForSource(MTI);
+    } else if (!BBI->mayReadFromMemory()) {
+      // Instruction doesn't read memory.  Note that stores that weren't removed
+      // above will hit this case.
+      continue;
+    } else {
+      // Unknown inst; assume it clobbers everything.
+      break;
+    }
+
+    // Remove any allocas from the DeadPointer set that are loaded, as this
+    // makes any stores above the access live.
+    removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI);
+
+    // If all of the allocas were clobbered by the access then we're not going
+    // to find anything else to process.
+    if (DeadStackObjects.empty())
+      break;
+  }
+
+  return MadeChange;
+}
+
+static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
+                         int64_t &EarlierSize, int64_t LaterOffset,
+                         int64_t LaterSize, bool IsOverwriteEnd) {
+  // TODO: base this on the target vector size so that if the earlier
+  // store was too small to get vector writes anyway then its likely
+  // a good idea to shorten it
+  // Power of 2 vector writes are probably always a bad idea to optimize
+  // as any store/memset/memcpy is likely using vector instructions so
+  // shortening it to not vector size is likely to be slower
+  MemIntrinsic *EarlierIntrinsic = cast<MemIntrinsic>(EarlierWrite);
+  unsigned EarlierWriteAlign = EarlierIntrinsic->getAlignment();
+  if (!IsOverwriteEnd)
+    LaterOffset = int64_t(LaterOffset + LaterSize);
+
+  if (!(llvm::isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
+      !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
+    return false;
+
+  DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW "
+               << (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *EarlierWrite
+               << "\n  KILLER (offset " << LaterOffset << ", " << EarlierSize
+               << ")\n");
+
+  int64_t NewLength = IsOverwriteEnd
+                          ? LaterOffset - EarlierOffset
+                          : EarlierSize - (LaterOffset - EarlierOffset);
+
+  Value *EarlierWriteLength = EarlierIntrinsic->getLength();
+  Value *TrimmedLength =
+      ConstantInt::get(EarlierWriteLength->getType(), NewLength);
+  EarlierIntrinsic->setLength(TrimmedLength);
+
+  EarlierSize = NewLength;
+  if (!IsOverwriteEnd) {
+    int64_t OffsetMoved = (LaterOffset - EarlierOffset);
+    Value *Indices[1] = {
+        ConstantInt::get(EarlierWriteLength->getType(), OffsetMoved)};
+    GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds(
+        EarlierIntrinsic->getRawDest(), Indices, "", EarlierWrite);
+    EarlierIntrinsic->setDest(NewDestGEP);
+    EarlierOffset = EarlierOffset + OffsetMoved;
+  }
+  return true;
+}
+
+static bool tryToShortenEnd(Instruction *EarlierWrite,
+                            OverlapIntervalsTy &IntervalMap,
+                            int64_t &EarlierStart, int64_t &EarlierSize) {
+  if (IntervalMap.empty() || !isShortenableAtTheEnd(EarlierWrite))
+    return false;
+
+  OverlapIntervalsTy::iterator OII = --IntervalMap.end();
+  int64_t LaterStart = OII->second;
+  int64_t LaterSize = OII->first - LaterStart;
+
+  if (LaterStart > EarlierStart && LaterStart < EarlierStart + EarlierSize &&
+      LaterStart + LaterSize >= EarlierStart + EarlierSize) {
+    if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
+                     LaterSize, true)) {
+      IntervalMap.erase(OII);
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool tryToShortenBegin(Instruction *EarlierWrite,
+                              OverlapIntervalsTy &IntervalMap,
+                              int64_t &EarlierStart, int64_t &EarlierSize) {
+  if (IntervalMap.empty() || !isShortenableAtTheBeginning(EarlierWrite))
+    return false;
+
+  OverlapIntervalsTy::iterator OII = IntervalMap.begin();
+  int64_t LaterStart = OII->second;
+  int64_t LaterSize = OII->first - LaterStart;
+
+  if (LaterStart <= EarlierStart && LaterStart + LaterSize > EarlierStart) {
+    assert(LaterStart + LaterSize < EarlierStart + EarlierSize &&
+           "Should have been handled as OW_Complete");
+    if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
+                     LaterSize, false)) {
+      IntervalMap.erase(OII);
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool removePartiallyOverlappedStores(AliasAnalysis *AA,
+                                            const DataLayout &DL,
+                                            InstOverlapIntervalsTy &IOL) {
+  bool Changed = false;
+  for (auto OI : IOL) {
+    Instruction *EarlierWrite = OI.first;
+    MemoryLocation Loc = getLocForWrite(EarlierWrite, *AA);
+    assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
+    assert(Loc.Size != MemoryLocation::UnknownSize && "Unexpected mem loc");
+
+    const Value *Ptr = Loc.Ptr->stripPointerCasts();
+    int64_t EarlierStart = 0;
+    int64_t EarlierSize = int64_t(Loc.Size);
+    GetPointerBaseWithConstantOffset(Ptr, EarlierStart, DL);
+    OverlapIntervalsTy &IntervalMap = OI.second;
+    Changed |=
+        tryToShortenEnd(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
+    if (IntervalMap.empty())
+      continue;
+    Changed |=
+        tryToShortenBegin(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
+  }
+  return Changed;
+}
+
+static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
+                               AliasAnalysis *AA, MemoryDependenceResults *MD,
+                               const DataLayout &DL,
+                               const TargetLibraryInfo *TLI,
+                               InstOverlapIntervalsTy &IOL,
+                               DenseMap<Instruction*, size_t> *InstrOrdering) {
+  // Must be a store instruction.
+  StoreInst *SI = dyn_cast<StoreInst>(Inst);
+  if (!SI)
+    return false;
+
+  // If we're storing the same value back to a pointer that we just loaded from,
+  // then the store can be removed.
+  if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
+    if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
+        isRemovable(SI) && memoryIsNotModifiedBetween(DepLoad, SI, AA)) {
+
+      DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n  LOAD: "
+                   << *DepLoad << "\n  STORE: " << *SI << '\n');
+
+      deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, InstrOrdering);
+      ++NumRedundantStores;
+      return true;
+    }
+  }
+
+  // Remove null stores into the calloc'ed objects
+  Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand());
+  if (StoredConstant && StoredConstant->isNullValue() && isRemovable(SI)) {
+    Instruction *UnderlyingPointer =
+        dyn_cast<Instruction>(GetUnderlyingObject(SI->getPointerOperand(), DL));
+
+    if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
+        memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA)) {
+      DEBUG(
+          dbgs() << "DSE: Remove null store to the calloc'ed object:\n  DEAD: "
+                 << *Inst << "\n  OBJECT: " << *UnderlyingPointer << '\n');
+
+      deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, InstrOrdering);
+      ++NumRedundantStores;
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
+                                MemoryDependenceResults *MD, DominatorTree *DT,
+                                const TargetLibraryInfo *TLI) {
+  const DataLayout &DL = BB.getModule()->getDataLayout();
+  bool MadeChange = false;
+
+  // FIXME: Maybe change this to use some abstraction like OrderedBasicBlock?
+  // The current OrderedBasicBlock can't deal with mutation at the moment.
+  size_t LastThrowingInstIndex = 0;
+  DenseMap<Instruction*, size_t> InstrOrdering;
+  size_t InstrIndex = 1;
+
+  // A map of interval maps representing partially-overwritten value parts.
+  InstOverlapIntervalsTy IOL;
+
+  // Do a top-down walk on the BB.
+  for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
+    // Handle 'free' calls specially.
+    if (CallInst *F = isFreeCall(&*BBI, TLI)) {
+      MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, &InstrOrdering);
+      // Increment BBI after handleFree has potentially deleted instructions.
+      // This ensures we maintain a valid iterator.
+      ++BBI;
+      continue;
+    }
+
+    Instruction *Inst = &*BBI++;
+
+    size_t CurInstNumber = InstrIndex++;
+    InstrOrdering.insert(std::make_pair(Inst, CurInstNumber));
+    if (Inst->mayThrow()) {
+      LastThrowingInstIndex = CurInstNumber;
+      continue;
+    }
+
+    // Check to see if Inst writes to memory.  If not, continue.
+    if (!hasMemoryWrite(Inst, *TLI))
+      continue;
+
+    // eliminateNoopStore will update in iterator, if necessary.
+    if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, &InstrOrdering)) {
+      MadeChange = true;
+      continue;
+    }
+
+    // If we find something that writes memory, get its memory dependence.
+    MemDepResult InstDep = MD->getDependency(Inst);
+
+    // Ignore any store where we can't find a local dependence.
+    // FIXME: cross-block DSE would be fun. :)
+    if (!InstDep.isDef() && !InstDep.isClobber())
+      continue;
+
+    // Figure out what location is being stored to.
+    MemoryLocation Loc = getLocForWrite(Inst, *AA);
+
+    // If we didn't get a useful location, fail.
+    if (!Loc.Ptr)
+      continue;
+
+    // Loop until we find a store we can eliminate or a load that
+    // invalidates the analysis. Without an upper bound on the number of
+    // instructions examined, this analysis can become very time-consuming.
+    // However, the potential gain diminishes as we process more instructions
+    // without eliminating any of them. Therefore, we limit the number of
+    // instructions we look at.
+    auto Limit = MD->getDefaultBlockScanLimit();
+    while (InstDep.isDef() || InstDep.isClobber()) {
+      // Get the memory clobbered by the instruction we depend on.  MemDep will
+      // skip any instructions that 'Loc' clearly doesn't interact with.  If we
+      // end up depending on a may- or must-aliased load, then we can't optimize
+      // away the store and we bail out.  However, if we depend on something
+      // that overwrites the memory location we *can* potentially optimize it.
+      //
+      // Find out what memory location the dependent instruction stores.
+      Instruction *DepWrite = InstDep.getInst();
+      MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
+      // If we didn't get a useful location, or if it isn't a size, bail out.
+      if (!DepLoc.Ptr)
+        break;
+
+      // Make sure we don't look past a call which might throw. This is an
+      // issue because MemoryDependenceAnalysis works in the wrong direction:
+      // it finds instructions which dominate the current instruction, rather than
+      // instructions which are post-dominated by the current instruction.
+      //
+      // If the underlying object is a non-escaping memory allocation, any store
+      // to it is dead along the unwind edge. Otherwise, we need to preserve
+      // the store.
+      size_t DepIndex = InstrOrdering.lookup(DepWrite);
+      assert(DepIndex && "Unexpected instruction");
+      if (DepIndex <= LastThrowingInstIndex) {
+        const Value* Underlying = GetUnderlyingObject(DepLoc.Ptr, DL);
+        bool IsStoreDeadOnUnwind = isa<AllocaInst>(Underlying);
+        if (!IsStoreDeadOnUnwind) {
+            // We're looking for a call to an allocation function
+            // where the allocation doesn't escape before the last
+            // throwing instruction; PointerMayBeCaptured
+            // reasonably fast approximation.
+            IsStoreDeadOnUnwind = isAllocLikeFn(Underlying, TLI) &&
+                !PointerMayBeCaptured(Underlying, false, true);
+        }
+        if (!IsStoreDeadOnUnwind)
+          break;
+      }
+
+      // If we find a write that is a) removable (i.e., non-volatile), b) is
+      // completely obliterated by the store to 'Loc', and c) which we know that
+      // 'Inst' doesn't load from, then we can remove it.
+      if (isRemovable(DepWrite) &&
+          !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
+        int64_t InstWriteOffset, DepWriteOffset;
+        OverwriteResult OR =
+            isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset,
+                        DepWrite, IOL);
+        if (OR == OW_Complete) {
+          DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
+                << *DepWrite << "\n  KILLER: " << *Inst << '\n');
+
+          // Delete the store and now-dead instructions that feed it.
+          deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, &InstrOrdering);
+          ++NumFastStores;
+          MadeChange = true;
+
+          // We erased DepWrite; start over.
+          InstDep = MD->getDependency(Inst);
+          continue;
+        } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) ||
+                   ((OR == OW_Begin &&
+                     isShortenableAtTheBeginning(DepWrite)))) {
+          assert(!EnablePartialOverwriteTracking && "Do not expect to perform "
+                                                    "when partial-overwrite "
+                                                    "tracking is enabled");
+          int64_t EarlierSize = DepLoc.Size;
+          int64_t LaterSize = Loc.Size;
+          bool IsOverwriteEnd = (OR == OW_End);
+          MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
+                                    InstWriteOffset, LaterSize, IsOverwriteEnd);
+        }
+      }
+
+      // If this is a may-aliased store that is clobbering the store value, we
+      // can keep searching past it for another must-aliased pointer that stores
+      // to the same location.  For example, in:
+      //   store -> P
+      //   store -> Q
+      //   store -> P
+      // we can remove the first store to P even though we don't know if P and Q
+      // alias.
+      if (DepWrite == &BB.front()) break;
+
+      // Can't look past this instruction if it might read 'Loc'.
+      if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
+        break;
+
+      InstDep = MD->getPointerDependencyFrom(Loc, /*isLoad=*/ false,
+                                             DepWrite->getIterator(), &BB,
+                                             /*QueryInst=*/ nullptr, &Limit);
+    }
+  }
+
+  if (EnablePartialOverwriteTracking)
+    MadeChange |= removePartiallyOverlappedStores(AA, DL, IOL);
+
+  // If this block ends in a return, unwind, or unreachable, all allocas are
+  // dead at its end, which means stores to them are also dead.
+  if (BB.getTerminator()->getNumSuccessors() == 0)
+    MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, &InstrOrdering);
+
+  return MadeChange;
+}
+
+static bool eliminateDeadStores(Function &F, AliasAnalysis *AA,
+                                MemoryDependenceResults *MD, DominatorTree *DT,
+                                const TargetLibraryInfo *TLI) {
+  bool MadeChange = false;
+  for (BasicBlock &BB : F)
+    // Only check non-dead blocks.  Dead blocks may have strange pointer
+    // cycles that will confuse alias analysis.
+    if (DT->isReachableFromEntry(&BB))
+      MadeChange |= eliminateDeadStores(BB, AA, MD, DT, TLI);
+
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+// DSE Pass
+//===----------------------------------------------------------------------===//
+PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
+  AliasAnalysis *AA = &AM.getResult<AAManager>(F);
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  MemoryDependenceResults *MD = &AM.getResult<MemoryDependenceAnalysis>(F);
+  const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+
+  if (!eliminateDeadStores(F, AA, MD, DT, TLI))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<MemoryDependenceAnalysis>();
+  return PA;
+}
+
+namespace {
+/// A legacy pass for the legacy pass manager that wraps \c DSEPass.
+class DSELegacyPass : public FunctionPass {
+public:
+  DSELegacyPass() : FunctionPass(ID) {
+    initializeDSELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    MemoryDependenceResults *MD =
+        &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+    return eliminateDeadStores(F, AA, MD, DT, TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<MemoryDependenceWrapperPass>();
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+};
+} // end anonymous namespace
+
+char DSELegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
+                    false)
+
+FunctionPass *llvm::createDeadStoreEliminationPass() {
+  return new DSELegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
new file mode 100644
index 000000000000..7fd77a082b82
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -0,0 +1,1087 @@
+//===- EarlyCSE.cpp - Simple and fast CSE pass ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs a simple dominator tree walk that eliminates trivially
+// redundant instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/RecyclingAllocator.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <deque>
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "early-cse"
+
+STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd");
+STATISTIC(NumCSE,      "Number of instructions CSE'd");
+STATISTIC(NumCSECVP,   "Number of compare instructions CVP'd");
+STATISTIC(NumCSELoad,  "Number of load instructions CSE'd");
+STATISTIC(NumCSECall,  "Number of call instructions CSE'd");
+STATISTIC(NumDSE,      "Number of trivial dead stores removed");
+
+//===----------------------------------------------------------------------===//
+// SimpleValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Struct representing the available values in the scoped hash table.
+struct SimpleValue {
+  Instruction *Inst;
+
+  SimpleValue(Instruction *I) : Inst(I) {
+    assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+  }
+
+  bool isSentinel() const {
+    return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+           Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+
+  static bool canHandle(Instruction *Inst) {
+    // This can only handle non-void readnone functions.
+    if (CallInst *CI = dyn_cast<CallInst>(Inst))
+      return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
+    return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) ||
+           isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) ||
+           isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
+           isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
+           isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
+  }
+};
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<SimpleValue> {
+  static inline SimpleValue getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+  static inline SimpleValue getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+  static unsigned getHashValue(SimpleValue Val);
+  static bool isEqual(SimpleValue LHS, SimpleValue RHS);
+};
+}
+
+unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
+  Instruction *Inst = Val.Inst;
+  // Hash in all of the operands as pointers.
+  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst)) {
+    Value *LHS = BinOp->getOperand(0);
+    Value *RHS = BinOp->getOperand(1);
+    if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1))
+      std::swap(LHS, RHS);
+
+    return hash_combine(BinOp->getOpcode(), LHS, RHS);
+  }
+
+  if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) {
+    Value *LHS = CI->getOperand(0);
+    Value *RHS = CI->getOperand(1);
+    CmpInst::Predicate Pred = CI->getPredicate();
+    if (Inst->getOperand(0) > Inst->getOperand(1)) {
+      std::swap(LHS, RHS);
+      Pred = CI->getSwappedPredicate();
+    }
+    return hash_combine(Inst->getOpcode(), Pred, LHS, RHS);
+  }
+
+  if (CastInst *CI = dyn_cast<CastInst>(Inst))
+    return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
+
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst))
+    return hash_combine(EVI->getOpcode(), EVI->getOperand(0),
+                        hash_combine_range(EVI->idx_begin(), EVI->idx_end()));
+
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst))
+    return hash_combine(IVI->getOpcode(), IVI->getOperand(0),
+                        IVI->getOperand(1),
+                        hash_combine_range(IVI->idx_begin(), IVI->idx_end()));
+
+  assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) ||
+          isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
+          isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
+          isa<ShuffleVectorInst>(Inst)) &&
+         "Invalid/unknown instruction");
+
+  // Mix in the opcode.
+  return hash_combine(
+      Inst->getOpcode(),
+      hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
+}
+
+bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
+  Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
+
+  if (LHS.isSentinel() || RHS.isSentinel())
+    return LHSI == RHSI;
+
+  if (LHSI->getOpcode() != RHSI->getOpcode())
+    return false;
+  if (LHSI->isIdenticalToWhenDefined(RHSI))
+    return true;
+
+  // If we're not strictly identical, we still might be a commutable instruction
+  if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) {
+    if (!LHSBinOp->isCommutative())
+      return false;
+
+    assert(isa<BinaryOperator>(RHSI) &&
+           "same opcode, but different instruction type?");
+    BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI);
+
+    // Commuted equality
+    return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) &&
+           LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
+  }
+  if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) {
+    assert(isa<CmpInst>(RHSI) &&
+           "same opcode, but different instruction type?");
+    CmpInst *RHSCmp = cast<CmpInst>(RHSI);
+    // Commuted equality
+    return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) &&
+           LHSCmp->getOperand(1) == RHSCmp->getOperand(0) &&
+           LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
+  }
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// CallValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Struct representing the available call values in the scoped hash
+/// table.
+struct CallValue {
+  Instruction *Inst;
+
+  CallValue(Instruction *I) : Inst(I) {
+    assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+  }
+
+  bool isSentinel() const {
+    return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+           Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+
+  static bool canHandle(Instruction *Inst) {
+    // Don't value number anything that returns void.
+    if (Inst->getType()->isVoidTy())
+      return false;
+
+    CallInst *CI = dyn_cast<CallInst>(Inst);
+    if (!CI || !CI->onlyReadsMemory())
+      return false;
+    return true;
+  }
+};
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<CallValue> {
+  static inline CallValue getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+  static inline CallValue getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+  static unsigned getHashValue(CallValue Val);
+  static bool isEqual(CallValue LHS, CallValue RHS);
+};
+}
+
+unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
+  Instruction *Inst = Val.Inst;
+  // Hash all of the operands as pointers and mix in the opcode.
+  return hash_combine(
+      Inst->getOpcode(),
+      hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
+}
+
+bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
+  Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
+  if (LHS.isSentinel() || RHS.isSentinel())
+    return LHSI == RHSI;
+  return LHSI->isIdenticalTo(RHSI);
+}
+
+//===----------------------------------------------------------------------===//
+// EarlyCSE implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+class EarlyCSE {
+public:
+  const TargetLibraryInfo &TLI;
+  const TargetTransformInfo &TTI;
+  DominatorTree &DT;
+  AssumptionCache &AC;
+  const SimplifyQuery SQ;
+  MemorySSA *MSSA;
+  std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
+  typedef RecyclingAllocator<
+      BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy;
+  typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
+                          AllocatorTy> ScopedHTType;
+
+  /// \brief A scoped hash table of the current values of all of our simple
+  /// scalar expressions.
+  ///
+  /// As we walk down the domtree, we look to see if instructions are in this:
+  /// if so, we replace them with what we find, otherwise we insert them so
+  /// that dominated values can succeed in their lookup.
+  ScopedHTType AvailableValues;
+
+  /// A scoped hash table of the current values of previously encounted memory
+  /// locations.
+  ///
+  /// This allows us to get efficient access to dominating loads or stores when
+  /// we have a fully redundant load.  In addition to the most recent load, we
+  /// keep track of a generation count of the read, which is compared against
+  /// the current generation count.  The current generation count is incremented
+  /// after every possibly writing memory operation, which ensures that we only
+  /// CSE loads with other loads that have no intervening store.  Ordering
+  /// events (such as fences or atomic instructions) increment the generation
+  /// count as well; essentially, we model these as writes to all possible
+  /// locations.  Note that atomic and/or volatile loads and stores can be
+  /// present the table; it is the responsibility of the consumer to inspect
+  /// the atomicity/volatility if needed.
+  struct LoadValue {
+    Instruction *DefInst;
+    unsigned Generation;
+    int MatchingId;
+    bool IsAtomic;
+    bool IsInvariant;
+    LoadValue()
+        : DefInst(nullptr), Generation(0), MatchingId(-1), IsAtomic(false),
+          IsInvariant(false) {}
+    LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId,
+              bool IsAtomic, bool IsInvariant)
+        : DefInst(Inst), Generation(Generation), MatchingId(MatchingId),
+          IsAtomic(IsAtomic), IsInvariant(IsInvariant) {}
+  };
+  typedef RecyclingAllocator<BumpPtrAllocator,
+                             ScopedHashTableVal<Value *, LoadValue>>
+      LoadMapAllocator;
+  typedef ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>,
+                          LoadMapAllocator> LoadHTType;
+  LoadHTType AvailableLoads;
+
+  /// \brief A scoped hash table of the current values of read-only call
+  /// values.
+  ///
+  /// It uses the same generation count as loads.
+  typedef ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>
+      CallHTType;
+  CallHTType AvailableCalls;
+
+  /// \brief This is the current generation of the memory value.
+  unsigned CurrentGeneration;
+
+  /// \brief Set up the EarlyCSE runner for a particular function.
+  EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI,
+           const TargetTransformInfo &TTI, DominatorTree &DT,
+           AssumptionCache &AC, MemorySSA *MSSA)
+      : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA),
+        MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)), CurrentGeneration(0) {
+  }
+
+  bool run();
+
+private:
+  // Almost a POD, but needs to call the constructors for the scoped hash
+  // tables so that a new scope gets pushed on. These are RAII so that the
+  // scope gets popped when the NodeScope is destroyed.
+  class NodeScope {
+  public:
+    NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+              CallHTType &AvailableCalls)
+        : Scope(AvailableValues), LoadScope(AvailableLoads),
+          CallScope(AvailableCalls) {}
+
+  private:
+    NodeScope(const NodeScope &) = delete;
+    void operator=(const NodeScope &) = delete;
+
+    ScopedHTType::ScopeTy Scope;
+    LoadHTType::ScopeTy LoadScope;
+    CallHTType::ScopeTy CallScope;
+  };
+
+  // Contains all the needed information to create a stack for doing a depth
+  // first traversal of the tree. This includes scopes for values, loads, and
+  // calls as well as the generation. There is a child iterator so that the
+  // children do not need to be store separately.
+  class StackNode {
+  public:
+    StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+              CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
+              DomTreeNode::iterator child, DomTreeNode::iterator end)
+        : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
+          EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls),
+          Processed(false) {}
+
+    // Accessors.
+    unsigned currentGeneration() { return CurrentGeneration; }
+    unsigned childGeneration() { return ChildGeneration; }
+    void childGeneration(unsigned generation) { ChildGeneration = generation; }
+    DomTreeNode *node() { return Node; }
+    DomTreeNode::iterator childIter() { return ChildIter; }
+    DomTreeNode *nextChild() {
+      DomTreeNode *child = *ChildIter;
+      ++ChildIter;
+      return child;
+    }
+    DomTreeNode::iterator end() { return EndIter; }
+    bool isProcessed() { return Processed; }
+    void process() { Processed = true; }
+
+  private:
+    StackNode(const StackNode &) = delete;
+    void operator=(const StackNode &) = delete;
+
+    // Members.
+    unsigned CurrentGeneration;
+    unsigned ChildGeneration;
+    DomTreeNode *Node;
+    DomTreeNode::iterator ChildIter;
+    DomTreeNode::iterator EndIter;
+    NodeScope Scopes;
+    bool Processed;
+  };
+
+  /// \brief Wrapper class to handle memory instructions, including loads,
+  /// stores and intrinsic loads and stores defined by the target.
+  class ParseMemoryInst {
+  public:
+    ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
+      : IsTargetMemInst(false), Inst(Inst) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+        if (TTI.getTgtMemIntrinsic(II, Info))
+          IsTargetMemInst = true;
+    }
+    bool isLoad() const {
+      if (IsTargetMemInst) return Info.ReadMem;
+      return isa<LoadInst>(Inst);
+    }
+    bool isStore() const {
+      if (IsTargetMemInst) return Info.WriteMem;
+      return isa<StoreInst>(Inst);
+    }
+    bool isAtomic() const {
+      if (IsTargetMemInst)
+        return Info.Ordering != AtomicOrdering::NotAtomic;
+      return Inst->isAtomic();
+    }
+    bool isUnordered() const {
+      if (IsTargetMemInst)
+        return Info.isUnordered();
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isUnordered();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->isUnordered();
+      }
+      // Conservative answer
+      return !Inst->isAtomic();
+    }
+
+    bool isVolatile() const {
+      if (IsTargetMemInst)
+        return Info.IsVolatile;
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isVolatile();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->isVolatile();
+      }
+      // Conservative answer
+      return true;
+    }
+
+    bool isInvariantLoad() const {
+      if (auto *LI = dyn_cast<LoadInst>(Inst))
+        return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
+      return false;
+    }
+
+    bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
+      return (getPointerOperand() == Inst.getPointerOperand() &&
+              getMatchingId() == Inst.getMatchingId());
+    }
+    bool isValid() const { return getPointerOperand() != nullptr; }
+
+    // For regular (non-intrinsic) loads/stores, this is set to -1. For
+    // intrinsic loads/stores, the id is retrieved from the corresponding
+    // field in the MemIntrinsicInfo structure.  That field contains
+    // non-negative values only.
+    int getMatchingId() const {
+      if (IsTargetMemInst) return Info.MatchingId;
+      return -1;
+    }
+    Value *getPointerOperand() const {
+      if (IsTargetMemInst) return Info.PtrVal;
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->getPointerOperand();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->getPointerOperand();
+      }
+      return nullptr;
+    }
+    bool mayReadFromMemory() const {
+      if (IsTargetMemInst) return Info.ReadMem;
+      return Inst->mayReadFromMemory();
+    }
+    bool mayWriteToMemory() const {
+      if (IsTargetMemInst) return Info.WriteMem;
+      return Inst->mayWriteToMemory();
+    }
+
+  private:
+    bool IsTargetMemInst;
+    MemIntrinsicInfo Info;
+    Instruction *Inst;
+  };
+
+  bool processNode(DomTreeNode *Node);
+
+  Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
+    if (auto *LI = dyn_cast<LoadInst>(Inst))
+      return LI;
+    if (auto *SI = dyn_cast<StoreInst>(Inst))
+      return SI->getValueOperand();
+    assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
+    return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
+                                                 ExpectedType);
+  }
+
+  bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
+                           Instruction *EarlierInst, Instruction *LaterInst);
+
+  void removeMSSA(Instruction *Inst) {
+    if (!MSSA)
+      return;
+    // Removing a store here can leave MemorySSA in an unoptimized state by
+    // creating MemoryPhis that have identical arguments and by creating
+    // MemoryUses whose defining access is not an actual clobber.  We handle the
+    // phi case eagerly here.  The non-optimized MemoryUse case is lazily
+    // updated by MemorySSA getClobberingMemoryAccess.
+    if (MemoryAccess *MA = MSSA->getMemoryAccess(Inst)) {
+      // Optimize MemoryPhi nodes that may become redundant by having all the
+      // same input values once MA is removed.
+      SmallSetVector<MemoryPhi *, 4> PhisToCheck;
+      SmallVector<MemoryAccess *, 8> WorkQueue;
+      WorkQueue.push_back(MA);
+      // Process MemoryPhi nodes in FIFO order using a ever-growing vector since
+      // we shouldn't be processing that many phis and this will avoid an
+      // allocation in almost all cases.
+      for (unsigned I = 0; I < WorkQueue.size(); ++I) {
+        MemoryAccess *WI = WorkQueue[I];
+
+        for (auto *U : WI->users())
+          if (MemoryPhi *MP = dyn_cast<MemoryPhi>(U))
+            PhisToCheck.insert(MP);
+
+        MSSAUpdater->removeMemoryAccess(WI);
+
+        for (MemoryPhi *MP : PhisToCheck) {
+          MemoryAccess *FirstIn = MP->getIncomingValue(0);
+          if (all_of(MP->incoming_values(),
+                     [=](Use &In) { return In == FirstIn; }))
+            WorkQueue.push_back(MP);
+        }
+        PhisToCheck.clear();
+      }
+    }
+  }
+};
+}
+
+/// Determine if the memory referenced by LaterInst is from the same heap
+/// version as EarlierInst.
+/// This is currently called in two scenarios:
+///
+///   load p
+///   ...
+///   load p
+///
+/// and
+///
+///   x = load p
+///   ...
+///   store x, p
+///
+/// in both cases we want to verify that there are no possible writes to the
+/// memory referenced by p between the earlier and later instruction.
+bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
+                                   unsigned LaterGeneration,
+                                   Instruction *EarlierInst,
+                                   Instruction *LaterInst) {
+  // Check the simple memory generation tracking first.
+  if (EarlierGeneration == LaterGeneration)
+    return true;
+
+  if (!MSSA)
+    return false;
+
+  // Since we know LaterDef dominates LaterInst and EarlierInst dominates
+  // LaterInst, if LaterDef dominates EarlierInst then it can't occur between
+  // EarlierInst and LaterInst and neither can any other write that potentially
+  // clobbers LaterInst.
+  MemoryAccess *LaterDef =
+      MSSA->getWalker()->getClobberingMemoryAccess(LaterInst);
+  return MSSA->dominates(LaterDef, MSSA->getMemoryAccess(EarlierInst));
+}
+
+bool EarlyCSE::processNode(DomTreeNode *Node) {
+  bool Changed = false;
+  BasicBlock *BB = Node->getBlock();
+
+  // If this block has a single predecessor, then the predecessor is the parent
+  // of the domtree node and all of the live out memory values are still current
+  // in this block.  If this block has multiple predecessors, then they could
+  // have invalidated the live-out memory values of our parent value.  For now,
+  // just be conservative and invalidate memory if this block has multiple
+  // predecessors.
+  if (!BB->getSinglePredecessor())
+    ++CurrentGeneration;
+
+  // If this node has a single predecessor which ends in a conditional branch,
+  // we can infer the value of the branch condition given that we took this
+  // path.  We need the single predecessor to ensure there's not another path
+  // which reaches this block where the condition might hold a different
+  // value.  Since we're adding this to the scoped hash table (like any other
+  // def), it will have been popped if we encounter a future merge block.
+  if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+    auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
+    if (BI && BI->isConditional()) {
+      auto *CondInst = dyn_cast<Instruction>(BI->getCondition());
+      if (CondInst && SimpleValue::canHandle(CondInst)) {
+        assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
+        auto *TorF = (BI->getSuccessor(0) == BB)
+                         ? ConstantInt::getTrue(BB->getContext())
+                         : ConstantInt::getFalse(BB->getContext());
+        AvailableValues.insert(CondInst, TorF);
+        DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
+                     << CondInst->getName() << "' as " << *TorF << " in "
+                     << BB->getName() << "\n");
+        // Replace all dominated uses with the known value.
+        if (unsigned Count = replaceDominatedUsesWith(
+                CondInst, TorF, DT, BasicBlockEdge(Pred, BB))) {
+          Changed = true;
+          NumCSECVP += Count;
+        }
+      }
+    }
+  }
+
+  /// LastStore - Keep track of the last non-volatile store that we saw... for
+  /// as long as there in no instruction that reads memory.  If we see a store
+  /// to the same location, we delete the dead store.  This zaps trivial dead
+  /// stores which can occur in bitfield code among other things.
+  Instruction *LastStore = nullptr;
+
+  // See if any instructions in the block can be eliminated.  If so, do it.  If
+  // not, add them to AvailableValues.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+    Instruction *Inst = &*I++;
+
+    // Dead instructions should just be removed.
+    if (isInstructionTriviallyDead(Inst, &TLI)) {
+      DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
+      removeMSSA(Inst);
+      Inst->eraseFromParent();
+      Changed = true;
+      ++NumSimplify;
+      continue;
+    }
+
+    // Skip assume intrinsics, they don't really have side effects (although
+    // they're marked as such to ensure preservation of control dependencies),
+    // and this pass will not bother with its removal. However, we should mark
+    // its condition as true for all dominated blocks.
+    if (match(Inst, m_Intrinsic<Intrinsic::assume>())) {
+      auto *CondI =
+          dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0));
+      if (CondI && SimpleValue::canHandle(CondI)) {
+        DEBUG(dbgs() << "EarlyCSE considering assumption: " << *Inst << '\n');
+        AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
+      } else
+        DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
+      continue;
+    }
+
+    // Skip invariant.start intrinsics since they only read memory, and we can
+    // forward values across it. Also, we dont need to consume the last store
+    // since the semantics of invariant.start allow us to perform DSE of the
+    // last store, if there was a store following invariant.start. Consider:
+    //
+    // store 30, i8* p
+    // invariant.start(p)
+    // store 40, i8* p
+    // We can DSE the store to 30, since the store 40 to invariant location p
+    // causes undefined behaviour.
+    if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>()))
+      continue;
+
+    if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) {
+      if (auto *CondI =
+              dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0))) {
+        if (SimpleValue::canHandle(CondI)) {
+          // Do we already know the actual value of this condition?
+          if (auto *KnownCond = AvailableValues.lookup(CondI)) {
+            // Is the condition known to be true?
+            if (isa<ConstantInt>(KnownCond) &&
+                cast<ConstantInt>(KnownCond)->isOne()) {
+              DEBUG(dbgs() << "EarlyCSE removing guard: " << *Inst << '\n');
+              removeMSSA(Inst);
+              Inst->eraseFromParent();
+              Changed = true;
+              continue;
+            } else
+              // Use the known value if it wasn't true.
+              cast<CallInst>(Inst)->setArgOperand(0, KnownCond);
+          }
+          // The condition we're on guarding here is true for all dominated
+          // locations.
+          AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
+        }
+      }
+
+      // Guard intrinsics read all memory, but don't write any memory.
+      // Accordingly, don't update the generation but consume the last store (to
+      // avoid an incorrect DSE).
+      LastStore = nullptr;
+      continue;
+    }
+
+    // If the instruction can be simplified (e.g. X+0 = X) then replace it with
+    // its simpler value.
+    if (Value *V = SimplifyInstruction(Inst, SQ)) {
+      DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V << '\n');
+      bool Killed = false;
+      if (!Inst->use_empty()) {
+        Inst->replaceAllUsesWith(V);
+        Changed = true;
+      }
+      if (isInstructionTriviallyDead(Inst, &TLI)) {
+        removeMSSA(Inst);
+        Inst->eraseFromParent();
+        Changed = true;
+        Killed = true;
+      }
+      if (Changed)
+        ++NumSimplify;
+      if (Killed)
+        continue;
+    }
+
+    // If this is a simple instruction that we can value number, process it.
+    if (SimpleValue::canHandle(Inst)) {
+      // See if the instruction has an available value.  If so, use it.
+      if (Value *V = AvailableValues.lookup(Inst)) {
+        DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << "  to: " << *V << '\n');
+        if (auto *I = dyn_cast<Instruction>(V))
+          I->andIRFlags(Inst);
+        Inst->replaceAllUsesWith(V);
+        removeMSSA(Inst);
+        Inst->eraseFromParent();
+        Changed = true;
+        ++NumCSE;
+        continue;
+      }
+
+      // Otherwise, just remember that this value is available.
+      AvailableValues.insert(Inst, Inst);
+      continue;
+    }
+
+    ParseMemoryInst MemInst(Inst, TTI);
+    // If this is a non-volatile load, process it.
+    if (MemInst.isValid() && MemInst.isLoad()) {
+      // (conservatively) we can't peak past the ordering implied by this
+      // operation, but we can add this load to our set of available values
+      if (MemInst.isVolatile() || !MemInst.isUnordered()) {
+        LastStore = nullptr;
+        ++CurrentGeneration;
+      }
+
+      // If we have an available version of this load, and if it is the right
+      // generation or the load is known to be from an invariant location,
+      // replace this instruction.
+      //
+      // If either the dominating load or the current load are invariant, then
+      // we can assume the current load loads the same value as the dominating
+      // load.
+      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
+      if (InVal.DefInst != nullptr &&
+          InVal.MatchingId == MemInst.getMatchingId() &&
+          // We don't yet handle removing loads with ordering of any kind.
+          !MemInst.isVolatile() && MemInst.isUnordered() &&
+          // We can't replace an atomic load with one which isn't also atomic.
+          InVal.IsAtomic >= MemInst.isAtomic() &&
+          (InVal.IsInvariant || MemInst.isInvariantLoad() ||
+           isSameMemGeneration(InVal.Generation, CurrentGeneration,
+                               InVal.DefInst, Inst))) {
+        Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType());
+        if (Op != nullptr) {
+          DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
+                       << "  to: " << *InVal.DefInst << '\n');
+          if (!Inst->use_empty())
+            Inst->replaceAllUsesWith(Op);
+          removeMSSA(Inst);
+          Inst->eraseFromParent();
+          Changed = true;
+          ++NumCSELoad;
+          continue;
+        }
+      }
+
+      // Otherwise, remember that we have this instruction.
+      AvailableLoads.insert(
+          MemInst.getPointerOperand(),
+          LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+                    MemInst.isAtomic(), MemInst.isInvariantLoad()));
+      LastStore = nullptr;
+      continue;
+    }
+
+    // If this instruction may read from memory or throw (and potentially read
+    // from memory in the exception handler), forget LastStore.  Load/store
+    // intrinsics will indicate both a read and a write to memory.  The target
+    // may override this (e.g. so that a store intrinsic does not read from
+    // memory, and thus will be treated the same as a regular store for
+    // commoning purposes).
+    if ((Inst->mayReadFromMemory() || Inst->mayThrow()) &&
+        !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
+      LastStore = nullptr;
+
+    // If this is a read-only call, process it.
+    if (CallValue::canHandle(Inst)) {
+      // If we have an available version of this call, and if it is the right
+      // generation, replace this instruction.
+      std::pair<Instruction *, unsigned> InVal = AvailableCalls.lookup(Inst);
+      if (InVal.first != nullptr &&
+          isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first,
+                              Inst)) {
+        DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
+                     << "  to: " << *InVal.first << '\n');
+        if (!Inst->use_empty())
+          Inst->replaceAllUsesWith(InVal.first);
+        removeMSSA(Inst);
+        Inst->eraseFromParent();
+        Changed = true;
+        ++NumCSECall;
+        continue;
+      }
+
+      // Otherwise, remember that we have this instruction.
+      AvailableCalls.insert(
+          Inst, std::pair<Instruction *, unsigned>(Inst, CurrentGeneration));
+      continue;
+    }
+
+    // A release fence requires that all stores complete before it, but does
+    // not prevent the reordering of following loads 'before' the fence.  As a
+    // result, we don't need to consider it as writing to memory and don't need
+    // to advance the generation.  We do need to prevent DSE across the fence,
+    // but that's handled above.
+    if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+      if (FI->getOrdering() == AtomicOrdering::Release) {
+        assert(Inst->mayReadFromMemory() && "relied on to prevent DSE above");
+        continue;
+      }
+
+    // write back DSE - If we write back the same value we just loaded from
+    // the same location and haven't passed any intervening writes or ordering
+    // operations, we can remove the write.  The primary benefit is in allowing
+    // the available load table to remain valid and value forward past where
+    // the store originally was.
+    if (MemInst.isValid() && MemInst.isStore()) {
+      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
+      if (InVal.DefInst &&
+          InVal.DefInst == getOrCreateResult(Inst, InVal.DefInst->getType()) &&
+          InVal.MatchingId == MemInst.getMatchingId() &&
+          // We don't yet handle removing stores with ordering of any kind.
+          !MemInst.isVolatile() && MemInst.isUnordered() &&
+          isSameMemGeneration(InVal.Generation, CurrentGeneration,
+                              InVal.DefInst, Inst)) {
+        // It is okay to have a LastStore to a different pointer here if MemorySSA
+        // tells us that the load and store are from the same memory generation.
+        // In that case, LastStore should keep its present value since we're
+        // removing the current store.
+        assert((!LastStore ||
+                ParseMemoryInst(LastStore, TTI).getPointerOperand() ==
+                    MemInst.getPointerOperand() ||
+                MSSA) &&
+               "can't have an intervening store if not using MemorySSA!");
+        DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n');
+        removeMSSA(Inst);
+        Inst->eraseFromParent();
+        Changed = true;
+        ++NumDSE;
+        // We can avoid incrementing the generation count since we were able
+        // to eliminate this store.
+        continue;
+      }
+    }
+
+    // Okay, this isn't something we can CSE at all.  Check to see if it is
+    // something that could modify memory.  If so, our available memory values
+    // cannot be used so bump the generation count.
+    if (Inst->mayWriteToMemory()) {
+      ++CurrentGeneration;
+
+      if (MemInst.isValid() && MemInst.isStore()) {
+        // We do a trivial form of DSE if there are two stores to the same
+        // location with no intervening loads.  Delete the earlier store.
+        // At the moment, we don't remove ordered stores, but do remove
+        // unordered atomic stores.  There's no special requirement (for
+        // unordered atomics) about removing atomic stores only in favor of
+        // other atomic stores since we we're going to execute the non-atomic
+        // one anyway and the atomic one might never have become visible.
+        if (LastStore) {
+          ParseMemoryInst LastStoreMemInst(LastStore, TTI);
+          assert(LastStoreMemInst.isUnordered() &&
+                 !LastStoreMemInst.isVolatile() &&
+                 "Violated invariant");
+          if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
+            DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
+                         << "  due to: " << *Inst << '\n');
+            removeMSSA(LastStore);
+            LastStore->eraseFromParent();
+            Changed = true;
+            ++NumDSE;
+            LastStore = nullptr;
+          }
+          // fallthrough - we can exploit information about this store
+        }
+
+        // Okay, we just invalidated anything we knew about loaded values.  Try
+        // to salvage *something* by remembering that the stored value is a live
+        // version of the pointer.  It is safe to forward from volatile stores
+        // to non-volatile loads, so we don't have to check for volatility of
+        // the store.
+        AvailableLoads.insert(
+            MemInst.getPointerOperand(),
+            LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+                      MemInst.isAtomic(), /*IsInvariant=*/false));
+
+        // Remember that this was the last unordered store we saw for DSE. We
+        // don't yet handle DSE on ordered or volatile stores since we don't
+        // have a good way to model the ordering requirement for following
+        // passes  once the store is removed.  We could insert a fence, but
+        // since fences are slightly stronger than stores in their ordering,
+        // it's not clear this is a profitable transform. Another option would
+        // be to merge the ordering with that of the post dominating store.
+        if (MemInst.isUnordered() && !MemInst.isVolatile())
+          LastStore = Inst;
+        else
+          LastStore = nullptr;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool EarlyCSE::run() {
+  // Note, deque is being used here because there is significant performance
+  // gains over vector when the container becomes very large due to the
+  // specific access patterns. For more information see the mailing list
+  // discussion on this:
+  // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
+  std::deque<StackNode *> nodesToProcess;
+
+  bool Changed = false;
+
+  // Process the root node.
+  nodesToProcess.push_back(new StackNode(
+      AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration,
+      DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end()));
+
+  // Save the current generation.
+  unsigned LiveOutGeneration = CurrentGeneration;
+
+  // Process the stack.
+  while (!nodesToProcess.empty()) {
+    // Grab the first item off the stack. Set the current generation, remove
+    // the node from the stack, and process it.
+    StackNode *NodeToProcess = nodesToProcess.back();
+
+    // Initialize class members.
+    CurrentGeneration = NodeToProcess->currentGeneration();
+
+    // Check if the node needs to be processed.
+    if (!NodeToProcess->isProcessed()) {
+      // Process the node.
+      Changed |= processNode(NodeToProcess->node());
+      NodeToProcess->childGeneration(CurrentGeneration);
+      NodeToProcess->process();
+    } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
+      // Push the next child onto the stack.
+      DomTreeNode *child = NodeToProcess->nextChild();
+      nodesToProcess.push_back(
+          new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
+                        NodeToProcess->childGeneration(), child, child->begin(),
+                        child->end()));
+    } else {
+      // It has been processed, and there are no more children to process,
+      // so delete it and pop it off the stack.
+      delete NodeToProcess;
+      nodesToProcess.pop_back();
+    }
+  } // while (!nodes...)
+
+  // Reset the current generation.
+  CurrentGeneration = LiveOutGeneration;
+
+  return Changed;
+}
+
+PreservedAnalyses EarlyCSEPass::run(Function &F,
+                                    FunctionAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto *MSSA =
+      UseMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() : nullptr;
+
+  EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
+
+  if (!CSE.run())
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  if (UseMemorySSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
+}
+
+namespace {
+/// \brief A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+template<bool UseMemorySSA>
+class EarlyCSELegacyCommonPass : public FunctionPass {
+public:
+  static char ID;
+
+  EarlyCSELegacyCommonPass() : FunctionPass(ID) {
+    if (UseMemorySSA)
+      initializeEarlyCSEMemSSALegacyPassPass(*PassRegistry::getPassRegistry());
+    else
+      initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto *MSSA =
+        UseMemorySSA ? &getAnalysis<MemorySSAWrapperPass>().getMSSA() : nullptr;
+
+    EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
+
+    return CSE.run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    if (UseMemorySSA) {
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    }
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+}
+
+using EarlyCSELegacyPass = EarlyCSELegacyCommonPass</*UseMemorySSA=*/false>;
+
+template<>
+char EarlyCSELegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false)
+
+using EarlyCSEMemSSALegacyPass =
+    EarlyCSELegacyCommonPass</*UseMemorySSA=*/true>;
+
+template<>
+char EarlyCSEMemSSALegacyPass::ID = 0;
+
+FunctionPass *llvm::createEarlyCSEPass(bool UseMemorySSA) {
+  if (UseMemorySSA)
+    return new EarlyCSEMemSSALegacyPass();
+  else
+    return new EarlyCSELegacyPass();
+}
+
+INITIALIZE_PASS_BEGIN(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
+                      "Early CSE w/ MemorySSA", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
+                    "Early CSE w/ MemorySSA", false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
new file mode 100644
index 000000000000..063df779a30b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -0,0 +1,80 @@
+//===- FlattenCFGPass.cpp - CFG Flatten Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements flattening of CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "flattencfg"
+
+namespace {
+struct FlattenCFGPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+public:
+  FlattenCFGPass() : FunctionPass(ID) {
+    initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+  }
+
+private:
+  AliasAnalysis *AA;
+};
+}
+
+char FlattenCFGPass::ID = 0;
+INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+                    false)
+
+// Public interface to the FlattenCFG pass
+FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); }
+
+/// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
+  bool Changed = false;
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+
+    // Loop over all of the basic blocks and remove them if they are unneeded...
+    //
+    for (Function::iterator BBIt = F.begin(); BBIt != F.end();) {
+      if (FlattenCFG(&*BBIt++, AA)) {
+        LocalChange = true;
+      }
+    }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
+bool FlattenCFGPass::runOnFunction(Function &F) {
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  bool EverChanged = false;
+  // iterativelyFlattenCFG can make some blocks dead.
+  while (iterativelyFlattenCFG(F, AA)) {
+    removeUnreachableBlocks(F);
+    EverChanged = true;
+  }
+  return EverChanged;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
new file mode 100644
index 000000000000..b105ece8dc7c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -0,0 +1,525 @@
+//===- Float2Int.cpp - Demote floating point ops to work on integers ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Float2Int pass, which aims to demote floating
+// point operations to work on integers, where that is losslessly possible.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "float2int"
+
+#include "llvm/Transforms/Scalar/Float2Int.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <deque>
+#include <functional> // For std::function
+using namespace llvm;
+
+// The algorithm is simple. Start at instructions that convert from the
+// float to the int domain: fptoui, fptosi and fcmp. Walk up the def-use
+// graph, using an equivalence datastructure to unify graphs that interfere.
+//
+// Mappable instructions are those with an integer corrollary that, given
+// integer domain inputs, produce an integer output; fadd, for example.
+//
+// If a non-mappable instruction is seen, this entire def-use graph is marked
+// as non-transformable. If we see an instruction that converts from the
+// integer domain to FP domain (uitofp,sitofp), we terminate our walk.
+
+/// The largest integer type worth dealing with.
+static cl::opt<unsigned>
+MaxIntegerBW("float2int-max-integer-bw", cl::init(64), cl::Hidden,
+             cl::desc("Max integer bitwidth to consider in float2int"
+                      "(default=64)"));
+
+namespace {
+  struct Float2IntLegacyPass : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    Float2IntLegacyPass() : FunctionPass(ID) {
+      initializeFloat2IntLegacyPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override {
+      if (skipFunction(F))
+        return false;
+
+      return Impl.runImpl(F);
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+    }
+
+  private:
+    Float2IntPass Impl;
+  };
+}
+
+char Float2IntLegacyPass::ID = 0;
+INITIALIZE_PASS(Float2IntLegacyPass, "float2int", "Float to int", false, false)
+
+// Given a FCmp predicate, return a matching ICmp predicate if one
+// exists, otherwise return BAD_ICMP_PREDICATE.
+static CmpInst::Predicate mapFCmpPred(CmpInst::Predicate P) {
+  switch (P) {
+  case CmpInst::FCMP_OEQ:
+  case CmpInst::FCMP_UEQ:
+    return CmpInst::ICMP_EQ;
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_UGT:
+    return CmpInst::ICMP_SGT;
+  case CmpInst::FCMP_OGE:
+  case CmpInst::FCMP_UGE:
+    return CmpInst::ICMP_SGE;
+  case CmpInst::FCMP_OLT:
+  case CmpInst::FCMP_ULT:
+    return CmpInst::ICMP_SLT;
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_ULE:
+    return CmpInst::ICMP_SLE;
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UNE:
+    return CmpInst::ICMP_NE;
+  default:
+    return CmpInst::BAD_ICMP_PREDICATE;
+  }
+}
+
+// Given a floating point binary operator, return the matching
+// integer version.
+static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case Instruction::FAdd: return Instruction::Add;
+  case Instruction::FSub: return Instruction::Sub;
+  case Instruction::FMul: return Instruction::Mul;
+  }
+}
+
+// Find the roots - instructions that convert from the FP domain to
+// integer domain.
+void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
+  for (auto &I : instructions(F)) {
+    if (isa<VectorType>(I.getType()))
+      continue;
+    switch (I.getOpcode()) {
+    default: break;
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+      Roots.insert(&I);
+      break;
+    case Instruction::FCmp:
+      if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=
+          CmpInst::BAD_ICMP_PREDICATE)
+        Roots.insert(&I);
+      break;
+    }
+  }
+}
+
+// Helper - mark I as having been traversed, having range R.
+void Float2IntPass::seen(Instruction *I, ConstantRange R) {
+  DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
+  auto IT = SeenInsts.find(I);
+  if (IT != SeenInsts.end())
+    IT->second = std::move(R);
+  else
+    SeenInsts.insert(std::make_pair(I, std::move(R)));
+}
+
+// Helper - get a range representing a poison value.
+ConstantRange Float2IntPass::badRange() {
+  return ConstantRange(MaxIntegerBW + 1, true);
+}
+ConstantRange Float2IntPass::unknownRange() {
+  return ConstantRange(MaxIntegerBW + 1, false);
+}
+ConstantRange Float2IntPass::validateRange(ConstantRange R) {
+  if (R.getBitWidth() > MaxIntegerBW + 1)
+    return badRange();
+  return R;
+}
+
+// The most obvious way to structure the search is a depth-first, eager
+// search from each root. However, that require direct recursion and so
+// can only handle small instruction sequences. Instead, we split the search
+// up into two phases:
+//   - walkBackwards:  A breadth-first walk of the use-def graph starting from
+//                     the roots. Populate "SeenInsts" with interesting
+//                     instructions and poison values if they're obvious and
+//                     cheap to compute. Calculate the equivalance set structure
+//                     while we're here too.
+//   - walkForwards:  Iterate over SeenInsts in reverse order, so we visit
+//                     defs before their uses. Calculate the real range info.
+
+// Breadth-first walk of the use-def graph; determine the set of nodes
+// we care about and eagerly determine if some of them are poisonous.
+void Float2IntPass::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {
+  std::deque<Instruction*> Worklist(Roots.begin(), Roots.end());
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+
+    if (SeenInsts.find(I) != SeenInsts.end())
+      // Seen already.
+      continue;
+
+    switch (I->getOpcode()) {
+      // FIXME: Handle select and phi nodes.
+    default:
+      // Path terminated uncleanly.
+      seen(I, badRange());
+      break;
+
+    case Instruction::UIToFP:
+    case Instruction::SIToFP: {
+      // Path terminated cleanly - use the type of the integer input to seed
+      // the analysis.
+      unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+      auto Input = ConstantRange(BW, true);
+      auto CastOp = (Instruction::CastOps)I->getOpcode();
+      seen(I, validateRange(Input.castOp(CastOp, MaxIntegerBW+1)));
+      continue;
+    }
+
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FCmp:
+      seen(I, unknownRange());
+      break;
+    }
+
+    for (Value *O : I->operands()) {
+      if (Instruction *OI = dyn_cast<Instruction>(O)) {
+        // Unify def-use chains if they interfere.
+        ECs.unionSets(I, OI);
+        if (SeenInsts.find(I)->second != badRange())
+          Worklist.push_back(OI);
+      } else if (!isa<ConstantFP>(O)) {
+        // Not an instruction or ConstantFP? we can't do anything.
+        seen(I, badRange());
+      }
+    }
+  }
+}
+
+// Walk forwards down the list of seen instructions, so we visit defs before
+// uses.
+void Float2IntPass::walkForwards() {
+  for (auto &It : reverse(SeenInsts)) {
+    if (It.second != unknownRange())
+      continue;
+
+    Instruction *I = It.first;
+    std::function<ConstantRange(ArrayRef<ConstantRange>)> Op;
+    switch (I->getOpcode()) {
+      // FIXME: Handle select and phi nodes.
+    default:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+      llvm_unreachable("Should have been handled in walkForwards!");
+
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+      Op = [I](ArrayRef<ConstantRange> Ops) {
+        assert(Ops.size() == 2 && "its a binary operator!");
+        auto BinOp = (Instruction::BinaryOps) I->getOpcode();
+        return Ops[0].binaryOp(BinOp, Ops[1]);
+      };
+      break;
+
+    //
+    // Root-only instructions - we'll only see these if they're the
+    //                          first node in a walk.
+    //
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+      Op = [I](ArrayRef<ConstantRange> Ops) {
+        assert(Ops.size() == 1 && "FPTo[US]I is a unary operator!");
+        // Note: We're ignoring the casts output size here as that's what the
+        // caller expects.
+        auto CastOp = (Instruction::CastOps)I->getOpcode();
+        return Ops[0].castOp(CastOp, MaxIntegerBW+1);
+      };
+      break;
+
+    case Instruction::FCmp:
+      Op = [](ArrayRef<ConstantRange> Ops) {
+        assert(Ops.size() == 2 && "FCmp is a binary operator!");
+        return Ops[0].unionWith(Ops[1]);
+      };
+      break;
+    }
+
+    bool Abort = false;
+    SmallVector<ConstantRange,4> OpRanges;
+    for (Value *O : I->operands()) {
+      if (Instruction *OI = dyn_cast<Instruction>(O)) {
+        assert(SeenInsts.find(OI) != SeenInsts.end() &&
+               "def not seen before use!");
+        OpRanges.push_back(SeenInsts.find(OI)->second);
+      } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) {
+        // Work out if the floating point number can be losslessly represented
+        // as an integer.
+        // APFloat::convertToInteger(&Exact) purports to do what we want, but
+        // the exactness can be too precise. For example, negative zero can
+        // never be exactly converted to an integer.
+        //
+        // Instead, we ask APFloat to round itself to an integral value - this
+        // preserves sign-of-zero - then compare the result with the original.
+        //
+        const APFloat &F = CF->getValueAPF();
+
+        // First, weed out obviously incorrect values. Non-finite numbers
+        // can't be represented and neither can negative zero, unless
+        // we're in fast math mode.
+        if (!F.isFinite() ||
+            (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) &&
+             !I->hasNoSignedZeros())) {
+          seen(I, badRange());
+          Abort = true;
+          break;
+        }
+
+        APFloat NewF = F;
+        auto Res = NewF.roundToIntegral(APFloat::rmNearestTiesToEven);
+        if (Res != APFloat::opOK || NewF.compare(F) != APFloat::cmpEqual) {
+          seen(I, badRange());
+          Abort = true;
+          break;
+        }
+        // OK, it's representable. Now get it.
+        APSInt Int(MaxIntegerBW+1, false);
+        bool Exact;
+        CF->getValueAPF().convertToInteger(Int,
+                                           APFloat::rmNearestTiesToEven,
+                                           &Exact);
+        OpRanges.push_back(ConstantRange(Int));
+      } else {
+        llvm_unreachable("Should have already marked this as badRange!");
+      }
+    }
+
+    // Reduce the operands' ranges to a single range and return.
+    if (!Abort)
+      seen(I, Op(OpRanges));
+  }
+}
+
+// If there is a valid transform to be done, do it.
+bool Float2IntPass::validateAndTransform() {
+  bool MadeChange = false;
+
+  // Iterate over every disjoint partition of the def-use graph.
+  for (auto It = ECs.begin(), E = ECs.end(); It != E; ++It) {
+    ConstantRange R(MaxIntegerBW + 1, false);
+    bool Fail = false;
+    Type *ConvertedToTy = nullptr;
+
+    // For every member of the partition, union all the ranges together.
+    for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
+         MI != ME; ++MI) {
+      Instruction *I = *MI;
+      auto SeenI = SeenInsts.find(I);
+      if (SeenI == SeenInsts.end())
+        continue;
+
+      R = R.unionWith(SeenI->second);
+      // We need to ensure I has no users that have not been seen.
+      // If it does, transformation would be illegal.
+      //
+      // Don't count the roots, as they terminate the graphs.
+      if (Roots.count(I) == 0) {
+        // Set the type of the conversion while we're here.
+        if (!ConvertedToTy)
+          ConvertedToTy = I->getType();
+        for (User *U : I->users()) {
+          Instruction *UI = dyn_cast<Instruction>(U);
+          if (!UI || SeenInsts.find(UI) == SeenInsts.end()) {
+            DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n");
+            Fail = true;
+            break;
+          }
+        }
+      }
+      if (Fail)
+        break;
+    }
+
+    // If the set was empty, or we failed, or the range is poisonous,
+    // bail out.
+    if (ECs.member_begin(It) == ECs.member_end() || Fail ||
+        R.isFullSet() || R.isSignWrappedSet())
+      continue;
+    assert(ConvertedToTy && "Must have set the convertedtoty by this point!");
+
+    // The number of bits required is the maximum of the upper and
+    // lower limits, plus one so it can be signed.
+    unsigned MinBW = std::max(R.getLower().getMinSignedBits(),
+                              R.getUpper().getMinSignedBits()) + 1;
+    DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n");
+
+    // If we've run off the realms of the exactly representable integers,
+    // the floating point result will differ from an integer approximation.
+
+    // Do we need more bits than are in the mantissa of the type we converted
+    // to? semanticsPrecision returns the number of mantissa bits plus one
+    // for the sign bit.
+    unsigned MaxRepresentableBits
+      = APFloat::semanticsPrecision(ConvertedToTy->getFltSemantics()) - 1;
+    if (MinBW > MaxRepresentableBits) {
+      DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
+      continue;
+    }
+    if (MinBW > 64) {
+      DEBUG(dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
+      continue;
+    }
+
+    // OK, R is known to be representable. Now pick a type for it.
+    // FIXME: Pick the smallest legal type that will fit.
+    Type *Ty = (MinBW > 32) ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx);
+
+    for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
+         MI != ME; ++MI)
+      convert(*MI, Ty);
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+Value *Float2IntPass::convert(Instruction *I, Type *ToTy) {
+  if (ConvertedInsts.find(I) != ConvertedInsts.end())
+    // Already converted this instruction.
+    return ConvertedInsts[I];
+
+  SmallVector<Value*,4> NewOperands;
+  for (Value *V : I->operands()) {
+    // Don't recurse if we're an instruction that terminates the path.
+    if (I->getOpcode() == Instruction::UIToFP ||
+        I->getOpcode() == Instruction::SIToFP) {
+      NewOperands.push_back(V);
+    } else if (Instruction *VI = dyn_cast<Instruction>(V)) {
+      NewOperands.push_back(convert(VI, ToTy));
+    } else if (ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
+      APSInt Val(ToTy->getPrimitiveSizeInBits(), /*IsUnsigned=*/false);
+      bool Exact;
+      CF->getValueAPF().convertToInteger(Val,
+                                         APFloat::rmNearestTiesToEven,
+                                         &Exact);
+      NewOperands.push_back(ConstantInt::get(ToTy, Val));
+    } else {
+      llvm_unreachable("Unhandled operand type?");
+    }
+  }
+
+  // Now create a new instruction.
+  IRBuilder<> IRB(I);
+  Value *NewV = nullptr;
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Unhandled instruction!");
+
+  case Instruction::FPToUI:
+    NewV = IRB.CreateZExtOrTrunc(NewOperands[0], I->getType());
+    break;
+
+  case Instruction::FPToSI:
+    NewV = IRB.CreateSExtOrTrunc(NewOperands[0], I->getType());
+    break;
+
+  case Instruction::FCmp: {
+    CmpInst::Predicate P = mapFCmpPred(cast<CmpInst>(I)->getPredicate());
+    assert(P != CmpInst::BAD_ICMP_PREDICATE && "Unhandled predicate!");
+    NewV = IRB.CreateICmp(P, NewOperands[0], NewOperands[1], I->getName());
+    break;
+  }
+
+  case Instruction::UIToFP:
+    NewV = IRB.CreateZExtOrTrunc(NewOperands[0], ToTy);
+    break;
+
+  case Instruction::SIToFP:
+    NewV = IRB.CreateSExtOrTrunc(NewOperands[0], ToTy);
+    break;
+
+  case Instruction::FAdd:
+  case Instruction::FSub:
+  case Instruction::FMul:
+    NewV = IRB.CreateBinOp(mapBinOpcode(I->getOpcode()),
+                           NewOperands[0], NewOperands[1],
+                           I->getName());
+    break;
+  }
+
+  // If we're a root instruction, RAUW.
+  if (Roots.count(I))
+    I->replaceAllUsesWith(NewV);
+
+  ConvertedInsts[I] = NewV;
+  return NewV;
+}
+
+// Perform dead code elimination on the instructions we just modified.
+void Float2IntPass::cleanup() {
+  for (auto &I : reverse(ConvertedInsts))
+    I.first->eraseFromParent();
+}
+
+bool Float2IntPass::runImpl(Function &F) {
+  DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
+  // Clear out all state.
+  ECs = EquivalenceClasses<Instruction*>();
+  SeenInsts.clear();
+  ConvertedInsts.clear();
+  Roots.clear();
+
+  Ctx = &F.getParent()->getContext();
+
+  findRoots(F, Roots);
+
+  walkBackwards(Roots);
+  walkForwards();
+
+  bool Modified = validateAndTransform();
+  if (Modified)
+    cleanup();
+  return Modified;
+}
+
+namespace llvm {
+FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); }
+
+PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &) {
+  if (!runImpl(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+} // End namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
new file mode 100644
index 000000000000..0fe72f3f7331
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -0,0 +1,2375 @@
+//===- GVN.cpp - Eliminate redundant values and loads ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs global value numbering to eliminate fully redundant
+// instructions.  It also performs simple dead load elimination.
+//
+// Note that this pass does the value numbering itself; it does not use the
+// ValueNumbering analysis passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+
+#include <vector>
+using namespace llvm;
+using namespace llvm::gvn;
+using namespace llvm::VNCoercion;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "gvn"
+
+STATISTIC(NumGVNInstr,  "Number of instructions deleted");
+STATISTIC(NumGVNLoad,   "Number of loads deleted");
+STATISTIC(NumGVNPRE,    "Number of instructions PRE'd");
+STATISTIC(NumGVNBlocks, "Number of blocks merged");
+STATISTIC(NumGVNSimpl,  "Number of instructions simplified");
+STATISTIC(NumGVNEqProp, "Number of equalities propagated");
+STATISTIC(NumPRELoad,   "Number of loads PRE'd");
+
+static cl::opt<bool> EnablePRE("enable-pre",
+                               cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true));
+
+// Maximum allowed recursion depth.
+static cl::opt<uint32_t>
+MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,
+                cl::desc("Max recurse depth (default = 1000)"));
+
+struct llvm::GVN::Expression {
+  uint32_t opcode;
+  Type *type;
+  SmallVector<uint32_t, 4> varargs;
+
+  Expression(uint32_t o = ~2U) : opcode(o) {}
+
+  bool operator==(const Expression &other) const {
+    if (opcode != other.opcode)
+      return false;
+    if (opcode == ~0U || opcode == ~1U)
+      return true;
+    if (type != other.type)
+      return false;
+    if (varargs != other.varargs)
+      return false;
+    return true;
+  }
+
+  friend hash_code hash_value(const Expression &Value) {
+    return hash_combine(
+        Value.opcode, Value.type,
+        hash_combine_range(Value.varargs.begin(), Value.varargs.end()));
+  }
+};
+
+namespace llvm {
+template <> struct DenseMapInfo<GVN::Expression> {
+  static inline GVN::Expression getEmptyKey() { return ~0U; }
+
+  static inline GVN::Expression getTombstoneKey() { return ~1U; }
+
+  static unsigned getHashValue(const GVN::Expression &e) {
+    using llvm::hash_value;
+    return static_cast<unsigned>(hash_value(e));
+  }
+  static bool isEqual(const GVN::Expression &LHS, const GVN::Expression &RHS) {
+    return LHS == RHS;
+  }
+};
+} // End llvm namespace.
+
+/// Represents a particular available value that we know how to materialize.
+/// Materialization of an AvailableValue never fails.  An AvailableValue is
+/// implicitly associated with a rematerialization point which is the
+/// location of the instruction from which it was formed.
+struct llvm::gvn::AvailableValue {
+  enum ValType {
+    SimpleVal, // A simple offsetted value that is accessed.
+    LoadVal,   // A value produced by a load.
+    MemIntrin, // A memory intrinsic which is loaded from.
+    UndefVal   // A UndefValue representing a value from dead block (which
+               // is not yet physically removed from the CFG).
+  };
+
+  /// V - The value that is live out of the block.
+  PointerIntPair<Value *, 2, ValType> Val;
+
+  /// Offset - The byte offset in Val that is interesting for the load query.
+  unsigned Offset;
+
+  static AvailableValue get(Value *V, unsigned Offset = 0) {
+    AvailableValue Res;
+    Res.Val.setPointer(V);
+    Res.Val.setInt(SimpleVal);
+    Res.Offset = Offset;
+    return Res;
+  }
+
+  static AvailableValue getMI(MemIntrinsic *MI, unsigned Offset = 0) {
+    AvailableValue Res;
+    Res.Val.setPointer(MI);
+    Res.Val.setInt(MemIntrin);
+    Res.Offset = Offset;
+    return Res;
+  }
+
+  static AvailableValue getLoad(LoadInst *LI, unsigned Offset = 0) {
+    AvailableValue Res;
+    Res.Val.setPointer(LI);
+    Res.Val.setInt(LoadVal);
+    Res.Offset = Offset;
+    return Res;
+  }
+
+  static AvailableValue getUndef() {
+    AvailableValue Res;
+    Res.Val.setPointer(nullptr);
+    Res.Val.setInt(UndefVal);
+    Res.Offset = 0;
+    return Res;
+  }
+
+  bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
+  bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
+  bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+  bool isUndefValue() const { return Val.getInt() == UndefVal; }
+
+  Value *getSimpleValue() const {
+    assert(isSimpleValue() && "Wrong accessor");
+    return Val.getPointer();
+  }
+
+  LoadInst *getCoercedLoadValue() const {
+    assert(isCoercedLoadValue() && "Wrong accessor");
+    return cast<LoadInst>(Val.getPointer());
+  }
+
+  MemIntrinsic *getMemIntrinValue() const {
+    assert(isMemIntrinValue() && "Wrong accessor");
+    return cast<MemIntrinsic>(Val.getPointer());
+  }
+
+  /// Emit code at the specified insertion point to adjust the value defined
+  /// here to the specified type. This handles various coercion cases.
+  Value *MaterializeAdjustedValue(LoadInst *LI, Instruction *InsertPt,
+                                  GVN &gvn) const;
+};
+
+/// Represents an AvailableValue which can be rematerialized at the end of
+/// the associated BasicBlock.
+struct llvm::gvn::AvailableValueInBlock {
+  /// BB - The basic block in question.
+  BasicBlock *BB;
+
+  /// AV - The actual available value
+  AvailableValue AV;
+
+  static AvailableValueInBlock get(BasicBlock *BB, AvailableValue &&AV) {
+    AvailableValueInBlock Res;
+    Res.BB = BB;
+    Res.AV = std::move(AV);
+    return Res;
+  }
+
+  static AvailableValueInBlock get(BasicBlock *BB, Value *V,
+                                   unsigned Offset = 0) {
+    return get(BB, AvailableValue::get(V, Offset));
+  }
+  static AvailableValueInBlock getUndef(BasicBlock *BB) {
+    return get(BB, AvailableValue::getUndef());
+  }
+
+  /// Emit code at the end of this block to adjust the value defined here to
+  /// the specified type. This handles various coercion cases.
+  Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const {
+    return AV.MaterializeAdjustedValue(LI, BB->getTerminator(), gvn);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable Internal Functions
+//===----------------------------------------------------------------------===//
+
+GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
+  Expression e;
+  e.type = I->getType();
+  e.opcode = I->getOpcode();
+  for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
+       OI != OE; ++OI)
+    e.varargs.push_back(lookupOrAdd(*OI));
+  if (I->isCommutative()) {
+    // Ensure that commutative instructions that only differ by a permutation
+    // of their operands get the same value number by sorting the operand value
+    // numbers.  Since all commutative instructions have two operands it is more
+    // efficient to sort by hand rather than using, say, std::sort.
+    assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
+    if (e.varargs[0] > e.varargs[1])
+      std::swap(e.varargs[0], e.varargs[1]);
+  }
+
+  if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+    // Sort the operand value numbers so x<y and y>x get the same value number.
+    CmpInst::Predicate Predicate = C->getPredicate();
+    if (e.varargs[0] > e.varargs[1]) {
+      std::swap(e.varargs[0], e.varargs[1]);
+      Predicate = CmpInst::getSwappedPredicate(Predicate);
+    }
+    e.opcode = (C->getOpcode() << 8) | Predicate;
+  } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) {
+    for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();
+         II != IE; ++II)
+      e.varargs.push_back(*II);
+  }
+
+  return e;
+}
+
+GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode,
+                                               CmpInst::Predicate Predicate,
+                                               Value *LHS, Value *RHS) {
+  assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+         "Not a comparison!");
+  Expression e;
+  e.type = CmpInst::makeCmpResultType(LHS->getType());
+  e.varargs.push_back(lookupOrAdd(LHS));
+  e.varargs.push_back(lookupOrAdd(RHS));
+
+  // Sort the operand value numbers so x<y and y>x get the same value number.
+  if (e.varargs[0] > e.varargs[1]) {
+    std::swap(e.varargs[0], e.varargs[1]);
+    Predicate = CmpInst::getSwappedPredicate(Predicate);
+  }
+  e.opcode = (Opcode << 8) | Predicate;
+  return e;
+}
+
+GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
+  assert(EI && "Not an ExtractValueInst?");
+  Expression e;
+  e.type = EI->getType();
+  e.opcode = 0;
+
+  IntrinsicInst *I = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
+  if (I != nullptr && EI->getNumIndices() == 1 && *EI->idx_begin() == 0 ) {
+    // EI might be an extract from one of our recognised intrinsics. If it
+    // is we'll synthesize a semantically equivalent expression instead on
+    // an extract value expression.
+    switch (I->getIntrinsicID()) {
+      case Intrinsic::sadd_with_overflow:
+      case Intrinsic::uadd_with_overflow:
+        e.opcode = Instruction::Add;
+        break;
+      case Intrinsic::ssub_with_overflow:
+      case Intrinsic::usub_with_overflow:
+        e.opcode = Instruction::Sub;
+        break;
+      case Intrinsic::smul_with_overflow:
+      case Intrinsic::umul_with_overflow:
+        e.opcode = Instruction::Mul;
+        break;
+      default:
+        break;
+    }
+
+    if (e.opcode != 0) {
+      // Intrinsic recognized. Grab its args to finish building the expression.
+      assert(I->getNumArgOperands() == 2 &&
+             "Expect two args for recognised intrinsics.");
+      e.varargs.push_back(lookupOrAdd(I->getArgOperand(0)));
+      e.varargs.push_back(lookupOrAdd(I->getArgOperand(1)));
+      return e;
+    }
+  }
+
+  // Not a recognised intrinsic. Fall back to producing an extract value
+  // expression.
+  e.opcode = EI->getOpcode();
+  for (Instruction::op_iterator OI = EI->op_begin(), OE = EI->op_end();
+       OI != OE; ++OI)
+    e.varargs.push_back(lookupOrAdd(*OI));
+
+  for (ExtractValueInst::idx_iterator II = EI->idx_begin(), IE = EI->idx_end();
+         II != IE; ++II)
+    e.varargs.push_back(*II);
+
+  return e;
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable External Functions
+//===----------------------------------------------------------------------===//
+
+GVN::ValueTable::ValueTable() : nextValueNumber(1) {}
+GVN::ValueTable::ValueTable(const ValueTable &) = default;
+GVN::ValueTable::ValueTable(ValueTable &&) = default;
+GVN::ValueTable::~ValueTable() = default;
+
+/// add - Insert a value into the table with a specified value number.
+void GVN::ValueTable::add(Value *V, uint32_t num) {
+  valueNumbering.insert(std::make_pair(V, num));
+}
+
+uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
+  if (AA->doesNotAccessMemory(C)) {
+    Expression exp = createExpr(C);
+    uint32_t &e = expressionNumbering[exp];
+    if (!e) e = nextValueNumber++;
+    valueNumbering[C] = e;
+    return e;
+  } else if (AA->onlyReadsMemory(C)) {
+    Expression exp = createExpr(C);
+    uint32_t &e = expressionNumbering[exp];
+    if (!e) {
+      e = nextValueNumber++;
+      valueNumbering[C] = e;
+      return e;
+    }
+    if (!MD) {
+      e = nextValueNumber++;
+      valueNumbering[C] = e;
+      return e;
+    }
+
+    MemDepResult local_dep = MD->getDependency(C);
+
+    if (!local_dep.isDef() && !local_dep.isNonLocal()) {
+      valueNumbering[C] =  nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    if (local_dep.isDef()) {
+      CallInst* local_cdep = cast<CallInst>(local_dep.getInst());
+
+      if (local_cdep->getNumArgOperands() != C->getNumArgOperands()) {
+        valueNumbering[C] = nextValueNumber;
+        return nextValueNumber++;
+      }
+
+      for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+        uint32_t c_vn = lookupOrAdd(C->getArgOperand(i));
+        uint32_t cd_vn = lookupOrAdd(local_cdep->getArgOperand(i));
+        if (c_vn != cd_vn) {
+          valueNumbering[C] = nextValueNumber;
+          return nextValueNumber++;
+        }
+      }
+
+      uint32_t v = lookupOrAdd(local_cdep);
+      valueNumbering[C] = v;
+      return v;
+    }
+
+    // Non-local case.
+    const MemoryDependenceResults::NonLocalDepInfo &deps =
+      MD->getNonLocalCallDependency(CallSite(C));
+    // FIXME: Move the checking logic to MemDep!
+    CallInst* cdep = nullptr;
+
+    // Check to see if we have a single dominating call instruction that is
+    // identical to C.
+    for (unsigned i = 0, e = deps.size(); i != e; ++i) {
+      const NonLocalDepEntry *I = &deps[i];
+      if (I->getResult().isNonLocal())
+        continue;
+
+      // We don't handle non-definitions.  If we already have a call, reject
+      // instruction dependencies.
+      if (!I->getResult().isDef() || cdep != nullptr) {
+        cdep = nullptr;
+        break;
+      }
+
+      CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->getResult().getInst());
+      // FIXME: All duplicated with non-local case.
+      if (NonLocalDepCall && DT->properlyDominates(I->getBB(), C->getParent())){
+        cdep = NonLocalDepCall;
+        continue;
+      }
+
+      cdep = nullptr;
+      break;
+    }
+
+    if (!cdep) {
+      valueNumbering[C] = nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    if (cdep->getNumArgOperands() != C->getNumArgOperands()) {
+      valueNumbering[C] = nextValueNumber;
+      return nextValueNumber++;
+    }
+    for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+      uint32_t c_vn = lookupOrAdd(C->getArgOperand(i));
+      uint32_t cd_vn = lookupOrAdd(cdep->getArgOperand(i));
+      if (c_vn != cd_vn) {
+        valueNumbering[C] = nextValueNumber;
+        return nextValueNumber++;
+      }
+    }
+
+    uint32_t v = lookupOrAdd(cdep);
+    valueNumbering[C] = v;
+    return v;
+
+  } else {
+    valueNumbering[C] = nextValueNumber;
+    return nextValueNumber++;
+  }
+}
+
+/// Returns true if a value number exists for the specified value.
+bool GVN::ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; }
+
+/// lookup_or_add - Returns the value number for the specified value, assigning
+/// it a new number if it did not have one before.
+uint32_t GVN::ValueTable::lookupOrAdd(Value *V) {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    return VI->second;
+
+  if (!isa<Instruction>(V)) {
+    valueNumbering[V] = nextValueNumber;
+    return nextValueNumber++;
+  }
+
+  Instruction* I = cast<Instruction>(V);
+  Expression exp;
+  switch (I->getOpcode()) {
+    case Instruction::Call:
+      return lookupOrAddCall(cast<CallInst>(I));
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+    case Instruction::Select:
+    case Instruction::ExtractElement:
+    case Instruction::InsertElement:
+    case Instruction::ShuffleVector:
+    case Instruction::InsertValue:
+    case Instruction::GetElementPtr:
+      exp = createExpr(I);
+      break;
+    case Instruction::ExtractValue:
+      exp = createExtractvalueExpr(cast<ExtractValueInst>(I));
+      break;
+    default:
+      valueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+  }
+
+  uint32_t& e = expressionNumbering[exp];
+  if (!e) e = nextValueNumber++;
+  valueNumbering[V] = e;
+  return e;
+}
+
+/// Returns the value number of the specified value. Fails if
+/// the value has not yet been numbered.
+uint32_t GVN::ValueTable::lookup(Value *V) const {
+  DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
+  assert(VI != valueNumbering.end() && "Value not numbered?");
+  return VI->second;
+}
+
+/// Returns the value number of the given comparison,
+/// assigning it a new number if it did not have one before.  Useful when
+/// we deduced the result of a comparison, but don't immediately have an
+/// instruction realizing that comparison to hand.
+uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode,
+                                         CmpInst::Predicate Predicate,
+                                         Value *LHS, Value *RHS) {
+  Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS);
+  uint32_t& e = expressionNumbering[exp];
+  if (!e) e = nextValueNumber++;
+  return e;
+}
+
+/// Remove all entries from the ValueTable.
+void GVN::ValueTable::clear() {
+  valueNumbering.clear();
+  expressionNumbering.clear();
+  nextValueNumber = 1;
+}
+
+/// Remove a value from the value numbering.
+void GVN::ValueTable::erase(Value *V) {
+  valueNumbering.erase(V);
+}
+
+/// verifyRemoved - Verify that the value is removed from all internal data
+/// structures.
+void GVN::ValueTable::verifyRemoved(const Value *V) const {
+  for (DenseMap<Value*, uint32_t>::const_iterator
+         I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
+    assert(I->first != V && "Inst still occurs in value numbering map!");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                                GVN Pass
+//===----------------------------------------------------------------------===//
+
+PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
+  // FIXME: The order of evaluation of these 'getResult' calls is very
+  // significant! Re-ordering these variables will cause GVN when run alone to
+  // be less effective! We should fix memdep and basic-aa to not exhibit this
+  // behavior, but until then don't change the order here.
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &MemDep = AM.getResult<MemoryDependenceAnalysis>(F);
+  auto *LI = AM.getCachedResult<LoopAnalysis>(F);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  bool Changed = runImpl(F, AC, DT, TLI, AA, &MemDep, LI, &ORE);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<TargetLibraryAnalysis>();
+  return PA;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const {
+  errs() << "{\n";
+  for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
+       E = d.end(); I != E; ++I) {
+      errs() << I->first << "\n";
+      I->second->dump();
+  }
+  errs() << "}\n";
+}
+#endif
+
+/// Return true if we can prove that the value
+/// we're analyzing is fully available in the specified block.  As we go, keep
+/// track of which blocks we know are fully alive in FullyAvailableBlocks.  This
+/// map is actually a tri-state map with the following values:
+///   0) we know the block *is not* fully available.
+///   1) we know the block *is* fully available.
+///   2) we do not know whether the block is fully available or not, but we are
+///      currently speculating that it will be.
+///   3) we are speculating for this block and have used that to speculate for
+///      other blocks.
+static bool IsValueFullyAvailableInBlock(BasicBlock *BB,
+                            DenseMap<BasicBlock*, char> &FullyAvailableBlocks,
+                            uint32_t RecurseDepth) {
+  if (RecurseDepth > MaxRecurseDepth)
+    return false;
+
+  // Optimistically assume that the block is fully available and check to see
+  // if we already know about this block in one lookup.
+  std::pair<DenseMap<BasicBlock*, char>::iterator, char> IV =
+    FullyAvailableBlocks.insert(std::make_pair(BB, 2));
+
+  // If the entry already existed for this block, return the precomputed value.
+  if (!IV.second) {
+    // If this is a speculative "available" value, mark it as being used for
+    // speculation of other blocks.
+    if (IV.first->second == 2)
+      IV.first->second = 3;
+    return IV.first->second != 0;
+  }
+
+  // Otherwise, see if it is fully available in all predecessors.
+  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+
+  // If this block has no predecessors, it isn't live-in here.
+  if (PI == PE)
+    goto SpeculationFailure;
+
+  for (; PI != PE; ++PI)
+    // If the value isn't fully available in one of our predecessors, then it
+    // isn't fully available in this block either.  Undo our previous
+    // optimistic assumption and bail out.
+    if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks,RecurseDepth+1))
+      goto SpeculationFailure;
+
+  return true;
+
+// If we get here, we found out that this is not, after
+// all, a fully-available block.  We have a problem if we speculated on this and
+// used the speculation to mark other blocks as available.
+SpeculationFailure:
+  char &BBVal = FullyAvailableBlocks[BB];
+
+  // If we didn't speculate on this, just return with it set to false.
+  if (BBVal == 2) {
+    BBVal = 0;
+    return false;
+  }
+
+  // If we did speculate on this value, we could have blocks set to 1 that are
+  // incorrect.  Walk the (transitive) successors of this block and mark them as
+  // 0 if set to one.
+  SmallVector<BasicBlock*, 32> BBWorklist;
+  BBWorklist.push_back(BB);
+
+  do {
+    BasicBlock *Entry = BBWorklist.pop_back_val();
+    // Note that this sets blocks to 0 (unavailable) if they happen to not
+    // already be in FullyAvailableBlocks.  This is safe.
+    char &EntryVal = FullyAvailableBlocks[Entry];
+    if (EntryVal == 0) continue;  // Already unavailable.
+
+    // Mark as unavailable.
+    EntryVal = 0;
+
+    BBWorklist.append(succ_begin(Entry), succ_end(Entry));
+  } while (!BBWorklist.empty());
+
+  return false;
+}
+
+
+
+
+/// Given a set of loads specified by ValuesPerBlock,
+/// construct SSA form, allowing us to eliminate LI.  This returns the value
+/// that should be used at LI's definition site.
+static Value *ConstructSSAForLoadSet(LoadInst *LI,
+                         SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock,
+                                     GVN &gvn) {
+  // Check for the fully redundant, dominating load case.  In this case, we can
+  // just use the dominating value directly.
+  if (ValuesPerBlock.size() == 1 &&
+      gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
+                                               LI->getParent())) {
+    assert(!ValuesPerBlock[0].AV.isUndefValue() &&
+           "Dead BB dominate this block");
+    return ValuesPerBlock[0].MaterializeAdjustedValue(LI, gvn);
+  }
+
+  // Otherwise, we have to construct SSA form.
+  SmallVector<PHINode*, 8> NewPHIs;
+  SSAUpdater SSAUpdate(&NewPHIs);
+  SSAUpdate.Initialize(LI->getType(), LI->getName());
+
+  for (const AvailableValueInBlock &AV : ValuesPerBlock) {
+    BasicBlock *BB = AV.BB;
+
+    if (SSAUpdate.HasValueForBlock(BB))
+      continue;
+
+    SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn));
+  }
+
+  // Perform PHI construction.
+  return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
+}
+
+Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
+                                                Instruction *InsertPt,
+                                                GVN &gvn) const {
+  Value *Res;
+  Type *LoadTy = LI->getType();
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  if (isSimpleValue()) {
+    Res = getSimpleValue();
+    if (Res->getType() != LoadTy) {
+      Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
+
+      DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
+                   << *getSimpleValue() << '\n'
+                   << *Res << '\n' << "\n\n\n");
+    }
+  } else if (isCoercedLoadValue()) {
+    LoadInst *Load = getCoercedLoadValue();
+    if (Load->getType() == LoadTy && Offset == 0) {
+      Res = Load;
+    } else {
+      Res = getLoadValueForLoad(Load, Offset, LoadTy, InsertPt, DL);
+      // We would like to use gvn.markInstructionForDeletion here, but we can't
+      // because the load is already memoized into the leader map table that GVN
+      // tracks.  It is potentially possible to remove the load from the table,
+      // but then there all of the operations based on it would need to be
+      // rehashed.  Just leave the dead load around.
+      gvn.getMemDep().removeInstruction(Load);
+      DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
+                   << *getCoercedLoadValue() << '\n'
+                   << *Res << '\n'
+                   << "\n\n\n");
+    }
+  } else if (isMemIntrinValue()) {
+    Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
+                                 InsertPt, DL);
+    DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+                 << "  " << *getMemIntrinValue() << '\n'
+                 << *Res << '\n' << "\n\n\n");
+  } else {
+    assert(isUndefValue() && "Should be UndefVal");
+    DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
+    return UndefValue::get(LoadTy);
+  }
+  assert(Res && "failed to materialize?");
+  return Res;
+}
+
+static bool isLifetimeStart(const Instruction *Inst) {
+  if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst))
+    return II->getIntrinsicID() == Intrinsic::lifetime_start;
+  return false;
+}
+
+/// \brief Try to locate the three instruction involved in a missed
+/// load-elimination case that is due to an intervening store.
+static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo,
+                                   DominatorTree *DT,
+                                   OptimizationRemarkEmitter *ORE) {
+  using namespace ore;
+  User *OtherAccess = nullptr;
+
+  OptimizationRemarkMissed R(DEBUG_TYPE, "LoadClobbered", LI);
+  R << "load of type " << NV("Type", LI->getType()) << " not eliminated"
+    << setExtraArgs();
+
+  for (auto *U : LI->getPointerOperand()->users())
+    if (U != LI && (isa<LoadInst>(U) || isa<StoreInst>(U)) &&
+        DT->dominates(cast<Instruction>(U), LI)) {
+      // FIXME: for now give up if there are multiple memory accesses that
+      // dominate the load.  We need further analysis to decide which one is
+      // that we're forwarding from.
+      if (OtherAccess)
+        OtherAccess = nullptr;
+      else
+        OtherAccess = U;
+    }
+
+  if (OtherAccess)
+    R << " in favor of " << NV("OtherAccess", OtherAccess);
+
+  R << " because it is clobbered by " << NV("ClobberedBy", DepInfo.getInst());
+
+  ORE->emit(R);
+}
+
+bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
+                                  Value *Address, AvailableValue &Res) {
+
+  assert((DepInfo.isDef() || DepInfo.isClobber()) &&
+         "expected a local dependence");
+  assert(LI->isUnordered() && "rules below are incorrect for ordered access");
+
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+
+  if (DepInfo.isClobber()) {
+    // If the dependence is to a store that writes to a superset of the bits
+    // read by the load, we can extract the bits we need for the load from the
+    // stored value.
+    if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) {
+      // Can't forward from non-atomic to atomic without violating memory model.
+      if (Address && LI->isAtomic() <= DepSI->isAtomic()) {
+        int Offset =
+          analyzeLoadFromClobberingStore(LI->getType(), Address, DepSI, DL);
+        if (Offset != -1) {
+          Res = AvailableValue::get(DepSI->getValueOperand(), Offset);
+          return true;
+        }
+      }
+    }
+
+    // Check to see if we have something like this:
+    //    load i32* P
+    //    load i8* (P+1)
+    // if we have this, replace the later with an extraction from the former.
+    if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) {
+      // If this is a clobber and L is the first instruction in its block, then
+      // we have the first instruction in the entry block.
+      // Can't forward from non-atomic to atomic without violating memory model.
+      if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) {
+        int Offset =
+          analyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
+
+        if (Offset != -1) {
+          Res = AvailableValue::getLoad(DepLI, Offset);
+          return true;
+        }
+      }
+    }
+
+    // If the clobbering value is a memset/memcpy/memmove, see if we can
+    // forward a value on from it.
+    if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
+      if (Address && !LI->isAtomic()) {
+        int Offset = analyzeLoadFromClobberingMemInst(LI->getType(), Address,
+                                                      DepMI, DL);
+        if (Offset != -1) {
+          Res = AvailableValue::getMI(DepMI, Offset);
+          return true;
+        }
+      }
+    }
+    // Nothing known about this clobber, have to be conservative
+    DEBUG(
+      // fast print dep, using operator<< on instruction is too slow.
+      dbgs() << "GVN: load ";
+      LI->printAsOperand(dbgs());
+      Instruction *I = DepInfo.getInst();
+      dbgs() << " is clobbered by " << *I << '\n';
+    );
+
+    if (ORE->allowExtraAnalysis())
+      reportMayClobberedLoad(LI, DepInfo, DT, ORE);
+
+    return false;
+  }
+  assert(DepInfo.isDef() && "follows from above");
+
+  Instruction *DepInst = DepInfo.getInst();
+
+  // Loading the allocation -> undef.
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
+      // Loading immediately after lifetime begin -> undef.
+      isLifetimeStart(DepInst)) {
+    Res = AvailableValue::get(UndefValue::get(LI->getType()));
+    return true;
+  }
+
+  // Loading from calloc (which zero initializes memory) -> zero
+  if (isCallocLikeFn(DepInst, TLI)) {
+    Res = AvailableValue::get(Constant::getNullValue(LI->getType()));
+    return true;
+  }
+
+  if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
+    // Reject loads and stores that are to the same address but are of
+    // different types if we have to. If the stored value is larger or equal to
+    // the loaded value, we can reuse it.
+    if (S->getValueOperand()->getType() != LI->getType() &&
+        !canCoerceMustAliasedValueToLoad(S->getValueOperand(),
+                                         LI->getType(), DL))
+      return false;
+
+    // Can't forward from non-atomic to atomic without violating memory model.
+    if (S->isAtomic() < LI->isAtomic())
+      return false;
+
+    Res = AvailableValue::get(S->getValueOperand());
+    return true;
+  }
+
+  if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) {
+    // If the types mismatch and we can't handle it, reject reuse of the load.
+    // If the stored value is larger or equal to the loaded value, we can reuse
+    // it.
+    if (LD->getType() != LI->getType() &&
+        !canCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
+      return false;
+
+    // Can't forward from non-atomic to atomic without violating memory model.
+    if (LD->isAtomic() < LI->isAtomic())
+      return false;
+
+    Res = AvailableValue::getLoad(LD);
+    return true;
+  }
+
+  // Unknown def - must be conservative
+  DEBUG(
+    // fast print dep, using operator<< on instruction is too slow.
+    dbgs() << "GVN: load ";
+    LI->printAsOperand(dbgs());
+    dbgs() << " has unknown def " << *DepInst << '\n';
+  );
+  return false;
+}
+
+void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
+                                  AvailValInBlkVect &ValuesPerBlock,
+                                  UnavailBlkVect &UnavailableBlocks) {
+
+  // Filter out useless results (non-locals, etc).  Keep track of the blocks
+  // where we have a value available in repl, also keep track of whether we see
+  // dependencies that produce an unknown value for the load (such as a call
+  // that could potentially clobber the load).
+  unsigned NumDeps = Deps.size();
+  for (unsigned i = 0, e = NumDeps; i != e; ++i) {
+    BasicBlock *DepBB = Deps[i].getBB();
+    MemDepResult DepInfo = Deps[i].getResult();
+
+    if (DeadBlocks.count(DepBB)) {
+      // Dead dependent mem-op disguise as a load evaluating the same value
+      // as the load in question.
+      ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB));
+      continue;
+    }
+
+    if (!DepInfo.isDef() && !DepInfo.isClobber()) {
+      UnavailableBlocks.push_back(DepBB);
+      continue;
+    }
+
+    // The address being loaded in this non-local block may not be the same as
+    // the pointer operand of the load if PHI translation occurs.  Make sure
+    // to consider the right address.
+    Value *Address = Deps[i].getAddress();
+
+    AvailableValue AV;
+    if (AnalyzeLoadAvailability(LI, DepInfo, Address, AV)) {
+      // subtlety: because we know this was a non-local dependency, we know
+      // it's safe to materialize anywhere between the instruction within
+      // DepInfo and the end of it's block.
+      ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
+                                                          std::move(AV)));
+    } else {
+      UnavailableBlocks.push_back(DepBB);
+    }
+  }
+
+  assert(NumDeps == ValuesPerBlock.size() + UnavailableBlocks.size() &&
+         "post condition violation");
+}
+
+bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
+                         UnavailBlkVect &UnavailableBlocks) {
+  // Okay, we have *some* definitions of the value.  This means that the value
+  // is available in some of our (transitive) predecessors.  Lets think about
+  // doing PRE of this load.  This will involve inserting a new load into the
+  // predecessor when it's not available.  We could do this in general, but
+  // prefer to not increase code size.  As such, we only do this when we know
+  // that we only have to insert *one* load (which means we're basically moving
+  // the load, not inserting a new one).
+
+  SmallPtrSet<BasicBlock *, 4> Blockers(UnavailableBlocks.begin(),
+                                        UnavailableBlocks.end());
+
+  // Let's find the first basic block with more than one predecessor.  Walk
+  // backwards through predecessors if needed.
+  BasicBlock *LoadBB = LI->getParent();
+  BasicBlock *TmpBB = LoadBB;
+
+  while (TmpBB->getSinglePredecessor()) {
+    TmpBB = TmpBB->getSinglePredecessor();
+    if (TmpBB == LoadBB) // Infinite (unreachable) loop.
+      return false;
+    if (Blockers.count(TmpBB))
+      return false;
+
+    // If any of these blocks has more than one successor (i.e. if the edge we
+    // just traversed was critical), then there are other paths through this
+    // block along which the load may not be anticipated.  Hoisting the load
+    // above this block would be adding the load to execution paths along
+    // which it was not previously executed.
+    if (TmpBB->getTerminator()->getNumSuccessors() != 1)
+      return false;
+  }
+
+  assert(TmpBB);
+  LoadBB = TmpBB;
+
+  // Check to see how many predecessors have the loaded value fully
+  // available.
+  MapVector<BasicBlock *, Value *> PredLoads;
+  DenseMap<BasicBlock*, char> FullyAvailableBlocks;
+  for (const AvailableValueInBlock &AV : ValuesPerBlock)
+    FullyAvailableBlocks[AV.BB] = true;
+  for (BasicBlock *UnavailableBB : UnavailableBlocks)
+    FullyAvailableBlocks[UnavailableBB] = false;
+
+  SmallVector<BasicBlock *, 4> CriticalEdgePred;
+  for (BasicBlock *Pred : predecessors(LoadBB)) {
+    // If any predecessor block is an EH pad that does not allow non-PHI
+    // instructions before the terminator, we can't PRE the load.
+    if (Pred->getTerminator()->isEHPad()) {
+      DEBUG(dbgs()
+            << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '"
+            << Pred->getName() << "': " << *LI << '\n');
+      return false;
+    }
+
+    if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
+      continue;
+    }
+
+    if (Pred->getTerminator()->getNumSuccessors() != 1) {
+      if (isa<IndirectBrInst>(Pred->getTerminator())) {
+        DEBUG(dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
+              << Pred->getName() << "': " << *LI << '\n');
+        return false;
+      }
+
+      if (LoadBB->isEHPad()) {
+        DEBUG(dbgs()
+              << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"
+              << Pred->getName() << "': " << *LI << '\n');
+        return false;
+      }
+
+      CriticalEdgePred.push_back(Pred);
+    } else {
+      // Only add the predecessors that will not be split for now.
+      PredLoads[Pred] = nullptr;
+    }
+  }
+
+  // Decide whether PRE is profitable for this load.
+  unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size();
+  assert(NumUnavailablePreds != 0 &&
+         "Fully available value should already be eliminated!");
+
+  // If this load is unavailable in multiple predecessors, reject it.
+  // FIXME: If we could restructure the CFG, we could make a common pred with
+  // all the preds that don't have an available LI and insert a new load into
+  // that one block.
+  if (NumUnavailablePreds != 1)
+      return false;
+
+  // Split critical edges, and update the unavailable predecessors accordingly.
+  for (BasicBlock *OrigPred : CriticalEdgePred) {
+    BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
+    assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!");
+    PredLoads[NewPred] = nullptr;
+    DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
+                 << LoadBB->getName() << '\n');
+  }
+
+  // Check if the load can safely be moved to all the unavailable predecessors.
+  bool CanDoPRE = true;
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  SmallVector<Instruction*, 8> NewInsts;
+  for (auto &PredLoad : PredLoads) {
+    BasicBlock *UnavailablePred = PredLoad.first;
+
+    // Do PHI translation to get its value in the predecessor if necessary.  The
+    // returned pointer (if non-null) is guaranteed to dominate UnavailablePred.
+
+    // If all preds have a single successor, then we know it is safe to insert
+    // the load on the pred (?!?), so we can insert code to materialize the
+    // pointer if it is not available.
+    PHITransAddr Address(LI->getPointerOperand(), DL, AC);
+    Value *LoadPtr = nullptr;
+    LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
+                                                *DT, NewInsts);
+
+    // If we couldn't find or insert a computation of this phi translated value,
+    // we fail PRE.
+    if (!LoadPtr) {
+      DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
+            << *LI->getPointerOperand() << "\n");
+      CanDoPRE = false;
+      break;
+    }
+
+    PredLoad.second = LoadPtr;
+  }
+
+  if (!CanDoPRE) {
+    while (!NewInsts.empty()) {
+      Instruction *I = NewInsts.pop_back_val();
+      if (MD) MD->removeInstruction(I);
+      I->eraseFromParent();
+    }
+    // HINT: Don't revert the edge-splitting as following transformation may
+    // also need to split these critical edges.
+    return !CriticalEdgePred.empty();
+  }
+
+  // Okay, we can eliminate this load by inserting a reload in the predecessor
+  // and using PHI construction to get the value in the other predecessors, do
+  // it.
+  DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n');
+  DEBUG(if (!NewInsts.empty())
+          dbgs() << "INSERTED " << NewInsts.size() << " INSTS: "
+                 << *NewInsts.back() << '\n');
+
+  // Assign value numbers to the new instructions.
+  for (Instruction *I : NewInsts) {
+    // Instructions that have been inserted in predecessor(s) to materialize
+    // the load address do not retain their original debug locations. Doing
+    // so could lead to confusing (but correct) source attributions.
+    // FIXME: How do we retain source locations without causing poor debugging
+    // behavior?
+    I->setDebugLoc(DebugLoc());
+
+    // FIXME: We really _ought_ to insert these value numbers into their
+    // parent's availability map.  However, in doing so, we risk getting into
+    // ordering issues.  If a block hasn't been processed yet, we would be
+    // marking a value as AVAIL-IN, which isn't what we intend.
+    VN.lookupOrAdd(I);
+  }
+
+  for (const auto &PredLoad : PredLoads) {
+    BasicBlock *UnavailablePred = PredLoad.first;
+    Value *LoadPtr = PredLoad.second;
+
+    auto *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre",
+                                 LI->isVolatile(), LI->getAlignment(),
+                                 LI->getOrdering(), LI->getSyncScopeID(),
+                                 UnavailablePred->getTerminator());
+
+    // Transfer the old load's AA tags to the new load.
+    AAMDNodes Tags;
+    LI->getAAMetadata(Tags);
+    if (Tags)
+      NewLoad->setAAMetadata(Tags);
+
+    if (auto *MD = LI->getMetadata(LLVMContext::MD_invariant_load))
+      NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD);
+    if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group))
+      NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD);
+    if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range))
+      NewLoad->setMetadata(LLVMContext::MD_range, RangeMD);
+
+    // We do not propagate the old load's debug location, because the new
+    // load now lives in a different BB, and we want to avoid a jumpy line
+    // table.
+    // FIXME: How do we retain source locations without causing poor debugging
+    // behavior?
+
+    // Add the newly created load.
+    ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
+                                                        NewLoad));
+    MD->invalidateCachedPointerInfo(LoadPtr);
+    DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n');
+  }
+
+  // Perform PHI construction.
+  Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+  LI->replaceAllUsesWith(V);
+  if (isa<PHINode>(V))
+    V->takeName(LI);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    I->setDebugLoc(LI->getDebugLoc());
+  if (V->getType()->isPtrOrPtrVectorTy())
+    MD->invalidateCachedPointerInfo(V);
+  markInstructionForDeletion(LI);
+  ORE->emit(OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI)
+            << "load eliminated by PRE");
+  ++NumPRELoad;
+  return true;
+}
+
+static void reportLoadElim(LoadInst *LI, Value *AvailableValue,
+                           OptimizationRemarkEmitter *ORE) {
+  using namespace ore;
+  ORE->emit(OptimizationRemark(DEBUG_TYPE, "LoadElim", LI)
+            << "load of type " << NV("Type", LI->getType()) << " eliminated"
+            << setExtraArgs() << " in favor of "
+            << NV("InfavorOfValue", AvailableValue));
+}
+
+/// Attempt to eliminate a load whose dependencies are
+/// non-local by performing PHI construction.
+bool GVN::processNonLocalLoad(LoadInst *LI) {
+  // non-local speculations are not allowed under asan.
+  if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeAddress))
+    return false;
+
+  // Step 1: Find the non-local dependencies of the load.
+  LoadDepVect Deps;
+  MD->getNonLocalPointerDependency(LI, Deps);
+
+  // If we had to process more than one hundred blocks to find the
+  // dependencies, this load isn't worth worrying about.  Optimizing
+  // it will be too expensive.
+  unsigned NumDeps = Deps.size();
+  if (NumDeps > 100)
+    return false;
+
+  // If we had a phi translation failure, we'll have a single entry which is a
+  // clobber in the current block.  Reject this early.
+  if (NumDeps == 1 &&
+      !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
+    DEBUG(
+      dbgs() << "GVN: non-local load ";
+      LI->printAsOperand(dbgs());
+      dbgs() << " has unknown dependencies\n";
+    );
+    return false;
+  }
+
+  // If this load follows a GEP, see if we can PRE the indices before analyzing.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) {
+    for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
+                                        OE = GEP->idx_end();
+         OI != OE; ++OI)
+      if (Instruction *I = dyn_cast<Instruction>(OI->get()))
+        performScalarPRE(I);
+  }
+
+  // Step 2: Analyze the availability of the load
+  AvailValInBlkVect ValuesPerBlock;
+  UnavailBlkVect UnavailableBlocks;
+  AnalyzeLoadAvailability(LI, Deps, ValuesPerBlock, UnavailableBlocks);
+
+  // If we have no predecessors that produce a known value for this load, exit
+  // early.
+  if (ValuesPerBlock.empty())
+    return false;
+
+  // Step 3: Eliminate fully redundancy.
+  //
+  // If all of the instructions we depend on produce a known value for this
+  // load, then it is fully redundant and we can use PHI insertion to compute
+  // its value.  Insert PHIs and remove the fully redundant value now.
+  if (UnavailableBlocks.empty()) {
+    DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
+
+    // Perform PHI construction.
+    Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+    LI->replaceAllUsesWith(V);
+
+    if (isa<PHINode>(V))
+      V->takeName(LI);
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      // If instruction I has debug info, then we should not update it.
+      // Also, if I has a null DebugLoc, then it is still potentially incorrect
+      // to propagate LI's DebugLoc because LI may not post-dominate I.
+      if (LI->getDebugLoc() && LI->getParent() == I->getParent())
+        I->setDebugLoc(LI->getDebugLoc());
+    if (V->getType()->isPtrOrPtrVectorTy())
+      MD->invalidateCachedPointerInfo(V);
+    markInstructionForDeletion(LI);
+    ++NumGVNLoad;
+    reportLoadElim(LI, V, ORE);
+    return true;
+  }
+
+  // Step 4: Eliminate partial redundancy.
+  if (!EnablePRE || !EnableLoadPRE)
+    return false;
+
+  return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
+}
+
+bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
+  assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume &&
+         "This function can only be called with llvm.assume intrinsic");
+  Value *V = IntrinsicI->getArgOperand(0);
+
+  if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) {
+    if (Cond->isZero()) {
+      Type *Int8Ty = Type::getInt8Ty(V->getContext());
+      // Insert a new store to null instruction before the load to indicate that
+      // this code is not reachable.  FIXME: We could insert unreachable
+      // instruction directly because we can modify the CFG.
+      new StoreInst(UndefValue::get(Int8Ty),
+                    Constant::getNullValue(Int8Ty->getPointerTo()),
+                    IntrinsicI);
+    }
+    markInstructionForDeletion(IntrinsicI);
+    return false;
+  }
+
+  Constant *True = ConstantInt::getTrue(V->getContext());
+  bool Changed = false;
+
+  for (BasicBlock *Successor : successors(IntrinsicI->getParent())) {
+    BasicBlockEdge Edge(IntrinsicI->getParent(), Successor);
+
+    // This property is only true in dominated successors, propagateEquality
+    // will check dominance for us.
+    Changed |= propagateEquality(V, True, Edge, false);
+  }
+
+  // We can replace assume value with true, which covers cases like this:
+  // call void @llvm.assume(i1 %cmp)
+  // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
+  ReplaceWithConstMap[V] = True;
+
+  // If one of *cmp *eq operand is const, adding it to map will cover this:
+  // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
+  // call void @llvm.assume(i1 %cmp)
+  // ret float %0 ; will change it to ret float 3.000000e+00
+  if (auto *CmpI = dyn_cast<CmpInst>(V)) {
+    if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ ||
+        CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ ||
+        (CmpI->getPredicate() == CmpInst::Predicate::FCMP_UEQ &&
+         CmpI->getFastMathFlags().noNaNs())) {
+      Value *CmpLHS = CmpI->getOperand(0);
+      Value *CmpRHS = CmpI->getOperand(1);
+      if (isa<Constant>(CmpLHS))
+        std::swap(CmpLHS, CmpRHS);
+      auto *RHSConst = dyn_cast<Constant>(CmpRHS);
+
+      // If only one operand is constant.
+      if (RHSConst != nullptr && !isa<Constant>(CmpLHS))
+        ReplaceWithConstMap[CmpLHS] = RHSConst;
+    }
+  }
+  return Changed;
+}
+
+static void patchReplacementInstruction(Instruction *I, Value *Repl) {
+  auto *ReplInst = dyn_cast<Instruction>(Repl);
+  if (!ReplInst)
+    return;
+
+  // Patch the replacement so that it is not more restrictive than the value
+  // being replaced.
+  // Note that if 'I' is a load being replaced by some operation,
+  // for example, by an arithmetic operation, then andIRFlags()
+  // would just erase all math flags from the original arithmetic
+  // operation, which is clearly not wanted and not needed.
+  if (!isa<LoadInst>(I))
+    ReplInst->andIRFlags(I);
+
+  // FIXME: If both the original and replacement value are part of the
+  // same control-flow region (meaning that the execution of one
+  // guarantees the execution of the other), then we can combine the
+  // noalias scopes here and do better than the general conservative
+  // answer used in combineMetadata().
+
+  // In general, GVN unifies expressions over different control-flow
+  // regions, and so we need a conservative combination of the noalias
+  // scopes.
+  static const unsigned KnownIDs[] = {
+      LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
+      LLVMContext::MD_noalias,        LLVMContext::MD_range,
+      LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
+      LLVMContext::MD_invariant_group};
+  combineMetadata(ReplInst, I, KnownIDs);
+}
+
+static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
+  patchReplacementInstruction(I, Repl);
+  I->replaceAllUsesWith(Repl);
+}
+
+/// Attempt to eliminate a load, first by eliminating it
+/// locally, and then attempting non-local elimination if that fails.
+bool GVN::processLoad(LoadInst *L) {
+  if (!MD)
+    return false;
+
+  // This code hasn't been audited for ordered or volatile memory access
+  if (!L->isUnordered())
+    return false;
+
+  if (L->use_empty()) {
+    markInstructionForDeletion(L);
+    return true;
+  }
+
+  // ... to a pointer that has been loaded from before...
+  MemDepResult Dep = MD->getDependency(L);
+
+  // If it is defined in another block, try harder.
+  if (Dep.isNonLocal())
+    return processNonLocalLoad(L);
+
+  // Only handle the local case below
+  if (!Dep.isDef() && !Dep.isClobber()) {
+    // This might be a NonFuncLocal or an Unknown
+    DEBUG(
+      // fast print dep, using operator<< on instruction is too slow.
+      dbgs() << "GVN: load ";
+      L->printAsOperand(dbgs());
+      dbgs() << " has unknown dependence\n";
+    );
+    return false;
+  }
+
+  AvailableValue AV;
+  if (AnalyzeLoadAvailability(L, Dep, L->getPointerOperand(), AV)) {
+    Value *AvailableValue = AV.MaterializeAdjustedValue(L, L, *this);
+
+    // Replace the load!
+    patchAndReplaceAllUsesWith(L, AvailableValue);
+    markInstructionForDeletion(L);
+    ++NumGVNLoad;
+    reportLoadElim(L, AvailableValue, ORE);
+    // Tell MDA to rexamine the reused pointer since we might have more
+    // information after forwarding it.
+    if (MD && AvailableValue->getType()->isPtrOrPtrVectorTy())
+      MD->invalidateCachedPointerInfo(AvailableValue);
+    return true;
+  }
+
+  return false;
+}
+
+// In order to find a leader for a given value number at a
+// specific basic block, we first obtain the list of all Values for that number,
+// and then scan the list to find one whose block dominates the block in
+// question.  This is fast because dominator tree queries consist of only
+// a few comparisons of DFS numbers.
+Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) {
+  LeaderTableEntry Vals = LeaderTable[num];
+  if (!Vals.Val) return nullptr;
+
+  Value *Val = nullptr;
+  if (DT->dominates(Vals.BB, BB)) {
+    Val = Vals.Val;
+    if (isa<Constant>(Val)) return Val;
+  }
+
+  LeaderTableEntry* Next = Vals.Next;
+  while (Next) {
+    if (DT->dominates(Next->BB, BB)) {
+      if (isa<Constant>(Next->Val)) return Next->Val;
+      if (!Val) Val = Next->Val;
+    }
+
+    Next = Next->Next;
+  }
+
+  return Val;
+}
+
+/// There is an edge from 'Src' to 'Dst'.  Return
+/// true if every path from the entry block to 'Dst' passes via this edge.  In
+/// particular 'Dst' must not be reachable via another edge from 'Src'.
+static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
+                                       DominatorTree *DT) {
+  // While in theory it is interesting to consider the case in which Dst has
+  // more than one predecessor, because Dst might be part of a loop which is
+  // only reachable from Src, in practice it is pointless since at the time
+  // GVN runs all such loops have preheaders, which means that Dst will have
+  // been changed to have only one predecessor, namely Src.
+  const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
+  assert((!Pred || Pred == E.getStart()) &&
+         "No edge between these basic blocks!");
+  return Pred != nullptr;
+}
+
+// Tries to replace instruction with const, using information from
+// ReplaceWithConstMap.
+bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
+  bool Changed = false;
+  for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
+    Value *Operand = Instr->getOperand(OpNum);
+    auto it = ReplaceWithConstMap.find(Operand);
+    if (it != ReplaceWithConstMap.end()) {
+      assert(!isa<Constant>(Operand) &&
+             "Replacing constants with constants is invalid");
+      DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second
+                   << " in instruction " << *Instr << '\n');
+      Instr->setOperand(OpNum, it->second);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+/// The given values are known to be equal in every block
+/// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with
+/// 'RHS' everywhere in the scope.  Returns whether a change was made.
+/// If DominatesByEdge is false, then it means that we will propagate the RHS
+/// value starting from the end of Root.Start.
+bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
+                            bool DominatesByEdge) {
+  SmallVector<std::pair<Value*, Value*>, 4> Worklist;
+  Worklist.push_back(std::make_pair(LHS, RHS));
+  bool Changed = false;
+  // For speed, compute a conservative fast approximation to
+  // DT->dominates(Root, Root.getEnd());
+  const bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT);
+
+  while (!Worklist.empty()) {
+    std::pair<Value*, Value*> Item = Worklist.pop_back_val();
+    LHS = Item.first; RHS = Item.second;
+
+    if (LHS == RHS)
+      continue;
+    assert(LHS->getType() == RHS->getType() && "Equality but unequal types!");
+
+    // Don't try to propagate equalities between constants.
+    if (isa<Constant>(LHS) && isa<Constant>(RHS))
+      continue;
+
+    // Prefer a constant on the right-hand side, or an Argument if no constants.
+    if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS)))
+      std::swap(LHS, RHS);
+    assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
+
+    // If there is no obvious reason to prefer the left-hand side over the
+    // right-hand side, ensure the longest lived term is on the right-hand side,
+    // so the shortest lived term will be replaced by the longest lived.
+    // This tends to expose more simplifications.
+    uint32_t LVN = VN.lookupOrAdd(LHS);
+    if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
+        (isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
+      // Move the 'oldest' value to the right-hand side, using the value number
+      // as a proxy for age.
+      uint32_t RVN = VN.lookupOrAdd(RHS);
+      if (LVN < RVN) {
+        std::swap(LHS, RHS);
+        LVN = RVN;
+      }
+    }
+
+    // If value numbering later sees that an instruction in the scope is equal
+    // to 'LHS' then ensure it will be turned into 'RHS'.  In order to preserve
+    // the invariant that instructions only occur in the leader table for their
+    // own value number (this is used by removeFromLeaderTable), do not do this
+    // if RHS is an instruction (if an instruction in the scope is morphed into
+    // LHS then it will be turned into RHS by the next GVN iteration anyway, so
+    // using the leader table is about compiling faster, not optimizing better).
+    // The leader table only tracks basic blocks, not edges. Only add to if we
+    // have the simple case where the edge dominates the end.
+    if (RootDominatesEnd && !isa<Instruction>(RHS))
+      addToLeaderTable(LVN, RHS, Root.getEnd());
+
+    // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.  As
+    // LHS always has at least one use that is not dominated by Root, this will
+    // never do anything if LHS has only one use.
+    if (!LHS->hasOneUse()) {
+      unsigned NumReplacements =
+          DominatesByEdge
+              ? replaceDominatedUsesWith(LHS, RHS, *DT, Root)
+              : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getStart());
+
+      Changed |= NumReplacements > 0;
+      NumGVNEqProp += NumReplacements;
+    }
+
+    // Now try to deduce additional equalities from this one. For example, if
+    // the known equality was "(A != B)" == "false" then it follows that A and B
+    // are equal in the scope. Only boolean equalities with an explicit true or
+    // false RHS are currently supported.
+    if (!RHS->getType()->isIntegerTy(1))
+      // Not a boolean equality - bail out.
+      continue;
+    ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
+    if (!CI)
+      // RHS neither 'true' nor 'false' - bail out.
+      continue;
+    // Whether RHS equals 'true'.  Otherwise it equals 'false'.
+    bool isKnownTrue = CI->isMinusOne();
+    bool isKnownFalse = !isKnownTrue;
+
+    // If "A && B" is known true then both A and B are known true.  If "A || B"
+    // is known false then both A and B are known false.
+    Value *A, *B;
+    if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) ||
+        (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) {
+      Worklist.push_back(std::make_pair(A, RHS));
+      Worklist.push_back(std::make_pair(B, RHS));
+      continue;
+    }
+
+    // If we are propagating an equality like "(A == B)" == "true" then also
+    // propagate the equality A == B.  When propagating a comparison such as
+    // "(A >= B)" == "true", replace all instances of "A < B" with "false".
+    if (CmpInst *Cmp = dyn_cast<CmpInst>(LHS)) {
+      Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
+
+      // If "A == B" is known true, or "A != B" is known false, then replace
+      // A with B everywhere in the scope.
+      if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) ||
+          (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE))
+        Worklist.push_back(std::make_pair(Op0, Op1));
+
+      // Handle the floating point versions of equality comparisons too.
+      if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) ||
+          (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) {
+
+        // Floating point -0.0 and 0.0 compare equal, so we can only
+        // propagate values if we know that we have a constant and that
+        // its value is non-zero.
+
+        // FIXME: We should do this optimization if 'no signed zeros' is
+        // applicable via an instruction-level fast-math-flag or some other
+        // indicator that relaxed FP semantics are being used.
+
+        if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero())
+          Worklist.push_back(std::make_pair(Op0, Op1));
+      }
+
+      // If "A >= B" is known true, replace "A < B" with false everywhere.
+      CmpInst::Predicate NotPred = Cmp->getInversePredicate();
+      Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
+      // Since we don't have the instruction "A < B" immediately to hand, work
+      // out the value number that it would have and use that to find an
+      // appropriate instruction (if any).
+      uint32_t NextNum = VN.getNextUnusedValueNumber();
+      uint32_t Num = VN.lookupOrAddCmp(Cmp->getOpcode(), NotPred, Op0, Op1);
+      // If the number we were assigned was brand new then there is no point in
+      // looking for an instruction realizing it: there cannot be one!
+      if (Num < NextNum) {
+        Value *NotCmp = findLeader(Root.getEnd(), Num);
+        if (NotCmp && isa<Instruction>(NotCmp)) {
+          unsigned NumReplacements =
+              DominatesByEdge
+                  ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root)
+                  : replaceDominatedUsesWith(NotCmp, NotVal, *DT,
+                                             Root.getStart());
+          Changed |= NumReplacements > 0;
+          NumGVNEqProp += NumReplacements;
+        }
+      }
+      // Ensure that any instruction in scope that gets the "A < B" value number
+      // is replaced with false.
+      // The leader table only tracks basic blocks, not edges. Only add to if we
+      // have the simple case where the edge dominates the end.
+      if (RootDominatesEnd)
+        addToLeaderTable(Num, NotVal, Root.getEnd());
+
+      continue;
+    }
+  }
+
+  return Changed;
+}
+
+/// When calculating availability, handle an instruction
+/// by inserting it into the appropriate sets
+bool GVN::processInstruction(Instruction *I) {
+  // Ignore dbg info intrinsics.
+  if (isa<DbgInfoIntrinsic>(I))
+    return false;
+
+  // If the instruction can be easily simplified then do so now in preference
+  // to value numbering it.  Value numbering often exposes redundancies, for
+  // example if it determines that %y is equal to %x then the instruction
+  // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  if (Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC})) {
+    bool Changed = false;
+    if (!I->use_empty()) {
+      I->replaceAllUsesWith(V);
+      Changed = true;
+    }
+    if (isInstructionTriviallyDead(I, TLI)) {
+      markInstructionForDeletion(I);
+      Changed = true;
+    }
+    if (Changed) {
+      if (MD && V->getType()->isPtrOrPtrVectorTy())
+        MD->invalidateCachedPointerInfo(V);
+      ++NumGVNSimpl;
+      return true;
+    }
+  }
+
+  if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I))
+    if (IntrinsicI->getIntrinsicID() == Intrinsic::assume)
+      return processAssumeIntrinsic(IntrinsicI);
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (processLoad(LI))
+      return true;
+
+    unsigned Num = VN.lookupOrAdd(LI);
+    addToLeaderTable(Num, LI, LI->getParent());
+    return false;
+  }
+
+  // For conditional branches, we can perform simple conditional propagation on
+  // the condition value itself.
+  if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+    if (!BI->isConditional())
+      return false;
+
+    if (isa<Constant>(BI->getCondition()))
+      return processFoldableCondBr(BI);
+
+    Value *BranchCond = BI->getCondition();
+    BasicBlock *TrueSucc = BI->getSuccessor(0);
+    BasicBlock *FalseSucc = BI->getSuccessor(1);
+    // Avoid multiple edges early.
+    if (TrueSucc == FalseSucc)
+      return false;
+
+    BasicBlock *Parent = BI->getParent();
+    bool Changed = false;
+
+    Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext());
+    BasicBlockEdge TrueE(Parent, TrueSucc);
+    Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true);
+
+    Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext());
+    BasicBlockEdge FalseE(Parent, FalseSucc);
+    Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true);
+
+    return Changed;
+  }
+
+  // For switches, propagate the case values into the case destinations.
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+    Value *SwitchCond = SI->getCondition();
+    BasicBlock *Parent = SI->getParent();
+    bool Changed = false;
+
+    // Remember how many outgoing edges there are to every successor.
+    SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+    for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i)
+      ++SwitchEdges[SI->getSuccessor(i)];
+
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      BasicBlock *Dst = i->getCaseSuccessor();
+      // If there is only a single edge, propagate the case value into it.
+      if (SwitchEdges.lookup(Dst) == 1) {
+        BasicBlockEdge E(Parent, Dst);
+        Changed |= propagateEquality(SwitchCond, i->getCaseValue(), E, true);
+      }
+    }
+    return Changed;
+  }
+
+  // Instructions with void type don't return a value, so there's
+  // no point in trying to find redundancies in them.
+  if (I->getType()->isVoidTy())
+    return false;
+
+  uint32_t NextNum = VN.getNextUnusedValueNumber();
+  unsigned Num = VN.lookupOrAdd(I);
+
+  // Allocations are always uniquely numbered, so we can save time and memory
+  // by fast failing them.
+  if (isa<AllocaInst>(I) || isa<TerminatorInst>(I) || isa<PHINode>(I)) {
+    addToLeaderTable(Num, I, I->getParent());
+    return false;
+  }
+
+  // If the number we were assigned was a brand new VN, then we don't
+  // need to do a lookup to see if the number already exists
+  // somewhere in the domtree: it can't!
+  if (Num >= NextNum) {
+    addToLeaderTable(Num, I, I->getParent());
+    return false;
+  }
+
+  // Perform fast-path value-number based elimination of values inherited from
+  // dominators.
+  Value *Repl = findLeader(I->getParent(), Num);
+  if (!Repl) {
+    // Failure, just remember this instance for future use.
+    addToLeaderTable(Num, I, I->getParent());
+    return false;
+  } else if (Repl == I) {
+    // If I was the result of a shortcut PRE, it might already be in the table
+    // and the best replacement for itself. Nothing to do.
+    return false;
+  }
+
+  // Remove it!
+  patchAndReplaceAllUsesWith(I, Repl);
+  if (MD && Repl->getType()->isPtrOrPtrVectorTy())
+    MD->invalidateCachedPointerInfo(Repl);
+  markInstructionForDeletion(I);
+  return true;
+}
+
+/// runOnFunction - This is the main transformation entry point for a function.
+bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
+                  const TargetLibraryInfo &RunTLI, AAResults &RunAA,
+                  MemoryDependenceResults *RunMD, LoopInfo *LI,
+                  OptimizationRemarkEmitter *RunORE) {
+  AC = &RunAC;
+  DT = &RunDT;
+  VN.setDomTree(DT);
+  TLI = &RunTLI;
+  VN.setAliasAnalysis(&RunAA);
+  MD = RunMD;
+  VN.setMemDep(MD);
+  ORE = RunORE;
+
+  bool Changed = false;
+  bool ShouldContinue = true;
+
+  // Merge unconditional branches, allowing PRE to catch more
+  // optimization opportunities.
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
+    BasicBlock *BB = &*FI++;
+
+    bool removedBlock = MergeBlockIntoPredecessor(BB, DT, LI, MD);
+    if (removedBlock)
+      ++NumGVNBlocks;
+
+    Changed |= removedBlock;
+  }
+
+  unsigned Iteration = 0;
+  while (ShouldContinue) {
+    DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
+    ShouldContinue = iterateOnFunction(F);
+    Changed |= ShouldContinue;
+    ++Iteration;
+  }
+
+  if (EnablePRE) {
+    // Fabricate val-num for dead-code in order to suppress assertion in
+    // performPRE().
+    assignValNumForDeadCode();
+    bool PREChanged = true;
+    while (PREChanged) {
+      PREChanged = performPRE(F);
+      Changed |= PREChanged;
+    }
+  }
+
+  // FIXME: Should perform GVN again after PRE does something.  PRE can move
+  // computations into blocks where they become fully redundant.  Note that
+  // we can't do this until PRE's critical edge splitting updates memdep.
+  // Actually, when this happens, we should just fully integrate PRE into GVN.
+
+  cleanupGlobalSets();
+  // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each
+  // iteration.
+  DeadBlocks.clear();
+
+  return Changed;
+}
+
+bool GVN::processBlock(BasicBlock *BB) {
+  // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function
+  // (and incrementing BI before processing an instruction).
+  assert(InstrsToErase.empty() &&
+         "We expect InstrsToErase to be empty across iterations");
+  if (DeadBlocks.count(BB))
+    return false;
+
+  // Clearing map before every BB because it can be used only for single BB.
+  ReplaceWithConstMap.clear();
+  bool ChangedFunction = false;
+
+  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+       BI != BE;) {
+    if (!ReplaceWithConstMap.empty())
+      ChangedFunction |= replaceOperandsWithConsts(&*BI);
+    ChangedFunction |= processInstruction(&*BI);
+
+    if (InstrsToErase.empty()) {
+      ++BI;
+      continue;
+    }
+
+    // If we need some instructions deleted, do it now.
+    NumGVNInstr += InstrsToErase.size();
+
+    // Avoid iterator invalidation.
+    bool AtStart = BI == BB->begin();
+    if (!AtStart)
+      --BI;
+
+    for (SmallVectorImpl<Instruction *>::iterator I = InstrsToErase.begin(),
+         E = InstrsToErase.end(); I != E; ++I) {
+      DEBUG(dbgs() << "GVN removed: " << **I << '\n');
+      if (MD) MD->removeInstruction(*I);
+      DEBUG(verifyRemoved(*I));
+      (*I)->eraseFromParent();
+    }
+    InstrsToErase.clear();
+
+    if (AtStart)
+      BI = BB->begin();
+    else
+      ++BI;
+  }
+
+  return ChangedFunction;
+}
+
+// Instantiate an expression in a predecessor that lacked it.
+bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
+                                    unsigned int ValNo) {
+  // Because we are going top-down through the block, all value numbers
+  // will be available in the predecessor by the time we need them.  Any
+  // that weren't originally present will have been instantiated earlier
+  // in this loop.
+  bool success = true;
+  for (unsigned i = 0, e = Instr->getNumOperands(); i != e; ++i) {
+    Value *Op = Instr->getOperand(i);
+    if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
+      continue;
+    // This could be a newly inserted instruction, in which case, we won't
+    // find a value number, and should give up before we hurt ourselves.
+    // FIXME: Rewrite the infrastructure to let it easier to value number
+    // and process newly inserted instructions.
+    if (!VN.exists(Op)) {
+      success = false;
+      break;
+    }
+    if (Value *V = findLeader(Pred, VN.lookup(Op))) {
+      Instr->setOperand(i, V);
+    } else {
+      success = false;
+      break;
+    }
+  }
+
+  // Fail out if we encounter an operand that is not available in
+  // the PRE predecessor.  This is typically because of loads which
+  // are not value numbered precisely.
+  if (!success)
+    return false;
+
+  Instr->insertBefore(Pred->getTerminator());
+  Instr->setName(Instr->getName() + ".pre");
+  Instr->setDebugLoc(Instr->getDebugLoc());
+  VN.add(Instr, ValNo);
+
+  // Update the availability map to include the new instruction.
+  addToLeaderTable(ValNo, Instr, Pred);
+  return true;
+}
+
+bool GVN::performScalarPRE(Instruction *CurInst) {
+  if (isa<AllocaInst>(CurInst) || isa<TerminatorInst>(CurInst) ||
+      isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
+      CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+      isa<DbgInfoIntrinsic>(CurInst))
+    return false;
+
+  // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
+  // sinking the compare again, and it would force the code generator to
+  // move the i1 from processor flags or predicate registers into a general
+  // purpose register.
+  if (isa<CmpInst>(CurInst))
+    return false;
+
+  // We don't currently value number ANY inline asm calls.
+  if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
+    if (CallI->isInlineAsm())
+      return false;
+
+  uint32_t ValNo = VN.lookup(CurInst);
+
+  // Look for the predecessors for PRE opportunities.  We're
+  // only trying to solve the basic diamond case, where
+  // a value is computed in the successor and one predecessor,
+  // but not the other.  We also explicitly disallow cases
+  // where the successor is its own predecessor, because they're
+  // more complicated to get right.
+  unsigned NumWith = 0;
+  unsigned NumWithout = 0;
+  BasicBlock *PREPred = nullptr;
+  BasicBlock *CurrentBlock = CurInst->getParent();
+
+  SmallVector<std::pair<Value *, BasicBlock *>, 8> predMap;
+  for (BasicBlock *P : predecessors(CurrentBlock)) {
+    // We're not interested in PRE where the block is its
+    // own predecessor, or in blocks with predecessors
+    // that are not reachable.
+    if (P == CurrentBlock) {
+      NumWithout = 2;
+      break;
+    } else if (!DT->isReachableFromEntry(P)) {
+      NumWithout = 2;
+      break;
+    }
+
+    Value *predV = findLeader(P, ValNo);
+    if (!predV) {
+      predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
+      PREPred = P;
+      ++NumWithout;
+    } else if (predV == CurInst) {
+      /* CurInst dominates this predecessor. */
+      NumWithout = 2;
+      break;
+    } else {
+      predMap.push_back(std::make_pair(predV, P));
+      ++NumWith;
+    }
+  }
+
+  // Don't do PRE when it might increase code size, i.e. when
+  // we would need to insert instructions in more than one pred.
+  if (NumWithout > 1 || NumWith == 0)
+    return false;
+
+  // We may have a case where all predecessors have the instruction,
+  // and we just need to insert a phi node. Otherwise, perform
+  // insertion.
+  Instruction *PREInstr = nullptr;
+
+  if (NumWithout != 0) {
+    // Don't do PRE across indirect branch.
+    if (isa<IndirectBrInst>(PREPred->getTerminator()))
+      return false;
+
+    // We can't do PRE safely on a critical edge, so instead we schedule
+    // the edge to be split and perform the PRE the next time we iterate
+    // on the function.
+    unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
+    if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
+      toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
+      return false;
+    }
+    // We need to insert somewhere, so let's give it a shot
+    PREInstr = CurInst->clone();
+    if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) {
+      // If we failed insertion, make sure we remove the instruction.
+      DEBUG(verifyRemoved(PREInstr));
+      PREInstr->deleteValue();
+      return false;
+    }
+  }
+
+  // Either we should have filled in the PRE instruction, or we should
+  // not have needed insertions.
+  assert (PREInstr != nullptr || NumWithout == 0);
+
+  ++NumGVNPRE;
+
+  // Create a PHI to make the value available in this block.
+  PHINode *Phi =
+      PHINode::Create(CurInst->getType(), predMap.size(),
+                      CurInst->getName() + ".pre-phi", &CurrentBlock->front());
+  for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
+    if (Value *V = predMap[i].first)
+      Phi->addIncoming(V, predMap[i].second);
+    else
+      Phi->addIncoming(PREInstr, PREPred);
+  }
+
+  VN.add(Phi, ValNo);
+  addToLeaderTable(ValNo, Phi, CurrentBlock);
+  Phi->setDebugLoc(CurInst->getDebugLoc());
+  CurInst->replaceAllUsesWith(Phi);
+  if (MD && Phi->getType()->isPtrOrPtrVectorTy())
+    MD->invalidateCachedPointerInfo(Phi);
+  VN.erase(CurInst);
+  removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
+
+  DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
+  if (MD)
+    MD->removeInstruction(CurInst);
+  DEBUG(verifyRemoved(CurInst));
+  CurInst->eraseFromParent();
+  ++NumGVNInstr;
+
+  return true;
+}
+
+/// Perform a purely local form of PRE that looks for diamond
+/// control flow patterns and attempts to perform simple PRE at the join point.
+bool GVN::performPRE(Function &F) {
+  bool Changed = false;
+  for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
+    // Nothing to PRE in the entry block.
+    if (CurrentBlock == &F.getEntryBlock())
+      continue;
+
+    // Don't perform PRE on an EH pad.
+    if (CurrentBlock->isEHPad())
+      continue;
+
+    for (BasicBlock::iterator BI = CurrentBlock->begin(),
+                              BE = CurrentBlock->end();
+         BI != BE;) {
+      Instruction *CurInst = &*BI++;
+      Changed |= performScalarPRE(CurInst);
+    }
+  }
+
+  if (splitCriticalEdges())
+    Changed = true;
+
+  return Changed;
+}
+
+/// Split the critical edge connecting the given two blocks, and return
+/// the block inserted to the critical edge.
+BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
+  BasicBlock *BB =
+      SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT));
+  if (MD)
+    MD->invalidateCachedPredecessors();
+  return BB;
+}
+
+/// Split critical edges found during the previous
+/// iteration that may enable further optimization.
+bool GVN::splitCriticalEdges() {
+  if (toSplit.empty())
+    return false;
+  do {
+    std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val();
+    SplitCriticalEdge(Edge.first, Edge.second,
+                      CriticalEdgeSplittingOptions(DT));
+  } while (!toSplit.empty());
+  if (MD) MD->invalidateCachedPredecessors();
+  return true;
+}
+
+/// Executes one iteration of GVN
+bool GVN::iterateOnFunction(Function &F) {
+  cleanupGlobalSets();
+
+  // Top-down walk of the dominator tree
+  bool Changed = false;
+  // Needed for value numbering with phi construction to work.
+  // RPOT walks the graph in its constructor and will not be invalidated during
+  // processBlock.
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  for (BasicBlock *BB : RPOT)
+    Changed |= processBlock(BB);
+
+  return Changed;
+}
+
+void GVN::cleanupGlobalSets() {
+  VN.clear();
+  LeaderTable.clear();
+  TableAllocator.Reset();
+}
+
+/// Verify that the specified instruction does not occur in our
+/// internal data structures.
+void GVN::verifyRemoved(const Instruction *Inst) const {
+  VN.verifyRemoved(Inst);
+
+  // Walk through the value number scope to make sure the instruction isn't
+  // ferreted away in it.
+  for (DenseMap<uint32_t, LeaderTableEntry>::const_iterator
+       I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) {
+    const LeaderTableEntry *Node = &I->second;
+    assert(Node->Val != Inst && "Inst still in value numbering scope!");
+
+    while (Node->Next) {
+      Node = Node->Next;
+      assert(Node->Val != Inst && "Inst still in value numbering scope!");
+    }
+  }
+}
+
+/// BB is declared dead, which implied other blocks become dead as well. This
+/// function is to add all these blocks to "DeadBlocks". For the dead blocks'
+/// live successors, update their phi nodes by replacing the operands
+/// corresponding to dead blocks with UndefVal.
+void GVN::addDeadBlock(BasicBlock *BB) {
+  SmallVector<BasicBlock *, 4> NewDead;
+  SmallSetVector<BasicBlock *, 4> DF;
+
+  NewDead.push_back(BB);
+  while (!NewDead.empty()) {
+    BasicBlock *D = NewDead.pop_back_val();
+    if (DeadBlocks.count(D))
+      continue;
+
+    // All blocks dominated by D are dead.
+    SmallVector<BasicBlock *, 8> Dom;
+    DT->getDescendants(D, Dom);
+    DeadBlocks.insert(Dom.begin(), Dom.end());
+
+    // Figure out the dominance-frontier(D).
+    for (BasicBlock *B : Dom) {
+      for (BasicBlock *S : successors(B)) {
+        if (DeadBlocks.count(S))
+          continue;
+
+        bool AllPredDead = true;
+        for (BasicBlock *P : predecessors(S))
+          if (!DeadBlocks.count(P)) {
+            AllPredDead = false;
+            break;
+          }
+
+        if (!AllPredDead) {
+          // S could be proved dead later on. That is why we don't update phi
+          // operands at this moment.
+          DF.insert(S);
+        } else {
+          // While S is not dominated by D, it is dead by now. This could take
+          // place if S already have a dead predecessor before D is declared
+          // dead.
+          NewDead.push_back(S);
+        }
+      }
+    }
+  }
+
+  // For the dead blocks' live successors, update their phi nodes by replacing
+  // the operands corresponding to dead blocks with UndefVal.
+  for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end();
+        I != E; I++) {
+    BasicBlock *B = *I;
+    if (DeadBlocks.count(B))
+      continue;
+
+    SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B));
+    for (BasicBlock *P : Preds) {
+      if (!DeadBlocks.count(P))
+        continue;
+
+      if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) {
+        if (BasicBlock *S = splitCriticalEdges(P, B))
+          DeadBlocks.insert(P = S);
+      }
+
+      for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) {
+        PHINode &Phi = cast<PHINode>(*II);
+        Phi.setIncomingValue(Phi.getBasicBlockIndex(P),
+                             UndefValue::get(Phi.getType()));
+      }
+    }
+  }
+}
+
+// If the given branch is recognized as a foldable branch (i.e. conditional
+// branch with constant condition), it will perform following analyses and
+// transformation.
+//  1) If the dead out-coming edge is a critical-edge, split it. Let
+//     R be the target of the dead out-coming edge.
+//  1) Identify the set of dead blocks implied by the branch's dead outcoming
+//     edge. The result of this step will be {X| X is dominated by R}
+//  2) Identify those blocks which haves at least one dead predecessor. The
+//     result of this step will be dominance-frontier(R).
+//  3) Update the PHIs in DF(R) by replacing the operands corresponding to
+//     dead blocks with "UndefVal" in an hope these PHIs will optimized away.
+//
+// Return true iff *NEW* dead code are found.
+bool GVN::processFoldableCondBr(BranchInst *BI) {
+  if (!BI || BI->isUnconditional())
+    return false;
+
+  // If a branch has two identical successors, we cannot declare either dead.
+  if (BI->getSuccessor(0) == BI->getSuccessor(1))
+    return false;
+
+  ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+  if (!Cond)
+    return false;
+
+  BasicBlock *DeadRoot =
+      Cond->getZExtValue() ? BI->getSuccessor(1) : BI->getSuccessor(0);
+  if (DeadBlocks.count(DeadRoot))
+    return false;
+
+  if (!DeadRoot->getSinglePredecessor())
+    DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot);
+
+  addDeadBlock(DeadRoot);
+  return true;
+}
+
+// performPRE() will trigger assert if it comes across an instruction without
+// associated val-num. As it normally has far more live instructions than dead
+// instructions, it makes more sense just to "fabricate" a val-number for the
+// dead code than checking if instruction involved is dead or not.
+void GVN::assignValNumForDeadCode() {
+  for (BasicBlock *BB : DeadBlocks) {
+    for (Instruction &Inst : *BB) {
+      unsigned ValNum = VN.lookupOrAdd(&Inst);
+      addToLeaderTable(ValNum, &Inst, BB);
+    }
+  }
+}
+
+class llvm::gvn::GVNLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit GVNLegacyPass(bool NoLoads = false)
+      : FunctionPass(ID), NoLoads(NoLoads) {
+    initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+
+    return Impl.runImpl(
+        F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+        getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+        getAnalysis<AAResultsWrapperPass>().getAAResults(),
+        NoLoads ? nullptr
+                : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(),
+        LIWP ? &LIWP->getLoopInfo() : nullptr,
+        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    if (!NoLoads)
+      AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  }
+
+private:
+  bool NoLoads;
+  GVN Impl;
+};
+
+char GVNLegacyPass::ID = 0;
+
+// The public interface to this file...
+FunctionPass *llvm::createGVNPass(bool NoLoads) {
+  return new GVNLegacyPass(NoLoads);
+}
+
+INITIALIZE_PASS_BEGIN(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
new file mode 100644
index 000000000000..29de792bd248
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -0,0 +1,1045 @@
+//===- GVNHoist.cpp - Hoist scalar and load expressions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists expressions from branches to a common dominator. It uses
+// GVN (global value numbering) to discover expressions computing the same
+// values. The primary goals of code-hoisting are:
+// 1. To reduce the code size.
+// 2. In some cases reduce critical path (by exposing more ILP).
+//
+// Hoisting may affect the performance in some cases. To mitigate that, hoisting
+// is disabled in the following cases.
+// 1. Scalars across calls.
+// 2. geps when corresponding load/store cannot be hoisted.
+//
+// TODO: Hoist from >2 successors. Currently GVNHoist will not hoist stores
+// in this case because it works on two instructions at a time.
+// entry:
+//   switch i32 %c1, label %exit1 [
+//     i32 0, label %sw0
+//     i32 1, label %sw1
+//   ]
+//
+// sw0:
+//   store i32 1, i32* @G
+//   br label %exit
+//
+// sw1:
+//   store i32 1, i32* @G
+//   br label %exit
+//
+// exit1:
+//   store i32 1, i32* @G
+//   ret void
+// exit:
+//   ret void
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gvn-hoist"
+
+STATISTIC(NumHoisted, "Number of instructions hoisted");
+STATISTIC(NumRemoved, "Number of instructions removed");
+STATISTIC(NumLoadsHoisted, "Number of loads hoisted");
+STATISTIC(NumLoadsRemoved, "Number of loads removed");
+STATISTIC(NumStoresHoisted, "Number of stores hoisted");
+STATISTIC(NumStoresRemoved, "Number of stores removed");
+STATISTIC(NumCallsHoisted, "Number of calls hoisted");
+STATISTIC(NumCallsRemoved, "Number of calls removed");
+
+static cl::opt<int>
+    MaxHoistedThreshold("gvn-max-hoisted", cl::Hidden, cl::init(-1),
+                        cl::desc("Max number of instructions to hoist "
+                                 "(default unlimited = -1)"));
+static cl::opt<int> MaxNumberOfBBSInPath(
+    "gvn-hoist-max-bbs", cl::Hidden, cl::init(4),
+    cl::desc("Max number of basic blocks on the path between "
+             "hoisting locations (default = 4, unlimited = -1)"));
+
+static cl::opt<int> MaxDepthInBB(
+    "gvn-hoist-max-depth", cl::Hidden, cl::init(100),
+    cl::desc("Hoist instructions from the beginning of the BB up to the "
+             "maximum specified depth (default = 100, unlimited = -1)"));
+
+static cl::opt<int>
+    MaxChainLength("gvn-hoist-max-chain-length", cl::Hidden, cl::init(10),
+                   cl::desc("Maximum length of dependent chains to hoist "
+                            "(default = 10, unlimited = -1)"));
+
+namespace llvm {
+
+// Provides a sorting function based on the execution order of two instructions.
+struct SortByDFSIn {
+private:
+  DenseMap<const Value *, unsigned> &DFSNumber;
+
+public:
+  SortByDFSIn(DenseMap<const Value *, unsigned> &D) : DFSNumber(D) {}
+
+  // Returns true when A executes before B.
+  bool operator()(const Instruction *A, const Instruction *B) const {
+    const BasicBlock *BA = A->getParent();
+    const BasicBlock *BB = B->getParent();
+    unsigned ADFS, BDFS;
+    if (BA == BB) {
+      ADFS = DFSNumber.lookup(A);
+      BDFS = DFSNumber.lookup(B);
+    } else {
+      ADFS = DFSNumber.lookup(BA);
+      BDFS = DFSNumber.lookup(BB);
+    }
+    assert(ADFS && BDFS);
+    return ADFS < BDFS;
+  }
+};
+
+// A map from a pair of VNs to all the instructions with those VNs.
+typedef DenseMap<std::pair<unsigned, unsigned>, SmallVector<Instruction *, 4>>
+    VNtoInsns;
+// An invalid value number Used when inserting a single value number into
+// VNtoInsns.
+enum : unsigned { InvalidVN = ~2U };
+
+// Records all scalar instructions candidate for code hoisting.
+class InsnInfo {
+  VNtoInsns VNtoScalars;
+
+public:
+  // Inserts I and its value number in VNtoScalars.
+  void insert(Instruction *I, GVN::ValueTable &VN) {
+    // Scalar instruction.
+    unsigned V = VN.lookupOrAdd(I);
+    VNtoScalars[{V, InvalidVN}].push_back(I);
+  }
+
+  const VNtoInsns &getVNTable() const { return VNtoScalars; }
+};
+
+// Records all load instructions candidate for code hoisting.
+class LoadInfo {
+  VNtoInsns VNtoLoads;
+
+public:
+  // Insert Load and the value number of its memory address in VNtoLoads.
+  void insert(LoadInst *Load, GVN::ValueTable &VN) {
+    if (Load->isSimple()) {
+      unsigned V = VN.lookupOrAdd(Load->getPointerOperand());
+      VNtoLoads[{V, InvalidVN}].push_back(Load);
+    }
+  }
+
+  const VNtoInsns &getVNTable() const { return VNtoLoads; }
+};
+
+// Records all store instructions candidate for code hoisting.
+class StoreInfo {
+  VNtoInsns VNtoStores;
+
+public:
+  // Insert the Store and a hash number of the store address and the stored
+  // value in VNtoStores.
+  void insert(StoreInst *Store, GVN::ValueTable &VN) {
+    if (!Store->isSimple())
+      return;
+    // Hash the store address and the stored value.
+    Value *Ptr = Store->getPointerOperand();
+    Value *Val = Store->getValueOperand();
+    VNtoStores[{VN.lookupOrAdd(Ptr), VN.lookupOrAdd(Val)}].push_back(Store);
+  }
+
+  const VNtoInsns &getVNTable() const { return VNtoStores; }
+};
+
+// Records all call instructions candidate for code hoisting.
+class CallInfo {
+  VNtoInsns VNtoCallsScalars;
+  VNtoInsns VNtoCallsLoads;
+  VNtoInsns VNtoCallsStores;
+
+public:
+  // Insert Call and its value numbering in one of the VNtoCalls* containers.
+  void insert(CallInst *Call, GVN::ValueTable &VN) {
+    // A call that doesNotAccessMemory is handled as a Scalar,
+    // onlyReadsMemory will be handled as a Load instruction,
+    // all other calls will be handled as stores.
+    unsigned V = VN.lookupOrAdd(Call);
+    auto Entry = std::make_pair(V, InvalidVN);
+
+    if (Call->doesNotAccessMemory())
+      VNtoCallsScalars[Entry].push_back(Call);
+    else if (Call->onlyReadsMemory())
+      VNtoCallsLoads[Entry].push_back(Call);
+    else
+      VNtoCallsStores[Entry].push_back(Call);
+  }
+
+  const VNtoInsns &getScalarVNTable() const { return VNtoCallsScalars; }
+
+  const VNtoInsns &getLoadVNTable() const { return VNtoCallsLoads; }
+
+  const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; }
+};
+
+typedef DenseMap<const BasicBlock *, bool> BBSideEffectsSet;
+typedef SmallVector<Instruction *, 4> SmallVecInsn;
+typedef SmallVectorImpl<Instruction *> SmallVecImplInsn;
+
+static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
+  static const unsigned KnownIDs[] = {
+      LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
+      LLVMContext::MD_noalias,        LLVMContext::MD_range,
+      LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
+      LLVMContext::MD_invariant_group};
+  combineMetadata(ReplInst, I, KnownIDs);
+}
+
+// This pass hoists common computations across branches sharing common
+// dominator. The primary goal is to reduce the code size, and in some
+// cases reduce critical path (by exposing more ILP).
+class GVNHoist {
+public:
+  GVNHoist(DominatorTree *DT, AliasAnalysis *AA, MemoryDependenceResults *MD,
+           MemorySSA *MSSA)
+      : DT(DT), AA(AA), MD(MD), MSSA(MSSA),
+        MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)),
+        HoistingGeps(false),
+        HoistedCtr(0)
+  { }
+
+  bool run(Function &F) {
+    VN.setDomTree(DT);
+    VN.setAliasAnalysis(AA);
+    VN.setMemDep(MD);
+    bool Res = false;
+    // Perform DFS Numbering of instructions.
+    unsigned BBI = 0;
+    for (const BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+      DFSNumber[BB] = ++BBI;
+      unsigned I = 0;
+      for (auto &Inst : *BB)
+        DFSNumber[&Inst] = ++I;
+    }
+
+    int ChainLength = 0;
+
+    // FIXME: use lazy evaluation of VN to avoid the fix-point computation.
+    while (1) {
+      if (MaxChainLength != -1 && ++ChainLength >= MaxChainLength)
+        return Res;
+
+      auto HoistStat = hoistExpressions(F);
+      if (HoistStat.first + HoistStat.second == 0)
+        return Res;
+
+      if (HoistStat.second > 0)
+        // To address a limitation of the current GVN, we need to rerun the
+        // hoisting after we hoisted loads or stores in order to be able to
+        // hoist all scalars dependent on the hoisted ld/st.
+        VN.clear();
+
+      Res = true;
+    }
+
+    return Res;
+  }
+
+private:
+  GVN::ValueTable VN;
+  DominatorTree *DT;
+  AliasAnalysis *AA;
+  MemoryDependenceResults *MD;
+  MemorySSA *MSSA;
+  std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
+  const bool HoistingGeps;
+  DenseMap<const Value *, unsigned> DFSNumber;
+  BBSideEffectsSet BBSideEffects;
+  DenseSet<const BasicBlock*> HoistBarrier;
+  int HoistedCtr;
+
+  enum InsKind { Unknown, Scalar, Load, Store };
+
+  // Return true when there are exception handling in BB.
+  bool hasEH(const BasicBlock *BB) {
+    auto It = BBSideEffects.find(BB);
+    if (It != BBSideEffects.end())
+      return It->second;
+
+    if (BB->isEHPad() || BB->hasAddressTaken()) {
+      BBSideEffects[BB] = true;
+      return true;
+    }
+
+    if (BB->getTerminator()->mayThrow()) {
+      BBSideEffects[BB] = true;
+      return true;
+    }
+
+    BBSideEffects[BB] = false;
+    return false;
+  }
+
+  // Return true when a successor of BB dominates A.
+  bool successorDominate(const BasicBlock *BB, const BasicBlock *A) {
+    for (const BasicBlock *Succ : BB->getTerminator()->successors())
+      if (DT->dominates(Succ, A))
+        return true;
+
+    return false;
+  }
+
+  // Return true when all paths from HoistBB to the end of the function pass
+  // through one of the blocks in WL.
+  bool hoistingFromAllPaths(const BasicBlock *HoistBB,
+                            SmallPtrSetImpl<const BasicBlock *> &WL) {
+
+    // Copy WL as the loop will remove elements from it.
+    SmallPtrSet<const BasicBlock *, 2> WorkList(WL.begin(), WL.end());
+
+    for (auto It = df_begin(HoistBB), E = df_end(HoistBB); It != E;) {
+      // There exists a path from HoistBB to the exit of the function if we are
+      // still iterating in DF traversal and we removed all instructions from
+      // the work list.
+      if (WorkList.empty())
+        return false;
+
+      const BasicBlock *BB = *It;
+      if (WorkList.erase(BB)) {
+        // Stop DFS traversal when BB is in the work list.
+        It.skipChildren();
+        continue;
+      }
+
+      // We reached the leaf Basic Block => not all paths have this instruction.
+      if (!BB->getTerminator()->getNumSuccessors())
+        return false;
+
+      // When reaching the back-edge of a loop, there may be a path through the
+      // loop that does not pass through B or C before exiting the loop.
+      if (successorDominate(BB, HoistBB))
+        return false;
+
+      // Increment DFS traversal when not skipping children.
+      ++It;
+    }
+
+    return true;
+  }
+
+  /* Return true when I1 appears before I2 in the instructions of BB.  */
+  bool firstInBB(const Instruction *I1, const Instruction *I2) {
+    assert(I1->getParent() == I2->getParent());
+    unsigned I1DFS = DFSNumber.lookup(I1);
+    unsigned I2DFS = DFSNumber.lookup(I2);
+    assert(I1DFS && I2DFS);
+    return I1DFS < I2DFS;
+  }
+
+  // Return true when there are memory uses of Def in BB.
+  bool hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
+                    const BasicBlock *BB) {
+    const MemorySSA::AccessList *Acc = MSSA->getBlockAccesses(BB);
+    if (!Acc)
+      return false;
+
+    Instruction *OldPt = Def->getMemoryInst();
+    const BasicBlock *OldBB = OldPt->getParent();
+    const BasicBlock *NewBB = NewPt->getParent();
+    bool ReachedNewPt = false;
+
+    for (const MemoryAccess &MA : *Acc)
+      if (const MemoryUse *MU = dyn_cast<MemoryUse>(&MA)) {
+        Instruction *Insn = MU->getMemoryInst();
+
+        // Do not check whether MU aliases Def when MU occurs after OldPt.
+        if (BB == OldBB && firstInBB(OldPt, Insn))
+          break;
+
+        // Do not check whether MU aliases Def when MU occurs before NewPt.
+        if (BB == NewBB) {
+          if (!ReachedNewPt) {
+            if (firstInBB(Insn, NewPt))
+              continue;
+            ReachedNewPt = true;
+          }
+        }
+        if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA))
+          return true;
+      }
+
+    return false;
+  }
+
+  // Return true when there are exception handling or loads of memory Def
+  // between Def and NewPt.  This function is only called for stores: Def is
+  // the MemoryDef of the store to be hoisted.
+
+  // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
+  // return true when the counter NBBsOnAllPaths reaces 0, except when it is
+  // initialized to -1 which is unlimited.
+  bool hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
+                          int &NBBsOnAllPaths) {
+    const BasicBlock *NewBB = NewPt->getParent();
+    const BasicBlock *OldBB = Def->getBlock();
+    assert(DT->dominates(NewBB, OldBB) && "invalid path");
+    assert(DT->dominates(Def->getDefiningAccess()->getBlock(), NewBB) &&
+           "def does not dominate new hoisting point");
+
+    // Walk all basic blocks reachable in depth-first iteration on the inverse
+    // CFG from OldBB to NewBB. These blocks are all the blocks that may be
+    // executed between the execution of NewBB and OldBB. Hoisting an expression
+    // from OldBB into NewBB has to be safe on all execution paths.
+    for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) {
+      const BasicBlock *BB = *I;
+      if (BB == NewBB) {
+        // Stop traversal when reaching HoistPt.
+        I.skipChildren();
+        continue;
+      }
+
+      // Stop walk once the limit is reached.
+      if (NBBsOnAllPaths == 0)
+        return true;
+
+      // Impossible to hoist with exceptions on the path.
+      if (hasEH(BB))
+        return true;
+
+      // No such instruction after HoistBarrier in a basic block was
+      // selected for hoisting so instructions selected within basic block with
+      // a hoist barrier can be hoisted.
+      if ((BB != OldBB) && HoistBarrier.count(BB))
+        return true;
+
+      // Check that we do not move a store past loads.
+      if (hasMemoryUse(NewPt, Def, BB))
+        return true;
+
+      // -1 is unlimited number of blocks on all paths.
+      if (NBBsOnAllPaths != -1)
+        --NBBsOnAllPaths;
+
+      ++I;
+    }
+
+    return false;
+  }
+
+  // Return true when there are exception handling between HoistPt and BB.
+  // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
+  // return true when the counter NBBsOnAllPaths reaches 0, except when it is
+  // initialized to -1 which is unlimited.
+  bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
+                   int &NBBsOnAllPaths) {
+    assert(DT->dominates(HoistPt, SrcBB) && "Invalid path");
+
+    // Walk all basic blocks reachable in depth-first iteration on
+    // the inverse CFG from BBInsn to NewHoistPt. These blocks are all the
+    // blocks that may be executed between the execution of NewHoistPt and
+    // BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe
+    // on all execution paths.
+    for (auto I = idf_begin(SrcBB), E = idf_end(SrcBB); I != E;) {
+      const BasicBlock *BB = *I;
+      if (BB == HoistPt) {
+        // Stop traversal when reaching NewHoistPt.
+        I.skipChildren();
+        continue;
+      }
+
+      // Stop walk once the limit is reached.
+      if (NBBsOnAllPaths == 0)
+        return true;
+
+      // Impossible to hoist with exceptions on the path.
+      if (hasEH(BB))
+        return true;
+
+      // No such instruction after HoistBarrier in a basic block was
+      // selected for hoisting so instructions selected within basic block with
+      // a hoist barrier can be hoisted.
+      if ((BB != SrcBB) && HoistBarrier.count(BB))
+        return true;
+
+      // -1 is unlimited number of blocks on all paths.
+      if (NBBsOnAllPaths != -1)
+        --NBBsOnAllPaths;
+
+      ++I;
+    }
+
+    return false;
+  }
+
+  // Return true when it is safe to hoist a memory load or store U from OldPt
+  // to NewPt.
+  bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt,
+                       MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths) {
+
+    // In place hoisting is safe.
+    if (NewPt == OldPt)
+      return true;
+
+    const BasicBlock *NewBB = NewPt->getParent();
+    const BasicBlock *OldBB = OldPt->getParent();
+    const BasicBlock *UBB = U->getBlock();
+
+    // Check for dependences on the Memory SSA.
+    MemoryAccess *D = U->getDefiningAccess();
+    BasicBlock *DBB = D->getBlock();
+    if (DT->properlyDominates(NewBB, DBB))
+      // Cannot move the load or store to NewBB above its definition in DBB.
+      return false;
+
+    if (NewBB == DBB && !MSSA->isLiveOnEntryDef(D))
+      if (auto *UD = dyn_cast<MemoryUseOrDef>(D))
+        if (firstInBB(NewPt, UD->getMemoryInst()))
+          // Cannot move the load or store to NewPt above its definition in D.
+          return false;
+
+    // Check for unsafe hoistings due to side effects.
+    if (K == InsKind::Store) {
+      if (hasEHOrLoadsOnPath(NewPt, dyn_cast<MemoryDef>(U), NBBsOnAllPaths))
+        return false;
+    } else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths))
+      return false;
+
+    if (UBB == NewBB) {
+      if (DT->properlyDominates(DBB, NewBB))
+        return true;
+      assert(UBB == DBB);
+      assert(MSSA->locallyDominates(D, U));
+    }
+
+    // No side effects: it is safe to hoist.
+    return true;
+  }
+
+  // Return true when it is safe to hoist scalar instructions from all blocks in
+  // WL to HoistBB.
+  bool safeToHoistScalar(const BasicBlock *HoistBB,
+                         SmallPtrSetImpl<const BasicBlock *> &WL,
+                         int &NBBsOnAllPaths) {
+    // Check that the hoisted expression is needed on all paths.
+    if (!hoistingFromAllPaths(HoistBB, WL))
+      return false;
+
+    for (const BasicBlock *BB : WL)
+      if (hasEHOnPath(HoistBB, BB, NBBsOnAllPaths))
+        return false;
+
+    return true;
+  }
+
+  // Each element of a hoisting list contains the basic block where to hoist and
+  // a list of instructions to be hoisted.
+  typedef std::pair<BasicBlock *, SmallVecInsn> HoistingPointInfo;
+  typedef SmallVector<HoistingPointInfo, 4> HoistingPointList;
+
+  // Partition InstructionsToHoist into a set of candidates which can share a
+  // common hoisting point. The partitions are collected in HPL. IsScalar is
+  // true when the instructions in InstructionsToHoist are scalars. IsLoad is
+  // true when the InstructionsToHoist are loads, false when they are stores.
+  void partitionCandidates(SmallVecImplInsn &InstructionsToHoist,
+                           HoistingPointList &HPL, InsKind K) {
+    // No need to sort for two instructions.
+    if (InstructionsToHoist.size() > 2) {
+      SortByDFSIn Pred(DFSNumber);
+      std::sort(InstructionsToHoist.begin(), InstructionsToHoist.end(), Pred);
+    }
+
+    int NumBBsOnAllPaths = MaxNumberOfBBSInPath;
+
+    SmallVecImplInsn::iterator II = InstructionsToHoist.begin();
+    SmallVecImplInsn::iterator Start = II;
+    Instruction *HoistPt = *II;
+    BasicBlock *HoistBB = HoistPt->getParent();
+    MemoryUseOrDef *UD;
+    if (K != InsKind::Scalar)
+      UD = MSSA->getMemoryAccess(HoistPt);
+
+    for (++II; II != InstructionsToHoist.end(); ++II) {
+      Instruction *Insn = *II;
+      BasicBlock *BB = Insn->getParent();
+      BasicBlock *NewHoistBB;
+      Instruction *NewHoistPt;
+
+      if (BB == HoistBB) { // Both are in the same Basic Block.
+        NewHoistBB = HoistBB;
+        NewHoistPt = firstInBB(Insn, HoistPt) ? Insn : HoistPt;
+      } else {
+        // If the hoisting point contains one of the instructions,
+        // then hoist there, otherwise hoist before the terminator.
+        NewHoistBB = DT->findNearestCommonDominator(HoistBB, BB);
+        if (NewHoistBB == BB)
+          NewHoistPt = Insn;
+        else if (NewHoistBB == HoistBB)
+          NewHoistPt = HoistPt;
+        else
+          NewHoistPt = NewHoistBB->getTerminator();
+      }
+
+      SmallPtrSet<const BasicBlock *, 2> WL;
+      WL.insert(HoistBB);
+      WL.insert(BB);
+
+      if (K == InsKind::Scalar) {
+        if (safeToHoistScalar(NewHoistBB, WL, NumBBsOnAllPaths)) {
+          // Extend HoistPt to NewHoistPt.
+          HoistPt = NewHoistPt;
+          HoistBB = NewHoistBB;
+          continue;
+        }
+      } else {
+        // When NewBB already contains an instruction to be hoisted, the
+        // expression is needed on all paths.
+        // Check that the hoisted expression is needed on all paths: it is
+        // unsafe to hoist loads to a place where there may be a path not
+        // loading from the same address: for instance there may be a branch on
+        // which the address of the load may not be initialized.
+        if ((HoistBB == NewHoistBB || BB == NewHoistBB ||
+             hoistingFromAllPaths(NewHoistBB, WL)) &&
+            // Also check that it is safe to move the load or store from HoistPt
+            // to NewHoistPt, and from Insn to NewHoistPt.
+            safeToHoistLdSt(NewHoistPt, HoistPt, UD, K, NumBBsOnAllPaths) &&
+            safeToHoistLdSt(NewHoistPt, Insn, MSSA->getMemoryAccess(Insn),
+                            K, NumBBsOnAllPaths)) {
+          // Extend HoistPt to NewHoistPt.
+          HoistPt = NewHoistPt;
+          HoistBB = NewHoistBB;
+          continue;
+        }
+      }
+
+      // At this point it is not safe to extend the current hoisting to
+      // NewHoistPt: save the hoisting list so far.
+      if (std::distance(Start, II) > 1)
+        HPL.push_back({HoistBB, SmallVecInsn(Start, II)});
+
+      // Start over from BB.
+      Start = II;
+      if (K != InsKind::Scalar)
+        UD = MSSA->getMemoryAccess(*Start);
+      HoistPt = Insn;
+      HoistBB = BB;
+      NumBBsOnAllPaths = MaxNumberOfBBSInPath;
+    }
+
+    // Save the last partition.
+    if (std::distance(Start, II) > 1)
+      HPL.push_back({HoistBB, SmallVecInsn(Start, II)});
+  }
+
+  // Initialize HPL from Map.
+  void computeInsertionPoints(const VNtoInsns &Map, HoistingPointList &HPL,
+                              InsKind K) {
+    for (const auto &Entry : Map) {
+      if (MaxHoistedThreshold != -1 && ++HoistedCtr > MaxHoistedThreshold)
+        return;
+
+      const SmallVecInsn &V = Entry.second;
+      if (V.size() < 2)
+        continue;
+
+      // Compute the insertion point and the list of expressions to be hoisted.
+      SmallVecInsn InstructionsToHoist;
+      for (auto I : V)
+        // We don't need to check for hoist-barriers here because if
+        // I->getParent() is a barrier then I precedes the barrier.
+        if (!hasEH(I->getParent()))
+          InstructionsToHoist.push_back(I);
+
+      if (!InstructionsToHoist.empty())
+        partitionCandidates(InstructionsToHoist, HPL, K);
+    }
+  }
+
+  // Return true when all operands of Instr are available at insertion point
+  // HoistPt. When limiting the number of hoisted expressions, one could hoist
+  // a load without hoisting its access function. So before hoisting any
+  // expression, make sure that all its operands are available at insert point.
+  bool allOperandsAvailable(const Instruction *I,
+                            const BasicBlock *HoistPt) const {
+    for (const Use &Op : I->operands())
+      if (const auto *Inst = dyn_cast<Instruction>(&Op))
+        if (!DT->dominates(Inst->getParent(), HoistPt))
+          return false;
+
+    return true;
+  }
+
+  // Same as allOperandsAvailable with recursive check for GEP operands.
+  bool allGepOperandsAvailable(const Instruction *I,
+                               const BasicBlock *HoistPt) const {
+    for (const Use &Op : I->operands())
+      if (const auto *Inst = dyn_cast<Instruction>(&Op))
+        if (!DT->dominates(Inst->getParent(), HoistPt)) {
+          if (const GetElementPtrInst *GepOp =
+                  dyn_cast<GetElementPtrInst>(Inst)) {
+            if (!allGepOperandsAvailable(GepOp, HoistPt))
+              return false;
+            // Gep is available if all operands of GepOp are available.
+          } else {
+            // Gep is not available if it has operands other than GEPs that are
+            // defined in blocks not dominating HoistPt.
+            return false;
+          }
+        }
+    return true;
+  }
+
+  // Make all operands of the GEP available.
+  void makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+                         const SmallVecInsn &InstructionsToHoist,
+                         Instruction *Gep) const {
+    assert(allGepOperandsAvailable(Gep, HoistPt) &&
+           "GEP operands not available");
+
+    Instruction *ClonedGep = Gep->clone();
+    for (unsigned i = 0, e = Gep->getNumOperands(); i != e; ++i)
+      if (Instruction *Op = dyn_cast<Instruction>(Gep->getOperand(i))) {
+
+        // Check whether the operand is already available.
+        if (DT->dominates(Op->getParent(), HoistPt))
+          continue;
+
+        // As a GEP can refer to other GEPs, recursively make all the operands
+        // of this GEP available at HoistPt.
+        if (GetElementPtrInst *GepOp = dyn_cast<GetElementPtrInst>(Op))
+          makeGepsAvailable(ClonedGep, HoistPt, InstructionsToHoist, GepOp);
+      }
+
+    // Copy Gep and replace its uses in Repl with ClonedGep.
+    ClonedGep->insertBefore(HoistPt->getTerminator());
+
+    // Conservatively discard any optimization hints, they may differ on the
+    // other paths.
+    ClonedGep->dropUnknownNonDebugMetadata();
+
+    // If we have optimization hints which agree with each other along different
+    // paths, preserve them.
+    for (const Instruction *OtherInst : InstructionsToHoist) {
+      const GetElementPtrInst *OtherGep;
+      if (auto *OtherLd = dyn_cast<LoadInst>(OtherInst))
+        OtherGep = cast<GetElementPtrInst>(OtherLd->getPointerOperand());
+      else
+        OtherGep = cast<GetElementPtrInst>(
+            cast<StoreInst>(OtherInst)->getPointerOperand());
+      ClonedGep->andIRFlags(OtherGep);
+    }
+
+    // Replace uses of Gep with ClonedGep in Repl.
+    Repl->replaceUsesOfWith(Gep, ClonedGep);
+  }
+
+  // In the case Repl is a load or a store, we make all their GEPs
+  // available: GEPs are not hoisted by default to avoid the address
+  // computations to be hoisted without the associated load or store.
+  bool makeGepOperandsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+                                const SmallVecInsn &InstructionsToHoist) const {
+    // Check whether the GEP of a ld/st can be synthesized at HoistPt.
+    GetElementPtrInst *Gep = nullptr;
+    Instruction *Val = nullptr;
+    if (auto *Ld = dyn_cast<LoadInst>(Repl)) {
+      Gep = dyn_cast<GetElementPtrInst>(Ld->getPointerOperand());
+    } else if (auto *St = dyn_cast<StoreInst>(Repl)) {
+      Gep = dyn_cast<GetElementPtrInst>(St->getPointerOperand());
+      Val = dyn_cast<Instruction>(St->getValueOperand());
+      // Check that the stored value is available.
+      if (Val) {
+        if (isa<GetElementPtrInst>(Val)) {
+          // Check whether we can compute the GEP at HoistPt.
+          if (!allGepOperandsAvailable(Val, HoistPt))
+            return false;
+        } else if (!DT->dominates(Val->getParent(), HoistPt))
+          return false;
+      }
+    }
+
+    // Check whether we can compute the Gep at HoistPt.
+    if (!Gep || !allGepOperandsAvailable(Gep, HoistPt))
+      return false;
+
+    makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Gep);
+
+    if (Val && isa<GetElementPtrInst>(Val))
+      makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Val);
+
+    return true;
+  }
+
+  std::pair<unsigned, unsigned> hoist(HoistingPointList &HPL) {
+    unsigned NI = 0, NL = 0, NS = 0, NC = 0, NR = 0;
+    for (const HoistingPointInfo &HP : HPL) {
+      // Find out whether we already have one of the instructions in HoistPt,
+      // in which case we do not have to move it.
+      BasicBlock *HoistPt = HP.first;
+      const SmallVecInsn &InstructionsToHoist = HP.second;
+      Instruction *Repl = nullptr;
+      for (Instruction *I : InstructionsToHoist)
+        if (I->getParent() == HoistPt)
+          // If there are two instructions in HoistPt to be hoisted in place:
+          // update Repl to be the first one, such that we can rename the uses
+          // of the second based on the first.
+          if (!Repl || firstInBB(I, Repl))
+            Repl = I;
+
+      // Keep track of whether we moved the instruction so we know whether we
+      // should move the MemoryAccess.
+      bool MoveAccess = true;
+      if (Repl) {
+        // Repl is already in HoistPt: it remains in place.
+        assert(allOperandsAvailable(Repl, HoistPt) &&
+               "instruction depends on operands that are not available");
+        MoveAccess = false;
+      } else {
+        // When we do not find Repl in HoistPt, select the first in the list
+        // and move it to HoistPt.
+        Repl = InstructionsToHoist.front();
+
+        // We can move Repl in HoistPt only when all operands are available.
+        // The order in which hoistings are done may influence the availability
+        // of operands.
+        if (!allOperandsAvailable(Repl, HoistPt)) {
+
+          // When HoistingGeps there is nothing more we can do to make the
+          // operands available: just continue.
+          if (HoistingGeps)
+            continue;
+
+          // When not HoistingGeps we need to copy the GEPs.
+          if (!makeGepOperandsAvailable(Repl, HoistPt, InstructionsToHoist))
+            continue;
+        }
+
+        // Move the instruction at the end of HoistPt.
+        Instruction *Last = HoistPt->getTerminator();
+        MD->removeInstruction(Repl);
+        Repl->moveBefore(Last);
+
+        DFSNumber[Repl] = DFSNumber[Last]++;
+      }
+
+      MemoryAccess *NewMemAcc = MSSA->getMemoryAccess(Repl);
+
+      if (MoveAccess) {
+        if (MemoryUseOrDef *OldMemAcc =
+                dyn_cast_or_null<MemoryUseOrDef>(NewMemAcc)) {
+          // The definition of this ld/st will not change: ld/st hoisting is
+          // legal when the ld/st is not moved past its current definition.
+          MemoryAccess *Def = OldMemAcc->getDefiningAccess();
+          NewMemAcc =
+            MSSAUpdater->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End);
+          OldMemAcc->replaceAllUsesWith(NewMemAcc);
+          MSSAUpdater->removeMemoryAccess(OldMemAcc);
+        }
+      }
+
+      if (isa<LoadInst>(Repl))
+        ++NL;
+      else if (isa<StoreInst>(Repl))
+        ++NS;
+      else if (isa<CallInst>(Repl))
+        ++NC;
+      else // Scalar
+        ++NI;
+
+      // Remove and rename all other instructions.
+      for (Instruction *I : InstructionsToHoist)
+        if (I != Repl) {
+          ++NR;
+          if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
+            ReplacementLoad->setAlignment(
+                std::min(ReplacementLoad->getAlignment(),
+                         cast<LoadInst>(I)->getAlignment()));
+            ++NumLoadsRemoved;
+          } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
+            ReplacementStore->setAlignment(
+                std::min(ReplacementStore->getAlignment(),
+                         cast<StoreInst>(I)->getAlignment()));
+            ++NumStoresRemoved;
+          } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
+            ReplacementAlloca->setAlignment(
+                std::max(ReplacementAlloca->getAlignment(),
+                         cast<AllocaInst>(I)->getAlignment()));
+          } else if (isa<CallInst>(Repl)) {
+            ++NumCallsRemoved;
+          }
+
+          if (NewMemAcc) {
+            // Update the uses of the old MSSA access with NewMemAcc.
+            MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
+            OldMA->replaceAllUsesWith(NewMemAcc);
+            MSSAUpdater->removeMemoryAccess(OldMA);
+          }
+
+          Repl->andIRFlags(I);
+          combineKnownMetadata(Repl, I);
+          I->replaceAllUsesWith(Repl);
+          // Also invalidate the Alias Analysis cache.
+          MD->removeInstruction(I);
+          I->eraseFromParent();
+        }
+
+      // Remove MemorySSA phi nodes with the same arguments.
+      if (NewMemAcc) {
+        SmallPtrSet<MemoryPhi *, 4> UsePhis;
+        for (User *U : NewMemAcc->users())
+          if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
+            UsePhis.insert(Phi);
+
+        for (auto *Phi : UsePhis) {
+          auto In = Phi->incoming_values();
+          if (all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
+            Phi->replaceAllUsesWith(NewMemAcc);
+            MSSAUpdater->removeMemoryAccess(Phi);
+          }
+        }
+      }
+    }
+
+    NumHoisted += NL + NS + NC + NI;
+    NumRemoved += NR;
+    NumLoadsHoisted += NL;
+    NumStoresHoisted += NS;
+    NumCallsHoisted += NC;
+    return {NI, NL + NC + NS};
+  }
+
+  // Hoist all expressions. Returns Number of scalars hoisted
+  // and number of non-scalars hoisted.
+  std::pair<unsigned, unsigned> hoistExpressions(Function &F) {
+    InsnInfo II;
+    LoadInfo LI;
+    StoreInfo SI;
+    CallInfo CI;
+    for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+      int InstructionNb = 0;
+      for (Instruction &I1 : *BB) {
+        // If I1 cannot guarantee progress, subsequent instructions
+        // in BB cannot be hoisted anyways.
+        if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
+           HoistBarrier.insert(BB);
+           break;
+        }
+        // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
+        // deeper may increase the register pressure and compilation time.
+        if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
+          break;
+
+        // Do not value number terminator instructions.
+        if (isa<TerminatorInst>(&I1))
+          break;
+
+        if (auto *Load = dyn_cast<LoadInst>(&I1))
+          LI.insert(Load, VN);
+        else if (auto *Store = dyn_cast<StoreInst>(&I1))
+          SI.insert(Store, VN);
+        else if (auto *Call = dyn_cast<CallInst>(&I1)) {
+          if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) {
+            if (isa<DbgInfoIntrinsic>(Intr) ||
+                Intr->getIntrinsicID() == Intrinsic::assume)
+              continue;
+          }
+          if (Call->mayHaveSideEffects())
+            break;
+
+          if (Call->isConvergent())
+            break;
+
+          CI.insert(Call, VN);
+        } else if (HoistingGeps || !isa<GetElementPtrInst>(&I1))
+          // Do not hoist scalars past calls that may write to memory because
+          // that could result in spills later. geps are handled separately.
+          // TODO: We can relax this for targets like AArch64 as they have more
+          // registers than X86.
+          II.insert(&I1, VN);
+      }
+    }
+
+    HoistingPointList HPL;
+    computeInsertionPoints(II.getVNTable(), HPL, InsKind::Scalar);
+    computeInsertionPoints(LI.getVNTable(), HPL, InsKind::Load);
+    computeInsertionPoints(SI.getVNTable(), HPL, InsKind::Store);
+    computeInsertionPoints(CI.getScalarVNTable(), HPL, InsKind::Scalar);
+    computeInsertionPoints(CI.getLoadVNTable(), HPL, InsKind::Load);
+    computeInsertionPoints(CI.getStoreVNTable(), HPL, InsKind::Store);
+    return hoist(HPL);
+  }
+};
+
+class GVNHoistLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  GVNHoistLegacyPass() : FunctionPass(ID) {
+    initializeGVNHoistLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+    auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+
+    GVNHoist G(&DT, &AA, &MD, &MSSA);
+    return G.run(F);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+} // namespace
+
+PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) {
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+  MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  GVNHoist G(&DT, &AA, &MD, &MSSA);
+  if (!G.run(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<MemorySSAAnalysis>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+char GVNHoistLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist",
+                      "Early GVN Hoisting of Expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(GVNHoistLegacyPass, "gvn-hoist",
+                    "Early GVN Hoisting of Expressions", false, false)
+
+FunctionPass *llvm::createGVNHoistPass() { return new GVNHoistLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp
new file mode 100644
index 000000000000..5fd2dfc118b4
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -0,0 +1,883 @@
+//===- GVNSink.cpp - sink expressions into successors -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file GVNSink.cpp
+/// This pass attempts to sink instructions into successors, reducing static
+/// instruction count and enabling if-conversion.
+///
+/// We use a variant of global value numbering to decide what can be sunk.
+/// Consider:
+///
+/// [ %a1 = add i32 %b, 1  ]   [ %c1 = add i32 %d, 1  ]
+/// [ %a2 = xor i32 %a1, 1 ]   [ %c2 = xor i32 %c1, 1 ]
+///                  \           /
+///            [ %e = phi i32 %a2, %c2 ]
+///            [ add i32 %e, 4         ]
+///
+///
+/// GVN would number %a1 and %c1 differently because they compute different
+/// results - the VN of an instruction is a function of its opcode and the
+/// transitive closure of its operands. This is the key property for hoisting
+/// and CSE.
+///
+/// What we want when sinking however is for a numbering that is a function of
+/// the *uses* of an instruction, which allows us to answer the question "if I
+/// replace %a1 with %c1, will it contribute in an equivalent way to all
+/// successive instructions?". The PostValueTable class in GVN provides this
+/// mapping.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/GVNExpression.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <unordered_set>
+using namespace llvm;
+
+#define DEBUG_TYPE "gvn-sink"
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+
+namespace llvm {
+namespace GVNExpression {
+
+LLVM_DUMP_METHOD void Expression::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+
+}
+}
+
+namespace {
+
+static bool isMemoryInst(const Instruction *I) {
+  return isa<LoadInst>(I) || isa<StoreInst>(I) ||
+         (isa<InvokeInst>(I) && !cast<InvokeInst>(I)->doesNotAccessMemory()) ||
+         (isa<CallInst>(I) && !cast<CallInst>(I)->doesNotAccessMemory());
+}
+
+/// Iterates through instructions in a set of blocks in reverse order from the
+/// first non-terminator. For example (assume all blocks have size n):
+///   LockstepReverseIterator I([B1, B2, B3]);
+///   *I-- = [B1[n], B2[n], B3[n]];
+///   *I-- = [B1[n-1], B2[n-1], B3[n-1]];
+///   *I-- = [B1[n-2], B2[n-2], B3[n-2]];
+///   ...
+///
+/// It continues until all blocks have been exhausted. Use \c getActiveBlocks()
+/// to
+/// determine which blocks are still going and the order they appear in the
+/// list returned by operator*.
+class LockstepReverseIterator {
+  ArrayRef<BasicBlock *> Blocks;
+  SmallPtrSet<BasicBlock *, 4> ActiveBlocks;
+  SmallVector<Instruction *, 4> Insts;
+  bool Fail;
+
+public:
+  LockstepReverseIterator(ArrayRef<BasicBlock *> Blocks) : Blocks(Blocks) {
+    reset();
+  }
+
+  void reset() {
+    Fail = false;
+    ActiveBlocks.clear();
+    for (BasicBlock *BB : Blocks)
+      ActiveBlocks.insert(BB);
+    Insts.clear();
+    for (BasicBlock *BB : Blocks) {
+      if (BB->size() <= 1) {
+        // Block wasn't big enough - only contained a terminator.
+        ActiveBlocks.erase(BB);
+        continue;
+      }
+      Insts.push_back(BB->getTerminator()->getPrevNode());
+    }
+    if (Insts.empty())
+      Fail = true;
+  }
+
+  bool isValid() const { return !Fail; }
+  ArrayRef<Instruction *> operator*() const { return Insts; }
+  SmallPtrSet<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; }
+
+  void restrictToBlocks(SmallPtrSetImpl<BasicBlock *> &Blocks) {
+    for (auto II = Insts.begin(); II != Insts.end();) {
+      if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) ==
+          Blocks.end()) {
+        ActiveBlocks.erase((*II)->getParent());
+        II = Insts.erase(II);
+      } else {
+        ++II;
+      }
+    }
+  }
+
+  void operator--() {
+    if (Fail)
+      return;
+    SmallVector<Instruction *, 4> NewInsts;
+    for (auto *Inst : Insts) {
+      if (Inst == &Inst->getParent()->front())
+        ActiveBlocks.erase(Inst->getParent());
+      else
+        NewInsts.push_back(Inst->getPrevNode());
+    }
+    if (NewInsts.empty()) {
+      Fail = true;
+      return;
+    }
+    Insts = NewInsts;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+/// Candidate solution for sinking. There may be different ways to
+/// sink instructions, differing in the number of instructions sunk,
+/// the number of predecessors sunk from and the number of PHIs
+/// required.
+struct SinkingInstructionCandidate {
+  unsigned NumBlocks;
+  unsigned NumInstructions;
+  unsigned NumPHIs;
+  unsigned NumMemoryInsts;
+  int Cost = -1;
+  SmallVector<BasicBlock *, 4> Blocks;
+
+  void calculateCost(unsigned NumOrigPHIs, unsigned NumOrigBlocks) {
+    unsigned NumExtraPHIs = NumPHIs - NumOrigPHIs;
+    unsigned SplitEdgeCost = (NumOrigBlocks > NumBlocks) ? 2 : 0;
+    Cost = (NumInstructions * (NumBlocks - 1)) -
+           (NumExtraPHIs *
+            NumExtraPHIs) // PHIs are expensive, so make sure they're worth it.
+           - SplitEdgeCost;
+  }
+  bool operator>(const SinkingInstructionCandidate &Other) const {
+    return Cost > Other.Cost;
+  }
+};
+
+#ifndef NDEBUG
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+                              const SinkingInstructionCandidate &C) {
+  OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
+     << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
+  return OS;
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+
+/// Describes a PHI node that may or may not exist. These track the PHIs
+/// that must be created if we sunk a sequence of instructions. It provides
+/// a hash function for efficient equality comparisons.
+class ModelledPHI {
+  SmallVector<Value *, 4> Values;
+  SmallVector<BasicBlock *, 4> Blocks;
+
+public:
+  ModelledPHI() {}
+  ModelledPHI(const PHINode *PN) {
+    for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
+      Blocks.push_back(PN->getIncomingBlock(I));
+    std::sort(Blocks.begin(), Blocks.end());
+
+    // This assumes the PHI is already well-formed and there aren't conflicting
+    // incoming values for the same block.
+    for (auto *B : Blocks)
+      Values.push_back(PN->getIncomingValueForBlock(B));
+  }
+  /// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI
+  /// without the same ID.
+  /// \note This is specifically for DenseMapInfo - do not use this!
+  static ModelledPHI createDummy(size_t ID) {
+    ModelledPHI M;
+    M.Values.push_back(reinterpret_cast<Value*>(ID));
+    return M;
+  }
+
+  /// Create a PHI from an array of incoming values and incoming blocks.
+  template <typename VArray, typename BArray>
+  ModelledPHI(const VArray &V, const BArray &B) {
+    std::copy(V.begin(), V.end(), std::back_inserter(Values));
+    std::copy(B.begin(), B.end(), std::back_inserter(Blocks));
+  }
+
+  /// Create a PHI from [I[OpNum] for I in Insts].
+  template <typename BArray>
+  ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) {
+    std::copy(B.begin(), B.end(), std::back_inserter(Blocks));
+    for (auto *I : Insts)
+      Values.push_back(I->getOperand(OpNum));
+  }
+
+  /// Restrict the PHI's contents down to only \c NewBlocks.
+  /// \c NewBlocks must be a subset of \c this->Blocks.
+  void restrictToBlocks(const SmallPtrSetImpl<BasicBlock *> &NewBlocks) {
+    auto BI = Blocks.begin();
+    auto VI = Values.begin();
+    while (BI != Blocks.end()) {
+      assert(VI != Values.end());
+      if (std::find(NewBlocks.begin(), NewBlocks.end(), *BI) ==
+          NewBlocks.end()) {
+        BI = Blocks.erase(BI);
+        VI = Values.erase(VI);
+      } else {
+        ++BI;
+        ++VI;
+      }
+    }
+    assert(Blocks.size() == NewBlocks.size());
+  }
+
+  ArrayRef<Value *> getValues() const { return Values; }
+
+  bool areAllIncomingValuesSame() const {
+    return all_of(Values, [&](Value *V) { return V == Values[0]; });
+  }
+  bool areAllIncomingValuesSameType() const {
+    return all_of(
+        Values, [&](Value *V) { return V->getType() == Values[0]->getType(); });
+  }
+  bool areAnyIncomingValuesConstant() const {
+    return any_of(Values, [&](Value *V) { return isa<Constant>(V); });
+  }
+  // Hash functor
+  unsigned hash() const {
+      return (unsigned)hash_combine_range(Values.begin(), Values.end());
+  }
+  bool operator==(const ModelledPHI &Other) const {
+    return Values == Other.Values && Blocks == Other.Blocks;
+  }
+};
+
+template <typename ModelledPHI> struct DenseMapInfo {
+  static inline ModelledPHI &getEmptyKey() {
+    static ModelledPHI Dummy = ModelledPHI::createDummy(0);
+    return Dummy;
+  }
+  static inline ModelledPHI &getTombstoneKey() {
+    static ModelledPHI Dummy = ModelledPHI::createDummy(1);
+    return Dummy;
+  }
+  static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); }
+  static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) {
+    return LHS == RHS;
+  }
+};
+
+typedef DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>> ModelledPHISet;
+
+//===----------------------------------------------------------------------===//
+//                             ValueTable
+//===----------------------------------------------------------------------===//
+// This is a value number table where the value number is a function of the
+// *uses* of a value, rather than its operands. Thus, if VN(A) == VN(B) we know
+// that the program would be equivalent if we replaced A with PHI(A, B).
+//===----------------------------------------------------------------------===//
+
+/// A GVN expression describing how an instruction is used. The operands
+/// field of BasicExpression is used to store uses, not operands.
+///
+/// This class also contains fields for discriminators used when determining
+/// equivalence of instructions with sideeffects.
+class InstructionUseExpr : public GVNExpression::BasicExpression {
+  unsigned MemoryUseOrder = -1;
+  bool Volatile = false;
+
+public:
+  InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R,
+                     BumpPtrAllocator &A)
+      : GVNExpression::BasicExpression(I->getNumUses()) {
+    allocateOperands(R, A);
+    setOpcode(I->getOpcode());
+    setType(I->getType());
+
+    for (auto &U : I->uses())
+      op_push_back(U.getUser());
+    std::sort(op_begin(), op_end());
+  }
+  void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; }
+  void setVolatile(bool V) { Volatile = V; }
+
+  virtual hash_code getHashValue() const {
+    return hash_combine(GVNExpression::BasicExpression::getHashValue(),
+                        MemoryUseOrder, Volatile);
+  }
+
+  template <typename Function> hash_code getHashValue(Function MapFn) {
+    hash_code H =
+        hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile);
+    for (auto *V : operands())
+      H = hash_combine(H, MapFn(V));
+    return H;
+  }
+};
+
+class ValueTable {
+  DenseMap<Value *, uint32_t> ValueNumbering;
+  DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering;
+  DenseMap<size_t, uint32_t> HashNumbering;
+  BumpPtrAllocator Allocator;
+  ArrayRecycler<Value *> Recycler;
+  uint32_t nextValueNumber;
+
+  /// Create an expression for I based on its opcode and its uses. If I
+  /// touches or reads memory, the expression is also based upon its memory
+  /// order - see \c getMemoryUseOrder().
+  InstructionUseExpr *createExpr(Instruction *I) {
+    InstructionUseExpr *E =
+        new (Allocator) InstructionUseExpr(I, Recycler, Allocator);
+    if (isMemoryInst(I))
+      E->setMemoryUseOrder(getMemoryUseOrder(I));
+
+    if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+      CmpInst::Predicate Predicate = C->getPredicate();
+      E->setOpcode((C->getOpcode() << 8) | Predicate);
+    }
+    return E;
+  }
+
+  /// Helper to compute the value number for a memory instruction
+  /// (LoadInst/StoreInst), including checking the memory ordering and
+  /// volatility.
+  template <class Inst> InstructionUseExpr *createMemoryExpr(Inst *I) {
+    if (isStrongerThanUnordered(I->getOrdering()) || I->isAtomic())
+      return nullptr;
+    InstructionUseExpr *E = createExpr(I);
+    E->setVolatile(I->isVolatile());
+    return E;
+  }
+
+public:
+  /// Returns the value number for the specified value, assigning
+  /// it a new number if it did not have one before.
+  uint32_t lookupOrAdd(Value *V) {
+    auto VI = ValueNumbering.find(V);
+    if (VI != ValueNumbering.end())
+      return VI->second;
+
+    if (!isa<Instruction>(V)) {
+      ValueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    Instruction *I = cast<Instruction>(V);
+    InstructionUseExpr *exp = nullptr;
+    switch (I->getOpcode()) {
+    case Instruction::Load:
+      exp = createMemoryExpr(cast<LoadInst>(I));
+      break;
+    case Instruction::Store:
+      exp = createMemoryExpr(cast<StoreInst>(I));
+      break;
+    case Instruction::Call:
+    case Instruction::Invoke:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+    case Instruction::Select:
+    case Instruction::ExtractElement:
+    case Instruction::InsertElement:
+    case Instruction::ShuffleVector:
+    case Instruction::InsertValue:
+    case Instruction::GetElementPtr:
+      exp = createExpr(I);
+      break;
+    default:
+      break;
+    }
+
+    if (!exp) {
+      ValueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    uint32_t e = ExpressionNumbering[exp];
+    if (!e) {
+      hash_code H = exp->getHashValue([=](Value *V) { return lookupOrAdd(V); });
+      auto I = HashNumbering.find(H);
+      if (I != HashNumbering.end()) {
+        e = I->second;
+      } else {
+        e = nextValueNumber++;
+        HashNumbering[H] = e;
+        ExpressionNumbering[exp] = e;
+      }
+    }
+    ValueNumbering[V] = e;
+    return e;
+  }
+
+  /// Returns the value number of the specified value. Fails if the value has
+  /// not yet been numbered.
+  uint32_t lookup(Value *V) const {
+    auto VI = ValueNumbering.find(V);
+    assert(VI != ValueNumbering.end() && "Value not numbered?");
+    return VI->second;
+  }
+
+  /// Removes all value numberings and resets the value table.
+  void clear() {
+    ValueNumbering.clear();
+    ExpressionNumbering.clear();
+    HashNumbering.clear();
+    Recycler.clear(Allocator);
+    nextValueNumber = 1;
+  }
+
+  ValueTable() : nextValueNumber(1) {}
+
+  /// \c Inst uses or touches memory. Return an ID describing the memory state
+  /// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2),
+  /// the exact same memory operations happen after I1 and I2.
+  ///
+  /// This is a very hard problem in general, so we use domain-specific
+  /// knowledge that we only ever check for equivalence between blocks sharing a
+  /// single immediate successor that is common, and when determining if I1 ==
+  /// I2 we will have already determined that next(I1) == next(I2). This
+  /// inductive property allows us to simply return the value number of the next
+  /// instruction that defines memory.
+  uint32_t getMemoryUseOrder(Instruction *Inst) {
+    auto *BB = Inst->getParent();
+    for (auto I = std::next(Inst->getIterator()), E = BB->end();
+         I != E && !I->isTerminator(); ++I) {
+      if (!isMemoryInst(&*I))
+        continue;
+      if (isa<LoadInst>(&*I))
+        continue;
+      CallInst *CI = dyn_cast<CallInst>(&*I);
+      if (CI && CI->onlyReadsMemory())
+        continue;
+      InvokeInst *II = dyn_cast<InvokeInst>(&*I);
+      if (II && II->onlyReadsMemory())
+        continue;
+      return lookupOrAdd(&*I);
+    }
+    return 0;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+class GVNSink {
+public:
+  GVNSink() : VN() {}
+  bool run(Function &F) {
+    DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n");
+
+    unsigned NumSunk = 0;
+    ReversePostOrderTraversal<Function*> RPOT(&F);
+    for (auto *N : RPOT)
+      NumSunk += sinkBB(N);
+    
+    return NumSunk > 0;
+  }
+
+private:
+  ValueTable VN;
+
+  bool isInstructionBlacklisted(Instruction *I) {
+    // These instructions may change or break semantics if moved.
+    if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
+        I->getType()->isTokenTy())
+      return true;
+    return false;
+  }
+
+  /// The main heuristic function. Analyze the set of instructions pointed to by
+  /// LRI and return a candidate solution if these instructions can be sunk, or
+  /// None otherwise.
+  Optional<SinkingInstructionCandidate> analyzeInstructionForSinking(
+      LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
+      ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents);
+
+  /// Create a ModelledPHI for each PHI in BB, adding to PHIs.
+  void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs,
+                          SmallPtrSetImpl<Value *> &PHIContents) {
+    for (auto &I : *BB) {
+      auto *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        return;
+
+      auto MPHI = ModelledPHI(PN);
+      PHIs.insert(MPHI);
+      for (auto *V : MPHI.getValues())
+        PHIContents.insert(V);
+    }
+  }
+
+  /// The main instruction sinking driver. Set up state and try and sink
+  /// instructions into BBEnd from its predecessors.
+  unsigned sinkBB(BasicBlock *BBEnd);
+
+  /// Perform the actual mechanics of sinking an instruction from Blocks into
+  /// BBEnd, which is their only successor.
+  void sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, BasicBlock *BBEnd);
+
+  /// Remove PHIs that all have the same incoming value.
+  void foldPointlessPHINodes(BasicBlock *BB) {
+    auto I = BB->begin();
+    while (PHINode *PN = dyn_cast<PHINode>(I++)) {
+      if (!all_of(PN->incoming_values(),
+                  [&](const Value *V) { return V == PN->getIncomingValue(0); }))
+        continue;
+      if (PN->getIncomingValue(0) != PN)
+        PN->replaceAllUsesWith(PN->getIncomingValue(0));
+      else
+        PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+      PN->eraseFromParent();
+    }
+  }
+};
+
+Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
+  LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
+  ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) {
+  auto Insts = *LRI;
+  DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I
+                                                             : Insts) {
+    I->dump();
+  } dbgs() << " ]\n";);
+
+  DenseMap<uint32_t, unsigned> VNums;
+  for (auto *I : Insts) {
+    uint32_t N = VN.lookupOrAdd(I);
+    DEBUG(dbgs() << " VN=" << utohexstr(N) << " for" << *I << "\n");
+    if (N == ~0U)
+      return None;
+    VNums[N]++;
+  }
+  unsigned VNumToSink =
+      std::max_element(VNums.begin(), VNums.end(),
+                       [](const std::pair<uint32_t, unsigned> &I,
+                          const std::pair<uint32_t, unsigned> &J) {
+                         return I.second < J.second;
+                       })
+          ->first;
+
+  if (VNums[VNumToSink] == 1)
+    // Can't sink anything!
+    return None;
+
+  // Now restrict the number of incoming blocks down to only those with
+  // VNumToSink.
+  auto &ActivePreds = LRI.getActiveBlocks();
+  unsigned InitialActivePredSize = ActivePreds.size();
+  SmallVector<Instruction *, 4> NewInsts;
+  for (auto *I : Insts) {
+    if (VN.lookup(I) != VNumToSink)
+      ActivePreds.erase(I->getParent());
+    else
+      NewInsts.push_back(I);
+  }
+  for (auto *I : NewInsts)
+    if (isInstructionBlacklisted(I))
+      return None;
+
+  // If we've restricted the incoming blocks, restrict all needed PHIs also
+  // to that set.
+  bool RecomputePHIContents = false;
+  if (ActivePreds.size() != InitialActivePredSize) {
+    ModelledPHISet NewNeededPHIs;
+    for (auto P : NeededPHIs) {
+      P.restrictToBlocks(ActivePreds);
+      NewNeededPHIs.insert(P);
+    }
+    NeededPHIs = NewNeededPHIs;
+    LRI.restrictToBlocks(ActivePreds);
+    RecomputePHIContents = true;
+  }
+
+  // The sunk instruction's results.
+  ModelledPHI NewPHI(NewInsts, ActivePreds);
+
+  // Does sinking this instruction render previous PHIs redundant?
+  if (NeededPHIs.find(NewPHI) != NeededPHIs.end()) {
+    NeededPHIs.erase(NewPHI);
+    RecomputePHIContents = true;
+  }
+
+  if (RecomputePHIContents) {
+    // The needed PHIs have changed, so recompute the set of all needed
+    // values.
+    PHIContents.clear();
+    for (auto &PHI : NeededPHIs)
+      PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+  }
+
+  // Is this instruction required by a later PHI that doesn't match this PHI?
+  // if so, we can't sink this instruction.
+  for (auto *V : NewPHI.getValues())
+    if (PHIContents.count(V))
+      // V exists in this PHI, but the whole PHI is different to NewPHI
+      // (else it would have been removed earlier). We cannot continue
+      // because this isn't representable.
+      return None;
+
+  // Which operands need PHIs?
+  // FIXME: If any of these fail, we should partition up the candidates to
+  // try and continue making progress.
+  Instruction *I0 = NewInsts[0];
+  for (unsigned OpNum = 0, E = I0->getNumOperands(); OpNum != E; ++OpNum) {
+    ModelledPHI PHI(NewInsts, OpNum, ActivePreds);
+    if (PHI.areAllIncomingValuesSame())
+      continue;
+    if (!canReplaceOperandWithVariable(I0, OpNum))
+      // We can 't create a PHI from this instruction!
+      return None;
+    if (NeededPHIs.count(PHI))
+      continue;
+    if (!PHI.areAllIncomingValuesSameType())
+      return None;
+    // Don't create indirect calls! The called value is the final operand.
+    if ((isa<CallInst>(I0) || isa<InvokeInst>(I0)) && OpNum == E - 1 &&
+        PHI.areAnyIncomingValuesConstant())
+      return None;
+
+    NeededPHIs.reserve(NeededPHIs.size());
+    NeededPHIs.insert(PHI);
+    PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+  }
+
+  if (isMemoryInst(NewInsts[0]))
+    ++MemoryInstNum;
+
+  SinkingInstructionCandidate Cand;
+  Cand.NumInstructions = ++InstNum;
+  Cand.NumMemoryInsts = MemoryInstNum;
+  Cand.NumBlocks = ActivePreds.size();
+  Cand.NumPHIs = NeededPHIs.size();
+  for (auto *C : ActivePreds)
+    Cand.Blocks.push_back(C);
+
+  return Cand;
+}
+
+unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
+  DEBUG(dbgs() << "GVNSink: running on basic block ";
+        BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
+  SmallVector<BasicBlock *, 4> Preds;
+  for (auto *B : predecessors(BBEnd)) {
+    auto *T = B->getTerminator();
+    if (isa<BranchInst>(T) || isa<SwitchInst>(T))
+      Preds.push_back(B);
+    else
+      return 0;
+  }
+  if (Preds.size() < 2)
+    return 0;
+  std::sort(Preds.begin(), Preds.end());
+
+  unsigned NumOrigPreds = Preds.size();
+  // We can only sink instructions through unconditional branches.
+  for (auto I = Preds.begin(); I != Preds.end();) {
+    if ((*I)->getTerminator()->getNumSuccessors() != 1)
+      I = Preds.erase(I);
+    else
+      ++I;
+  }
+
+  LockstepReverseIterator LRI(Preds);
+  SmallVector<SinkingInstructionCandidate, 4> Candidates;
+  unsigned InstNum = 0, MemoryInstNum = 0;
+  ModelledPHISet NeededPHIs;
+  SmallPtrSet<Value *, 4> PHIContents;
+  analyzeInitialPHIs(BBEnd, NeededPHIs, PHIContents);
+  unsigned NumOrigPHIs = NeededPHIs.size();
+
+  while (LRI.isValid()) {
+    auto Cand = analyzeInstructionForSinking(LRI, InstNum, MemoryInstNum,
+                                             NeededPHIs, PHIContents);
+    if (!Cand)
+      break;
+    Cand->calculateCost(NumOrigPHIs, Preds.size());
+    Candidates.emplace_back(*Cand);
+    --LRI;
+  }
+
+  std::stable_sort(
+      Candidates.begin(), Candidates.end(),
+      [](const SinkingInstructionCandidate &A,
+         const SinkingInstructionCandidate &B) { return A > B; });
+  DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
+                                                    : Candidates) dbgs()
+                                               << "  " << C << "\n";);
+
+  // Pick the top candidate, as long it is positive!
+  if (Candidates.empty() || Candidates.front().Cost <= 0)
+    return 0;
+  auto C = Candidates.front();
+
+  DEBUG(dbgs() << " -- Sinking: " << C << "\n");
+  BasicBlock *InsertBB = BBEnd;
+  if (C.Blocks.size() < NumOrigPreds) {
+    DEBUG(dbgs() << " -- Splitting edge to "; BBEnd->printAsOperand(dbgs());
+          dbgs() << "\n");
+    InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split");
+    if (!InsertBB) {
+      DEBUG(dbgs() << " -- FAILED to split edge!\n");
+      // Edge couldn't be split.
+      return 0;
+    }
+  }
+
+  for (unsigned I = 0; I < C.NumInstructions; ++I)
+    sinkLastInstruction(C.Blocks, InsertBB);
+
+  return C.NumInstructions;
+}
+
+void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
+                                  BasicBlock *BBEnd) {
+  SmallVector<Instruction *, 4> Insts;
+  for (BasicBlock *BB : Blocks)
+    Insts.push_back(BB->getTerminator()->getPrevNode());
+  Instruction *I0 = Insts.front();
+
+  SmallVector<Value *, 4> NewOperands;
+  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
+    bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) {
+      return I->getOperand(O) != I0->getOperand(O);
+    });
+    if (!NeedPHI) {
+      NewOperands.push_back(I0->getOperand(O));
+      continue;
+    }
+
+    // Create a new PHI in the successor block and populate it.
+    auto *Op = I0->getOperand(O);
+    assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!");
+    auto *PN = PHINode::Create(Op->getType(), Insts.size(),
+                               Op->getName() + ".sink", &BBEnd->front());
+    for (auto *I : Insts)
+      PN->addIncoming(I->getOperand(O), I->getParent());
+    NewOperands.push_back(PN);
+  }
+
+  // Arbitrarily use I0 as the new "common" instruction; remap its operands
+  // and move it to the start of the successor block.
+  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
+    I0->getOperandUse(O).set(NewOperands[O]);
+  I0->moveBefore(&*BBEnd->getFirstInsertionPt());
+
+  // Update metadata and IR flags.
+  for (auto *I : Insts)
+    if (I != I0) {
+      combineMetadataForCSE(I0, I);
+      I0->andIRFlags(I);
+    }
+
+  for (auto *I : Insts)
+    if (I != I0)
+      I->replaceAllUsesWith(I0);
+  foldPointlessPHINodes(BBEnd);
+
+  // Finally nuke all instructions apart from the common instruction.
+  for (auto *I : Insts)
+    if (I != I0)
+      I->eraseFromParent();
+
+  NumRemoved += Insts.size() - 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Pass machinery / boilerplate
+
+class GVNSinkLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  GVNSinkLegacyPass() : FunctionPass(ID) {
+    initializeGVNSinkLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    GVNSink G;
+    return G.run(F);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+} // namespace
+
+PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
+  GVNSink G;
+  if (!G.run(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+char GVNSinkLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink",
+                      "Early GVN sinking of Expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GVNSinkLegacyPass, "gvn-sink",
+                    "Early GVN sinking of Expressions", false, false)
+
+FunctionPass *llvm::createGVNSinkPass() { return new GVNSinkLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
new file mode 100644
index 000000000000..fb7c6e15758d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -0,0 +1,694 @@
+//===- GuardWidening.cpp - ---- Guard widening ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the guard widening pass.  The semantics of the
+// @llvm.experimental.guard intrinsic lets LLVM transform it so that it fails
+// more often that it did before the transform.  This optimization is called
+// "widening" and can be used hoist and common runtime checks in situations like
+// these:
+//
+//    %cmp0 = 7 u< Length
+//    call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ]
+//    call @unknown_side_effects()
+//    %cmp1 = 9 u< Length
+//    call @llvm.experimental.guard(i1 %cmp1) [ "deopt"(...) ]
+//    ...
+//
+// =>
+//
+//    %cmp0 = 9 u< Length
+//    call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ]
+//    call @unknown_side_effects()
+//    ...
+//
+// If %cmp0 is false, @llvm.experimental.guard will "deoptimize" back to a
+// generic implementation of the same function, which will have the correct
+// semantics from that point onward.  It is always _legal_ to deoptimize (so
+// replacing %cmp0 with false is "correct"), though it may not always be
+// profitable to do so.
+//
+// NB! This pass is a work in progress.  It hasn't been tuned to be "production
+// ready" yet.  It is known to have quadriatic running time and will not scale
+// to large numbers of guards
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/GuardWidening.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "guard-widening"
+
+namespace {
+
+class GuardWideningImpl {
+  DominatorTree &DT;
+  PostDominatorTree &PDT;
+  LoopInfo &LI;
+
+  /// The set of guards whose conditions have been widened into dominating
+  /// guards.
+  SmallVector<IntrinsicInst *, 16> EliminatedGuards;
+
+  /// The set of guards which have been widened to include conditions to other
+  /// guards.
+  DenseSet<IntrinsicInst *> WidenedGuards;
+
+  /// Try to eliminate guard \p Guard by widening it into an earlier dominating
+  /// guard.  \p DFSI is the DFS iterator on the dominator tree that is
+  /// currently visiting the block containing \p Guard, and \p GuardsPerBlock
+  /// maps BasicBlocks to the set of guards seen in that block.
+  bool eliminateGuardViaWidening(
+      IntrinsicInst *Guard, const df_iterator<DomTreeNode *> &DFSI,
+      const DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> &
+          GuardsPerBlock);
+
+  /// Used to keep track of which widening potential is more effective.
+  enum WideningScore {
+    /// Don't widen.
+    WS_IllegalOrNegative,
+
+    /// Widening is performance neutral as far as the cycles spent in check
+    /// conditions goes (but can still help, e.g., code layout, having less
+    /// deopt state).
+    WS_Neutral,
+
+    /// Widening is profitable.
+    WS_Positive,
+
+    /// Widening is very profitable.  Not significantly different from \c
+    /// WS_Positive, except by the order.
+    WS_VeryPositive
+  };
+
+  static StringRef scoreTypeToString(WideningScore WS);
+
+  /// Compute the score for widening the condition in \p DominatedGuard
+  /// (contained in \p DominatedGuardLoop) into \p DominatingGuard (contained in
+  /// \p DominatingGuardLoop).
+  WideningScore computeWideningScore(IntrinsicInst *DominatedGuard,
+                                     Loop *DominatedGuardLoop,
+                                     IntrinsicInst *DominatingGuard,
+                                     Loop *DominatingGuardLoop);
+
+  /// Helper to check if \p V can be hoisted to \p InsertPos.
+  bool isAvailableAt(Value *V, Instruction *InsertPos) {
+    SmallPtrSet<Instruction *, 8> Visited;
+    return isAvailableAt(V, InsertPos, Visited);
+  }
+
+  bool isAvailableAt(Value *V, Instruction *InsertPos,
+                     SmallPtrSetImpl<Instruction *> &Visited);
+
+  /// Helper to hoist \p V to \p InsertPos.  Guaranteed to succeed if \c
+  /// isAvailableAt returned true.
+  void makeAvailableAt(Value *V, Instruction *InsertPos);
+
+  /// Common helper used by \c widenGuard and \c isWideningCondProfitable.  Try
+  /// to generate an expression computing the logical AND of \p Cond0 and \p
+  /// Cond1.  Return true if the expression computing the AND is only as
+  /// expensive as computing one of the two. If \p InsertPt is true then
+  /// actually generate the resulting expression, make it available at \p
+  /// InsertPt and return it in \p Result (else no change to the IR is made).
+  bool widenCondCommon(Value *Cond0, Value *Cond1, Instruction *InsertPt,
+                       Value *&Result);
+
+  /// Represents a range check of the form \c Base + \c Offset u< \c Length,
+  /// with the constraint that \c Length is not negative.  \c CheckInst is the
+  /// pre-existing instruction in the IR that computes the result of this range
+  /// check.
+  class RangeCheck {
+    Value *Base;
+    ConstantInt *Offset;
+    Value *Length;
+    ICmpInst *CheckInst;
+
+  public:
+    explicit RangeCheck(Value *Base, ConstantInt *Offset, Value *Length,
+                        ICmpInst *CheckInst)
+        : Base(Base), Offset(Offset), Length(Length), CheckInst(CheckInst) {}
+
+    void setBase(Value *NewBase) { Base = NewBase; }
+    void setOffset(ConstantInt *NewOffset) { Offset = NewOffset; }
+
+    Value *getBase() const { return Base; }
+    ConstantInt *getOffset() const { return Offset; }
+    const APInt &getOffsetValue() const { return getOffset()->getValue(); }
+    Value *getLength() const { return Length; };
+    ICmpInst *getCheckInst() const { return CheckInst; }
+
+    void print(raw_ostream &OS, bool PrintTypes = false) {
+      OS << "Base: ";
+      Base->printAsOperand(OS, PrintTypes);
+      OS << " Offset: ";
+      Offset->printAsOperand(OS, PrintTypes);
+      OS << " Length: ";
+      Length->printAsOperand(OS, PrintTypes);
+    }
+
+    LLVM_DUMP_METHOD void dump() {
+      print(dbgs());
+      dbgs() << "\n";
+    }
+  };
+
+  /// Parse \p CheckCond into a conjunction (logical-and) of range checks; and
+  /// append them to \p Checks.  Returns true on success, may clobber \c Checks
+  /// on failure.
+  bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks) {
+    SmallPtrSet<Value *, 8> Visited;
+    return parseRangeChecks(CheckCond, Checks, Visited);
+  }
+
+  bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks,
+                        SmallPtrSetImpl<Value *> &Visited);
+
+  /// Combine the checks in \p Checks into a smaller set of checks and append
+  /// them into \p CombinedChecks.  Return true on success (i.e. all of checks
+  /// in \p Checks were combined into \p CombinedChecks).  Clobbers \p Checks
+  /// and \p CombinedChecks on success and on failure.
+  bool combineRangeChecks(SmallVectorImpl<RangeCheck> &Checks,
+                          SmallVectorImpl<RangeCheck> &CombinedChecks);
+
+  /// Can we compute the logical AND of \p Cond0 and \p Cond1 for the price of
+  /// computing only one of the two expressions?
+  bool isWideningCondProfitable(Value *Cond0, Value *Cond1) {
+    Value *ResultUnused;
+    return widenCondCommon(Cond0, Cond1, /*InsertPt=*/nullptr, ResultUnused);
+  }
+
+  /// Widen \p ToWiden to fail if \p NewCondition is false (in addition to
+  /// whatever it is already checking).
+  void widenGuard(IntrinsicInst *ToWiden, Value *NewCondition) {
+    Value *Result;
+    widenCondCommon(ToWiden->getArgOperand(0), NewCondition, ToWiden, Result);
+    ToWiden->setArgOperand(0, Result);
+  }
+
+public:
+  explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree &PDT,
+                             LoopInfo &LI)
+      : DT(DT), PDT(PDT), LI(LI) {}
+
+  /// The entry point for this pass.
+  bool run();
+};
+
+struct GuardWideningLegacyPass : public FunctionPass {
+  static char ID;
+  GuardWideningPass Impl;
+
+  GuardWideningLegacyPass() : FunctionPass(ID) {
+    initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    return GuardWideningImpl(
+               getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+               getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(),
+               getAnalysis<LoopInfoWrapperPass>().getLoopInfo()).run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+  }
+};
+
+}
+
+bool GuardWideningImpl::run() {
+  using namespace llvm::PatternMatch;
+
+  DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> GuardsInBlock;
+  bool Changed = false;
+
+  for (auto DFI = df_begin(DT.getRootNode()), DFE = df_end(DT.getRootNode());
+       DFI != DFE; ++DFI) {
+    auto *BB = (*DFI)->getBlock();
+    auto &CurrentList = GuardsInBlock[BB];
+
+    for (auto &I : *BB)
+      if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>()))
+        CurrentList.push_back(cast<IntrinsicInst>(&I));
+
+    for (auto *II : CurrentList)
+      Changed |= eliminateGuardViaWidening(II, DFI, GuardsInBlock);
+  }
+
+  for (auto *II : EliminatedGuards)
+    if (!WidenedGuards.count(II))
+      II->eraseFromParent();
+
+  return Changed;
+}
+
+bool GuardWideningImpl::eliminateGuardViaWidening(
+    IntrinsicInst *GuardInst, const df_iterator<DomTreeNode *> &DFSI,
+    const DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> &
+        GuardsInBlock) {
+  IntrinsicInst *BestSoFar = nullptr;
+  auto BestScoreSoFar = WS_IllegalOrNegative;
+  auto *GuardInstLoop = LI.getLoopFor(GuardInst->getParent());
+
+  // In the set of dominating guards, find the one we can merge GuardInst with
+  // for the most profit.
+  for (unsigned i = 0, e = DFSI.getPathLength(); i != e; ++i) {
+    auto *CurBB = DFSI.getPath(i)->getBlock();
+    auto *CurLoop = LI.getLoopFor(CurBB);
+    assert(GuardsInBlock.count(CurBB) && "Must have been populated by now!");
+    const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second;
+
+    auto I = GuardsInCurBB.begin();
+    auto E = GuardsInCurBB.end();
+
+#ifndef NDEBUG
+    {
+      unsigned Index = 0;
+      for (auto &I : *CurBB) {
+        if (Index == GuardsInCurBB.size())
+          break;
+        if (GuardsInCurBB[Index] == &I)
+          Index++;
+      }
+      assert(Index == GuardsInCurBB.size() &&
+             "Guards expected to be in order!");
+    }
+#endif
+
+    assert((i == (e - 1)) == (GuardInst->getParent() == CurBB) && "Bad DFS?");
+
+    if (i == (e - 1)) {
+      // Corner case: make sure we're only looking at guards strictly dominating
+      // GuardInst when visiting GuardInst->getParent().
+      auto NewEnd = std::find(I, E, GuardInst);
+      assert(NewEnd != E && "GuardInst not in its own block?");
+      E = NewEnd;
+    }
+
+    for (auto *Candidate : make_range(I, E)) {
+      auto Score =
+          computeWideningScore(GuardInst, GuardInstLoop, Candidate, CurLoop);
+      DEBUG(dbgs() << "Score between " << *GuardInst->getArgOperand(0)
+                   << " and " << *Candidate->getArgOperand(0) << " is "
+                   << scoreTypeToString(Score) << "\n");
+      if (Score > BestScoreSoFar) {
+        BestScoreSoFar = Score;
+        BestSoFar = Candidate;
+      }
+    }
+  }
+
+  if (BestScoreSoFar == WS_IllegalOrNegative) {
+    DEBUG(dbgs() << "Did not eliminate guard " << *GuardInst << "\n");
+    return false;
+  }
+
+  assert(BestSoFar != GuardInst && "Should have never visited same guard!");
+  assert(DT.dominates(BestSoFar, GuardInst) && "Should be!");
+
+  DEBUG(dbgs() << "Widening " << *GuardInst << " into " << *BestSoFar
+               << " with score " << scoreTypeToString(BestScoreSoFar) << "\n");
+  widenGuard(BestSoFar, GuardInst->getArgOperand(0));
+  GuardInst->setArgOperand(0, ConstantInt::getTrue(GuardInst->getContext()));
+  EliminatedGuards.push_back(GuardInst);
+  WidenedGuards.insert(BestSoFar);
+  return true;
+}
+
+GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore(
+    IntrinsicInst *DominatedGuard, Loop *DominatedGuardLoop,
+    IntrinsicInst *DominatingGuard, Loop *DominatingGuardLoop) {
+  bool HoistingOutOfLoop = false;
+
+  if (DominatingGuardLoop != DominatedGuardLoop) {
+    if (DominatingGuardLoop &&
+        !DominatingGuardLoop->contains(DominatedGuardLoop))
+      return WS_IllegalOrNegative;
+
+    HoistingOutOfLoop = true;
+  }
+
+  if (!isAvailableAt(DominatedGuard->getArgOperand(0), DominatingGuard))
+    return WS_IllegalOrNegative;
+
+  bool HoistingOutOfIf =
+      !PDT.dominates(DominatedGuard->getParent(), DominatingGuard->getParent());
+
+  if (isWideningCondProfitable(DominatedGuard->getArgOperand(0),
+                               DominatingGuard->getArgOperand(0)))
+    return HoistingOutOfLoop ? WS_VeryPositive : WS_Positive;
+
+  if (HoistingOutOfLoop)
+    return WS_Positive;
+
+  return HoistingOutOfIf ? WS_IllegalOrNegative : WS_Neutral;
+}
+
+bool GuardWideningImpl::isAvailableAt(Value *V, Instruction *Loc,
+                                      SmallPtrSetImpl<Instruction *> &Visited) {
+  auto *Inst = dyn_cast<Instruction>(V);
+  if (!Inst || DT.dominates(Inst, Loc) || Visited.count(Inst))
+    return true;
+
+  if (!isSafeToSpeculativelyExecute(Inst, Loc, &DT) ||
+      Inst->mayReadFromMemory())
+    return false;
+
+  Visited.insert(Inst);
+
+  // We only want to go _up_ the dominance chain when recursing.
+  assert(!isa<PHINode>(Loc) &&
+         "PHIs should return false for isSafeToSpeculativelyExecute");
+  assert(DT.isReachableFromEntry(Inst->getParent()) &&
+         "We did a DFS from the block entry!");
+  return all_of(Inst->operands(),
+                [&](Value *Op) { return isAvailableAt(Op, Loc, Visited); });
+}
+
+void GuardWideningImpl::makeAvailableAt(Value *V, Instruction *Loc) {
+  auto *Inst = dyn_cast<Instruction>(V);
+  if (!Inst || DT.dominates(Inst, Loc))
+    return;
+
+  assert(isSafeToSpeculativelyExecute(Inst, Loc, &DT) &&
+         !Inst->mayReadFromMemory() && "Should've checked with isAvailableAt!");
+
+  for (Value *Op : Inst->operands())
+    makeAvailableAt(Op, Loc);
+
+  Inst->moveBefore(Loc);
+}
+
+bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1,
+                                        Instruction *InsertPt, Value *&Result) {
+  using namespace llvm::PatternMatch;
+
+  {
+    // L >u C0 && L >u C1  ->  L >u max(C0, C1)
+    ConstantInt *RHS0, *RHS1;
+    Value *LHS;
+    ICmpInst::Predicate Pred0, Pred1;
+    if (match(Cond0, m_ICmp(Pred0, m_Value(LHS), m_ConstantInt(RHS0))) &&
+        match(Cond1, m_ICmp(Pred1, m_Specific(LHS), m_ConstantInt(RHS1)))) {
+
+      ConstantRange CR0 =
+          ConstantRange::makeExactICmpRegion(Pred0, RHS0->getValue());
+      ConstantRange CR1 =
+          ConstantRange::makeExactICmpRegion(Pred1, RHS1->getValue());
+
+      // SubsetIntersect is a subset of the actual mathematical intersection of
+      // CR0 and CR1, while SupersetIntersect is a superset of the actual
+      // mathematical intersection.  If these two ConstantRanges are equal, then
+      // we know we were able to represent the actual mathematical intersection
+      // of CR0 and CR1, and can use the same to generate an icmp instruction.
+      //
+      // Given what we're doing here and the semantics of guards, it would
+      // actually be correct to just use SubsetIntersect, but that may be too
+      // aggressive in cases we care about.
+      auto SubsetIntersect = CR0.inverse().unionWith(CR1.inverse()).inverse();
+      auto SupersetIntersect = CR0.intersectWith(CR1);
+
+      APInt NewRHSAP;
+      CmpInst::Predicate Pred;
+      if (SubsetIntersect == SupersetIntersect &&
+          SubsetIntersect.getEquivalentICmp(Pred, NewRHSAP)) {
+        if (InsertPt) {
+          ConstantInt *NewRHS = ConstantInt::get(Cond0->getContext(), NewRHSAP);
+          Result = new ICmpInst(InsertPt, Pred, LHS, NewRHS, "wide.chk");
+        }
+        return true;
+      }
+    }
+  }
+
+  {
+    SmallVector<GuardWideningImpl::RangeCheck, 4> Checks, CombinedChecks;
+    if (parseRangeChecks(Cond0, Checks) && parseRangeChecks(Cond1, Checks) &&
+        combineRangeChecks(Checks, CombinedChecks)) {
+      if (InsertPt) {
+        Result = nullptr;
+        for (auto &RC : CombinedChecks) {
+          makeAvailableAt(RC.getCheckInst(), InsertPt);
+          if (Result)
+            Result = BinaryOperator::CreateAnd(RC.getCheckInst(), Result, "",
+                                               InsertPt);
+          else
+            Result = RC.getCheckInst();
+        }
+
+        Result->setName("wide.chk");
+      }
+      return true;
+    }
+  }
+
+  // Base case -- just logical-and the two conditions together.
+
+  if (InsertPt) {
+    makeAvailableAt(Cond0, InsertPt);
+    makeAvailableAt(Cond1, InsertPt);
+
+    Result = BinaryOperator::CreateAnd(Cond0, Cond1, "wide.chk", InsertPt);
+  }
+
+  // We were not able to compute Cond0 AND Cond1 for the price of one.
+  return false;
+}
+
+bool GuardWideningImpl::parseRangeChecks(
+    Value *CheckCond, SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks,
+    SmallPtrSetImpl<Value *> &Visited) {
+  if (!Visited.insert(CheckCond).second)
+    return true;
+
+  using namespace llvm::PatternMatch;
+
+  {
+    Value *AndLHS, *AndRHS;
+    if (match(CheckCond, m_And(m_Value(AndLHS), m_Value(AndRHS))))
+      return parseRangeChecks(AndLHS, Checks) &&
+             parseRangeChecks(AndRHS, Checks);
+  }
+
+  auto *IC = dyn_cast<ICmpInst>(CheckCond);
+  if (!IC || !IC->getOperand(0)->getType()->isIntegerTy() ||
+      (IC->getPredicate() != ICmpInst::ICMP_ULT &&
+       IC->getPredicate() != ICmpInst::ICMP_UGT))
+    return false;
+
+  Value *CmpLHS = IC->getOperand(0), *CmpRHS = IC->getOperand(1);
+  if (IC->getPredicate() == ICmpInst::ICMP_UGT)
+    std::swap(CmpLHS, CmpRHS);
+
+  auto &DL = IC->getModule()->getDataLayout();
+
+  GuardWideningImpl::RangeCheck Check(
+      CmpLHS, cast<ConstantInt>(ConstantInt::getNullValue(CmpRHS->getType())),
+      CmpRHS, IC);
+
+  if (!isKnownNonNegative(Check.getLength(), DL))
+    return false;
+
+  // What we have in \c Check now is a correct interpretation of \p CheckCond.
+  // Try to see if we can move some constant offsets into the \c Offset field.
+
+  bool Changed;
+  auto &Ctx = CheckCond->getContext();
+
+  do {
+    Value *OpLHS;
+    ConstantInt *OpRHS;
+    Changed = false;
+
+#ifndef NDEBUG
+    auto *BaseInst = dyn_cast<Instruction>(Check.getBase());
+    assert((!BaseInst || DT.isReachableFromEntry(BaseInst->getParent())) &&
+           "Unreachable instruction?");
+#endif
+
+    if (match(Check.getBase(), m_Add(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
+      Check.setBase(OpLHS);
+      APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
+      Check.setOffset(ConstantInt::get(Ctx, NewOffset));
+      Changed = true;
+    } else if (match(Check.getBase(),
+                     m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
+      KnownBits Known = computeKnownBits(OpLHS, DL);
+      if ((OpRHS->getValue() & Known.Zero) == OpRHS->getValue()) {
+        Check.setBase(OpLHS);
+        APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
+        Check.setOffset(ConstantInt::get(Ctx, NewOffset));
+        Changed = true;
+      }
+    }
+  } while (Changed);
+
+  Checks.push_back(Check);
+  return true;
+}
+
+bool GuardWideningImpl::combineRangeChecks(
+    SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks,
+    SmallVectorImpl<GuardWideningImpl::RangeCheck> &RangeChecksOut) {
+  unsigned OldCount = Checks.size();
+  while (!Checks.empty()) {
+    // Pick all of the range checks with a specific base and length, and try to
+    // merge them.
+    Value *CurrentBase = Checks.front().getBase();
+    Value *CurrentLength = Checks.front().getLength();
+
+    SmallVector<GuardWideningImpl::RangeCheck, 3> CurrentChecks;
+
+    auto IsCurrentCheck = [&](GuardWideningImpl::RangeCheck &RC) {
+      return RC.getBase() == CurrentBase && RC.getLength() == CurrentLength;
+    };
+
+    copy_if(Checks, std::back_inserter(CurrentChecks), IsCurrentCheck);
+    Checks.erase(remove_if(Checks, IsCurrentCheck), Checks.end());
+
+    assert(CurrentChecks.size() != 0 && "We know we have at least one!");
+
+    if (CurrentChecks.size() < 3) {
+      RangeChecksOut.insert(RangeChecksOut.end(), CurrentChecks.begin(),
+                            CurrentChecks.end());
+      continue;
+    }
+
+    // CurrentChecks.size() will typically be 3 here, but so far there has been
+    // no need to hard-code that fact.
+
+    std::sort(CurrentChecks.begin(), CurrentChecks.end(),
+              [&](const GuardWideningImpl::RangeCheck &LHS,
+                  const GuardWideningImpl::RangeCheck &RHS) {
+      return LHS.getOffsetValue().slt(RHS.getOffsetValue());
+    });
+
+    // Note: std::sort should not invalidate the ChecksStart iterator.
+
+    ConstantInt *MinOffset = CurrentChecks.front().getOffset(),
+                *MaxOffset = CurrentChecks.back().getOffset();
+
+    unsigned BitWidth = MaxOffset->getValue().getBitWidth();
+    if ((MaxOffset->getValue() - MinOffset->getValue())
+            .ugt(APInt::getSignedMinValue(BitWidth)))
+      return false;
+
+    APInt MaxDiff = MaxOffset->getValue() - MinOffset->getValue();
+    const APInt &HighOffset = MaxOffset->getValue();
+    auto OffsetOK = [&](const GuardWideningImpl::RangeCheck &RC) {
+      return (HighOffset - RC.getOffsetValue()).ult(MaxDiff);
+    };
+
+    if (MaxDiff.isMinValue() ||
+        !std::all_of(std::next(CurrentChecks.begin()), CurrentChecks.end(),
+                     OffsetOK))
+      return false;
+
+    // We have a series of f+1 checks as:
+    //
+    //   I+k_0 u< L   ... Chk_0
+    //   I+k_1 u< L   ... Chk_1
+    //   ...
+    //   I+k_f u< L   ... Chk_f
+    //
+    //     with forall i in [0,f]: k_f-k_i u< k_f-k_0  ... Precond_0
+    //          k_f-k_0 u< INT_MIN+k_f                 ... Precond_1
+    //          k_f != k_0                             ... Precond_2
+    //
+    // Claim:
+    //   Chk_0 AND Chk_f  implies all the other checks
+    //
+    // Informal proof sketch:
+    //
+    // We will show that the integer range [I+k_0,I+k_f] does not unsigned-wrap
+    // (i.e. going from I+k_0 to I+k_f does not cross the -1,0 boundary) and
+    // thus I+k_f is the greatest unsigned value in that range.
+    //
+    // This combined with Ckh_(f+1) shows that everything in that range is u< L.
+    // Via Precond_0 we know that all of the indices in Chk_0 through Chk_(f+1)
+    // lie in [I+k_0,I+k_f], this proving our claim.
+    //
+    // To see that [I+k_0,I+k_f] is not a wrapping range, note that there are
+    // two possibilities: I+k_0 u< I+k_f or I+k_0 >u I+k_f (they can't be equal
+    // since k_0 != k_f).  In the former case, [I+k_0,I+k_f] is not a wrapping
+    // range by definition, and the latter case is impossible:
+    //
+    //   0-----I+k_f---I+k_0----L---INT_MAX,INT_MIN------------------(-1)
+    //   xxxxxx             xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+    //
+    // For Chk_0 to succeed, we'd have to have k_f-k_0 (the range highlighted
+    // with 'x' above) to be at least >u INT_MIN.
+
+    RangeChecksOut.emplace_back(CurrentChecks.front());
+    RangeChecksOut.emplace_back(CurrentChecks.back());
+  }
+
+  assert(RangeChecksOut.size() <= OldCount && "We pessimized!");
+  return RangeChecksOut.size() != OldCount;
+}
+
+PreservedAnalyses GuardWideningPass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+  if (!GuardWideningImpl(DT, PDT, LI).run())
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
+  switch (WS) {
+  case WS_IllegalOrNegative:
+    return "IllegalOrNegative";
+  case WS_Neutral:
+    return "Neutral";
+  case WS_Positive:
+    return "Positive";
+  case WS_VeryPositive:
+    return "VeryPositive";
+  }
+
+  llvm_unreachable("Fully covered switch above!");
+}
+
+char GuardWideningLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards",
+                    false, false)
+
+FunctionPass *llvm::createGuardWideningPass() {
+  return new GuardWideningLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp b/contrib/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp
new file mode 100644
index 000000000000..807593379283
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp
@@ -0,0 +1,22 @@
+//===- IVUsersPrinter.cpp - Induction Variable Users Printer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/IVUsersPrinter.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "iv-users"
+
+PreservedAnalyses IVUsersPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+                                          LoopStandardAnalysisResults &AR,
+                                          LPMUpdater &U) {
+  AM.getResult<IVUsersAnalysis>(L, AR).print(OS);
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
new file mode 100644
index 000000000000..10782963177c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -0,0 +1,2543 @@
+//===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into simpler forms suitable for subsequent
+// analysis and transformation.
+//
+// If the trip count of a loop is computable, this pass also makes the following
+// changes:
+//   1. The exit condition for the loop is canonicalized to compare the
+//      induction value against the exit value.  This turns loops like:
+//        'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)'
+//   2. Any use outside of the loop of an expression derived from the indvar
+//      is changed to compute the derived value outside of the loop, eliminating
+//      the dependence on the exit value of the induction variable.  If the only
+//      purpose of the loop is to compute the exit value of some derived
+//      expression, this transformation will make the loop dead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "indvars"
+
+STATISTIC(NumWidened     , "Number of indvars widened");
+STATISTIC(NumReplaced    , "Number of exit values replaced");
+STATISTIC(NumLFTR        , "Number of loop exit tests replaced");
+STATISTIC(NumElimExt     , "Number of IV sign/zero extends eliminated");
+STATISTIC(NumElimIV      , "Number of congruent IVs eliminated");
+
+// Trip count verification can be enabled by default under NDEBUG if we
+// implement a strong expression equivalence checker in SCEV. Until then, we
+// use the verify-indvars flag, which may assert in some cases.
+static cl::opt<bool> VerifyIndvars(
+  "verify-indvars", cl::Hidden,
+  cl::desc("Verify the ScalarEvolution result after running indvars"));
+
+enum ReplaceExitVal { NeverRepl, OnlyCheapRepl, AlwaysRepl };
+
+static cl::opt<ReplaceExitVal> ReplaceExitValue(
+    "replexitval", cl::Hidden, cl::init(OnlyCheapRepl),
+    cl::desc("Choose the strategy to replace exit value in IndVarSimplify"),
+    cl::values(clEnumValN(NeverRepl, "never", "never replace exit value"),
+               clEnumValN(OnlyCheapRepl, "cheap",
+                          "only replace exit value when the cost is cheap"),
+               clEnumValN(AlwaysRepl, "always",
+                          "always replace exit value whenever possible")));
+
+static cl::opt<bool> UsePostIncrementRanges(
+  "indvars-post-increment-ranges", cl::Hidden,
+  cl::desc("Use post increment control-dependent ranges in IndVarSimplify"),
+  cl::init(true));
+
+static cl::opt<bool>
+DisableLFTR("disable-lftr", cl::Hidden, cl::init(false),
+            cl::desc("Disable Linear Function Test Replace optimization"));
+
+namespace {
+struct RewritePhi;
+
+class IndVarSimplify {
+  LoopInfo *LI;
+  ScalarEvolution *SE;
+  DominatorTree *DT;
+  const DataLayout &DL;
+  TargetLibraryInfo *TLI;
+  const TargetTransformInfo *TTI;
+
+  SmallVector<WeakTrackingVH, 16> DeadInsts;
+  bool Changed = false;
+
+  bool isValidRewrite(Value *FromVal, Value *ToVal);
+
+  void handleFloatingPointIV(Loop *L, PHINode *PH);
+  void rewriteNonIntegerIVs(Loop *L);
+
+  void simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI);
+
+  bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);
+  void rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
+  void rewriteFirstIterationLoopExitValues(Loop *L);
+
+  Value *linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
+                                   PHINode *IndVar, SCEVExpander &Rewriter);
+
+  void sinkUnusedInvariants(Loop *L);
+
+  Value *expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L,
+                            Instruction *InsertPt, Type *Ty);
+
+public:
+  IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+                 const DataLayout &DL, TargetLibraryInfo *TLI,
+                 TargetTransformInfo *TTI)
+      : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI) {}
+
+  bool run(Loop *L);
+};
+}
+
+/// Return true if the SCEV expansion generated by the rewriter can replace the
+/// original value. SCEV guarantees that it produces the same value, but the way
+/// it is produced may be illegal IR.  Ideally, this function will only be
+/// called for verification.
+bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
+  // If an SCEV expression subsumed multiple pointers, its expansion could
+  // reassociate the GEP changing the base pointer. This is illegal because the
+  // final address produced by a GEP chain must be inbounds relative to its
+  // underlying object. Otherwise basic alias analysis, among other things,
+  // could fail in a dangerous way. Ultimately, SCEV will be improved to avoid
+  // producing an expression involving multiple pointers. Until then, we must
+  // bail out here.
+  //
+  // Retrieve the pointer operand of the GEP. Don't use GetUnderlyingObject
+  // because it understands lcssa phis while SCEV does not.
+  Value *FromPtr = FromVal;
+  Value *ToPtr = ToVal;
+  if (auto *GEP = dyn_cast<GEPOperator>(FromVal)) {
+    FromPtr = GEP->getPointerOperand();
+  }
+  if (auto *GEP = dyn_cast<GEPOperator>(ToVal)) {
+    ToPtr = GEP->getPointerOperand();
+  }
+  if (FromPtr != FromVal || ToPtr != ToVal) {
+    // Quickly check the common case
+    if (FromPtr == ToPtr)
+      return true;
+
+    // SCEV may have rewritten an expression that produces the GEP's pointer
+    // operand. That's ok as long as the pointer operand has the same base
+    // pointer. Unlike GetUnderlyingObject(), getPointerBase() will find the
+    // base of a recurrence. This handles the case in which SCEV expansion
+    // converts a pointer type recurrence into a nonrecurrent pointer base
+    // indexed by an integer recurrence.
+
+    // If the GEP base pointer is a vector of pointers, abort.
+    if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy())
+      return false;
+
+    const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr));
+    const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr));
+    if (FromBase == ToBase)
+      return true;
+
+    DEBUG(dbgs() << "INDVARS: GEP rewrite bail out "
+          << *FromBase << " != " << *ToBase << "\n");
+
+    return false;
+  }
+  return true;
+}
+
+/// Determine the insertion point for this user. By default, insert immediately
+/// before the user. SCEVExpander or LICM will hoist loop invariants out of the
+/// loop. For PHI nodes, there may be multiple uses, so compute the nearest
+/// common dominator for the incoming blocks.
+static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
+                                          DominatorTree *DT, LoopInfo *LI) {
+  PHINode *PHI = dyn_cast<PHINode>(User);
+  if (!PHI)
+    return User;
+
+  Instruction *InsertPt = nullptr;
+  for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) {
+    if (PHI->getIncomingValue(i) != Def)
+      continue;
+
+    BasicBlock *InsertBB = PHI->getIncomingBlock(i);
+    if (!InsertPt) {
+      InsertPt = InsertBB->getTerminator();
+      continue;
+    }
+    InsertBB = DT->findNearestCommonDominator(InsertPt->getParent(), InsertBB);
+    InsertPt = InsertBB->getTerminator();
+  }
+  assert(InsertPt && "Missing phi operand");
+
+  auto *DefI = dyn_cast<Instruction>(Def);
+  if (!DefI)
+    return InsertPt;
+
+  assert(DT->dominates(DefI, InsertPt) && "def does not dominate all uses");
+
+  auto *L = LI->getLoopFor(DefI->getParent());
+  assert(!L || L->contains(LI->getLoopFor(InsertPt->getParent())));
+
+  for (auto *DTN = (*DT)[InsertPt->getParent()]; DTN; DTN = DTN->getIDom())
+    if (LI->getLoopFor(DTN->getBlock()) == L)
+      return DTN->getBlock()->getTerminator();
+
+  llvm_unreachable("DefI dominates InsertPt!");
+}
+
+//===----------------------------------------------------------------------===//
+// rewriteNonIntegerIVs and helpers. Prefer integer IVs.
+//===----------------------------------------------------------------------===//
+
+/// Convert APF to an integer, if possible.
+static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
+  bool isExact = false;
+  // See if we can convert this to an int64_t
+  uint64_t UIntVal;
+  if (APF.convertToInteger(makeMutableArrayRef(UIntVal), 64, true,
+                           APFloat::rmTowardZero, &isExact) != APFloat::opOK ||
+      !isExact)
+    return false;
+  IntVal = UIntVal;
+  return true;
+}
+
+/// If the loop has floating induction variable then insert corresponding
+/// integer induction variable if possible.
+/// For example,
+/// for(double i = 0; i < 10000; ++i)
+///   bar(i)
+/// is converted into
+/// for(int i = 0; i < 10000; ++i)
+///   bar((double)i);
+///
+void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
+  unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0));
+  unsigned BackEdge     = IncomingEdge^1;
+
+  // Check incoming value.
+  auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge));
+
+  int64_t InitValue;
+  if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue))
+    return;
+
+  // Check IV increment. Reject this PN if increment operation is not
+  // an add or increment value can not be represented by an integer.
+  auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge));
+  if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return;
+
+  // If this is not an add of the PHI with a constantfp, or if the constant fp
+  // is not an integer, bail out.
+  ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1));
+  int64_t IncValue;
+  if (IncValueVal == nullptr || Incr->getOperand(0) != PN ||
+      !ConvertToSInt(IncValueVal->getValueAPF(), IncValue))
+    return;
+
+  // Check Incr uses. One user is PN and the other user is an exit condition
+  // used by the conditional terminator.
+  Value::user_iterator IncrUse = Incr->user_begin();
+  Instruction *U1 = cast<Instruction>(*IncrUse++);
+  if (IncrUse == Incr->user_end()) return;
+  Instruction *U2 = cast<Instruction>(*IncrUse++);
+  if (IncrUse != Incr->user_end()) return;
+
+  // Find exit condition, which is an fcmp.  If it doesn't exist, or if it isn't
+  // only used by a branch, we can't transform it.
+  FCmpInst *Compare = dyn_cast<FCmpInst>(U1);
+  if (!Compare)
+    Compare = dyn_cast<FCmpInst>(U2);
+  if (!Compare || !Compare->hasOneUse() ||
+      !isa<BranchInst>(Compare->user_back()))
+    return;
+
+  BranchInst *TheBr = cast<BranchInst>(Compare->user_back());
+
+  // We need to verify that the branch actually controls the iteration count
+  // of the loop.  If not, the new IV can overflow and no one will notice.
+  // The branch block must be in the loop and one of the successors must be out
+  // of the loop.
+  assert(TheBr->isConditional() && "Can't use fcmp if not conditional");
+  if (!L->contains(TheBr->getParent()) ||
+      (L->contains(TheBr->getSuccessor(0)) &&
+       L->contains(TheBr->getSuccessor(1))))
+    return;
+
+
+  // If it isn't a comparison with an integer-as-fp (the exit value), we can't
+  // transform it.
+  ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1));
+  int64_t ExitValue;
+  if (ExitValueVal == nullptr ||
+      !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue))
+    return;
+
+  // Find new predicate for integer comparison.
+  CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE;
+  switch (Compare->getPredicate()) {
+  default: return;  // Unknown comparison.
+  case CmpInst::FCMP_OEQ:
+  case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break;
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break;
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break;
+  case CmpInst::FCMP_OGE:
+  case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break;
+  case CmpInst::FCMP_OLT:
+  case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break;
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break;
+  }
+
+  // We convert the floating point induction variable to a signed i32 value if
+  // we can.  This is only safe if the comparison will not overflow in a way
+  // that won't be trapped by the integer equivalent operations.  Check for this
+  // now.
+  // TODO: We could use i64 if it is native and the range requires it.
+
+  // The start/stride/exit values must all fit in signed i32.
+  if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue))
+    return;
+
+  // If not actually striding (add x, 0.0), avoid touching the code.
+  if (IncValue == 0)
+    return;
+
+  // Positive and negative strides have different safety conditions.
+  if (IncValue > 0) {
+    // If we have a positive stride, we require the init to be less than the
+    // exit value.
+    if (InitValue >= ExitValue)
+      return;
+
+    uint32_t Range = uint32_t(ExitValue-InitValue);
+    // Check for infinite loop, either:
+    // while (i <= Exit) or until (i > Exit)
+    if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) {
+      if (++Range == 0) return;  // Range overflows.
+    }
+
+    unsigned Leftover = Range % uint32_t(IncValue);
+
+    // If this is an equality comparison, we require that the strided value
+    // exactly land on the exit value, otherwise the IV condition will wrap
+    // around and do things the fp IV wouldn't.
+    if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
+        Leftover != 0)
+      return;
+
+    // If the stride would wrap around the i32 before exiting, we can't
+    // transform the IV.
+    if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue)
+      return;
+
+  } else {
+    // If we have a negative stride, we require the init to be greater than the
+    // exit value.
+    if (InitValue <= ExitValue)
+      return;
+
+    uint32_t Range = uint32_t(InitValue-ExitValue);
+    // Check for infinite loop, either:
+    // while (i >= Exit) or until (i < Exit)
+    if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) {
+      if (++Range == 0) return;  // Range overflows.
+    }
+
+    unsigned Leftover = Range % uint32_t(-IncValue);
+
+    // If this is an equality comparison, we require that the strided value
+    // exactly land on the exit value, otherwise the IV condition will wrap
+    // around and do things the fp IV wouldn't.
+    if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
+        Leftover != 0)
+      return;
+
+    // If the stride would wrap around the i32 before exiting, we can't
+    // transform the IV.
+    if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue)
+      return;
+  }
+
+  IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext());
+
+  // Insert new integer induction variable.
+  PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN);
+  NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue),
+                      PN->getIncomingBlock(IncomingEdge));
+
+  Value *NewAdd =
+    BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue),
+                              Incr->getName()+".int", Incr);
+  NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge));
+
+  ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd,
+                                      ConstantInt::get(Int32Ty, ExitValue),
+                                      Compare->getName());
+
+  // In the following deletions, PN may become dead and may be deleted.
+  // Use a WeakTrackingVH to observe whether this happens.
+  WeakTrackingVH WeakPH = PN;
+
+  // Delete the old floating point exit comparison.  The branch starts using the
+  // new comparison.
+  NewCompare->takeName(Compare);
+  Compare->replaceAllUsesWith(NewCompare);
+  RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI);
+
+  // Delete the old floating point increment.
+  Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
+  RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI);
+
+  // If the FP induction variable still has uses, this is because something else
+  // in the loop uses its value.  In order to canonicalize the induction
+  // variable, we chose to eliminate the IV and rewrite it in terms of an
+  // int->fp cast.
+  //
+  // We give preference to sitofp over uitofp because it is faster on most
+  // platforms.
+  if (WeakPH) {
+    Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
+                                 &*PN->getParent()->getFirstInsertionPt());
+    PN->replaceAllUsesWith(Conv);
+    RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
+  }
+  Changed = true;
+}
+
+void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
+  // First step.  Check to see if there are any floating-point recurrences.
+  // If there are, change them into integer recurrences, permitting analysis by
+  // the SCEV routines.
+  //
+  BasicBlock *Header = L->getHeader();
+
+  SmallVector<WeakTrackingVH, 8> PHIs;
+  for (BasicBlock::iterator I = Header->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PHIs.push_back(PN);
+
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+    if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i]))
+      handleFloatingPointIV(L, PN);
+
+  // If the loop previously had floating-point IV, ScalarEvolution
+  // may not have been able to compute a trip count. Now that we've done some
+  // re-writing, the trip count may be computable.
+  if (Changed)
+    SE->forgetLoop(L);
+}
+
+namespace {
+// Collect information about PHI nodes which can be transformed in
+// rewriteLoopExitValues.
+struct RewritePhi {
+  PHINode *PN;
+  unsigned Ith;  // Ith incoming value.
+  Value *Val;    // Exit value after expansion.
+  bool HighCost; // High Cost when expansion.
+
+  RewritePhi(PHINode *P, unsigned I, Value *V, bool H)
+      : PN(P), Ith(I), Val(V), HighCost(H) {}
+};
+}
+
+Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S,
+                                          Loop *L, Instruction *InsertPt,
+                                          Type *ResultTy) {
+  // Before expanding S into an expensive LLVM expression, see if we can use an
+  // already existing value as the expansion for S.
+  if (Value *ExistingValue = Rewriter.getExactExistingExpansion(S, InsertPt, L))
+    if (ExistingValue->getType() == ResultTy)
+      return ExistingValue;
+
+  // We didn't find anything, fall back to using SCEVExpander.
+  return Rewriter.expandCodeFor(S, ResultTy, InsertPt);
+}
+
+//===----------------------------------------------------------------------===//
+// rewriteLoopExitValues - Optimize IV users outside the loop.
+// As a side effect, reduces the amount of IV processing within the loop.
+//===----------------------------------------------------------------------===//
+
+/// Check to see if this loop has a computable loop-invariant execution count.
+/// If so, this means that we can compute the final value of any expressions
+/// that are recurrent in the loop, and substitute the exit values from the loop
+/// into any instructions outside of the loop that use the final values of the
+/// current expressions.
+///
+/// This is mostly redundant with the regular IndVarSimplify activities that
+/// happen later, except that it's more powerful in some cases, because it's
+/// able to brute-force evaluate arbitrary instructions as long as they have
+/// constant operands at the beginning of the loop.
+void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
+  // Check a pre-condition.
+  assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+         "Indvars did not preserve LCSSA!");
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  SmallVector<RewritePhi, 8> RewritePhiSet;
+  // Find all values that are computed inside the loop, but used outside of it.
+  // Because of LCSSA, these values will only occur in LCSSA PHI Nodes.  Scan
+  // the exit blocks of the loop to find them.
+  for (BasicBlock *ExitBB : ExitBlocks) {
+    // If there are no PHI nodes in this exit block, then no values defined
+    // inside the loop are used on this path, skip it.
+    PHINode *PN = dyn_cast<PHINode>(ExitBB->begin());
+    if (!PN) continue;
+
+    unsigned NumPreds = PN->getNumIncomingValues();
+
+    // Iterate over all of the PHI nodes.
+    BasicBlock::iterator BBI = ExitBB->begin();
+    while ((PN = dyn_cast<PHINode>(BBI++))) {
+      if (PN->use_empty())
+        continue; // dead use, don't replace it
+
+      if (!SE->isSCEVable(PN->getType()))
+        continue;
+
+      // It's necessary to tell ScalarEvolution about this explicitly so that
+      // it can walk the def-use list and forget all SCEVs, as it may not be
+      // watching the PHI itself. Once the new exit value is in place, there
+      // may not be a def-use connection between the loop and every instruction
+      // which got a SCEVAddRecExpr for that loop.
+      SE->forgetValue(PN);
+
+      // Iterate over all of the values in all the PHI nodes.
+      for (unsigned i = 0; i != NumPreds; ++i) {
+        // If the value being merged in is not integer or is not defined
+        // in the loop, skip it.
+        Value *InVal = PN->getIncomingValue(i);
+        if (!isa<Instruction>(InVal))
+          continue;
+
+        // If this pred is for a subloop, not L itself, skip it.
+        if (LI->getLoopFor(PN->getIncomingBlock(i)) != L)
+          continue; // The Block is in a subloop, skip it.
+
+        // Check that InVal is defined in the loop.
+        Instruction *Inst = cast<Instruction>(InVal);
+        if (!L->contains(Inst))
+          continue;
+
+        // Okay, this instruction has a user outside of the current loop
+        // and varies predictably *inside* the loop.  Evaluate the value it
+        // contains when the loop exits, if possible.
+        const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
+        if (!SE->isLoopInvariant(ExitValue, L) ||
+            !isSafeToExpand(ExitValue, *SE))
+          continue;
+
+        // Computing the value outside of the loop brings no benefit if :
+        //  - it is definitely used inside the loop in a way which can not be
+        //    optimized away.
+        //  - no use outside of the loop can take advantage of hoisting the
+        //    computation out of the loop
+        if (ExitValue->getSCEVType()>=scMulExpr) {
+          unsigned NumHardInternalUses = 0;
+          unsigned NumSoftExternalUses = 0;
+          unsigned NumUses = 0;
+          for (auto IB = Inst->user_begin(), IE = Inst->user_end();
+               IB != IE && NumUses <= 6; ++IB) {
+            Instruction *UseInstr = cast<Instruction>(*IB);
+            unsigned Opc = UseInstr->getOpcode();
+            NumUses++;
+            if (L->contains(UseInstr)) {
+              if (Opc == Instruction::Call || Opc == Instruction::Ret)
+                NumHardInternalUses++;
+            } else {
+              if (Opc == Instruction::PHI) {
+                // Do not count the Phi as a use. LCSSA may have inserted
+                // plenty of trivial ones.
+                NumUses--;
+                for (auto PB = UseInstr->user_begin(),
+                          PE = UseInstr->user_end();
+                     PB != PE && NumUses <= 6; ++PB, ++NumUses) {
+                  unsigned PhiOpc = cast<Instruction>(*PB)->getOpcode();
+                  if (PhiOpc != Instruction::Call && PhiOpc != Instruction::Ret)
+                    NumSoftExternalUses++;
+                }
+                continue;
+              }
+              if (Opc != Instruction::Call && Opc != Instruction::Ret)
+                NumSoftExternalUses++;
+            }
+          }
+          if (NumUses <= 6 && NumHardInternalUses && !NumSoftExternalUses)
+            continue;
+        }
+
+        bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst);
+        Value *ExitVal =
+            expandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, PN->getType());
+
+        DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'
+                     << "  LoopVal = " << *Inst << "\n");
+
+        if (!isValidRewrite(Inst, ExitVal)) {
+          DeadInsts.push_back(ExitVal);
+          continue;
+        }
+
+        // Collect all the candidate PHINodes to be rewritten.
+        RewritePhiSet.emplace_back(PN, i, ExitVal, HighCost);
+      }
+    }
+  }
+
+  bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet);
+
+  // Transformation.
+  for (const RewritePhi &Phi : RewritePhiSet) {
+    PHINode *PN = Phi.PN;
+    Value *ExitVal = Phi.Val;
+
+    // Only do the rewrite when the ExitValue can be expanded cheaply.
+    // If LoopCanBeDel is true, rewrite exit value aggressively.
+    if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) {
+      DeadInsts.push_back(ExitVal);
+      continue;
+    }
+
+    Changed = true;
+    ++NumReplaced;
+    Instruction *Inst = cast<Instruction>(PN->getIncomingValue(Phi.Ith));
+    PN->setIncomingValue(Phi.Ith, ExitVal);
+
+    // If this instruction is dead now, delete it. Don't do it now to avoid
+    // invalidating iterators.
+    if (isInstructionTriviallyDead(Inst, TLI))
+      DeadInsts.push_back(Inst);
+
+    // Replace PN with ExitVal if that is legal and does not break LCSSA.
+    if (PN->getNumIncomingValues() == 1 &&
+        LI->replacementPreservesLCSSAForm(PN, ExitVal)) {
+      PN->replaceAllUsesWith(ExitVal);
+      PN->eraseFromParent();
+    }
+  }
+
+  // The insertion point instruction may have been deleted; clear it out
+  // so that the rewriter doesn't trip over it later.
+  Rewriter.clearInsertPoint();
+}
+
+//===---------------------------------------------------------------------===//
+// rewriteFirstIterationLoopExitValues: Rewrite loop exit values if we know
+// they will exit at the first iteration.
+//===---------------------------------------------------------------------===//
+
+/// Check to see if this loop has loop invariant conditions which lead to loop
+/// exits. If so, we know that if the exit path is taken, it is at the first
+/// loop iteration. This lets us predict exit values of PHI nodes that live in
+/// loop header.
+void IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
+  // Verify the input to the pass is already in LCSSA form.
+  assert(L->isLCSSAForm(*DT));
+
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+  auto *LoopHeader = L->getHeader();
+  assert(LoopHeader && "Invalid loop");
+
+  for (auto *ExitBB : ExitBlocks) {
+    BasicBlock::iterator BBI = ExitBB->begin();
+    // If there are no more PHI nodes in this exit block, then no more
+    // values defined inside the loop are used on this path.
+    while (auto *PN = dyn_cast<PHINode>(BBI++)) {
+      for (unsigned IncomingValIdx = 0, E = PN->getNumIncomingValues();
+          IncomingValIdx != E; ++IncomingValIdx) {
+        auto *IncomingBB = PN->getIncomingBlock(IncomingValIdx);
+
+        // We currently only support loop exits from loop header. If the
+        // incoming block is not loop header, we need to recursively check
+        // all conditions starting from loop header are loop invariants.
+        // Additional support might be added in the future.
+        if (IncomingBB != LoopHeader)
+          continue;
+
+        // Get condition that leads to the exit path.
+        auto *TermInst = IncomingBB->getTerminator();
+
+        Value *Cond = nullptr;
+        if (auto *BI = dyn_cast<BranchInst>(TermInst)) {
+          // Must be a conditional branch, otherwise the block
+          // should not be in the loop.
+          Cond = BI->getCondition();
+        } else if (auto *SI = dyn_cast<SwitchInst>(TermInst))
+          Cond = SI->getCondition();
+        else
+          continue;
+
+        if (!L->isLoopInvariant(Cond))
+          continue;
+
+        auto *ExitVal =
+            dyn_cast<PHINode>(PN->getIncomingValue(IncomingValIdx));
+
+        // Only deal with PHIs.
+        if (!ExitVal)
+          continue;
+
+        // If ExitVal is a PHI on the loop header, then we know its
+        // value along this exit because the exit can only be taken
+        // on the first iteration.
+        auto *LoopPreheader = L->getLoopPreheader();
+        assert(LoopPreheader && "Invalid loop");
+        int PreheaderIdx = ExitVal->getBasicBlockIndex(LoopPreheader);
+        if (PreheaderIdx != -1) {
+          assert(ExitVal->getParent() == LoopHeader &&
+                 "ExitVal must be in loop header");
+          PN->setIncomingValue(IncomingValIdx,
+              ExitVal->getIncomingValue(PreheaderIdx));
+        }
+      }
+    }
+  }
+}
+
+/// Check whether it is possible to delete the loop after rewriting exit
+/// value. If it is possible, ignore ReplaceExitValue and do rewriting
+/// aggressively.
+bool IndVarSimplify::canLoopBeDeleted(
+    Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) {
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  // If there is no preheader, the loop will not be deleted.
+  if (!Preheader)
+    return false;
+
+  // In LoopDeletion pass Loop can be deleted when ExitingBlocks.size() > 1.
+  // We obviate multiple ExitingBlocks case for simplicity.
+  // TODO: If we see testcase with multiple ExitingBlocks can be deleted
+  // after exit value rewriting, we can enhance the logic here.
+  SmallVector<BasicBlock *, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() > 1 || ExitingBlocks.size() > 1)
+    return false;
+
+  BasicBlock *ExitBlock = ExitBlocks[0];
+  BasicBlock::iterator BI = ExitBlock->begin();
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    Value *Incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
+
+    // If the Incoming value of P is found in RewritePhiSet, we know it
+    // could be rewritten to use a loop invariant value in transformation
+    // phase later. Skip it in the loop invariant check below.
+    bool found = false;
+    for (const RewritePhi &Phi : RewritePhiSet) {
+      unsigned i = Phi.Ith;
+      if (Phi.PN == P && (Phi.PN)->getIncomingValue(i) == Incoming) {
+        found = true;
+        break;
+      }
+    }
+
+    Instruction *I;
+    if (!found && (I = dyn_cast<Instruction>(Incoming)))
+      if (!L->hasLoopInvariantOperands(I))
+        return false;
+
+    ++BI;
+  }
+
+  for (auto *BB : L->blocks())
+    if (any_of(*BB, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+      return false;
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  IV Widening - Extend the width of an IV to cover its widest uses.
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Collect information about induction variables that are used by sign/zero
+// extend operations. This information is recorded by CollectExtend and provides
+// the input to WidenIV.
+struct WideIVInfo {
+  PHINode *NarrowIV = nullptr;
+  Type *WidestNativeType = nullptr; // Widest integer type created [sz]ext
+  bool IsSigned = false;            // Was a sext user seen before a zext?
+};
+}
+
+/// Update information about the induction variable that is extended by this
+/// sign or zero extend operation. This is used to determine the final width of
+/// the IV before actually widening it.
+static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
+                        const TargetTransformInfo *TTI) {
+  bool IsSigned = Cast->getOpcode() == Instruction::SExt;
+  if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
+    return;
+
+  Type *Ty = Cast->getType();
+  uint64_t Width = SE->getTypeSizeInBits(Ty);
+  if (!Cast->getModule()->getDataLayout().isLegalInteger(Width))
+    return;
+
+  // Check that `Cast` actually extends the induction variable (we rely on this
+  // later).  This takes care of cases where `Cast` is extending a truncation of
+  // the narrow induction variable, and thus can end up being narrower than the
+  // "narrow" induction variable.
+  uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType());
+  if (NarrowIVWidth >= Width)
+    return;
+
+  // Cast is either an sext or zext up to this point.
+  // We should not widen an indvar if arithmetics on the wider indvar are more
+  // expensive than those on the narrower indvar. We check only the cost of ADD
+  // because at least an ADD is required to increment the induction variable. We
+  // could compute more comprehensively the cost of all instructions on the
+  // induction variable when necessary.
+  if (TTI &&
+      TTI->getArithmeticInstrCost(Instruction::Add, Ty) >
+          TTI->getArithmeticInstrCost(Instruction::Add,
+                                      Cast->getOperand(0)->getType())) {
+    return;
+  }
+
+  if (!WI.WidestNativeType) {
+    WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+    WI.IsSigned = IsSigned;
+    return;
+  }
+
+  // We extend the IV to satisfy the sign of its first user, arbitrarily.
+  if (WI.IsSigned != IsSigned)
+    return;
+
+  if (Width > SE->getTypeSizeInBits(WI.WidestNativeType))
+    WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+}
+
+namespace {
+
+/// Record a link in the Narrow IV def-use chain along with the WideIV that
+/// computes the same value as the Narrow IV def.  This avoids caching Use*
+/// pointers.
+struct NarrowIVDefUse {
+  Instruction *NarrowDef = nullptr;
+  Instruction *NarrowUse = nullptr;
+  Instruction *WideDef = nullptr;
+
+  // True if the narrow def is never negative.  Tracking this information lets
+  // us use a sign extension instead of a zero extension or vice versa, when
+  // profitable and legal.
+  bool NeverNegative = false;
+
+  NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD,
+                 bool NeverNegative)
+      : NarrowDef(ND), NarrowUse(NU), WideDef(WD),
+        NeverNegative(NeverNegative) {}
+};
+
+/// The goal of this transform is to remove sign and zero extends without
+/// creating any new induction variables. To do this, it creates a new phi of
+/// the wider type and redirects all users, either removing extends or inserting
+/// truncs whenever we stop propagating the type.
+///
+class WidenIV {
+  // Parameters
+  PHINode *OrigPhi;
+  Type *WideType;
+
+  // Context
+  LoopInfo        *LI;
+  Loop            *L;
+  ScalarEvolution *SE;
+  DominatorTree   *DT;
+
+  // Does the module have any calls to the llvm.experimental.guard intrinsic
+  // at all? If not we can avoid scanning instructions looking for guards.
+  bool HasGuards;
+
+  // Result
+  PHINode *WidePhi;
+  Instruction *WideInc;
+  const SCEV *WideIncExpr;
+  SmallVectorImpl<WeakTrackingVH> &DeadInsts;
+
+  SmallPtrSet<Instruction *,16> Widened;
+  SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
+
+  enum ExtendKind { ZeroExtended, SignExtended, Unknown };
+  // A map tracking the kind of extension used to widen each narrow IV
+  // and narrow IV user.
+  // Key: pointer to a narrow IV or IV user.
+  // Value: the kind of extension used to widen this Instruction.
+  DenseMap<AssertingVH<Instruction>, ExtendKind> ExtendKindMap;
+
+  typedef std::pair<AssertingVH<Value>, AssertingVH<Instruction>> DefUserPair;
+  // A map with control-dependent ranges for post increment IV uses. The key is
+  // a pair of IV def and a use of this def denoting the context. The value is
+  // a ConstantRange representing possible values of the def at the given
+  // context.
+  DenseMap<DefUserPair, ConstantRange> PostIncRangeInfos;
+
+  Optional<ConstantRange> getPostIncRangeInfo(Value *Def,
+                                              Instruction *UseI) {
+    DefUserPair Key(Def, UseI);
+    auto It = PostIncRangeInfos.find(Key);
+    return It == PostIncRangeInfos.end()
+               ? Optional<ConstantRange>(None)
+               : Optional<ConstantRange>(It->second);
+  }
+
+  void calculatePostIncRanges(PHINode *OrigPhi);
+  void calculatePostIncRange(Instruction *NarrowDef, Instruction *NarrowUser);
+  void updatePostIncRangeInfo(Value *Def, Instruction *UseI, ConstantRange R) {
+    DefUserPair Key(Def, UseI);
+    auto It = PostIncRangeInfos.find(Key);
+    if (It == PostIncRangeInfos.end())
+      PostIncRangeInfos.insert({Key, R});
+    else
+      It->second = R.intersectWith(It->second);
+  }
+
+public:
+  WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, ScalarEvolution *SEv,
+          DominatorTree *DTree, SmallVectorImpl<WeakTrackingVH> &DI,
+          bool HasGuards)
+      : OrigPhi(WI.NarrowIV), WideType(WI.WidestNativeType), LI(LInfo),
+        L(LI->getLoopFor(OrigPhi->getParent())), SE(SEv), DT(DTree),
+        HasGuards(HasGuards), WidePhi(nullptr), WideInc(nullptr),
+        WideIncExpr(nullptr), DeadInsts(DI) {
+    assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
+    ExtendKindMap[OrigPhi] = WI.IsSigned ? SignExtended : ZeroExtended;
+  }
+
+  PHINode *createWideIV(SCEVExpander &Rewriter);
+
+protected:
+  Value *createExtendInst(Value *NarrowOper, Type *WideType, bool IsSigned,
+                          Instruction *Use);
+
+  Instruction *cloneIVUser(NarrowIVDefUse DU, const SCEVAddRecExpr *WideAR);
+  Instruction *cloneArithmeticIVUser(NarrowIVDefUse DU,
+                                     const SCEVAddRecExpr *WideAR);
+  Instruction *cloneBitwiseIVUser(NarrowIVDefUse DU);
+
+  ExtendKind getExtendKind(Instruction *I);
+
+  typedef std::pair<const SCEVAddRecExpr *, ExtendKind> WidenedRecTy;
+
+  WidenedRecTy getWideRecurrence(NarrowIVDefUse DU);
+
+  WidenedRecTy getExtendedOperandRecurrence(NarrowIVDefUse DU);
+
+  const SCEV *getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+                              unsigned OpCode) const;
+
+  Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
+
+  bool widenLoopCompare(NarrowIVDefUse DU);
+
+  void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
+};
+} // anonymous namespace
+
+/// Perform a quick domtree based check for loop invariance assuming that V is
+/// used within the loop. LoopInfo::isLoopInvariant() seems gratuitous for this
+/// purpose.
+static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) {
+  Instruction *Inst = dyn_cast<Instruction>(V);
+  if (!Inst)
+    return true;
+
+  return DT->properlyDominates(Inst->getParent(), L->getHeader());
+}
+
+Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType,
+                                 bool IsSigned, Instruction *Use) {
+  // Set the debug location and conservative insertion point.
+  IRBuilder<> Builder(Use);
+  // Hoist the insertion point into loop preheaders as far as possible.
+  for (const Loop *L = LI->getLoopFor(Use->getParent());
+       L && L->getLoopPreheader() && isLoopInvariant(NarrowOper, L, DT);
+       L = L->getParentLoop())
+    Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
+
+  return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
+                    Builder.CreateZExt(NarrowOper, WideType);
+}
+
+/// Instantiate a wide operation to replace a narrow operation. This only needs
+/// to handle operations that can evaluation to SCEVAddRec. It can safely return
+/// 0 for any operation we decide not to clone.
+Instruction *WidenIV::cloneIVUser(NarrowIVDefUse DU,
+                                  const SCEVAddRecExpr *WideAR) {
+  unsigned Opcode = DU.NarrowUse->getOpcode();
+  switch (Opcode) {
+  default:
+    return nullptr;
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::Sub:
+    return cloneArithmeticIVUser(DU, WideAR);
+
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return cloneBitwiseIVUser(DU);
+  }
+}
+
+Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) {
+  Instruction *NarrowUse = DU.NarrowUse;
+  Instruction *NarrowDef = DU.NarrowDef;
+  Instruction *WideDef = DU.WideDef;
+
+  DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n");
+
+  // Replace NarrowDef operands with WideDef. Otherwise, we don't know anything
+  // about the narrow operand yet so must insert a [sz]ext. It is probably loop
+  // invariant and will be folded or hoisted. If it actually comes from a
+  // widened IV, it should be removed during a future call to widenIVUse.
+  bool IsSigned = getExtendKind(NarrowDef) == SignExtended;
+  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(0), WideType,
+                                      IsSigned, NarrowUse);
+  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(1), WideType,
+                                      IsSigned, NarrowUse);
+
+  auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
+  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
+                                        NarrowBO->getName());
+  IRBuilder<> Builder(NarrowUse);
+  Builder.Insert(WideBO);
+  WideBO->copyIRFlags(NarrowBO);
+  return WideBO;
+}
+
+Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU,
+                                            const SCEVAddRecExpr *WideAR) {
+  Instruction *NarrowUse = DU.NarrowUse;
+  Instruction *NarrowDef = DU.NarrowDef;
+  Instruction *WideDef = DU.WideDef;
+
+  DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
+
+  unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1;
+
+  // We're trying to find X such that
+  //
+  //  Widen(NarrowDef `op` NonIVNarrowDef) == WideAR == WideDef `op.wide` X
+  //
+  // We guess two solutions to X, sext(NonIVNarrowDef) and zext(NonIVNarrowDef),
+  // and check using SCEV if any of them are correct.
+
+  // Returns true if extending NonIVNarrowDef according to `SignExt` is a
+  // correct solution to X.
+  auto GuessNonIVOperand = [&](bool SignExt) {
+    const SCEV *WideLHS;
+    const SCEV *WideRHS;
+
+    auto GetExtend = [this, SignExt](const SCEV *S, Type *Ty) {
+      if (SignExt)
+        return SE->getSignExtendExpr(S, Ty);
+      return SE->getZeroExtendExpr(S, Ty);
+    };
+
+    if (IVOpIdx == 0) {
+      WideLHS = SE->getSCEV(WideDef);
+      const SCEV *NarrowRHS = SE->getSCEV(NarrowUse->getOperand(1));
+      WideRHS = GetExtend(NarrowRHS, WideType);
+    } else {
+      const SCEV *NarrowLHS = SE->getSCEV(NarrowUse->getOperand(0));
+      WideLHS = GetExtend(NarrowLHS, WideType);
+      WideRHS = SE->getSCEV(WideDef);
+    }
+
+    // WideUse is "WideDef `op.wide` X" as described in the comment.
+    const SCEV *WideUse = nullptr;
+
+    switch (NarrowUse->getOpcode()) {
+    default:
+      llvm_unreachable("No other possibility!");
+
+    case Instruction::Add:
+      WideUse = SE->getAddExpr(WideLHS, WideRHS);
+      break;
+
+    case Instruction::Mul:
+      WideUse = SE->getMulExpr(WideLHS, WideRHS);
+      break;
+
+    case Instruction::UDiv:
+      WideUse = SE->getUDivExpr(WideLHS, WideRHS);
+      break;
+
+    case Instruction::Sub:
+      WideUse = SE->getMinusSCEV(WideLHS, WideRHS);
+      break;
+    }
+
+    return WideUse == WideAR;
+  };
+
+  bool SignExtend = getExtendKind(NarrowDef) == SignExtended;
+  if (!GuessNonIVOperand(SignExtend)) {
+    SignExtend = !SignExtend;
+    if (!GuessNonIVOperand(SignExtend))
+      return nullptr;
+  }
+
+  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(0), WideType,
+                                      SignExtend, NarrowUse);
+  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(1), WideType,
+                                      SignExtend, NarrowUse);
+
+  auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
+  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
+                                        NarrowBO->getName());
+
+  IRBuilder<> Builder(NarrowUse);
+  Builder.Insert(WideBO);
+  WideBO->copyIRFlags(NarrowBO);
+  return WideBO;
+}
+
+WidenIV::ExtendKind WidenIV::getExtendKind(Instruction *I) {
+  auto It = ExtendKindMap.find(I);
+  assert(It != ExtendKindMap.end() && "Instruction not yet extended!");
+  return It->second;
+}
+
+const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+                                     unsigned OpCode) const {
+  if (OpCode == Instruction::Add)
+    return SE->getAddExpr(LHS, RHS);
+  if (OpCode == Instruction::Sub)
+    return SE->getMinusSCEV(LHS, RHS);
+  if (OpCode == Instruction::Mul)
+    return SE->getMulExpr(LHS, RHS);
+
+  llvm_unreachable("Unsupported opcode.");
+}
+
+/// No-wrap operations can transfer sign extension of their result to their
+/// operands. Generate the SCEV value for the widened operation without
+/// actually modifying the IR yet. If the expression after extending the
+/// operands is an AddRec for this loop, return the AddRec and the kind of
+/// extension used.
+WidenIV::WidenedRecTy WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {
+
+  // Handle the common case of add<nsw/nuw>
+  const unsigned OpCode = DU.NarrowUse->getOpcode();
+  // Only Add/Sub/Mul instructions supported yet.
+  if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
+      OpCode != Instruction::Mul)
+    return {nullptr, Unknown};
+
+  // One operand (NarrowDef) has already been extended to WideDef. Now determine
+  // if extending the other will lead to a recurrence.
+  const unsigned ExtendOperIdx =
+      DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
+  assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
+
+  const SCEV *ExtendOperExpr = nullptr;
+  const OverflowingBinaryOperator *OBO =
+    cast<OverflowingBinaryOperator>(DU.NarrowUse);
+  ExtendKind ExtKind = getExtendKind(DU.NarrowDef);
+  if (ExtKind == SignExtended && OBO->hasNoSignedWrap())
+    ExtendOperExpr = SE->getSignExtendExpr(
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else if(ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap())
+    ExtendOperExpr = SE->getZeroExtendExpr(
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else
+    return {nullptr, Unknown};
+
+  // When creating this SCEV expr, don't apply the current operations NSW or NUW
+  // flags. This instruction may be guarded by control flow that the no-wrap
+  // behavior depends on. Non-control-equivalent instructions can be mapped to
+  // the same SCEV expression, and it would be incorrect to transfer NSW/NUW
+  // semantics to those operations.
+  const SCEV *lhs = SE->getSCEV(DU.WideDef);
+  const SCEV *rhs = ExtendOperExpr;
+
+  // Let's swap operands to the initial order for the case of non-commutative
+  // operations, like SUB. See PR21014.
+  if (ExtendOperIdx == 0)
+    std::swap(lhs, rhs);
+  const SCEVAddRecExpr *AddRec =
+      dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode));
+
+  if (!AddRec || AddRec->getLoop() != L)
+    return {nullptr, Unknown};
+
+  return {AddRec, ExtKind};
+}
+
+/// Is this instruction potentially interesting for further simplification after
+/// widening it's type? In other words, can the extend be safely hoisted out of
+/// the loop with SCEV reducing the value to a recurrence on the same loop. If
+/// so, return the extended recurrence and the kind of extension used. Otherwise
+/// return {nullptr, Unknown}.
+WidenIV::WidenedRecTy WidenIV::getWideRecurrence(NarrowIVDefUse DU) {
+  if (!SE->isSCEVable(DU.NarrowUse->getType()))
+    return {nullptr, Unknown};
+
+  const SCEV *NarrowExpr = SE->getSCEV(DU.NarrowUse);
+  if (SE->getTypeSizeInBits(NarrowExpr->getType()) >=
+      SE->getTypeSizeInBits(WideType)) {
+    // NarrowUse implicitly widens its operand. e.g. a gep with a narrow
+    // index. So don't follow this use.
+    return {nullptr, Unknown};
+  }
+
+  const SCEV *WideExpr;
+  ExtendKind ExtKind;
+  if (DU.NeverNegative) {
+    WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType);
+    if (isa<SCEVAddRecExpr>(WideExpr))
+      ExtKind = SignExtended;
+    else {
+      WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType);
+      ExtKind = ZeroExtended;
+    }
+  } else if (getExtendKind(DU.NarrowDef) == SignExtended) {
+    WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType);
+    ExtKind = SignExtended;
+  } else {
+    WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType);
+    ExtKind = ZeroExtended;
+  }
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
+  if (!AddRec || AddRec->getLoop() != L)
+    return {nullptr, Unknown};
+  return {AddRec, ExtKind};
+}
+
+/// This IV user cannot be widen. Replace this use of the original narrow IV
+/// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
+static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) {
+  DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef
+        << " for user " << *DU.NarrowUse << "\n");
+  IRBuilder<> Builder(
+      getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
+  Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
+  DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
+}
+
+/// If the narrow use is a compare instruction, then widen the compare
+//  (and possibly the other operand).  The extend operation is hoisted into the
+// loop preheader as far as possible.
+bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
+  ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse);
+  if (!Cmp)
+    return false;
+
+  // We can legally widen the comparison in the following two cases:
+  //
+  //  - The signedness of the IV extension and comparison match
+  //
+  //  - The narrow IV is always positive (and thus its sign extension is equal
+  //    to its zero extension).  For instance, let's say we're zero extending
+  //    %narrow for the following use
+  //
+  //      icmp slt i32 %narrow, %val   ... (A)
+  //
+  //    and %narrow is always positive.  Then
+  //
+  //      (A) == icmp slt i32 sext(%narrow), sext(%val)
+  //          == icmp slt i32 zext(%narrow), sext(%val)
+  bool IsSigned = getExtendKind(DU.NarrowDef) == SignExtended;
+  if (!(DU.NeverNegative || IsSigned == Cmp->isSigned()))
+    return false;
+
+  Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0);
+  unsigned CastWidth = SE->getTypeSizeInBits(Op->getType());
+  unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+  assert (CastWidth <= IVWidth && "Unexpected width while widening compare.");
+
+  // Widen the compare instruction.
+  IRBuilder<> Builder(
+      getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
+  DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+
+  // Widen the other operand of the compare, if necessary.
+  if (CastWidth < IVWidth) {
+    Value *ExtOp = createExtendInst(Op, WideType, Cmp->isSigned(), Cmp);
+    DU.NarrowUse->replaceUsesOfWith(Op, ExtOp);
+  }
+  return true;
+}
+
+/// Determine whether an individual user of the narrow IV can be widened. If so,
+/// return the wide clone of the user.
+Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
+  assert(ExtendKindMap.count(DU.NarrowDef) &&
+         "Should already know the kind of extension used to widen NarrowDef");
+
+  // Stop traversing the def-use chain at inner-loop phis or post-loop phis.
+  if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) {
+    if (LI->getLoopFor(UsePhi->getParent()) != L) {
+      // For LCSSA phis, sink the truncate outside the loop.
+      // After SimplifyCFG most loop exit targets have a single predecessor.
+      // Otherwise fall back to a truncate within the loop.
+      if (UsePhi->getNumOperands() != 1)
+        truncateIVUse(DU, DT, LI);
+      else {
+        // Widening the PHI requires us to insert a trunc.  The logical place
+        // for this trunc is in the same BB as the PHI.  This is not possible if
+        // the BB is terminated by a catchswitch.
+        if (isa<CatchSwitchInst>(UsePhi->getParent()->getTerminator()))
+          return nullptr;
+
+        PHINode *WidePhi =
+          PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",
+                          UsePhi);
+        WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0));
+        IRBuilder<> Builder(&*WidePhi->getParent()->getFirstInsertionPt());
+        Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());
+        UsePhi->replaceAllUsesWith(Trunc);
+        DeadInsts.emplace_back(UsePhi);
+        DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi
+              << " to " << *WidePhi << "\n");
+      }
+      return nullptr;
+    }
+  }
+
+  // This narrow use can be widened by a sext if it's non-negative or its narrow
+  // def was widended by a sext. Same for zext.
+  auto canWidenBySExt = [&]() {
+    return DU.NeverNegative || getExtendKind(DU.NarrowDef) == SignExtended;
+  };
+  auto canWidenByZExt = [&]() {
+    return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ZeroExtended;
+  };
+
+  // Our raison d'etre! Eliminate sign and zero extension.
+  if ((isa<SExtInst>(DU.NarrowUse) && canWidenBySExt()) ||
+      (isa<ZExtInst>(DU.NarrowUse) && canWidenByZExt())) {
+    Value *NewDef = DU.WideDef;
+    if (DU.NarrowUse->getType() != WideType) {
+      unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType());
+      unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+      if (CastWidth < IVWidth) {
+        // The cast isn't as wide as the IV, so insert a Trunc.
+        IRBuilder<> Builder(DU.NarrowUse);
+        NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType());
+      }
+      else {
+        // A wider extend was hidden behind a narrower one. This may induce
+        // another round of IV widening in which the intermediate IV becomes
+        // dead. It should be very rare.
+        DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
+              << " not wide enough to subsume " << *DU.NarrowUse << "\n");
+        DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+        NewDef = DU.NarrowUse;
+      }
+    }
+    if (NewDef != DU.NarrowUse) {
+      DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
+            << " replaced by " << *DU.WideDef << "\n");
+      ++NumElimExt;
+      DU.NarrowUse->replaceAllUsesWith(NewDef);
+      DeadInsts.emplace_back(DU.NarrowUse);
+    }
+    // Now that the extend is gone, we want to expose it's uses for potential
+    // further simplification. We don't need to directly inform SimplifyIVUsers
+    // of the new users, because their parent IV will be processed later as a
+    // new loop phi. If we preserved IVUsers analysis, we would also want to
+    // push the uses of WideDef here.
+
+    // No further widening is needed. The deceased [sz]ext had done it for us.
+    return nullptr;
+  }
+
+  // Does this user itself evaluate to a recurrence after widening?
+  WidenedRecTy WideAddRec = getExtendedOperandRecurrence(DU);
+  if (!WideAddRec.first)
+    WideAddRec = getWideRecurrence(DU);
+
+  assert((WideAddRec.first == nullptr) == (WideAddRec.second == Unknown));
+  if (!WideAddRec.first) {
+    // If use is a loop condition, try to promote the condition instead of
+    // truncating the IV first.
+    if (widenLoopCompare(DU))
+      return nullptr;
+
+    // This user does not evaluate to a recurrence after widening, so don't
+    // follow it. Instead insert a Trunc to kill off the original use,
+    // eventually isolating the original narrow IV so it can be removed.
+    truncateIVUse(DU, DT, LI);
+    return nullptr;
+  }
+  // Assume block terminators cannot evaluate to a recurrence. We can't to
+  // insert a Trunc after a terminator if there happens to be a critical edge.
+  assert(DU.NarrowUse != DU.NarrowUse->getParent()->getTerminator() &&
+         "SCEV is not expected to evaluate a block terminator");
+
+  // Reuse the IV increment that SCEVExpander created as long as it dominates
+  // NarrowUse.
+  Instruction *WideUse = nullptr;
+  if (WideAddRec.first == WideIncExpr &&
+      Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
+    WideUse = WideInc;
+  else {
+    WideUse = cloneIVUser(DU, WideAddRec.first);
+    if (!WideUse)
+      return nullptr;
+  }
+  // Evaluation of WideAddRec ensured that the narrow expression could be
+  // extended outside the loop without overflow. This suggests that the wide use
+  // evaluates to the same expression as the extended narrow use, but doesn't
+  // absolutely guarantee it. Hence the following failsafe check. In rare cases
+  // where it fails, we simply throw away the newly created wide use.
+  if (WideAddRec.first != SE->getSCEV(WideUse)) {
+    DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse
+          << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec.first << "\n");
+    DeadInsts.emplace_back(WideUse);
+    return nullptr;
+  }
+
+  ExtendKindMap[DU.NarrowUse] = WideAddRec.second;
+  // Returning WideUse pushes it on the worklist.
+  return WideUse;
+}
+
+/// Add eligible users of NarrowDef to NarrowIVUsers.
+///
+void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
+  const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef);
+  bool NonNegativeDef =
+      SE->isKnownPredicate(ICmpInst::ICMP_SGE, NarrowSCEV,
+                           SE->getConstant(NarrowSCEV->getType(), 0));
+  for (User *U : NarrowDef->users()) {
+    Instruction *NarrowUser = cast<Instruction>(U);
+
+    // Handle data flow merges and bizarre phi cycles.
+    if (!Widened.insert(NarrowUser).second)
+      continue;
+
+    bool NonNegativeUse = false;
+    if (!NonNegativeDef) {
+      // We might have a control-dependent range information for this context.
+      if (auto RangeInfo = getPostIncRangeInfo(NarrowDef, NarrowUser))
+        NonNegativeUse = RangeInfo->getSignedMin().isNonNegative();
+    }
+
+    NarrowIVUsers.emplace_back(NarrowDef, NarrowUser, WideDef,
+                               NonNegativeDef || NonNegativeUse);
+  }
+}
+
+/// Process a single induction variable. First use the SCEVExpander to create a
+/// wide induction variable that evaluates to the same recurrence as the
+/// original narrow IV. Then use a worklist to forward traverse the narrow IV's
+/// def-use chain. After widenIVUse has processed all interesting IV users, the
+/// narrow IV will be isolated for removal by DeleteDeadPHIs.
+///
+/// It would be simpler to delete uses as they are processed, but we must avoid
+/// invalidating SCEV expressions.
+///
+PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
+  // Is this phi an induction variable?
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));
+  if (!AddRec)
+    return nullptr;
+
+  // Widen the induction variable expression.
+  const SCEV *WideIVExpr = getExtendKind(OrigPhi) == SignExtended
+                               ? SE->getSignExtendExpr(AddRec, WideType)
+                               : SE->getZeroExtendExpr(AddRec, WideType);
+
+  assert(SE->getEffectiveSCEVType(WideIVExpr->getType()) == WideType &&
+         "Expect the new IV expression to preserve its type");
+
+  // Can the IV be extended outside the loop without overflow?
+  AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr);
+  if (!AddRec || AddRec->getLoop() != L)
+    return nullptr;
+
+  // An AddRec must have loop-invariant operands. Since this AddRec is
+  // materialized by a loop header phi, the expression cannot have any post-loop
+  // operands, so they must dominate the loop header.
+  assert(
+      SE->properlyDominates(AddRec->getStart(), L->getHeader()) &&
+      SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader()) &&
+      "Loop header phi recurrence inputs do not dominate the loop");
+
+  // Iterate over IV uses (including transitive ones) looking for IV increments
+  // of the form 'add nsw %iv, <const>'. For each increment and each use of
+  // the increment calculate control-dependent range information basing on
+  // dominating conditions inside of the loop (e.g. a range check inside of the
+  // loop). Calculated ranges are stored in PostIncRangeInfos map.
+  //
+  // Control-dependent range information is later used to prove that a narrow
+  // definition is not negative (see pushNarrowIVUsers). It's difficult to do
+  // this on demand because when pushNarrowIVUsers needs this information some
+  // of the dominating conditions might be already widened.
+  if (UsePostIncrementRanges)
+    calculatePostIncRanges(OrigPhi);
+
+  // The rewriter provides a value for the desired IV expression. This may
+  // either find an existing phi or materialize a new one. Either way, we
+  // expect a well-formed cyclic phi-with-increments. i.e. any operand not part
+  // of the phi-SCC dominates the loop entry.
+  Instruction *InsertPt = &L->getHeader()->front();
+  WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt));
+
+  // Remembering the WideIV increment generated by SCEVExpander allows
+  // widenIVUse to reuse it when widening the narrow IV's increment. We don't
+  // employ a general reuse mechanism because the call above is the only call to
+  // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses.
+  if (BasicBlock *LatchBlock = L->getLoopLatch()) {
+    WideInc =
+      cast<Instruction>(WidePhi->getIncomingValueForBlock(LatchBlock));
+    WideIncExpr = SE->getSCEV(WideInc);
+    // Propagate the debug location associated with the original loop increment
+    // to the new (widened) increment.
+    auto *OrigInc =
+      cast<Instruction>(OrigPhi->getIncomingValueForBlock(LatchBlock));
+    WideInc->setDebugLoc(OrigInc->getDebugLoc());
+  }
+
+  DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
+  ++NumWidened;
+
+  // Traverse the def-use chain using a worklist starting at the original IV.
+  assert(Widened.empty() && NarrowIVUsers.empty() && "expect initial state" );
+
+  Widened.insert(OrigPhi);
+  pushNarrowIVUsers(OrigPhi, WidePhi);
+
+  while (!NarrowIVUsers.empty()) {
+    NarrowIVDefUse DU = NarrowIVUsers.pop_back_val();
+
+    // Process a def-use edge. This may replace the use, so don't hold a
+    // use_iterator across it.
+    Instruction *WideUse = widenIVUse(DU, Rewriter);
+
+    // Follow all def-use edges from the previous narrow use.
+    if (WideUse)
+      pushNarrowIVUsers(DU.NarrowUse, WideUse);
+
+    // widenIVUse may have removed the def-use edge.
+    if (DU.NarrowDef->use_empty())
+      DeadInsts.emplace_back(DU.NarrowDef);
+  }
+  return WidePhi;
+}
+
+/// Calculates control-dependent range for the given def at the given context
+/// by looking at dominating conditions inside of the loop
+void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
+                                    Instruction *NarrowUser) {
+  using namespace llvm::PatternMatch;
+
+  Value *NarrowDefLHS;
+  const APInt *NarrowDefRHS;
+  if (!match(NarrowDef, m_NSWAdd(m_Value(NarrowDefLHS),
+                                 m_APInt(NarrowDefRHS))) ||
+      !NarrowDefRHS->isNonNegative())
+    return;
+
+  auto UpdateRangeFromCondition = [&] (Value *Condition,
+                                       bool TrueDest) {
+    CmpInst::Predicate Pred;
+    Value *CmpRHS;
+    if (!match(Condition, m_ICmp(Pred, m_Specific(NarrowDefLHS),
+                                 m_Value(CmpRHS))))
+      return;
+
+    CmpInst::Predicate P =
+            TrueDest ? Pred : CmpInst::getInversePredicate(Pred);
+
+    auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
+    auto CmpConstrainedLHSRange =
+            ConstantRange::makeAllowedICmpRegion(P, CmpRHSRange);
+    auto NarrowDefRange =
+            CmpConstrainedLHSRange.addWithNoSignedWrap(*NarrowDefRHS);
+
+    updatePostIncRangeInfo(NarrowDef, NarrowUser, NarrowDefRange);
+  };
+
+  auto UpdateRangeFromGuards = [&](Instruction *Ctx) {
+    if (!HasGuards)
+      return;
+
+    for (Instruction &I : make_range(Ctx->getIterator().getReverse(),
+                                     Ctx->getParent()->rend())) {
+      Value *C = nullptr;
+      if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>(m_Value(C))))
+        UpdateRangeFromCondition(C, /*TrueDest=*/true);
+    }
+  };
+
+  UpdateRangeFromGuards(NarrowUser);
+
+  BasicBlock *NarrowUserBB = NarrowUser->getParent();
+  // If NarrowUserBB is statically unreachable asking dominator queries may
+  // yield surprising results. (e.g. the block may not have a dom tree node)
+  if (!DT->isReachableFromEntry(NarrowUserBB))
+    return;
+
+  for (auto *DTB = (*DT)[NarrowUserBB]->getIDom();
+       L->contains(DTB->getBlock());
+       DTB = DTB->getIDom()) {
+    auto *BB = DTB->getBlock();
+    auto *TI = BB->getTerminator();
+    UpdateRangeFromGuards(TI);
+
+    auto *BI = dyn_cast<BranchInst>(TI);
+    if (!BI || !BI->isConditional())
+      continue;
+
+    auto *TrueSuccessor = BI->getSuccessor(0);
+    auto *FalseSuccessor = BI->getSuccessor(1);
+
+    auto DominatesNarrowUser = [this, NarrowUser] (BasicBlockEdge BBE) {
+      return BBE.isSingleEdge() &&
+             DT->dominates(BBE, NarrowUser->getParent());
+    };
+
+    if (DominatesNarrowUser(BasicBlockEdge(BB, TrueSuccessor)))
+      UpdateRangeFromCondition(BI->getCondition(), /*TrueDest=*/true);
+
+    if (DominatesNarrowUser(BasicBlockEdge(BB, FalseSuccessor)))
+      UpdateRangeFromCondition(BI->getCondition(), /*TrueDest=*/false);
+  }
+}
+
+/// Calculates PostIncRangeInfos map for the given IV
+void WidenIV::calculatePostIncRanges(PHINode *OrigPhi) {
+  SmallPtrSet<Instruction *, 16> Visited;
+  SmallVector<Instruction *, 6> Worklist;
+  Worklist.push_back(OrigPhi);
+  Visited.insert(OrigPhi);
+
+  while (!Worklist.empty()) {
+    Instruction *NarrowDef = Worklist.pop_back_val();
+
+    for (Use &U : NarrowDef->uses()) {
+      auto *NarrowUser = cast<Instruction>(U.getUser());
+
+      // Don't go looking outside the current loop.
+      auto *NarrowUserLoop = (*LI)[NarrowUser->getParent()];
+      if (!NarrowUserLoop || !L->contains(NarrowUserLoop))
+        continue;
+
+      if (!Visited.insert(NarrowUser).second)
+        continue;
+
+      Worklist.push_back(NarrowUser);
+
+      calculatePostIncRange(NarrowDef, NarrowUser);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Live IV Reduction - Minimize IVs live across the loop.
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+//  Simplification of IV users based on SCEV evaluation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+class IndVarSimplifyVisitor : public IVVisitor {
+  ScalarEvolution *SE;
+  const TargetTransformInfo *TTI;
+  PHINode *IVPhi;
+
+public:
+  WideIVInfo WI;
+
+  IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV,
+                        const TargetTransformInfo *TTI,
+                        const DominatorTree *DTree)
+    : SE(SCEV), TTI(TTI), IVPhi(IV) {
+    DT = DTree;
+    WI.NarrowIV = IVPhi;
+  }
+
+  // Implement the interface used by simplifyUsersOfIV.
+  void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); }
+};
+}
+
+/// Iteratively perform simplification on a worklist of IV users. Each
+/// successive simplification may push more users which may themselves be
+/// candidates for simplification.
+///
+/// Sign/Zero extend elimination is interleaved with IV simplification.
+///
+void IndVarSimplify::simplifyAndExtend(Loop *L,
+                                       SCEVExpander &Rewriter,
+                                       LoopInfo *LI) {
+  SmallVector<WideIVInfo, 8> WideIVs;
+
+  auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction(
+          Intrinsic::getName(Intrinsic::experimental_guard));
+  bool HasGuards = GuardDecl && !GuardDecl->use_empty();
+
+  SmallVector<PHINode*, 8> LoopPhis;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    LoopPhis.push_back(cast<PHINode>(I));
+  }
+  // Each round of simplification iterates through the SimplifyIVUsers worklist
+  // for all current phis, then determines whether any IVs can be
+  // widened. Widening adds new phis to LoopPhis, inducing another round of
+  // simplification on the wide IVs.
+  while (!LoopPhis.empty()) {
+    // Evaluate as many IV expressions as possible before widening any IVs. This
+    // forces SCEV to set no-wrap flags before evaluating sign/zero
+    // extension. The first time SCEV attempts to normalize sign/zero extension,
+    // the result becomes final. So for the most predictable results, we delay
+    // evaluation of sign/zero extend evaluation until needed, and avoid running
+    // other SCEV based analysis prior to simplifyAndExtend.
+    do {
+      PHINode *CurrIV = LoopPhis.pop_back_val();
+
+      // Information about sign/zero extensions of CurrIV.
+      IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT);
+
+      Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, &Visitor);
+
+      if (Visitor.WI.WidestNativeType) {
+        WideIVs.push_back(Visitor.WI);
+      }
+    } while(!LoopPhis.empty());
+
+    for (; !WideIVs.empty(); WideIVs.pop_back()) {
+      WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts, HasGuards);
+      if (PHINode *WidePhi = Widener.createWideIV(Rewriter)) {
+        Changed = true;
+        LoopPhis.push_back(WidePhi);
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  linearFunctionTestReplace and its kin. Rewrite the loop exit condition.
+//===----------------------------------------------------------------------===//
+
+/// Return true if this loop's backedge taken count expression can be safely and
+/// cheaply expanded into an instruction sequence that can be used by
+/// linearFunctionTestReplace.
+///
+/// TODO: This fails for pointer-type loop counters with greater than one byte
+/// strides, consequently preventing LFTR from running. For the purpose of LFTR
+/// we could skip this check in the case that the LFTR loop counter (chosen by
+/// FindLoopCounter) is also pointer type. Instead, we could directly convert
+/// the loop test to an inequality test by checking the target data's alignment
+/// of element types (given that the initial pointer value originates from or is
+/// used by ABI constrained operation, as opposed to inttoptr/ptrtoint).
+/// However, we don't yet have a strong motivation for converting loop tests
+/// into inequality tests.
+static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE,
+                                        SCEVExpander &Rewriter) {
+  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
+      BackedgeTakenCount->isZero())
+    return false;
+
+  if (!L->getExitingBlock())
+    return false;
+
+  // Can't rewrite non-branch yet.
+  if (!isa<BranchInst>(L->getExitingBlock()->getTerminator()))
+    return false;
+
+  if (Rewriter.isHighCostExpansion(BackedgeTakenCount, L))
+    return false;
+
+  return true;
+}
+
+/// Return the loop header phi IFF IncV adds a loop invariant value to the phi.
+static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
+  Instruction *IncI = dyn_cast<Instruction>(IncV);
+  if (!IncI)
+    return nullptr;
+
+  switch (IncI->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+    break;
+  case Instruction::GetElementPtr:
+    // An IV counter must preserve its type.
+    if (IncI->getNumOperands() == 2)
+      break;
+    LLVM_FALLTHROUGH;
+  default:
+    return nullptr;
+  }
+
+  PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
+  if (Phi && Phi->getParent() == L->getHeader()) {
+    if (isLoopInvariant(IncI->getOperand(1), L, DT))
+      return Phi;
+    return nullptr;
+  }
+  if (IncI->getOpcode() == Instruction::GetElementPtr)
+    return nullptr;
+
+  // Allow add/sub to be commuted.
+  Phi = dyn_cast<PHINode>(IncI->getOperand(1));
+  if (Phi && Phi->getParent() == L->getHeader()) {
+    if (isLoopInvariant(IncI->getOperand(0), L, DT))
+      return Phi;
+  }
+  return nullptr;
+}
+
+/// Return the compare guarding the loop latch, or NULL for unrecognized tests.
+static ICmpInst *getLoopTest(Loop *L) {
+  assert(L->getExitingBlock() && "expected loop exit");
+
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  // Don't bother with LFTR if the loop is not properly simplified.
+  if (!LatchBlock)
+    return nullptr;
+
+  BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
+  assert(BI && "expected exit branch");
+
+  return dyn_cast<ICmpInst>(BI->getCondition());
+}
+
+/// linearFunctionTestReplace policy. Return true unless we can show that the
+/// current exit test is already sufficiently canonical.
+static bool needsLFTR(Loop *L, DominatorTree *DT) {
+  // Do LFTR to simplify the exit condition to an ICMP.
+  ICmpInst *Cond = getLoopTest(L);
+  if (!Cond)
+    return true;
+
+  // Do LFTR to simplify the exit ICMP to EQ/NE
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+  if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ)
+    return true;
+
+  // Look for a loop invariant RHS
+  Value *LHS = Cond->getOperand(0);
+  Value *RHS = Cond->getOperand(1);
+  if (!isLoopInvariant(RHS, L, DT)) {
+    if (!isLoopInvariant(LHS, L, DT))
+      return true;
+    std::swap(LHS, RHS);
+  }
+  // Look for a simple IV counter LHS
+  PHINode *Phi = dyn_cast<PHINode>(LHS);
+  if (!Phi)
+    Phi = getLoopPhiForCounter(LHS, L, DT);
+
+  if (!Phi)
+    return true;
+
+  // Do LFTR if PHI node is defined in the loop, but is *not* a counter.
+  int Idx = Phi->getBasicBlockIndex(L->getLoopLatch());
+  if (Idx < 0)
+    return true;
+
+  // Do LFTR if the exit condition's IV is *not* a simple counter.
+  Value *IncV = Phi->getIncomingValue(Idx);
+  return Phi != getLoopPhiForCounter(IncV, L, DT);
+}
+
+/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils
+/// down to checking that all operands are constant and listing instructions
+/// that may hide undef.
+static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited,
+                               unsigned Depth) {
+  if (isa<Constant>(V))
+    return !isa<UndefValue>(V);
+
+  if (Depth >= 6)
+    return false;
+
+  // Conservatively handle non-constant non-instructions. For example, Arguments
+  // may be undef.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  // Load and return values may be undef.
+  if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I))
+    return false;
+
+  // Optimistically handle other instructions.
+  for (Value *Op : I->operands()) {
+    if (!Visited.insert(Op).second)
+      continue;
+    if (!hasConcreteDefImpl(Op, Visited, Depth+1))
+      return false;
+  }
+  return true;
+}
+
+/// Return true if the given value is concrete. We must prove that undef can
+/// never reach it.
+///
+/// TODO: If we decide that this is a good approach to checking for undef, we
+/// may factor it into a common location.
+static bool hasConcreteDef(Value *V) {
+  SmallPtrSet<Value*, 8> Visited;
+  Visited.insert(V);
+  return hasConcreteDefImpl(V, Visited, 0);
+}
+
+/// Return true if this IV has any uses other than the (soon to be rewritten)
+/// loop exit test.
+static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
+  int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+  Value *IncV = Phi->getIncomingValue(LatchIdx);
+
+  for (User *U : Phi->users())
+    if (U != Cond && U != IncV) return false;
+
+  for (User *U : IncV->users())
+    if (U != Cond && U != Phi) return false;
+  return true;
+}
+
+/// Find an affine IV in canonical form.
+///
+/// BECount may be an i8* pointer type. The pointer difference is already
+/// valid count without scaling the address stride, so it remains a pointer
+/// expression as far as SCEV is concerned.
+///
+/// Currently only valid for LFTR. See the comments on hasConcreteDef below.
+///
+/// FIXME: Accept -1 stride and set IVLimit = IVInit - BECount
+///
+/// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride.
+/// This is difficult in general for SCEV because of potential overflow. But we
+/// could at least handle constant BECounts.
+static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount,
+                                ScalarEvolution *SE, DominatorTree *DT) {
+  uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
+
+  Value *Cond =
+    cast<BranchInst>(L->getExitingBlock()->getTerminator())->getCondition();
+
+  // Loop over all of the PHI nodes, looking for a simple counter.
+  PHINode *BestPhi = nullptr;
+  const SCEV *BestInit = nullptr;
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  assert(LatchBlock && "needsLFTR should guarantee a loop latch");
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    PHINode *Phi = cast<PHINode>(I);
+    if (!SE->isSCEVable(Phi->getType()))
+      continue;
+
+    // Avoid comparing an integer IV against a pointer Limit.
+    if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy())
+      continue;
+
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
+    if (!AR || AR->getLoop() != L || !AR->isAffine())
+      continue;
+
+    // AR may be a pointer type, while BECount is an integer type.
+    // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
+    // AR may not be a narrower type, or we may never exit.
+    uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType());
+    if (PhiWidth < BCWidth || !DL.isLegalInteger(PhiWidth))
+      continue;
+
+    const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+    if (!Step || !Step->isOne())
+      continue;
+
+    int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+    Value *IncV = Phi->getIncomingValue(LatchIdx);
+    if (getLoopPhiForCounter(IncV, L, DT) != Phi)
+      continue;
+
+    // Avoid reusing a potentially undef value to compute other values that may
+    // have originally had a concrete definition.
+    if (!hasConcreteDef(Phi)) {
+      // We explicitly allow unknown phis as long as they are already used by
+      // the loop test. In this case we assume that performing LFTR could not
+      // increase the number of undef users.
+      if (ICmpInst *Cond = getLoopTest(L)) {
+        if (Phi != getLoopPhiForCounter(Cond->getOperand(0), L, DT) &&
+            Phi != getLoopPhiForCounter(Cond->getOperand(1), L, DT)) {
+          continue;
+        }
+      }
+    }
+    const SCEV *Init = AR->getStart();
+
+    if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) {
+      // Don't force a live loop counter if another IV can be used.
+      if (AlmostDeadIV(Phi, LatchBlock, Cond))
+        continue;
+
+      // Prefer to count-from-zero. This is a more "canonical" counter form. It
+      // also prefers integer to pointer IVs.
+      if (BestInit->isZero() != Init->isZero()) {
+        if (BestInit->isZero())
+          continue;
+      }
+      // If two IVs both count from zero or both count from nonzero then the
+      // narrower is likely a dead phi that has been widened. Use the wider phi
+      // to allow the other to be eliminated.
+      else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType()))
+        continue;
+    }
+    BestPhi = Phi;
+    BestInit = Init;
+  }
+  return BestPhi;
+}
+
+/// Help linearFunctionTestReplace by generating a value that holds the RHS of
+/// the new loop test.
+static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
+                           SCEVExpander &Rewriter, ScalarEvolution *SE) {
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+  assert(AR && AR->getLoop() == L && AR->isAffine() && "bad loop counter");
+  const SCEV *IVInit = AR->getStart();
+
+  // IVInit may be a pointer while IVCount is an integer when FindLoopCounter
+  // finds a valid pointer IV. Sign extend BECount in order to materialize a
+  // GEP. Avoid running SCEVExpander on a new pointer value, instead reusing
+  // the existing GEPs whenever possible.
+  if (IndVar->getType()->isPointerTy() && !IVCount->getType()->isPointerTy()) {
+    // IVOffset will be the new GEP offset that is interpreted by GEP as a
+    // signed value. IVCount on the other hand represents the loop trip count,
+    // which is an unsigned value. FindLoopCounter only allows induction
+    // variables that have a positive unit stride of one. This means we don't
+    // have to handle the case of negative offsets (yet) and just need to zero
+    // extend IVCount.
+    Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType());
+    const SCEV *IVOffset = SE->getTruncateOrZeroExtend(IVCount, OfsTy);
+
+    // Expand the code for the iteration count.
+    assert(SE->isLoopInvariant(IVOffset, L) &&
+           "Computed iteration count is not loop invariant!");
+    BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+    Value *GEPOffset = Rewriter.expandCodeFor(IVOffset, OfsTy, BI);
+
+    Value *GEPBase = IndVar->getIncomingValueForBlock(L->getLoopPreheader());
+    assert(AR->getStart() == SE->getSCEV(GEPBase) && "bad loop counter");
+    // We could handle pointer IVs other than i8*, but we need to compensate for
+    // gep index scaling. See canExpandBackedgeTakenCount comments.
+    assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
+                             cast<PointerType>(GEPBase->getType())
+                                 ->getElementType())->isOne() &&
+           "unit stride pointer IV must be i8*");
+
+    IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+    return Builder.CreateGEP(nullptr, GEPBase, GEPOffset, "lftr.limit");
+  } else {
+    // In any other case, convert both IVInit and IVCount to integers before
+    // comparing. This may result in SCEV expansion of pointers, but in practice
+    // SCEV will fold the pointer arithmetic away as such:
+    // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
+    //
+    // Valid Cases: (1) both integers is most common; (2) both may be pointers
+    // for simple memset-style loops.
+    //
+    // IVInit integer and IVCount pointer would only occur if a canonical IV
+    // were generated on top of case #2, which is not expected.
+
+    const SCEV *IVLimit = nullptr;
+    // For unit stride, IVCount = Start + BECount with 2's complement overflow.
+    // For non-zero Start, compute IVCount here.
+    if (AR->getStart()->isZero())
+      IVLimit = IVCount;
+    else {
+      assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
+      const SCEV *IVInit = AR->getStart();
+
+      // For integer IVs, truncate the IV before computing IVInit + BECount.
+      if (SE->getTypeSizeInBits(IVInit->getType())
+          > SE->getTypeSizeInBits(IVCount->getType()))
+        IVInit = SE->getTruncateExpr(IVInit, IVCount->getType());
+
+      IVLimit = SE->getAddExpr(IVInit, IVCount);
+    }
+    // Expand the code for the iteration count.
+    BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+    IRBuilder<> Builder(BI);
+    assert(SE->isLoopInvariant(IVLimit, L) &&
+           "Computed iteration count is not loop invariant!");
+    // Ensure that we generate the same type as IndVar, or a smaller integer
+    // type. In the presence of null pointer values, we have an integer type
+    // SCEV expression (IVInit) for a pointer type IV value (IndVar).
+    Type *LimitTy = IVCount->getType()->isPointerTy() ?
+      IndVar->getType() : IVCount->getType();
+    return Rewriter.expandCodeFor(IVLimit, LimitTy, BI);
+  }
+}
+
+/// This method rewrites the exit condition of the loop to be a canonical !=
+/// comparison against the incremented loop induction variable.  This pass is
+/// able to rewrite the exit tests of any loop where the SCEV analysis can
+/// determine a loop-invariant trip count of the loop, which is actually a much
+/// broader range than just linear tests.
+Value *IndVarSimplify::
+linearFunctionTestReplace(Loop *L,
+                          const SCEV *BackedgeTakenCount,
+                          PHINode *IndVar,
+                          SCEVExpander &Rewriter) {
+  assert(canExpandBackedgeTakenCount(L, SE, Rewriter) && "precondition");
+
+  // Initialize CmpIndVar and IVCount to their preincremented values.
+  Value *CmpIndVar = IndVar;
+  const SCEV *IVCount = BackedgeTakenCount;
+
+  assert(L->getLoopLatch() && "Loop no longer in simplified form?");
+
+  // If the exiting block is the same as the backedge block, we prefer to
+  // compare against the post-incremented value, otherwise we must compare
+  // against the preincremented value.
+  if (L->getExitingBlock() == L->getLoopLatch()) {
+    // Add one to the "backedge-taken" count to get the trip count.
+    // This addition may overflow, which is valid as long as the comparison is
+    // truncated to BackedgeTakenCount->getType().
+    IVCount = SE->getAddExpr(BackedgeTakenCount,
+                             SE->getOne(BackedgeTakenCount->getType()));
+    // The BackedgeTaken expression contains the number of times that the
+    // backedge branches to the loop header.  This is one less than the
+    // number of times the loop executes, so use the incremented indvar.
+    CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock());
+  }
+
+  Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE);
+  assert(ExitCnt->getType()->isPointerTy() ==
+             IndVar->getType()->isPointerTy() &&
+         "genLoopLimit missed a cast");
+
+  // Insert a new icmp_ne or icmp_eq instruction before the branch.
+  BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+  ICmpInst::Predicate P;
+  if (L->contains(BI->getSuccessor(0)))
+    P = ICmpInst::ICMP_NE;
+  else
+    P = ICmpInst::ICMP_EQ;
+
+  DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
+               << "      LHS:" << *CmpIndVar << '\n'
+               << "       op:\t"
+               << (P == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
+               << "      RHS:\t" << *ExitCnt << "\n"
+               << "  IVCount:\t" << *IVCount << "\n");
+
+  IRBuilder<> Builder(BI);
+
+  // The new loop exit condition should reuse the debug location of the
+  // original loop exit condition.
+  if (auto *Cond = dyn_cast<Instruction>(BI->getCondition()))
+    Builder.SetCurrentDebugLocation(Cond->getDebugLoc());
+
+  // LFTR can ignore IV overflow and truncate to the width of
+  // BECount. This avoids materializing the add(zext(add)) expression.
+  unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType());
+  unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType());
+  if (CmpIndVarSize > ExitCntSize) {
+    const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+    const SCEV *ARStart = AR->getStart();
+    const SCEV *ARStep = AR->getStepRecurrence(*SE);
+    // For constant IVCount, avoid truncation.
+    if (isa<SCEVConstant>(ARStart) && isa<SCEVConstant>(IVCount)) {
+      const APInt &Start = cast<SCEVConstant>(ARStart)->getAPInt();
+      APInt Count = cast<SCEVConstant>(IVCount)->getAPInt();
+      // Note that the post-inc value of BackedgeTakenCount may have overflowed
+      // above such that IVCount is now zero.
+      if (IVCount != BackedgeTakenCount && Count == 0) {
+        Count = APInt::getMaxValue(Count.getBitWidth()).zext(CmpIndVarSize);
+        ++Count;
+      }
+      else
+        Count = Count.zext(CmpIndVarSize);
+      APInt NewLimit;
+      if (cast<SCEVConstant>(ARStep)->getValue()->isNegative())
+        NewLimit = Start - Count;
+      else
+        NewLimit = Start + Count;
+      ExitCnt = ConstantInt::get(CmpIndVar->getType(), NewLimit);
+
+      DEBUG(dbgs() << "  Widen RHS:\t" << *ExitCnt << "\n");
+    } else {
+      // We try to extend trip count first. If that doesn't work we truncate IV.
+      // Zext(trunc(IV)) == IV implies equivalence of the following two:
+      // Trunc(IV) == ExitCnt and IV == zext(ExitCnt). Similarly for sext. If
+      // one of the two holds, extend the trip count, otherwise we truncate IV.
+      bool Extended = false;
+      const SCEV *IV = SE->getSCEV(CmpIndVar);
+      const SCEV *ZExtTrunc =
+           SE->getZeroExtendExpr(SE->getTruncateExpr(SE->getSCEV(CmpIndVar),
+                                                     ExitCnt->getType()),
+                                 CmpIndVar->getType());
+
+      if (ZExtTrunc == IV) {
+        Extended = true;
+        ExitCnt = Builder.CreateZExt(ExitCnt, IndVar->getType(),
+                                     "wide.trip.count");
+      } else {
+        const SCEV *SExtTrunc =
+          SE->getSignExtendExpr(SE->getTruncateExpr(SE->getSCEV(CmpIndVar),
+                                                    ExitCnt->getType()),
+                                CmpIndVar->getType());
+        if (SExtTrunc == IV) {
+          Extended = true;
+          ExitCnt = Builder.CreateSExt(ExitCnt, IndVar->getType(),
+                                       "wide.trip.count");
+        }
+      }
+
+      if (!Extended)
+        CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
+                                        "lftr.wideiv");
+    }
+  }
+  Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond");
+  Value *OrigCond = BI->getCondition();
+  // It's tempting to use replaceAllUsesWith here to fully replace the old
+  // comparison, but that's not immediately safe, since users of the old
+  // comparison may not be dominated by the new comparison. Instead, just
+  // update the branch to use the new comparison; in the common case this
+  // will make old comparison dead.
+  BI->setCondition(Cond);
+  DeadInsts.push_back(OrigCond);
+
+  ++NumLFTR;
+  Changed = true;
+  return Cond;
+}
+
+//===----------------------------------------------------------------------===//
+//  sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
+//===----------------------------------------------------------------------===//
+
+/// If there's a single exit block, sink any loop-invariant values that
+/// were defined in the preheader but not used inside the loop into the
+/// exit block to reduce register pressure in the loop.
+void IndVarSimplify::sinkUnusedInvariants(Loop *L) {
+  BasicBlock *ExitBlock = L->getExitBlock();
+  if (!ExitBlock) return;
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) return;
+
+  BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt();
+  BasicBlock::iterator I(Preheader->getTerminator());
+  while (I != Preheader->begin()) {
+    --I;
+    // New instructions were inserted at the end of the preheader.
+    if (isa<PHINode>(I))
+      break;
+
+    // Don't move instructions which might have side effects, since the side
+    // effects need to complete before instructions inside the loop.  Also don't
+    // move instructions which might read memory, since the loop may modify
+    // memory. Note that it's okay if the instruction might have undefined
+    // behavior: LoopSimplify guarantees that the preheader dominates the exit
+    // block.
+    if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+      continue;
+
+    // Skip debug info intrinsics.
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    // Skip eh pad instructions.
+    if (I->isEHPad())
+      continue;
+
+    // Don't sink alloca: we never want to sink static alloca's out of the
+    // entry block, and correctly sinking dynamic alloca's requires
+    // checks for stacksave/stackrestore intrinsics.
+    // FIXME: Refactor this check somehow?
+    if (isa<AllocaInst>(I))
+      continue;
+
+    // Determine if there is a use in or before the loop (direct or
+    // otherwise).
+    bool UsedInLoop = false;
+    for (Use &U : I->uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
+      BasicBlock *UseBB = User->getParent();
+      if (PHINode *P = dyn_cast<PHINode>(User)) {
+        unsigned i =
+          PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+        UseBB = P->getIncomingBlock(i);
+      }
+      if (UseBB == Preheader || L->contains(UseBB)) {
+        UsedInLoop = true;
+        break;
+      }
+    }
+
+    // If there is, the def must remain in the preheader.
+    if (UsedInLoop)
+      continue;
+
+    // Otherwise, sink it to the exit block.
+    Instruction *ToMove = &*I;
+    bool Done = false;
+
+    if (I != Preheader->begin()) {
+      // Skip debug info intrinsics.
+      do {
+        --I;
+      } while (isa<DbgInfoIntrinsic>(I) && I != Preheader->begin());
+
+      if (isa<DbgInfoIntrinsic>(I) && I == Preheader->begin())
+        Done = true;
+    } else {
+      Done = true;
+    }
+
+    ToMove->moveBefore(*ExitBlock, InsertPt);
+    if (Done) break;
+    InsertPt = ToMove->getIterator();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  IndVarSimplify driver. Manage several subpasses of IV simplification.
+//===----------------------------------------------------------------------===//
+
+bool IndVarSimplify::run(Loop *L) {
+  // We need (and expect!) the incoming loop to be in LCSSA.
+  assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+         "LCSSA required to run indvars!");
+
+  // If LoopSimplify form is not available, stay out of trouble. Some notes:
+  //  - LSR currently only supports LoopSimplify-form loops. Indvars'
+  //    canonicalization can be a pessimization without LSR to "clean up"
+  //    afterwards.
+  //  - We depend on having a preheader; in particular,
+  //    Loop::getCanonicalInductionVariable only supports loops with preheaders,
+  //    and we're in trouble if we can't find the induction variable even when
+  //    we've manually inserted one.
+  //  - LFTR relies on having a single backedge.
+  if (!L->isLoopSimplifyForm())
+    return false;
+
+  // If there are any floating-point recurrences, attempt to
+  // transform them to use integer recurrences.
+  rewriteNonIntegerIVs(L);
+
+  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+
+  // Create a rewriter object which we'll use to transform the code with.
+  SCEVExpander Rewriter(*SE, DL, "indvars");
+#ifndef NDEBUG
+  Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+
+  // Eliminate redundant IV users.
+  //
+  // Simplification works best when run before other consumers of SCEV. We
+  // attempt to avoid evaluating SCEVs for sign/zero extend operations until
+  // other expressions involving loop IVs have been evaluated. This helps SCEV
+  // set no-wrap flags before normalizing sign/zero extension.
+  Rewriter.disableCanonicalMode();
+  simplifyAndExtend(L, Rewriter, LI);
+
+  // Check to see if this loop has a computable loop-invariant execution count.
+  // If so, this means that we can compute the final value of any expressions
+  // that are recurrent in the loop, and substitute the exit values from the
+  // loop into any instructions outside of the loop that use the final values of
+  // the current expressions.
+  //
+  if (ReplaceExitValue != NeverRepl &&
+      !isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    rewriteLoopExitValues(L, Rewriter);
+
+  // Eliminate redundant IV cycles.
+  NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
+
+  // If we have a trip count expression, rewrite the loop's exit condition
+  // using it.  We can currently only handle loops with a single exit.
+  if (!DisableLFTR && canExpandBackedgeTakenCount(L, SE, Rewriter) &&
+      needsLFTR(L, DT)) {
+    PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT);
+    if (IndVar) {
+      // Check preconditions for proper SCEVExpander operation. SCEV does not
+      // express SCEVExpander's dependencies, such as LoopSimplify. Instead any
+      // pass that uses the SCEVExpander must do it. This does not work well for
+      // loop passes because SCEVExpander makes assumptions about all loops,
+      // while LoopPassManager only forces the current loop to be simplified.
+      //
+      // FIXME: SCEV expansion has no way to bail out, so the caller must
+      // explicitly check any assumptions made by SCEV. Brittle.
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount);
+      if (!AR || AR->getLoop()->getLoopPreheader())
+        (void)linearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
+                                        Rewriter);
+    }
+  }
+  // Clear the rewriter cache, because values that are in the rewriter's cache
+  // can be deleted in the loop below, causing the AssertingVH in the cache to
+  // trigger.
+  Rewriter.clear();
+
+  // Now that we're done iterating through lists, clean up any instructions
+  // which are now dead.
+  while (!DeadInsts.empty())
+    if (Instruction *Inst =
+            dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val()))
+      RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
+
+  // The Rewriter may not be used from this point on.
+
+  // Loop-invariant instructions in the preheader that aren't used in the
+  // loop may be sunk below the loop to reduce register pressure.
+  sinkUnusedInvariants(L);
+
+  // rewriteFirstIterationLoopExitValues does not rely on the computation of
+  // trip count and therefore can further simplify exit values in addition to
+  // rewriteLoopExitValues.
+  rewriteFirstIterationLoopExitValues(L);
+
+  // Clean up dead instructions.
+  Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
+
+  // Check a post-condition.
+  assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+         "Indvars did not preserve LCSSA!");
+
+  // Verify that LFTR, and any other change have not interfered with SCEV's
+  // ability to compute trip count.
+#ifndef NDEBUG
+  if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+    SE->forgetLoop(L);
+    const SCEV *NewBECount = SE->getBackedgeTakenCount(L);
+    if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) <
+        SE->getTypeSizeInBits(NewBECount->getType()))
+      NewBECount = SE->getTruncateOrNoop(NewBECount,
+                                         BackedgeTakenCount->getType());
+    else
+      BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount,
+                                                 NewBECount->getType());
+    assert(BackedgeTakenCount == NewBECount && "indvars must preserve SCEV");
+  }
+#endif
+
+  return Changed;
+}
+
+PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
+                                          LoopStandardAnalysisResults &AR,
+                                          LPMUpdater &) {
+  Function *F = L.getHeader()->getParent();
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
+  IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI);
+  if (!IVS.run(&L))
+    return PreservedAnalyses::all();
+
+  auto PA = getLoopPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+namespace {
+struct IndVarSimplifyLegacyPass : public LoopPass {
+  static char ID; // Pass identification, replacement for typeid
+  IndVarSimplifyLegacyPass() : LoopPass(ID) {
+    initializeIndVarSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
+    auto *TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
+    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+    IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI);
+    return IVS.run(L);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    getLoopAnalysisUsage(AU);
+  }
+};
+}
+
+char IndVarSimplifyLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(IndVarSimplifyLegacyPass, "indvars",
+                      "Induction Variable Simplification", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(IndVarSimplifyLegacyPass, "indvars",
+                    "Induction Variable Simplification", false, false)
+
+Pass *llvm::createIndVarSimplifyPass() {
+  return new IndVarSimplifyLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
new file mode 100644
index 000000000000..a40c22c3fce9
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -0,0 +1,1607 @@
+//===-- InductiveRangeCheckElimination.cpp - ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// The InductiveRangeCheckElimination pass splits a loop's iteration space into
+// three disjoint ranges.  It does that in a way such that the loop running in
+// the middle loop provably does not need range checks. As an example, it will
+// convert
+//
+//   len = < known positive >
+//   for (i = 0; i < n; i++) {
+//     if (0 <= i && i < len) {
+//       do_something();
+//     } else {
+//       throw_out_of_bounds();
+//     }
+//   }
+//
+// to
+//
+//   len = < known positive >
+//   limit = smin(n, len)
+//   // no first segment
+//   for (i = 0; i < limit; i++) {
+//     if (0 <= i && i < len) { // this check is fully redundant
+//       do_something();
+//     } else {
+//       throw_out_of_bounds();
+//     }
+//   }
+//   for (i = limit; i < n; i++) {
+//     if (0 <= i && i < len) {
+//       do_something();
+//     } else {
+//       throw_out_of_bounds();
+//     }
+//   }
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden,
+                                        cl::init(64));
+
+static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden,
+                                       cl::init(false));
+
+static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden,
+                                      cl::init(false));
+
+static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal",
+                                          cl::Hidden, cl::init(10));
+
+static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks",
+                                             cl::Hidden, cl::init(false));
+
+static const char *ClonedLoopTag = "irce.loop.clone";
+
+#define DEBUG_TYPE "irce"
+
+namespace {
+
+/// An inductive range check is conditional branch in a loop with
+///
+///  1. a very cold successor (i.e. the branch jumps to that successor very
+///     rarely)
+///
+///  and
+///
+///  2. a condition that is provably true for some contiguous range of values
+///     taken by the containing loop's induction variable.
+///
+class InductiveRangeCheck {
+  // Classifies a range check
+  enum RangeCheckKind : unsigned {
+    // Range check of the form "0 <= I".
+    RANGE_CHECK_LOWER = 1,
+
+    // Range check of the form "I < L" where L is known positive.
+    RANGE_CHECK_UPPER = 2,
+
+    // The logical and of the RANGE_CHECK_LOWER and RANGE_CHECK_UPPER
+    // conditions.
+    RANGE_CHECK_BOTH = RANGE_CHECK_LOWER | RANGE_CHECK_UPPER,
+
+    // Unrecognized range check condition.
+    RANGE_CHECK_UNKNOWN = (unsigned)-1
+  };
+
+  static StringRef rangeCheckKindToStr(RangeCheckKind);
+
+  const SCEV *Offset = nullptr;
+  const SCEV *Scale = nullptr;
+  Value *Length = nullptr;
+  Use *CheckUse = nullptr;
+  RangeCheckKind Kind = RANGE_CHECK_UNKNOWN;
+
+  static RangeCheckKind parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
+                                            ScalarEvolution &SE, Value *&Index,
+                                            Value *&Length);
+
+  static void
+  extractRangeChecksFromCond(Loop *L, ScalarEvolution &SE, Use &ConditionUse,
+                             SmallVectorImpl<InductiveRangeCheck> &Checks,
+                             SmallPtrSetImpl<Value *> &Visited);
+
+public:
+  const SCEV *getOffset() const { return Offset; }
+  const SCEV *getScale() const { return Scale; }
+  Value *getLength() const { return Length; }
+
+  void print(raw_ostream &OS) const {
+    OS << "InductiveRangeCheck:\n";
+    OS << "  Kind: " << rangeCheckKindToStr(Kind) << "\n";
+    OS << "  Offset: ";
+    Offset->print(OS);
+    OS << "  Scale: ";
+    Scale->print(OS);
+    OS << "  Length: ";
+    if (Length)
+      Length->print(OS);
+    else
+      OS << "(null)";
+    OS << "\n  CheckUse: ";
+    getCheckUse()->getUser()->print(OS);
+    OS << " Operand: " << getCheckUse()->getOperandNo() << "\n";
+  }
+
+  LLVM_DUMP_METHOD
+  void dump() {
+    print(dbgs());
+  }
+
+  Use *getCheckUse() const { return CheckUse; }
+
+  /// Represents an signed integer range [Range.getBegin(), Range.getEnd()).  If
+  /// R.getEnd() sle R.getBegin(), then R denotes the empty range.
+
+  class Range {
+    const SCEV *Begin;
+    const SCEV *End;
+
+  public:
+    Range(const SCEV *Begin, const SCEV *End) : Begin(Begin), End(End) {
+      assert(Begin->getType() == End->getType() && "ill-typed range!");
+    }
+
+    Type *getType() const { return Begin->getType(); }
+    const SCEV *getBegin() const { return Begin; }
+    const SCEV *getEnd() const { return End; }
+  };
+
+  /// This is the value the condition of the branch needs to evaluate to for the
+  /// branch to take the hot successor (see (1) above).
+  bool getPassingDirection() { return true; }
+
+  /// Computes a range for the induction variable (IndVar) in which the range
+  /// check is redundant and can be constant-folded away.  The induction
+  /// variable is not required to be the canonical {0,+,1} induction variable.
+  Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE,
+                                            const SCEVAddRecExpr *IndVar) const;
+
+  /// Parse out a set of inductive range checks from \p BI and append them to \p
+  /// Checks.
+  ///
+  /// NB! There may be conditions feeding into \p BI that aren't inductive range
+  /// checks, and hence don't end up in \p Checks.
+  static void
+  extractRangeChecksFromBranch(BranchInst *BI, Loop *L, ScalarEvolution &SE,
+                               BranchProbabilityInfo &BPI,
+                               SmallVectorImpl<InductiveRangeCheck> &Checks);
+};
+
+class InductiveRangeCheckElimination : public LoopPass {
+public:
+  static char ID;
+  InductiveRangeCheckElimination() : LoopPass(ID) {
+    initializeInductiveRangeCheckEliminationPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+char InductiveRangeCheckElimination::ID = 0;
+}
+
+INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce",
+                      "Inductive range check elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce",
+                    "Inductive range check elimination", false, false)
+
+StringRef InductiveRangeCheck::rangeCheckKindToStr(
+    InductiveRangeCheck::RangeCheckKind RCK) {
+  switch (RCK) {
+  case InductiveRangeCheck::RANGE_CHECK_UNKNOWN:
+    return "RANGE_CHECK_UNKNOWN";
+
+  case InductiveRangeCheck::RANGE_CHECK_UPPER:
+    return "RANGE_CHECK_UPPER";
+
+  case InductiveRangeCheck::RANGE_CHECK_LOWER:
+    return "RANGE_CHECK_LOWER";
+
+  case InductiveRangeCheck::RANGE_CHECK_BOTH:
+    return "RANGE_CHECK_BOTH";
+  }
+
+  llvm_unreachable("unknown range check type!");
+}
+
+/// Parse a single ICmp instruction, `ICI`, into a range check.  If `ICI` cannot
+/// be interpreted as a range check, return `RANGE_CHECK_UNKNOWN` and set
+/// `Index` and `Length` to `nullptr`.  Otherwise set `Index` to the value being
+/// range checked, and set `Length` to the upper limit `Index` is being range
+/// checked with if (and only if) the range check type is stronger or equal to
+/// RANGE_CHECK_UPPER.
+///
+InductiveRangeCheck::RangeCheckKind
+InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
+                                         ScalarEvolution &SE, Value *&Index,
+                                         Value *&Length) {
+
+  auto IsNonNegativeAndNotLoopVarying = [&SE, L](Value *V) {
+    const SCEV *S = SE.getSCEV(V);
+    if (isa<SCEVCouldNotCompute>(S))
+      return false;
+
+    return SE.getLoopDisposition(S, L) == ScalarEvolution::LoopInvariant &&
+           SE.isKnownNonNegative(S);
+  };
+
+  using namespace llvm::PatternMatch;
+
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *LHS = ICI->getOperand(0);
+  Value *RHS = ICI->getOperand(1);
+
+  switch (Pred) {
+  default:
+    return RANGE_CHECK_UNKNOWN;
+
+  case ICmpInst::ICMP_SLE:
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_SGE:
+    if (match(RHS, m_ConstantInt<0>())) {
+      Index = LHS;
+      return RANGE_CHECK_LOWER;
+    }
+    return RANGE_CHECK_UNKNOWN;
+
+  case ICmpInst::ICMP_SLT:
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_SGT:
+    if (match(RHS, m_ConstantInt<-1>())) {
+      Index = LHS;
+      return RANGE_CHECK_LOWER;
+    }
+
+    if (IsNonNegativeAndNotLoopVarying(LHS)) {
+      Index = RHS;
+      Length = LHS;
+      return RANGE_CHECK_UPPER;
+    }
+    return RANGE_CHECK_UNKNOWN;
+
+  case ICmpInst::ICMP_ULT:
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_UGT:
+    if (IsNonNegativeAndNotLoopVarying(LHS)) {
+      Index = RHS;
+      Length = LHS;
+      return RANGE_CHECK_BOTH;
+    }
+    return RANGE_CHECK_UNKNOWN;
+  }
+
+  llvm_unreachable("default clause returns!");
+}
+
+void InductiveRangeCheck::extractRangeChecksFromCond(
+    Loop *L, ScalarEvolution &SE, Use &ConditionUse,
+    SmallVectorImpl<InductiveRangeCheck> &Checks,
+    SmallPtrSetImpl<Value *> &Visited) {
+  using namespace llvm::PatternMatch;
+
+  Value *Condition = ConditionUse.get();
+  if (!Visited.insert(Condition).second)
+    return;
+
+  if (match(Condition, m_And(m_Value(), m_Value()))) {
+    SmallVector<InductiveRangeCheck, 8> SubChecks;
+    extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(0),
+                               SubChecks, Visited);
+    extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(1),
+                               SubChecks, Visited);
+
+    if (SubChecks.size() == 2) {
+      // Handle a special case where we know how to merge two checks separately
+      // checking the upper and lower bounds into a full range check.
+      const auto &RChkA = SubChecks[0];
+      const auto &RChkB = SubChecks[1];
+      if ((RChkA.Length == RChkB.Length || !RChkA.Length || !RChkB.Length) &&
+          RChkA.Offset == RChkB.Offset && RChkA.Scale == RChkB.Scale) {
+
+        // If RChkA.Kind == RChkB.Kind then we just found two identical checks.
+        // But if one of them is a RANGE_CHECK_LOWER and the other is a
+        // RANGE_CHECK_UPPER (only possibility if they're different) then
+        // together they form a RANGE_CHECK_BOTH.
+        SubChecks[0].Kind =
+            (InductiveRangeCheck::RangeCheckKind)(RChkA.Kind | RChkB.Kind);
+        SubChecks[0].Length = RChkA.Length ? RChkA.Length : RChkB.Length;
+        SubChecks[0].CheckUse = &ConditionUse;
+
+        // We updated one of the checks in place, now erase the other.
+        SubChecks.pop_back();
+      }
+    }
+
+    Checks.insert(Checks.end(), SubChecks.begin(), SubChecks.end());
+    return;
+  }
+
+  ICmpInst *ICI = dyn_cast<ICmpInst>(Condition);
+  if (!ICI)
+    return;
+
+  Value *Length = nullptr, *Index;
+  auto RCKind = parseRangeCheckICmp(L, ICI, SE, Index, Length);
+  if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
+    return;
+
+  const auto *IndexAddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Index));
+  bool IsAffineIndex =
+      IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine();
+
+  if (!IsAffineIndex)
+    return;
+
+  InductiveRangeCheck IRC;
+  IRC.Length = Length;
+  IRC.Offset = IndexAddRec->getStart();
+  IRC.Scale = IndexAddRec->getStepRecurrence(SE);
+  IRC.CheckUse = &ConditionUse;
+  IRC.Kind = RCKind;
+  Checks.push_back(IRC);
+}
+
+void InductiveRangeCheck::extractRangeChecksFromBranch(
+    BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo &BPI,
+    SmallVectorImpl<InductiveRangeCheck> &Checks) {
+
+  if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
+    return;
+
+  BranchProbability LikelyTaken(15, 16);
+
+  if (!SkipProfitabilityChecks &&
+      BPI.getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken)
+    return;
+
+  SmallPtrSet<Value *, 8> Visited;
+  InductiveRangeCheck::extractRangeChecksFromCond(L, SE, BI->getOperandUse(0),
+                                                  Checks, Visited);
+}
+
+// Add metadata to the loop L to disable loop optimizations. Callers need to
+// confirm that optimizing loop L is not beneficial.
+static void DisableAllLoopOptsOnLoop(Loop &L) {
+  // We do not care about any existing loopID related metadata for L, since we
+  // are setting all loop metadata to false.
+  LLVMContext &Context = L.getHeader()->getContext();
+  // Reserve first location for self reference to the LoopID metadata node.
+  MDNode *Dummy = MDNode::get(Context, {});
+  MDNode *DisableUnroll = MDNode::get(
+      Context, {MDString::get(Context, "llvm.loop.unroll.disable")});
+  Metadata *FalseVal =
+      ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), 0));
+  MDNode *DisableVectorize = MDNode::get(
+      Context,
+      {MDString::get(Context, "llvm.loop.vectorize.enable"), FalseVal});
+  MDNode *DisableLICMVersioning = MDNode::get(
+      Context, {MDString::get(Context, "llvm.loop.licm_versioning.disable")});
+  MDNode *DisableDistribution= MDNode::get(
+      Context,
+      {MDString::get(Context, "llvm.loop.distribute.enable"), FalseVal});
+  MDNode *NewLoopID =
+      MDNode::get(Context, {Dummy, DisableUnroll, DisableVectorize,
+                            DisableLICMVersioning, DisableDistribution});
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  L.setLoopID(NewLoopID);
+}
+
+namespace {
+
+// Keeps track of the structure of a loop.  This is similar to llvm::Loop,
+// except that it is more lightweight and can track the state of a loop through
+// changing and potentially invalid IR.  This structure also formalizes the
+// kinds of loops we can deal with -- ones that have a single latch that is also
+// an exiting block *and* have a canonical induction variable.
+struct LoopStructure {
+  const char *Tag;
+
+  BasicBlock *Header;
+  BasicBlock *Latch;
+
+  // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th
+  // successor is `LatchExit', the exit block of the loop.
+  BranchInst *LatchBr;
+  BasicBlock *LatchExit;
+  unsigned LatchBrExitIdx;
+
+  // The loop represented by this instance of LoopStructure is semantically
+  // equivalent to:
+  //
+  // intN_ty inc = IndVarIncreasing ? 1 : -1;
+  // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT;
+  //
+  // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarNext)
+  //   ... body ...
+
+  Value *IndVarNext;
+  Value *IndVarStart;
+  Value *LoopExitAt;
+  bool IndVarIncreasing;
+
+  LoopStructure()
+      : Tag(""), Header(nullptr), Latch(nullptr), LatchBr(nullptr),
+        LatchExit(nullptr), LatchBrExitIdx(-1), IndVarNext(nullptr),
+        IndVarStart(nullptr), LoopExitAt(nullptr), IndVarIncreasing(false) {}
+
+  template <typename M> LoopStructure map(M Map) const {
+    LoopStructure Result;
+    Result.Tag = Tag;
+    Result.Header = cast<BasicBlock>(Map(Header));
+    Result.Latch = cast<BasicBlock>(Map(Latch));
+    Result.LatchBr = cast<BranchInst>(Map(LatchBr));
+    Result.LatchExit = cast<BasicBlock>(Map(LatchExit));
+    Result.LatchBrExitIdx = LatchBrExitIdx;
+    Result.IndVarNext = Map(IndVarNext);
+    Result.IndVarStart = Map(IndVarStart);
+    Result.LoopExitAt = Map(LoopExitAt);
+    Result.IndVarIncreasing = IndVarIncreasing;
+    return Result;
+  }
+
+  static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &,
+                                                    BranchProbabilityInfo &BPI,
+                                                    Loop &,
+                                                    const char *&);
+};
+
+/// This class is used to constrain loops to run within a given iteration space.
+/// The algorithm this class implements is given a Loop and a range [Begin,
+/// End).  The algorithm then tries to break out a "main loop" out of the loop
+/// it is given in a way that the "main loop" runs with the induction variable
+/// in a subset of [Begin, End).  The algorithm emits appropriate pre and post
+/// loops to run any remaining iterations.  The pre loop runs any iterations in
+/// which the induction variable is < Begin, and the post loop runs any
+/// iterations in which the induction variable is >= End.
+///
+class LoopConstrainer {
+  // The representation of a clone of the original loop we started out with.
+  struct ClonedLoop {
+    // The cloned blocks
+    std::vector<BasicBlock *> Blocks;
+
+    // `Map` maps values in the clonee into values in the cloned version
+    ValueToValueMapTy Map;
+
+    // An instance of `LoopStructure` for the cloned loop
+    LoopStructure Structure;
+  };
+
+  // Result of rewriting the range of a loop.  See changeIterationSpaceEnd for
+  // more details on what these fields mean.
+  struct RewrittenRangeInfo {
+    BasicBlock *PseudoExit;
+    BasicBlock *ExitSelector;
+    std::vector<PHINode *> PHIValuesAtPseudoExit;
+    PHINode *IndVarEnd;
+
+    RewrittenRangeInfo()
+        : PseudoExit(nullptr), ExitSelector(nullptr), IndVarEnd(nullptr) {}
+  };
+
+  // Calculated subranges we restrict the iteration space of the main loop to.
+  // See the implementation of `calculateSubRanges' for more details on how
+  // these fields are computed.  `LowLimit` is None if there is no restriction
+  // on low end of the restricted iteration space of the main loop.  `HighLimit`
+  // is None if there is no restriction on high end of the restricted iteration
+  // space of the main loop.
+
+  struct SubRanges {
+    Optional<const SCEV *> LowLimit;
+    Optional<const SCEV *> HighLimit;
+  };
+
+  // A utility function that does a `replaceUsesOfWith' on the incoming block
+  // set of a `PHINode' -- replaces instances of `Block' in the `PHINode's
+  // incoming block list with `ReplaceBy'.
+  static void replacePHIBlock(PHINode *PN, BasicBlock *Block,
+                              BasicBlock *ReplaceBy);
+
+  // Compute a safe set of limits for the main loop to run in -- effectively the
+  // intersection of `Range' and the iteration space of the original loop.
+  // Return None if unable to compute the set of subranges.
+  //
+  Optional<SubRanges> calculateSubRanges() const;
+
+  // Clone `OriginalLoop' and return the result in CLResult.  The IR after
+  // running `cloneLoop' is well formed except for the PHI nodes in CLResult --
+  // the PHI nodes say that there is an incoming edge from `OriginalPreheader`
+  // but there is no such edge.
+  //
+  void cloneLoop(ClonedLoop &CLResult, const char *Tag) const;
+
+  // Create the appropriate loop structure needed to describe a cloned copy of
+  // `Original`.  The clone is described by `VM`.
+  Loop *createClonedLoopStructure(Loop *Original, Loop *Parent,
+                                  ValueToValueMapTy &VM);
+
+  // Rewrite the iteration space of the loop denoted by (LS, Preheader). The
+  // iteration space of the rewritten loop ends at ExitLoopAt.  The start of the
+  // iteration space is not changed.  `ExitLoopAt' is assumed to be slt
+  // `OriginalHeaderCount'.
+  //
+  // If there are iterations left to execute, control is made to jump to
+  // `ContinuationBlock', otherwise they take the normal loop exit.  The
+  // returned `RewrittenRangeInfo' object is populated as follows:
+  //
+  //  .PseudoExit is a basic block that unconditionally branches to
+  //      `ContinuationBlock'.
+  //
+  //  .ExitSelector is a basic block that decides, on exit from the loop,
+  //      whether to branch to the "true" exit or to `PseudoExit'.
+  //
+  //  .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value
+  //      for each PHINode in the loop header on taking the pseudo exit.
+  //
+  // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate
+  // preheader because it is made to branch to the loop header only
+  // conditionally.
+  //
+  RewrittenRangeInfo
+  changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader,
+                          Value *ExitLoopAt,
+                          BasicBlock *ContinuationBlock) const;
+
+  // The loop denoted by `LS' has `OldPreheader' as its preheader.  This
+  // function creates a new preheader for `LS' and returns it.
+  //
+  BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader,
+                              const char *Tag) const;
+
+  // `ContinuationBlockAndPreheader' was the continuation block for some call to
+  // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'.
+  // This function rewrites the PHI nodes in `LS.Header' to start with the
+  // correct value.
+  void rewriteIncomingValuesForPHIs(
+      LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader,
+      const LoopConstrainer::RewrittenRangeInfo &RRI) const;
+
+  // Even though we do not preserve any passes at this time, we at least need to
+  // keep the parent loop structure consistent.  The `LPPassManager' seems to
+  // verify this after running a loop pass.  This function adds the list of
+  // blocks denoted by BBs to this loops parent loop if required.
+  void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs);
+
+  // Some global state.
+  Function &F;
+  LLVMContext &Ctx;
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+  LPPassManager &LPM;
+  LoopInfo &LI;
+
+  // Information about the original loop we started out with.
+  Loop &OriginalLoop;
+  const SCEV *LatchTakenCount;
+  BasicBlock *OriginalPreheader;
+
+  // The preheader of the main loop.  This may or may not be different from
+  // `OriginalPreheader'.
+  BasicBlock *MainLoopPreheader;
+
+  // The range we need to run the main loop in.
+  InductiveRangeCheck::Range Range;
+
+  // The structure of the main loop (see comment at the beginning of this class
+  // for a definition)
+  LoopStructure MainLoopStructure;
+
+public:
+  LoopConstrainer(Loop &L, LoopInfo &LI, LPPassManager &LPM,
+                  const LoopStructure &LS, ScalarEvolution &SE,
+                  DominatorTree &DT, InductiveRangeCheck::Range R)
+      : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
+        SE(SE), DT(DT), LPM(LPM), LI(LI), OriginalLoop(L),
+        LatchTakenCount(nullptr), OriginalPreheader(nullptr),
+        MainLoopPreheader(nullptr), Range(R), MainLoopStructure(LS) {}
+
+  // Entry point for the algorithm.  Returns true on success.
+  bool run();
+};
+
+}
+
+void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block,
+                                      BasicBlock *ReplaceBy) {
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingBlock(i) == Block)
+      PN->setIncomingBlock(i, ReplaceBy);
+}
+
+static bool CanBeSMax(ScalarEvolution &SE, const SCEV *S) {
+  APInt SMax =
+      APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth());
+  return SE.getSignedRange(S).contains(SMax) &&
+         SE.getUnsignedRange(S).contains(SMax);
+}
+
+static bool CanBeSMin(ScalarEvolution &SE, const SCEV *S) {
+  APInt SMin =
+      APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth());
+  return SE.getSignedRange(S).contains(SMin) &&
+         SE.getUnsignedRange(S).contains(SMin);
+}
+
+Optional<LoopStructure>
+LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BPI,
+                                  Loop &L, const char *&FailureReason) {
+  if (!L.isLoopSimplifyForm()) {
+    FailureReason = "loop not in LoopSimplify form";
+    return None;
+  }
+
+  BasicBlock *Latch = L.getLoopLatch();
+  assert(Latch && "Simplified loops only have one latch!");
+
+  if (Latch->getTerminator()->getMetadata(ClonedLoopTag)) {
+    FailureReason = "loop has already been cloned";
+    return None;
+  }
+
+  if (!L.isLoopExiting(Latch)) {
+    FailureReason = "no loop latch";
+    return None;
+  }
+
+  BasicBlock *Header = L.getHeader();
+  BasicBlock *Preheader = L.getLoopPreheader();
+  if (!Preheader) {
+    FailureReason = "no preheader";
+    return None;
+  }
+
+  BranchInst *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!LatchBr || LatchBr->isUnconditional()) {
+    FailureReason = "latch terminator not conditional branch";
+    return None;
+  }
+
+  unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
+
+  BranchProbability ExitProbability =
+    BPI.getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx);
+
+  if (!SkipProfitabilityChecks &&
+      ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) {
+    FailureReason = "short running loop, not profitable";
+    return None;
+  }
+
+  ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition());
+  if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) {
+    FailureReason = "latch terminator branch not conditional on integral icmp";
+    return None;
+  }
+
+  const SCEV *LatchCount = SE.getExitCount(&L, Latch);
+  if (isa<SCEVCouldNotCompute>(LatchCount)) {
+    FailureReason = "could not compute latch count";
+    return None;
+  }
+
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *LeftValue = ICI->getOperand(0);
+  const SCEV *LeftSCEV = SE.getSCEV(LeftValue);
+  IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType());
+
+  Value *RightValue = ICI->getOperand(1);
+  const SCEV *RightSCEV = SE.getSCEV(RightValue);
+
+  // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence.
+  if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
+    if (isa<SCEVAddRecExpr>(RightSCEV)) {
+      std::swap(LeftSCEV, RightSCEV);
+      std::swap(LeftValue, RightValue);
+      Pred = ICmpInst::getSwappedPredicate(Pred);
+    } else {
+      FailureReason = "no add recurrences in the icmp";
+      return None;
+    }
+  }
+
+  auto HasNoSignedWrap = [&](const SCEVAddRecExpr *AR) {
+    if (AR->getNoWrapFlags(SCEV::FlagNSW))
+      return true;
+
+    IntegerType *Ty = cast<IntegerType>(AR->getType());
+    IntegerType *WideTy =
+        IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2);
+
+    const SCEVAddRecExpr *ExtendAfterOp =
+        dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+    if (ExtendAfterOp) {
+      const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy);
+      const SCEV *ExtendedStep =
+          SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy);
+
+      bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart &&
+                          ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep;
+
+      if (NoSignedWrap)
+        return true;
+    }
+
+    // We may have proved this when computing the sign extension above.
+    return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
+  };
+
+  auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing) {
+    if (!AR->isAffine())
+      return false;
+
+    // Currently we only work with induction variables that have been proved to
+    // not wrap.  This restriction can potentially be lifted in the future.
+
+    if (!HasNoSignedWrap(AR))
+      return false;
+
+    if (const SCEVConstant *StepExpr =
+            dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) {
+      ConstantInt *StepCI = StepExpr->getValue();
+      if (StepCI->isOne() || StepCI->isMinusOne()) {
+        IsIncreasing = StepCI->isOne();
+        return true;
+      }
+    }
+
+    return false;
+  };
+
+  // `ICI` is interpreted as taking the backedge if the *next* value of the
+  // induction variable satisfies some constraint.
+
+  const SCEVAddRecExpr *IndVarNext = cast<SCEVAddRecExpr>(LeftSCEV);
+  bool IsIncreasing = false;
+  if (!IsInductionVar(IndVarNext, IsIncreasing)) {
+    FailureReason = "LHS in icmp not induction variable";
+    return None;
+  }
+
+  const SCEV *StartNext = IndVarNext->getStart();
+  const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
+  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+
+  ConstantInt *One = ConstantInt::get(IndVarTy, 1);
+  // TODO: generalize the predicates here to also match their unsigned variants.
+  if (IsIncreasing) {
+    bool FoundExpectedPred =
+        (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) ||
+        (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0);
+
+    if (!FoundExpectedPred) {
+      FailureReason = "expected icmp slt semantically, found something else";
+      return None;
+    }
+
+    if (LatchBrExitIdx == 0) {
+      if (CanBeSMax(SE, RightSCEV)) {
+        // TODO: this restriction is easily removable -- we just have to
+        // remember that the icmp was an slt and not an sle.
+        FailureReason = "limit may overflow when coercing sle to slt";
+        return None;
+      }
+
+      if (!SE.isLoopEntryGuardedByCond(
+              &L, CmpInst::ICMP_SLT, IndVarStart,
+              SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+        FailureReason = "Induction variable start not bounded by upper limit";
+        return None;
+      }
+
+      IRBuilder<> B(Preheader->getTerminator());
+      RightValue = B.CreateAdd(RightValue, One);
+    } else {
+      if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SLT, IndVarStart,
+                                       RightSCEV)) {
+        FailureReason = "Induction variable start not bounded by upper limit";
+        return None;
+      }
+    }
+  } else {
+    bool FoundExpectedPred =
+        (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) ||
+        (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0);
+
+    if (!FoundExpectedPred) {
+      FailureReason = "expected icmp sgt semantically, found something else";
+      return None;
+    }
+
+    if (LatchBrExitIdx == 0) {
+      if (CanBeSMin(SE, RightSCEV)) {
+        // TODO: this restriction is easily removable -- we just have to
+        // remember that the icmp was an sgt and not an sge.
+        FailureReason = "limit may overflow when coercing sge to sgt";
+        return None;
+      }
+
+      if (!SE.isLoopEntryGuardedByCond(
+              &L, CmpInst::ICMP_SGT, IndVarStart,
+              SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+        FailureReason = "Induction variable start not bounded by lower limit";
+        return None;
+      }
+
+      IRBuilder<> B(Preheader->getTerminator());
+      RightValue = B.CreateSub(RightValue, One);
+    } else {
+      if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SGT, IndVarStart,
+                                       RightSCEV)) {
+        FailureReason = "Induction variable start not bounded by lower limit";
+        return None;
+      }
+    }
+  }
+
+  BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
+
+  assert(SE.getLoopDisposition(LatchCount, &L) ==
+             ScalarEvolution::LoopInvariant &&
+         "loop variant exit count doesn't make sense!");
+
+  assert(!L.contains(LatchExit) && "expected an exit block!");
+  const DataLayout &DL = Preheader->getModule()->getDataLayout();
+  Value *IndVarStartV =
+      SCEVExpander(SE, DL, "irce")
+          .expandCodeFor(IndVarStart, IndVarTy, Preheader->getTerminator());
+  IndVarStartV->setName("indvar.start");
+
+  LoopStructure Result;
+
+  Result.Tag = "main";
+  Result.Header = Header;
+  Result.Latch = Latch;
+  Result.LatchBr = LatchBr;
+  Result.LatchExit = LatchExit;
+  Result.LatchBrExitIdx = LatchBrExitIdx;
+  Result.IndVarStart = IndVarStartV;
+  Result.IndVarNext = LeftValue;
+  Result.IndVarIncreasing = IsIncreasing;
+  Result.LoopExitAt = RightValue;
+
+  FailureReason = nullptr;
+
+  return Result;
+}
+
+Optional<LoopConstrainer::SubRanges>
+LoopConstrainer::calculateSubRanges() const {
+  IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType());
+
+  if (Range.getType() != Ty)
+    return None;
+
+  LoopConstrainer::SubRanges Result;
+
+  // I think we can be more aggressive here and make this nuw / nsw if the
+  // addition that feeds into the icmp for the latch's terminating branch is nuw
+  // / nsw.  In any case, a wrapping 2's complement addition is safe.
+  const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart);
+  const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt);
+
+  bool Increasing = MainLoopStructure.IndVarIncreasing;
+
+  // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the
+  // range of values the induction variable takes.
+
+  const SCEV *Smallest = nullptr, *Greatest = nullptr;
+
+  if (Increasing) {
+    Smallest = Start;
+    Greatest = End;
+  } else {
+    // These two computations may sign-overflow.  Here is why that is okay:
+    //
+    // We know that the induction variable does not sign-overflow on any
+    // iteration except the last one, and it starts at `Start` and ends at
+    // `End`, decrementing by one every time.
+    //
+    //  * if `Smallest` sign-overflows we know `End` is `INT_SMAX`. Since the
+    //    induction variable is decreasing we know that that the smallest value
+    //    the loop body is actually executed with is `INT_SMIN` == `Smallest`.
+    //
+    //  * if `Greatest` sign-overflows, we know it can only be `INT_SMIN`.  In
+    //    that case, `Clamp` will always return `Smallest` and
+    //    [`Result.LowLimit`, `Result.HighLimit`) = [`Smallest`, `Smallest`)
+    //    will be an empty range.  Returning an empty range is always safe.
+    //
+
+    const SCEV *One = SE.getOne(Ty);
+    Smallest = SE.getAddExpr(End, One);
+    Greatest = SE.getAddExpr(Start, One);
+  }
+
+  auto Clamp = [this, Smallest, Greatest](const SCEV *S) {
+    return SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S));
+  };
+
+  // In some cases we can prove that we don't need a pre or post loop
+
+  bool ProvablyNoPreloop =
+      SE.isKnownPredicate(ICmpInst::ICMP_SLE, Range.getBegin(), Smallest);
+  if (!ProvablyNoPreloop)
+    Result.LowLimit = Clamp(Range.getBegin());
+
+  bool ProvablyNoPostLoop =
+      SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd());
+  if (!ProvablyNoPostLoop)
+    Result.HighLimit = Clamp(Range.getEnd());
+
+  return Result;
+}
+
+void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
+                                const char *Tag) const {
+  for (BasicBlock *BB : OriginalLoop.getBlocks()) {
+    BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F);
+    Result.Blocks.push_back(Clone);
+    Result.Map[BB] = Clone;
+  }
+
+  auto GetClonedValue = [&Result](Value *V) {
+    assert(V && "null values not in domain!");
+    auto It = Result.Map.find(V);
+    if (It == Result.Map.end())
+      return V;
+    return static_cast<Value *>(It->second);
+  };
+
+  auto *ClonedLatch =
+      cast<BasicBlock>(GetClonedValue(OriginalLoop.getLoopLatch()));
+  ClonedLatch->getTerminator()->setMetadata(ClonedLoopTag,
+                                            MDNode::get(Ctx, {}));
+
+  Result.Structure = MainLoopStructure.map(GetClonedValue);
+  Result.Structure.Tag = Tag;
+
+  for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) {
+    BasicBlock *ClonedBB = Result.Blocks[i];
+    BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i];
+
+    assert(Result.Map[OriginalBB] == ClonedBB && "invariant!");
+
+    for (Instruction &I : *ClonedBB)
+      RemapInstruction(&I, Result.Map,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+    // Exit blocks will now have one more predecessor and their PHI nodes need
+    // to be edited to reflect that.  No phi nodes need to be introduced because
+    // the loop is in LCSSA.
+
+    for (auto *SBB : successors(OriginalBB)) {
+      if (OriginalLoop.contains(SBB))
+        continue; // not an exit block
+
+      for (Instruction &I : *SBB) {
+        auto *PN = dyn_cast<PHINode>(&I);
+        if (!PN)
+          break;
+
+        Value *OldIncoming = PN->getIncomingValueForBlock(OriginalBB);
+        PN->addIncoming(GetClonedValue(OldIncoming), ClonedBB);
+      }
+    }
+  }
+}
+
+LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
+    const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt,
+    BasicBlock *ContinuationBlock) const {
+
+  // We start with a loop with a single latch:
+  //
+  //    +--------------------+
+  //    |                    |
+  //    |     preheader      |
+  //    |                    |
+  //    +--------+-----------+
+  //             |      ----------------\
+  //             |     /                |
+  //    +--------v----v------+          |
+  //    |                    |          |
+  //    |      header        |          |
+  //    |                    |          |
+  //    +--------------------+          |
+  //                                    |
+  //            .....                   |
+  //                                    |
+  //    +--------------------+          |
+  //    |                    |          |
+  //    |       latch        >----------/
+  //    |                    |
+  //    +-------v------------+
+  //            |
+  //            |
+  //            |   +--------------------+
+  //            |   |                    |
+  //            +--->   original exit    |
+  //                |                    |
+  //                +--------------------+
+  //
+  // We change the control flow to look like
+  //
+  //
+  //    +--------------------+
+  //    |                    |
+  //    |     preheader      >-------------------------+
+  //    |                    |                         |
+  //    +--------v-----------+                         |
+  //             |    /-------------+                  |
+  //             |   /              |                  |
+  //    +--------v--v--------+      |                  |
+  //    |                    |      |                  |
+  //    |      header        |      |   +--------+     |
+  //    |                    |      |   |        |     |
+  //    +--------------------+      |   |  +-----v-----v-----------+
+  //                                |   |  |                       |
+  //                                |   |  |     .pseudo.exit      |
+  //                                |   |  |                       |
+  //                                |   |  +-----------v-----------+
+  //                                |   |              |
+  //            .....               |   |              |
+  //                                |   |     +--------v-------------+
+  //    +--------------------+      |   |     |                      |
+  //    |                    |      |   |     |   ContinuationBlock  |
+  //    |       latch        >------+   |     |                      |
+  //    |                    |          |     +----------------------+
+  //    +---------v----------+          |
+  //              |                     |
+  //              |                     |
+  //              |     +---------------^-----+
+  //              |     |                     |
+  //              +----->    .exit.selector   |
+  //                    |                     |
+  //                    +----------v----------+
+  //                               |
+  //     +--------------------+    |
+  //     |                    |    |
+  //     |   original exit    <----+
+  //     |                    |
+  //     +--------------------+
+  //
+
+  RewrittenRangeInfo RRI;
+
+  BasicBlock *BBInsertLocation = LS.Latch->getNextNode();
+  RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector",
+                                        &F, BBInsertLocation);
+  RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
+                                      BBInsertLocation);
+
+  BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator());
+  bool Increasing = LS.IndVarIncreasing;
+
+  IRBuilder<> B(PreheaderJump);
+
+  // EnterLoopCond - is it okay to start executing this `LS'?
+  Value *EnterLoopCond = Increasing
+                             ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt)
+                             : B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt);
+
+  B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit);
+  PreheaderJump->eraseFromParent();
+
+  LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector);
+  B.SetInsertPoint(LS.LatchBr);
+  Value *TakeBackedgeLoopCond =
+      Increasing ? B.CreateICmpSLT(LS.IndVarNext, ExitSubloopAt)
+                 : B.CreateICmpSGT(LS.IndVarNext, ExitSubloopAt);
+  Value *CondForBranch = LS.LatchBrExitIdx == 1
+                             ? TakeBackedgeLoopCond
+                             : B.CreateNot(TakeBackedgeLoopCond);
+
+  LS.LatchBr->setCondition(CondForBranch);
+
+  B.SetInsertPoint(RRI.ExitSelector);
+
+  // IterationsLeft - are there any more iterations left, given the original
+  // upper bound on the induction variable?  If not, we branch to the "real"
+  // exit.
+  Value *IterationsLeft = Increasing
+                              ? B.CreateICmpSLT(LS.IndVarNext, LS.LoopExitAt)
+                              : B.CreateICmpSGT(LS.IndVarNext, LS.LoopExitAt);
+  B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit);
+
+  BranchInst *BranchToContinuation =
+      BranchInst::Create(ContinuationBlock, RRI.PseudoExit);
+
+  // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of
+  // each of the PHI nodes in the loop header.  This feeds into the initial
+  // value of the same PHI nodes if/when we continue execution.
+  for (Instruction &I : *LS.Header) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      break;
+
+    PHINode *NewPHI = PHINode::Create(PN->getType(), 2, PN->getName() + ".copy",
+                                      BranchToContinuation);
+
+    NewPHI->addIncoming(PN->getIncomingValueForBlock(Preheader), Preheader);
+    NewPHI->addIncoming(PN->getIncomingValueForBlock(LS.Latch),
+                        RRI.ExitSelector);
+    RRI.PHIValuesAtPseudoExit.push_back(NewPHI);
+  }
+
+  RRI.IndVarEnd = PHINode::Create(LS.IndVarNext->getType(), 2, "indvar.end",
+                                  BranchToContinuation);
+  RRI.IndVarEnd->addIncoming(LS.IndVarStart, Preheader);
+  RRI.IndVarEnd->addIncoming(LS.IndVarNext, RRI.ExitSelector);
+
+  // The latch exit now has a branch from `RRI.ExitSelector' instead of
+  // `LS.Latch'.  The PHI nodes need to be updated to reflect that.
+  for (Instruction &I : *LS.LatchExit) {
+    if (PHINode *PN = dyn_cast<PHINode>(&I))
+      replacePHIBlock(PN, LS.Latch, RRI.ExitSelector);
+    else
+      break;
+  }
+
+  return RRI;
+}
+
+void LoopConstrainer::rewriteIncomingValuesForPHIs(
+    LoopStructure &LS, BasicBlock *ContinuationBlock,
+    const LoopConstrainer::RewrittenRangeInfo &RRI) const {
+
+  unsigned PHIIndex = 0;
+  for (Instruction &I : *LS.Header) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      break;
+
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
+      if (PN->getIncomingBlock(i) == ContinuationBlock)
+        PN->setIncomingValue(i, RRI.PHIValuesAtPseudoExit[PHIIndex++]);
+  }
+
+  LS.IndVarStart = RRI.IndVarEnd;
+}
+
+BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
+                                             BasicBlock *OldPreheader,
+                                             const char *Tag) const {
+
+  BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
+  BranchInst::Create(LS.Header, Preheader);
+
+  for (Instruction &I : *LS.Header) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      break;
+
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
+      replacePHIBlock(PN, OldPreheader, Preheader);
+  }
+
+  return Preheader;
+}
+
+void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
+  Loop *ParentLoop = OriginalLoop.getParentLoop();
+  if (!ParentLoop)
+    return;
+
+  for (BasicBlock *BB : BBs)
+    ParentLoop->addBasicBlockToLoop(BB, LI);
+}
+
+Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
+                                                 ValueToValueMapTy &VM) {
+  Loop &New = *new Loop();
+  if (Parent)
+    Parent->addChildLoop(&New);
+  else
+    LI.addTopLevelLoop(&New);
+  LPM.addLoop(New);
+
+  // Add all of the blocks in Original to the new loop.
+  for (auto *BB : Original->blocks())
+    if (LI.getLoopFor(BB) == Original)
+      New.addBasicBlockToLoop(cast<BasicBlock>(VM[BB]), LI);
+
+  // Add all of the subloops to the new loop.
+  for (Loop *SubLoop : *Original)
+    createClonedLoopStructure(SubLoop, &New, VM);
+
+  return &New;
+}
+
+bool LoopConstrainer::run() {
+  BasicBlock *Preheader = nullptr;
+  LatchTakenCount = SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch);
+  Preheader = OriginalLoop.getLoopPreheader();
+  assert(!isa<SCEVCouldNotCompute>(LatchTakenCount) && Preheader != nullptr &&
+         "preconditions!");
+
+  OriginalPreheader = Preheader;
+  MainLoopPreheader = Preheader;
+
+  Optional<SubRanges> MaybeSR = calculateSubRanges();
+  if (!MaybeSR.hasValue()) {
+    DEBUG(dbgs() << "irce: could not compute subranges\n");
+    return false;
+  }
+
+  SubRanges SR = MaybeSR.getValue();
+  bool Increasing = MainLoopStructure.IndVarIncreasing;
+  IntegerType *IVTy =
+      cast<IntegerType>(MainLoopStructure.IndVarNext->getType());
+
+  SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce");
+  Instruction *InsertPt = OriginalPreheader->getTerminator();
+
+  // It would have been better to make `PreLoop' and `PostLoop'
+  // `Optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy
+  // constructor.
+  ClonedLoop PreLoop, PostLoop;
+  bool NeedsPreLoop =
+      Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue();
+  bool NeedsPostLoop =
+      Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue();
+
+  Value *ExitPreLoopAt = nullptr;
+  Value *ExitMainLoopAt = nullptr;
+  const SCEVConstant *MinusOneS =
+      cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */));
+
+  if (NeedsPreLoop) {
+    const SCEV *ExitPreLoopAtSCEV = nullptr;
+
+    if (Increasing)
+      ExitPreLoopAtSCEV = *SR.LowLimit;
+    else {
+      if (CanBeSMin(SE, *SR.HighLimit)) {
+        DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+                     << "preloop exit limit.  HighLimit = " << *(*SR.HighLimit)
+                     << "\n");
+        return false;
+      }
+      ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
+    }
+
+    ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt);
+    ExitPreLoopAt->setName("exit.preloop.at");
+  }
+
+  if (NeedsPostLoop) {
+    const SCEV *ExitMainLoopAtSCEV = nullptr;
+
+    if (Increasing)
+      ExitMainLoopAtSCEV = *SR.HighLimit;
+    else {
+      if (CanBeSMin(SE, *SR.LowLimit)) {
+        DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+                     << "mainloop exit limit.  LowLimit = " << *(*SR.LowLimit)
+                     << "\n");
+        return false;
+      }
+      ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
+    }
+
+    ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt);
+    ExitMainLoopAt->setName("exit.mainloop.at");
+  }
+
+  // We clone these ahead of time so that we don't have to deal with changing
+  // and temporarily invalid IR as we transform the loops.
+  if (NeedsPreLoop)
+    cloneLoop(PreLoop, "preloop");
+  if (NeedsPostLoop)
+    cloneLoop(PostLoop, "postloop");
+
+  RewrittenRangeInfo PreLoopRRI;
+
+  if (NeedsPreLoop) {
+    Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header,
+                                                  PreLoop.Structure.Header);
+
+    MainLoopPreheader =
+        createPreheader(MainLoopStructure, Preheader, "mainloop");
+    PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader,
+                                         ExitPreLoopAt, MainLoopPreheader);
+    rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader,
+                                 PreLoopRRI);
+  }
+
+  BasicBlock *PostLoopPreheader = nullptr;
+  RewrittenRangeInfo PostLoopRRI;
+
+  if (NeedsPostLoop) {
+    PostLoopPreheader =
+        createPreheader(PostLoop.Structure, Preheader, "postloop");
+    PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader,
+                                          ExitMainLoopAt, PostLoopPreheader);
+    rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader,
+                                 PostLoopRRI);
+  }
+
+  BasicBlock *NewMainLoopPreheader =
+      MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr;
+  BasicBlock *NewBlocks[] = {PostLoopPreheader,        PreLoopRRI.PseudoExit,
+                             PreLoopRRI.ExitSelector,  PostLoopRRI.PseudoExit,
+                             PostLoopRRI.ExitSelector, NewMainLoopPreheader};
+
+  // Some of the above may be nullptr, filter them out before passing to
+  // addToParentLoopIfNeeded.
+  auto NewBlocksEnd =
+      std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr);
+
+  addToParentLoopIfNeeded(makeArrayRef(std::begin(NewBlocks), NewBlocksEnd));
+
+  DT.recalculate(F);
+
+  // We need to first add all the pre and post loop blocks into the loop
+  // structures (as part of createClonedLoopStructure), and then update the
+  // LCSSA form and LoopSimplifyForm. This is necessary for correctly updating
+  // LI when LoopSimplifyForm is generated.
+  Loop *PreL = nullptr, *PostL = nullptr;
+  if (!PreLoop.Blocks.empty()) {
+    PreL = createClonedLoopStructure(
+        &OriginalLoop, OriginalLoop.getParentLoop(), PreLoop.Map);
+  }
+
+  if (!PostLoop.Blocks.empty()) {
+    PostL = createClonedLoopStructure(
+        &OriginalLoop, OriginalLoop.getParentLoop(), PostLoop.Map);
+  }
+
+  // This function canonicalizes the loop into Loop-Simplify and LCSSA forms.
+  auto CanonicalizeLoop = [&] (Loop *L, bool IsOriginalLoop) {
+    formLCSSARecursively(*L, DT, &LI, &SE);
+    simplifyLoop(L, &DT, &LI, &SE, nullptr, true);
+    // Pre/post loops are slow paths, we do not need to perform any loop
+    // optimizations on them.
+    if (!IsOriginalLoop)
+      DisableAllLoopOptsOnLoop(*L);
+  };
+  if (PreL)
+    CanonicalizeLoop(PreL, false);
+  if (PostL)
+    CanonicalizeLoop(PostL, false);
+  CanonicalizeLoop(&OriginalLoop, true);
+
+  return true;
+}
+
+/// Computes and returns a range of values for the induction variable (IndVar)
+/// in which the range check can be safely elided.  If it cannot compute such a
+/// range, returns None.
+Optional<InductiveRangeCheck::Range>
+InductiveRangeCheck::computeSafeIterationSpace(
+    ScalarEvolution &SE, const SCEVAddRecExpr *IndVar) const {
+  // IndVar is of the form "A + B * I" (where "I" is the canonical induction
+  // variable, that may or may not exist as a real llvm::Value in the loop) and
+  // this inductive range check is a range check on the "C + D * I" ("C" is
+  // getOffset() and "D" is getScale()).  We rewrite the value being range
+  // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA".
+  // Currently we support this only for "B" = "D" = { 1 or -1 }, but the code
+  // can be generalized as needed.
+  //
+  // The actual inequalities we solve are of the form
+  //
+  //   0 <= M + 1 * IndVar < L given L >= 0  (i.e. N == 1)
+  //
+  // The inequality is satisfied by -M <= IndVar < (L - M) [^1].  All additions
+  // and subtractions are twos-complement wrapping and comparisons are signed.
+  //
+  // Proof:
+  //
+  //   If there exists IndVar such that -M <= IndVar < (L - M) then it follows
+  //   that -M <= (-M + L) [== Eq. 1].  Since L >= 0, if (-M + L) sign-overflows
+  //   then (-M + L) < (-M).  Hence by [Eq. 1], (-M + L) could not have
+  //   overflown.
+  //
+  //   This means IndVar = t + (-M) for t in [0, L).  Hence (IndVar + M) = t.
+  //   Hence 0 <= (IndVar + M) < L
+
+  // [^1]: Note that the solution does _not_ apply if L < 0; consider values M =
+  // 127, IndVar = 126 and L = -2 in an i8 world.
+
+  if (!IndVar->isAffine())
+    return None;
+
+  const SCEV *A = IndVar->getStart();
+  const SCEVConstant *B = dyn_cast<SCEVConstant>(IndVar->getStepRecurrence(SE));
+  if (!B)
+    return None;
+
+  const SCEV *C = getOffset();
+  const SCEVConstant *D = dyn_cast<SCEVConstant>(getScale());
+  if (D != B)
+    return None;
+
+  ConstantInt *ConstD = D->getValue();
+  if (!(ConstD->isMinusOne() || ConstD->isOne()))
+    return None;
+
+  const SCEV *M = SE.getMinusSCEV(C, A);
+
+  const SCEV *Begin = SE.getNegativeSCEV(M);
+  const SCEV *UpperLimit = nullptr;
+
+  // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
+  // We can potentially do much better here.
+  if (Value *V = getLength()) {
+    UpperLimit = SE.getSCEV(V);
+  } else {
+    assert(Kind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!");
+    unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth();
+    UpperLimit = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+  }
+
+  const SCEV *End = SE.getMinusSCEV(UpperLimit, M);
+  return InductiveRangeCheck::Range(Begin, End);
+}
+
+static Optional<InductiveRangeCheck::Range>
+IntersectRange(ScalarEvolution &SE,
+               const Optional<InductiveRangeCheck::Range> &R1,
+               const InductiveRangeCheck::Range &R2) {
+  if (!R1.hasValue())
+    return R2;
+  auto &R1Value = R1.getValue();
+
+  // TODO: we could widen the smaller range and have this work; but for now we
+  // bail out to keep things simple.
+  if (R1Value.getType() != R2.getType())
+    return None;
+
+  const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin());
+  const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd());
+
+  return InductiveRangeCheck::Range(NewBegin, NewEnd);
+}
+
+bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipLoop(L))
+    return false;
+
+  if (L->getBlocks().size() >= LoopSizeCutoff) {
+    DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";);
+    return false;
+  }
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
+    return false;
+  }
+
+  LLVMContext &Context = Preheader->getContext();
+  SmallVector<InductiveRangeCheck, 16> RangeChecks;
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  BranchProbabilityInfo &BPI =
+      getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+
+  for (auto BBI : L->getBlocks())
+    if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
+      InductiveRangeCheck::extractRangeChecksFromBranch(TBI, L, SE, BPI,
+                                                        RangeChecks);
+
+  if (RangeChecks.empty())
+    return false;
+
+  auto PrintRecognizedRangeChecks = [&](raw_ostream &OS) {
+    OS << "irce: looking at loop "; L->print(OS);
+    OS << "irce: loop has " << RangeChecks.size()
+       << " inductive range checks: \n";
+    for (InductiveRangeCheck &IRC : RangeChecks)
+      IRC.print(OS);
+  };
+
+  DEBUG(PrintRecognizedRangeChecks(dbgs()));
+
+  if (PrintRangeChecks)
+    PrintRecognizedRangeChecks(errs());
+
+  const char *FailureReason = nullptr;
+  Optional<LoopStructure> MaybeLoopStructure =
+      LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason);
+  if (!MaybeLoopStructure.hasValue()) {
+    DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason
+                 << "\n";);
+    return false;
+  }
+  LoopStructure LS = MaybeLoopStructure.getValue();
+  bool Increasing = LS.IndVarIncreasing;
+  const SCEV *MinusOne =
+      SE.getConstant(LS.IndVarNext->getType(), Increasing ? -1 : 1, true);
+  const SCEVAddRecExpr *IndVar =
+      cast<SCEVAddRecExpr>(SE.getAddExpr(SE.getSCEV(LS.IndVarNext), MinusOne));
+
+  Optional<InductiveRangeCheck::Range> SafeIterRange;
+  Instruction *ExprInsertPt = Preheader->getTerminator();
+
+  SmallVector<InductiveRangeCheck, 4> RangeChecksToEliminate;
+
+  IRBuilder<> B(ExprInsertPt);
+  for (InductiveRangeCheck &IRC : RangeChecks) {
+    auto Result = IRC.computeSafeIterationSpace(SE, IndVar);
+    if (Result.hasValue()) {
+      auto MaybeSafeIterRange =
+          IntersectRange(SE, SafeIterRange, Result.getValue());
+      if (MaybeSafeIterRange.hasValue()) {
+        RangeChecksToEliminate.push_back(IRC);
+        SafeIterRange = MaybeSafeIterRange.getValue();
+      }
+    }
+  }
+
+  if (!SafeIterRange.hasValue())
+    return false;
+
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LPM,
+                     LS, SE, DT, SafeIterRange.getValue());
+  bool Changed = LC.run();
+
+  if (Changed) {
+    auto PrintConstrainedLoopInfo = [L]() {
+      dbgs() << "irce: in function ";
+      dbgs() << L->getHeader()->getParent()->getName() << ": ";
+      dbgs() << "constrained ";
+      L->print(dbgs());
+    };
+
+    DEBUG(PrintConstrainedLoopInfo());
+
+    if (PrintChangedLoops)
+      PrintConstrainedLoopInfo();
+
+    // Optimize away the now-redundant range checks.
+
+    for (InductiveRangeCheck &IRC : RangeChecksToEliminate) {
+      ConstantInt *FoldedRangeCheck = IRC.getPassingDirection()
+                                          ? ConstantInt::getTrue(Context)
+                                          : ConstantInt::getFalse(Context);
+      IRC.getCheckUse()->set(FoldedRangeCheck);
+    }
+  }
+
+  return Changed;
+}
+
+Pass *llvm::createInductiveRangeCheckEliminationPass() {
+  return new InductiveRangeCheckElimination;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
new file mode 100644
index 000000000000..89b28f0aeee6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -0,0 +1,969 @@
+//===-- NVPTXInferAddressSpace.cpp - ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// CUDA C/C++ includes memory space designation as variable type qualifers (such
+// as __global__ and __shared__). Knowing the space of a memory access allows
+// CUDA compilers to emit faster PTX loads and stores. For example, a load from
+// shared memory can be translated to `ld.shared` which is roughly 10% faster
+// than a generic `ld` on an NVIDIA Tesla K40c.
+//
+// Unfortunately, type qualifiers only apply to variable declarations, so CUDA
+// compilers must infer the memory space of an address expression from
+// type-qualified variables.
+//
+// LLVM IR uses non-zero (so-called) specific address spaces to represent memory
+// spaces (e.g. addrspace(3) means shared memory). The Clang frontend
+// places only type-qualified variables in specific address spaces, and then
+// conservatively `addrspacecast`s each type-qualified variable to addrspace(0)
+// (so-called the generic address space) for other instructions to use.
+//
+// For example, the Clang translates the following CUDA code
+//   __shared__ float a[10];
+//   float v = a[i];
+// to
+//   %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+//   %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i
+//   %v = load float, float* %1 ; emits ld.f32
+// @a is in addrspace(3) since it's type-qualified, but its use from %1 is
+// redirected to %0 (the generic version of @a).
+//
+// The optimization implemented in this file propagates specific address spaces
+// from type-qualified variable declarations to its users. For example, it
+// optimizes the above IR to
+//   %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+//   %v = load float addrspace(3)* %1 ; emits ld.shared.f32
+// propagating the addrspace(3) from @a to %1. As the result, the NVPTX
+// codegen is able to emit ld.shared.f32 for %v.
+//
+// Address space inference works in two steps. First, it uses a data-flow
+// analysis to infer as many generic pointers as possible to point to only one
+// specific address space. In the above example, it can prove that %1 only
+// points to addrspace(3). This algorithm was published in
+//   CUDA: Compiling and optimizing for a GPU platform
+//   Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang
+//   ICCS 2012
+//
+// Then, address space inference replaces all refinable generic pointers with
+// equivalent specific pointers.
+//
+// The major challenge of implementing this optimization is handling PHINodes,
+// which may create loops in the data flow graph. This brings two complications.
+//
+// First, the data flow analysis in Step 1 needs to be circular. For example,
+//     %generic.input = addrspacecast float addrspace(3)* %input to float*
+//   loop:
+//     %y = phi [ %generic.input, %y2 ]
+//     %y2 = getelementptr %y, 1
+//     %v = load %y2
+//     br ..., label %loop, ...
+// proving %y specific requires proving both %generic.input and %y2 specific,
+// but proving %y2 specific circles back to %y. To address this complication,
+// the data flow analysis operates on a lattice:
+//   uninitialized > specific address spaces > generic.
+// All address expressions (our implementation only considers phi, bitcast,
+// addrspacecast, and getelementptr) start with the uninitialized address space.
+// The monotone transfer function moves the address space of a pointer down a
+// lattice path from uninitialized to specific and then to generic. A join
+// operation of two different specific address spaces pushes the expression down
+// to the generic address space. The analysis completes once it reaches a fixed
+// point.
+//
+// Second, IR rewriting in Step 2 also needs to be circular. For example,
+// converting %y to addrspace(3) requires the compiler to know the converted
+// %y2, but converting %y2 needs the converted %y. To address this complication,
+// we break these cycles using "undef" placeholders. When converting an
+// instruction `I` to a new address space, if its operand `Op` is not converted
+// yet, we let `I` temporarily use `undef` and fix all the uses of undef later.
+// For instance, our algorithm first converts %y to
+//   %y' = phi float addrspace(3)* [ %input, undef ]
+// Then, it converts %y2 to
+//   %y2' = getelementptr %y', 1
+// Finally, it fixes the undef in %y' so that
+//   %y' = phi float addrspace(3)* [ %input, %y2' ]
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+#define DEBUG_TYPE "infer-address-spaces"
+
+using namespace llvm;
+
+namespace {
+static const unsigned UninitializedAddressSpace = ~0u;
+
+using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
+
+/// \brief InferAddressSpaces
+class InferAddressSpaces : public FunctionPass {
+  /// Target specific address space which uses of should be replaced if
+  /// possible.
+  unsigned FlatAddrSpace;
+
+public:
+  static char ID;
+
+  InferAddressSpaces() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  // Returns the new address space of V if updated; otherwise, returns None.
+  Optional<unsigned>
+  updateAddressSpace(const Value &V,
+                     const ValueToAddrSpaceMapTy &InferredAddrSpace) const;
+
+  // Tries to infer the specific address space of each address expression in
+  // Postorder.
+  void inferAddressSpaces(ArrayRef<WeakTrackingVH> Postorder,
+                          ValueToAddrSpaceMapTy *InferredAddrSpace) const;
+
+  bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const;
+
+  // Changes the flat address expressions in function F to point to specific
+  // address spaces if InferredAddrSpace says so. Postorder is the postorder of
+  // all flat expressions in the use-def graph of function F.
+  bool
+  rewriteWithNewAddressSpaces(ArrayRef<WeakTrackingVH> Postorder,
+                              const ValueToAddrSpaceMapTy &InferredAddrSpace,
+                              Function *F) const;
+
+  void appendsFlatAddressExpressionToPostorderStack(
+    Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack,
+    DenseSet<Value *> &Visited) const;
+
+  bool rewriteIntrinsicOperands(IntrinsicInst *II,
+                                Value *OldV, Value *NewV) const;
+  void collectRewritableIntrinsicOperands(
+    IntrinsicInst *II,
+    std::vector<std::pair<Value *, bool>> &PostorderStack,
+    DenseSet<Value *> &Visited) const;
+
+  std::vector<WeakTrackingVH> collectFlatAddressExpressions(Function &F) const;
+
+  Value *cloneValueWithNewAddressSpace(
+    Value *V, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) const;
+  unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const;
+};
+} // end anonymous namespace
+
+char InferAddressSpaces::ID = 0;
+
+namespace llvm {
+void initializeInferAddressSpacesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
+                false, false)
+
+// Returns true if V is an address expression.
+// TODO: Currently, we consider only phi, bitcast, addrspacecast, and
+// getelementptr operators.
+static bool isAddressExpression(const Value &V) {
+  if (!isa<Operator>(V))
+    return false;
+
+  switch (cast<Operator>(V).getOpcode()) {
+  case Instruction::PHI:
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::GetElementPtr:
+  case Instruction::Select:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Returns the pointer operands of V.
+//
+// Precondition: V is an address expression.
+static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
+  const Operator &Op = cast<Operator>(V);
+  switch (Op.getOpcode()) {
+  case Instruction::PHI: {
+    auto IncomingValues = cast<PHINode>(Op).incoming_values();
+    return SmallVector<Value *, 2>(IncomingValues.begin(),
+                                   IncomingValues.end());
+  }
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::GetElementPtr:
+    return {Op.getOperand(0)};
+  case Instruction::Select:
+    return {Op.getOperand(1), Op.getOperand(2)};
+  default:
+    llvm_unreachable("Unexpected instruction type.");
+  }
+}
+
+// TODO: Move logic to TTI?
+bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
+                                                  Value *OldV,
+                                                  Value *NewV) const {
+  Module *M = II->getParent()->getParent()->getParent();
+
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:{
+    const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
+    if (!IsVolatile || !IsVolatile->isZero())
+      return false;
+
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::objectsize: {
+    Type *DestTy = II->getType();
+    Type *SrcTy = NewV->getType();
+    Function *NewDecl =
+        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
+    II->setArgOperand(0, NewV);
+    II->setCalledFunction(NewDecl);
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+// TODO: Move logic to TTI?
+void InferAddressSpaces::collectRewritableIntrinsicOperands(
+    IntrinsicInst *II, std::vector<std::pair<Value *, bool>> &PostorderStack,
+    DenseSet<Value *> &Visited) const {
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::objectsize:
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:
+    appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
+                                                 PostorderStack, Visited);
+    break;
+  default:
+    break;
+  }
+}
+
+// Returns all flat address expressions in function F. The elements are
+// If V is an unvisited flat address expression, appends V to PostorderStack
+// and marks it as visited.
+void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
+    Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack,
+    DenseSet<Value *> &Visited) const {
+  assert(V->getType()->isPointerTy());
+
+  // Generic addressing expressions may be hidden in nested constant
+  // expressions.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    // TODO: Look in non-address parts, like icmp operands.
+    if (isAddressExpression(*CE) && Visited.insert(CE).second)
+      PostorderStack.push_back(std::make_pair(CE, false));
+
+    return;
+  }
+
+  if (isAddressExpression(*V) &&
+      V->getType()->getPointerAddressSpace() == FlatAddrSpace) {
+    if (Visited.insert(V).second) {
+      PostorderStack.push_back(std::make_pair(V, false));
+
+      Operator *Op = cast<Operator>(V);
+      for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) {
+        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op->getOperand(I))) {
+          if (isAddressExpression(*CE) && Visited.insert(CE).second)
+            PostorderStack.emplace_back(CE, false);
+        }
+      }
+    }
+  }
+}
+
+// Returns all flat address expressions in function F. The elements are ordered
+// ordered in postorder.
+std::vector<WeakTrackingVH>
+InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
+  // This function implements a non-recursive postorder traversal of a partial
+  // use-def graph of function F.
+  std::vector<std::pair<Value *, bool>> PostorderStack;
+  // The set of visited expressions.
+  DenseSet<Value *> Visited;
+
+  auto PushPtrOperand = [&](Value *Ptr) {
+    appendsFlatAddressExpressionToPostorderStack(Ptr, PostorderStack,
+                                                 Visited);
+  };
+
+  // Look at operations that may be interesting accelerate by moving to a known
+  // address space. We aim at generating after loads and stores, but pure
+  // addressing calculations may also be faster.
+  for (Instruction &I : instructions(F)) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+      if (!GEP->getType()->isVectorTy())
+        PushPtrOperand(GEP->getPointerOperand());
+    } else if (auto *LI = dyn_cast<LoadInst>(&I))
+      PushPtrOperand(LI->getPointerOperand());
+    else if (auto *SI = dyn_cast<StoreInst>(&I))
+      PushPtrOperand(SI->getPointerOperand());
+    else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
+      PushPtrOperand(RMW->getPointerOperand());
+    else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
+      PushPtrOperand(CmpX->getPointerOperand());
+    else if (auto *MI = dyn_cast<MemIntrinsic>(&I)) {
+      // For memset/memcpy/memmove, any pointer operand can be replaced.
+      PushPtrOperand(MI->getRawDest());
+
+      // Handle 2nd operand for memcpy/memmove.
+      if (auto *MTI = dyn_cast<MemTransferInst>(MI))
+        PushPtrOperand(MTI->getRawSource());
+    } else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+      collectRewritableIntrinsicOperands(II, PostorderStack, Visited);
+    else if (ICmpInst *Cmp = dyn_cast<ICmpInst>(&I)) {
+      // FIXME: Handle vectors of pointers
+      if (Cmp->getOperand(0)->getType()->isPointerTy()) {
+        PushPtrOperand(Cmp->getOperand(0));
+        PushPtrOperand(Cmp->getOperand(1));
+      }
+    } else if (auto *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
+      if (!ASC->getType()->isVectorTy())
+        PushPtrOperand(ASC->getPointerOperand());
+    }
+  }
+
+  std::vector<WeakTrackingVH> Postorder; // The resultant postorder.
+  while (!PostorderStack.empty()) {
+    Value *TopVal = PostorderStack.back().first;
+    // If the operands of the expression on the top are already explored,
+    // adds that expression to the resultant postorder.
+    if (PostorderStack.back().second) {
+      if (TopVal->getType()->getPointerAddressSpace() == FlatAddrSpace)
+        Postorder.push_back(TopVal);
+      PostorderStack.pop_back();
+      continue;
+    }
+    // Otherwise, adds its operands to the stack and explores them.
+    PostorderStack.back().second = true;
+    for (Value *PtrOperand : getPointerOperands(*TopVal)) {
+      appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
+                                                   Visited);
+    }
+  }
+  return Postorder;
+}
+
+// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone
+// of OperandUse.get() in the new address space. If the clone is not ready yet,
+// returns an undef in the new address space as a placeholder.
+static Value *operandWithNewAddressSpaceOrCreateUndef(
+    const Use &OperandUse, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) {
+  Value *Operand = OperandUse.get();
+
+  Type *NewPtrTy =
+      Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (Constant *C = dyn_cast<Constant>(Operand))
+    return ConstantExpr::getAddrSpaceCast(C, NewPtrTy);
+
+  if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
+    return NewOperand;
+
+  UndefUsesToFix->push_back(&OperandUse);
+  return UndefValue::get(NewPtrTy);
+}
+
+// Returns a clone of `I` with its operands converted to those specified in
+// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an
+// operand whose address space needs to be modified might not exist in
+// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and
+// adds that operand use to UndefUsesToFix so that caller can fix them later.
+//
+// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
+// from a pointer whose type already matches. Therefore, this function returns a
+// Value* instead of an Instruction*.
+static Value *cloneInstructionWithNewAddressSpace(
+    Instruction *I, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) {
+  Type *NewPtrType =
+      I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (I->getOpcode() == Instruction::AddrSpaceCast) {
+    Value *Src = I->getOperand(0);
+    // Because `I` is flat, the source address space must be specific.
+    // Therefore, the inferred address space must be the source space, according
+    // to our algorithm.
+    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+    if (Src->getType() != NewPtrType)
+      return new BitCastInst(Src, NewPtrType);
+    return Src;
+  }
+
+  // Computes the converted pointer operands.
+  SmallVector<Value *, 4> NewPointerOperands;
+  for (const Use &OperandUse : I->operands()) {
+    if (!OperandUse.get()->getType()->isPointerTy())
+      NewPointerOperands.push_back(nullptr);
+    else
+      NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
+                                     OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
+  }
+
+  switch (I->getOpcode()) {
+  case Instruction::BitCast:
+    return new BitCastInst(NewPointerOperands[0], NewPtrType);
+  case Instruction::PHI: {
+    assert(I->getType()->isPointerTy());
+    PHINode *PHI = cast<PHINode>(I);
+    PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues());
+    for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) {
+      unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index);
+      NewPHI->addIncoming(NewPointerOperands[OperandNo],
+                          PHI->getIncomingBlock(Index));
+    }
+    return NewPHI;
+  }
+  case Instruction::GetElementPtr: {
+    GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+    GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+        GEP->getSourceElementType(), NewPointerOperands[0],
+        SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()));
+    NewGEP->setIsInBounds(GEP->isInBounds());
+    return NewGEP;
+  }
+  case Instruction::Select: {
+    assert(I->getType()->isPointerTy());
+    return SelectInst::Create(I->getOperand(0), NewPointerOperands[1],
+                              NewPointerOperands[2], "", nullptr, I);
+  }
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+
+// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the
+// constant expression `CE` with its operands replaced as specified in
+// ValueWithNewAddrSpace.
+static Value *cloneConstantExprWithNewAddressSpace(
+  ConstantExpr *CE, unsigned NewAddrSpace,
+  const ValueToValueMapTy &ValueWithNewAddrSpace) {
+  Type *TargetType =
+    CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+    // Because CE is flat, the source address space must be specific.
+    // Therefore, the inferred address space must be the source space according
+    // to our algorithm.
+    assert(CE->getOperand(0)->getType()->getPointerAddressSpace() ==
+           NewAddrSpace);
+    return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
+  }
+
+  if (CE->getOpcode() == Instruction::BitCast) {
+    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(CE->getOperand(0)))
+      return ConstantExpr::getBitCast(cast<Constant>(NewOperand), TargetType);
+    return ConstantExpr::getAddrSpaceCast(CE, TargetType);
+  }
+
+  if (CE->getOpcode() == Instruction::Select) {
+    Constant *Src0 = CE->getOperand(1);
+    Constant *Src1 = CE->getOperand(2);
+    if (Src0->getType()->getPointerAddressSpace() ==
+        Src1->getType()->getPointerAddressSpace()) {
+
+      return ConstantExpr::getSelect(
+          CE->getOperand(0), ConstantExpr::getAddrSpaceCast(Src0, TargetType),
+          ConstantExpr::getAddrSpaceCast(Src1, TargetType));
+    }
+  }
+
+  // Computes the operands of the new constant expression.
+  bool IsNew = false;
+  SmallVector<Constant *, 4> NewOperands;
+  for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
+    Constant *Operand = CE->getOperand(Index);
+    // If the address space of `Operand` needs to be modified, the new operand
+    // with the new address space should already be in ValueWithNewAddrSpace
+    // because (1) the constant expressions we consider (i.e. addrspacecast,
+    // bitcast, and getelementptr) do not incur cycles in the data flow graph
+    // and (2) this function is called on constant expressions in postorder.
+    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) {
+      IsNew = true;
+      NewOperands.push_back(cast<Constant>(NewOperand));
+    } else {
+      // Otherwise, reuses the old operand.
+      NewOperands.push_back(Operand);
+    }
+  }
+
+  // If !IsNew, we will replace the Value with itself. However, replaced values
+  // are assumed to wrapped in a addrspace cast later so drop it now.
+  if (!IsNew)
+    return nullptr;
+
+  if (CE->getOpcode() == Instruction::GetElementPtr) {
+    // Needs to specify the source type while constructing a getelementptr
+    // constant expression.
+    return CE->getWithOperands(
+      NewOperands, TargetType, /*OnlyIfReduced=*/false,
+      NewOperands[0]->getType()->getPointerElementType());
+  }
+
+  return CE->getWithOperands(NewOperands, TargetType);
+}
+
+// Returns a clone of the value `V`, with its operands replaced as specified in
+// ValueWithNewAddrSpace. This function is called on every flat address
+// expression whose address space needs to be modified, in postorder.
+//
+// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
+Value *InferAddressSpaces::cloneValueWithNewAddressSpace(
+  Value *V, unsigned NewAddrSpace,
+  const ValueToValueMapTy &ValueWithNewAddrSpace,
+  SmallVectorImpl<const Use *> *UndefUsesToFix) const {
+  // All values in Postorder are flat address expressions.
+  assert(isAddressExpression(*V) &&
+         V->getType()->getPointerAddressSpace() == FlatAddrSpace);
+
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    Value *NewV = cloneInstructionWithNewAddressSpace(
+      I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
+    if (Instruction *NewI = dyn_cast<Instruction>(NewV)) {
+      if (NewI->getParent() == nullptr) {
+        NewI->insertBefore(I);
+        NewI->takeName(I);
+      }
+    }
+    return NewV;
+  }
+
+  return cloneConstantExprWithNewAddressSpace(
+    cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace);
+}
+
+// Defines the join operation on the address space lattice (see the file header
+// comments).
+unsigned InferAddressSpaces::joinAddressSpaces(unsigned AS1,
+                                               unsigned AS2) const {
+  if (AS1 == FlatAddrSpace || AS2 == FlatAddrSpace)
+    return FlatAddrSpace;
+
+  if (AS1 == UninitializedAddressSpace)
+    return AS2;
+  if (AS2 == UninitializedAddressSpace)
+    return AS1;
+
+  // The join of two different specific address spaces is flat.
+  return (AS1 == AS2) ? AS1 : FlatAddrSpace;
+}
+
+bool InferAddressSpaces::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  const TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  FlatAddrSpace = TTI.getFlatAddressSpace();
+  if (FlatAddrSpace == UninitializedAddressSpace)
+    return false;
+
+  // Collects all flat address expressions in postorder.
+  std::vector<WeakTrackingVH> Postorder = collectFlatAddressExpressions(F);
+
+  // Runs a data-flow analysis to refine the address spaces of every expression
+  // in Postorder.
+  ValueToAddrSpaceMapTy InferredAddrSpace;
+  inferAddressSpaces(Postorder, &InferredAddrSpace);
+
+  // Changes the address spaces of the flat address expressions who are inferred
+  // to point to a specific address space.
+  return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F);
+}
+
+// Constants need to be tracked through RAUW to handle cases with nested
+// constant expressions, so wrap values in WeakTrackingVH.
+void InferAddressSpaces::inferAddressSpaces(
+    ArrayRef<WeakTrackingVH> Postorder,
+    ValueToAddrSpaceMapTy *InferredAddrSpace) const {
+  SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
+  // Initially, all expressions are in the uninitialized address space.
+  for (Value *V : Postorder)
+    (*InferredAddrSpace)[V] = UninitializedAddressSpace;
+
+  while (!Worklist.empty()) {
+    Value *V = Worklist.pop_back_val();
+
+    // Tries to update the address space of the stack top according to the
+    // address spaces of its operands.
+    DEBUG(dbgs() << "Updating the address space of\n  " << *V << '\n');
+    Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
+    if (!NewAS.hasValue())
+      continue;
+    // If any updates are made, grabs its users to the worklist because
+    // their address spaces can also be possibly updated.
+    DEBUG(dbgs() << "  to " << NewAS.getValue() << '\n');
+    (*InferredAddrSpace)[V] = NewAS.getValue();
+
+    for (Value *User : V->users()) {
+      // Skip if User is already in the worklist.
+      if (Worklist.count(User))
+        continue;
+
+      auto Pos = InferredAddrSpace->find(User);
+      // Our algorithm only updates the address spaces of flat address
+      // expressions, which are those in InferredAddrSpace.
+      if (Pos == InferredAddrSpace->end())
+        continue;
+
+      // Function updateAddressSpace moves the address space down a lattice
+      // path. Therefore, nothing to do if User is already inferred as flat (the
+      // bottom element in the lattice).
+      if (Pos->second == FlatAddrSpace)
+        continue;
+
+      Worklist.insert(User);
+    }
+  }
+}
+
+Optional<unsigned> InferAddressSpaces::updateAddressSpace(
+    const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const {
+  assert(InferredAddrSpace.count(&V));
+
+  // The new inferred address space equals the join of the address spaces
+  // of all its pointer operands.
+  unsigned NewAS = UninitializedAddressSpace;
+
+  const Operator &Op = cast<Operator>(V);
+  if (Op.getOpcode() == Instruction::Select) {
+    Value *Src0 = Op.getOperand(1);
+    Value *Src1 = Op.getOperand(2);
+
+    auto I = InferredAddrSpace.find(Src0);
+    unsigned Src0AS = (I != InferredAddrSpace.end()) ?
+      I->second : Src0->getType()->getPointerAddressSpace();
+
+    auto J = InferredAddrSpace.find(Src1);
+    unsigned Src1AS = (J != InferredAddrSpace.end()) ?
+      J->second : Src1->getType()->getPointerAddressSpace();
+
+    auto *C0 = dyn_cast<Constant>(Src0);
+    auto *C1 = dyn_cast<Constant>(Src1);
+
+    // If one of the inputs is a constant, we may be able to do a constant
+    // addrspacecast of it. Defer inferring the address space until the input
+    // address space is known.
+    if ((C1 && Src0AS == UninitializedAddressSpace) ||
+        (C0 && Src1AS == UninitializedAddressSpace))
+      return None;
+
+    if (C0 && isSafeToCastConstAddrSpace(C0, Src1AS))
+      NewAS = Src1AS;
+    else if (C1 && isSafeToCastConstAddrSpace(C1, Src0AS))
+      NewAS = Src0AS;
+    else
+      NewAS = joinAddressSpaces(Src0AS, Src1AS);
+  } else {
+    for (Value *PtrOperand : getPointerOperands(V)) {
+      auto I = InferredAddrSpace.find(PtrOperand);
+      unsigned OperandAS = I != InferredAddrSpace.end() ?
+        I->second : PtrOperand->getType()->getPointerAddressSpace();
+
+      // join(flat, *) = flat. So we can break if NewAS is already flat.
+      NewAS = joinAddressSpaces(NewAS, OperandAS);
+      if (NewAS == FlatAddrSpace)
+        break;
+    }
+  }
+
+  unsigned OldAS = InferredAddrSpace.lookup(&V);
+  assert(OldAS != FlatAddrSpace);
+  if (OldAS == NewAS)
+    return None;
+  return NewAS;
+}
+
+/// \p returns true if \p U is the pointer operand of a memory instruction with
+/// a single pointer operand that can have its address space changed by simply
+/// mutating the use to a new value.
+static bool isSimplePointerUseValidToReplace(Use &U) {
+  User *Inst = U.getUser();
+  unsigned OpNo = U.getOperandNo();
+
+  if (auto *LI = dyn_cast<LoadInst>(Inst))
+    return OpNo == LoadInst::getPointerOperandIndex() && !LI->isVolatile();
+
+  if (auto *SI = dyn_cast<StoreInst>(Inst))
+    return OpNo == StoreInst::getPointerOperandIndex() && !SI->isVolatile();
+
+  if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst))
+    return OpNo == AtomicRMWInst::getPointerOperandIndex() && !RMW->isVolatile();
+
+  if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    return OpNo == AtomicCmpXchgInst::getPointerOperandIndex() &&
+           !CmpX->isVolatile();
+  }
+
+  return false;
+}
+
+/// Update memory intrinsic uses that require more complex processing than
+/// simple memory instructions. Thse require re-mangling and may have multiple
+/// pointer operands.
+static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
+                                     Value *NewV) {
+  IRBuilder<> B(MI);
+  MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa);
+  MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope);
+  MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias);
+
+  if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
+    B.CreateMemSet(NewV, MSI->getValue(),
+                   MSI->getLength(), MSI->getAlignment(),
+                   false, // isVolatile
+                   TBAA, ScopeMD, NoAliasMD);
+  } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
+    Value *Src = MTI->getRawSource();
+    Value *Dest = MTI->getRawDest();
+
+    // Be careful in case this is a self-to-self copy.
+    if (Src == OldV)
+      Src = NewV;
+
+    if (Dest == OldV)
+      Dest = NewV;
+
+    if (isa<MemCpyInst>(MTI)) {
+      MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
+      B.CreateMemCpy(Dest, Src, MTI->getLength(),
+                     MTI->getAlignment(),
+                     false, // isVolatile
+                     TBAA, TBAAStruct, ScopeMD, NoAliasMD);
+    } else {
+      assert(isa<MemMoveInst>(MTI));
+      B.CreateMemMove(Dest, Src, MTI->getLength(),
+                      MTI->getAlignment(),
+                      false, // isVolatile
+                      TBAA, ScopeMD, NoAliasMD);
+    }
+  } else
+    llvm_unreachable("unhandled MemIntrinsic");
+
+  MI->eraseFromParent();
+  return true;
+}
+
+// \p returns true if it is OK to change the address space of constant \p C with
+// a ConstantExpr addrspacecast.
+bool InferAddressSpaces::isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const {
+  assert(NewAS != UninitializedAddressSpace);
+
+  unsigned SrcAS = C->getType()->getPointerAddressSpace();
+  if (SrcAS == NewAS || isa<UndefValue>(C))
+    return true;
+
+  // Prevent illegal casts between different non-flat address spaces.
+  if (SrcAS != FlatAddrSpace && NewAS != FlatAddrSpace)
+    return false;
+
+  if (isa<ConstantPointerNull>(C))
+    return true;
+
+  if (auto *Op = dyn_cast<Operator>(C)) {
+    // If we already have a constant addrspacecast, it should be safe to cast it
+    // off.
+    if (Op->getOpcode() == Instruction::AddrSpaceCast)
+      return isSafeToCastConstAddrSpace(cast<Constant>(Op->getOperand(0)), NewAS);
+
+    if (Op->getOpcode() == Instruction::IntToPtr &&
+        Op->getType()->getPointerAddressSpace() == FlatAddrSpace)
+      return true;
+  }
+
+  return false;
+}
+
+static Value::use_iterator skipToNextUser(Value::use_iterator I,
+                                          Value::use_iterator End) {
+  User *CurUser = I->getUser();
+  ++I;
+
+  while (I != End && I->getUser() == CurUser)
+    ++I;
+
+  return I;
+}
+
+bool InferAddressSpaces::rewriteWithNewAddressSpaces(
+    ArrayRef<WeakTrackingVH> Postorder,
+    const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
+  // For each address expression to be modified, creates a clone of it with its
+  // pointer operands converted to the new address space. Since the pointer
+  // operands are converted, the clone is naturally in the new address space by
+  // construction.
+  ValueToValueMapTy ValueWithNewAddrSpace;
+  SmallVector<const Use *, 32> UndefUsesToFix;
+  for (Value* V : Postorder) {
+    unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
+    if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
+      ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace(
+        V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+    }
+  }
+
+  if (ValueWithNewAddrSpace.empty())
+    return false;
+
+  // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
+  for (const Use *UndefUse : UndefUsesToFix) {
+    User *V = UndefUse->getUser();
+    User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V));
+    unsigned OperandNo = UndefUse->getOperandNo();
+    assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
+    NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
+  }
+
+  SmallVector<Instruction *, 16> DeadInstructions;
+
+  // Replaces the uses of the old address expressions with the new ones.
+  for (const WeakTrackingVH &WVH : Postorder) {
+    assert(WVH && "value was unexpectedly deleted");
+    Value *V = WVH;
+    Value *NewV = ValueWithNewAddrSpace.lookup(V);
+    if (NewV == nullptr)
+      continue;
+
+    DEBUG(dbgs() << "Replacing the uses of " << *V
+                 << "\n  with\n  " << *NewV << '\n');
+
+    if (Constant *C = dyn_cast<Constant>(V)) {
+      Constant *Replace = ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+                                                         C->getType());
+      if (C != Replace) {
+        DEBUG(dbgs() << "Inserting replacement const cast: "
+              << Replace << ": " << *Replace << '\n');
+        C->replaceAllUsesWith(Replace);
+        V = Replace;
+      }
+    }
+
+    Value::use_iterator I, E, Next;
+    for (I = V->use_begin(), E = V->use_end(); I != E; ) {
+      Use &U = *I;
+
+      // Some users may see the same pointer operand in multiple operands. Skip
+      // to the next instruction.
+      I = skipToNextUser(I, E);
+
+      if (isSimplePointerUseValidToReplace(U)) {
+        // If V is used as the pointer operand of a compatible memory operation,
+        // sets the pointer operand to NewV. This replacement does not change
+        // the element type, so the resultant load/store is still valid.
+        U.set(NewV);
+        continue;
+      }
+
+      User *CurUser = U.getUser();
+      // Handle more complex cases like intrinsic that need to be remangled.
+      if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) {
+        if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV))
+          continue;
+      }
+
+      if (auto *II = dyn_cast<IntrinsicInst>(CurUser)) {
+        if (rewriteIntrinsicOperands(II, V, NewV))
+          continue;
+      }
+
+      if (isa<Instruction>(CurUser)) {
+        if (ICmpInst *Cmp = dyn_cast<ICmpInst>(CurUser)) {
+          // If we can infer that both pointers are in the same addrspace,
+          // transform e.g.
+          //   %cmp = icmp eq float* %p, %q
+          // into
+          //   %cmp = icmp eq float addrspace(3)* %new_p, %new_q
+
+          unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+          int SrcIdx = U.getOperandNo();
+          int OtherIdx = (SrcIdx == 0) ? 1 : 0;
+          Value *OtherSrc = Cmp->getOperand(OtherIdx);
+
+          if (Value *OtherNewV = ValueWithNewAddrSpace.lookup(OtherSrc)) {
+            if (OtherNewV->getType()->getPointerAddressSpace() == NewAS) {
+              Cmp->setOperand(OtherIdx, OtherNewV);
+              Cmp->setOperand(SrcIdx, NewV);
+              continue;
+            }
+          }
+
+          // Even if the type mismatches, we can cast the constant.
+          if (auto *KOtherSrc = dyn_cast<Constant>(OtherSrc)) {
+            if (isSafeToCastConstAddrSpace(KOtherSrc, NewAS)) {
+              Cmp->setOperand(SrcIdx, NewV);
+              Cmp->setOperand(OtherIdx,
+                ConstantExpr::getAddrSpaceCast(KOtherSrc, NewV->getType()));
+              continue;
+            }
+          }
+        }
+
+        if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(CurUser)) {
+          unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+          if (ASC->getDestAddressSpace() == NewAS) {
+            ASC->replaceAllUsesWith(NewV);
+            DeadInstructions.push_back(ASC);
+            continue;
+          }
+        }
+
+        // Otherwise, replaces the use with flat(NewV).
+        if (Instruction *I = dyn_cast<Instruction>(V)) {
+          BasicBlock::iterator InsertPos = std::next(I->getIterator());
+          while (isa<PHINode>(InsertPos))
+            ++InsertPos;
+          U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+        } else {
+          U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+                                               V->getType()));
+        }
+      }
+    }
+
+    if (V->use_empty()) {
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        DeadInstructions.push_back(I);
+    }
+  }
+
+  for (Instruction *I : DeadInstructions)
+    RecursivelyDeleteTriviallyDeadInstructions(I);
+
+  return true;
+}
+
+FunctionPass *llvm::createInferAddressSpacesPass() {
+  return new InferAddressSpaces();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
new file mode 100644
index 000000000000..ee3de51b1360
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -0,0 +1,2355 @@
+//===- JumpThreading.cpp - Thread control through conditional blocks ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Jump Threading pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/JumpThreading.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <memory>
+using namespace llvm;
+using namespace jumpthreading;
+
+#define DEBUG_TYPE "jump-threading"
+
+STATISTIC(NumThreads, "Number of jumps threaded");
+STATISTIC(NumFolds,   "Number of terminators folded");
+STATISTIC(NumDupes,   "Number of branch blocks duplicated to eliminate phi");
+
+static cl::opt<unsigned>
+BBDuplicateThreshold("jump-threading-threshold",
+          cl::desc("Max block size to duplicate for jump threading"),
+          cl::init(6), cl::Hidden);
+
+static cl::opt<unsigned>
+ImplicationSearchThreshold(
+  "jump-threading-implication-search-threshold",
+  cl::desc("The number of predecessors to search for a stronger "
+           "condition to use to thread over a weaker condition"),
+  cl::init(3), cl::Hidden);
+
+namespace {
+  /// This pass performs 'jump threading', which looks at blocks that have
+  /// multiple predecessors and multiple successors.  If one or more of the
+  /// predecessors of the block can be proven to always jump to one of the
+  /// successors, we forward the edge from the predecessor to the successor by
+  /// duplicating the contents of this block.
+  ///
+  /// An example of when this can occur is code like this:
+  ///
+  ///   if () { ...
+  ///     X = 4;
+  ///   }
+  ///   if (X < 3) {
+  ///
+  /// In this case, the unconditional branch at the end of the first if can be
+  /// revectored to the false side of the second if.
+  ///
+  class JumpThreading : public FunctionPass {
+    JumpThreadingPass Impl;
+
+  public:
+    static char ID; // Pass identification
+    JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) {
+      initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<LazyValueInfoWrapperPass>();
+      AU.addPreserved<LazyValueInfoWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+    }
+
+    void releaseMemory() override { Impl.releaseMemory(); }
+  };
+}
+
+char JumpThreading::ID = 0;
+INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
+                "Jump Threading", false, false)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(JumpThreading, "jump-threading",
+                "Jump Threading", false, false)
+
+// Public interface to the Jump Threading pass
+FunctionPass *llvm::createJumpThreadingPass(int Threshold) { return new JumpThreading(Threshold); }
+
+JumpThreadingPass::JumpThreadingPass(int T) {
+  BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
+}
+
+/// runOnFunction - Top level algorithm.
+///
+bool JumpThreading::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+  auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+  auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  std::unique_ptr<BlockFrequencyInfo> BFI;
+  std::unique_ptr<BranchProbabilityInfo> BPI;
+  bool HasProfileData = F.getEntryCount().hasValue();
+  if (HasProfileData) {
+    LoopInfo LI{DominatorTree(F)};
+    BPI.reset(new BranchProbabilityInfo(F, LI, TLI));
+    BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+  }
+
+  return Impl.runImpl(F, TLI, LVI, AA, HasProfileData, std::move(BFI),
+                      std::move(BPI));
+}
+
+PreservedAnalyses JumpThreadingPass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &LVI = AM.getResult<LazyValueAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+
+  std::unique_ptr<BlockFrequencyInfo> BFI;
+  std::unique_ptr<BranchProbabilityInfo> BPI;
+  bool HasProfileData = F.getEntryCount().hasValue();
+  if (HasProfileData) {
+    LoopInfo LI{DominatorTree(F)};
+    BPI.reset(new BranchProbabilityInfo(F, LI, &TLI));
+    BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+  }
+
+  bool Changed = runImpl(F, &TLI, &LVI, &AA, HasProfileData, std::move(BFI),
+                         std::move(BPI));
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
+                                LazyValueInfo *LVI_, AliasAnalysis *AA_,
+                                bool HasProfileData_,
+                                std::unique_ptr<BlockFrequencyInfo> BFI_,
+                                std::unique_ptr<BranchProbabilityInfo> BPI_) {
+
+  DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
+  TLI = TLI_;
+  LVI = LVI_;
+  AA = AA_;
+  BFI.reset();
+  BPI.reset();
+  // When profile data is available, we need to update edge weights after
+  // successful jump threading, which requires both BPI and BFI being available.
+  HasProfileData = HasProfileData_;
+  auto *GuardDecl = F.getParent()->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_guard));
+  HasGuards = GuardDecl && !GuardDecl->use_empty();
+  if (HasProfileData) {
+    BPI = std::move(BPI_);
+    BFI = std::move(BFI_);
+  }
+
+  // Remove unreachable blocks from function as they may result in infinite
+  // loop. We do threading if we found something profitable. Jump threading a
+  // branch can create other opportunities. If these opportunities form a cycle
+  // i.e. if any jump threading is undoing previous threading in the path, then
+  // we will loop forever. We take care of this issue by not jump threading for
+  // back edges. This works for normal cases but not for unreachable blocks as
+  // they may have cycle with no back edge.
+  bool EverChanged = false;
+  EverChanged |= removeUnreachableBlocks(F, LVI);
+
+  FindLoopHeaders(F);
+
+  bool Changed;
+  do {
+    Changed = false;
+    for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
+      BasicBlock *BB = &*I;
+      // Thread all of the branches we can over this block.
+      while (ProcessBlock(BB))
+        Changed = true;
+
+      ++I;
+
+      // If the block is trivially dead, zap it.  This eliminates the successor
+      // edges which simplifies the CFG.
+      if (pred_empty(BB) &&
+          BB != &BB->getParent()->getEntryBlock()) {
+        DEBUG(dbgs() << "  JT: Deleting dead block '" << BB->getName()
+              << "' with terminator: " << *BB->getTerminator() << '\n');
+        LoopHeaders.erase(BB);
+        LVI->eraseBlock(BB);
+        DeleteDeadBlock(BB);
+        Changed = true;
+        continue;
+      }
+
+      BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+
+      // Can't thread an unconditional jump, but if the block is "almost
+      // empty", we can replace uses of it with uses of the successor and make
+      // this dead.
+      // We should not eliminate the loop header either, because eliminating
+      // a loop header might later prevent LoopSimplify from transforming nested
+      // loops into simplified form.
+      if (BI && BI->isUnconditional() &&
+          BB != &BB->getParent()->getEntryBlock() &&
+          // If the terminator is the only non-phi instruction, try to nuke it.
+          BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB)) {
+        // FIXME: It is always conservatively correct to drop the info
+        // for a block even if it doesn't get erased.  This isn't totally
+        // awesome, but it allows us to use AssertingVH to prevent nasty
+        // dangling pointer issues within LazyValueInfo.
+        LVI->eraseBlock(BB);
+        if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
+          Changed = true;
+      }
+    }
+    EverChanged |= Changed;
+  } while (Changed);
+
+  LoopHeaders.clear();
+  return EverChanged;
+}
+
+// Replace uses of Cond with ToVal when safe to do so. If all uses are
+// replaced, we can remove Cond. We cannot blindly replace all uses of Cond
+// because we may incorrectly replace uses when guards/assumes are uses of
+// of `Cond` and we used the guards/assume to reason about the `Cond` value
+// at the end of block. RAUW unconditionally replaces all uses
+// including the guards/assumes themselves and the uses before the
+// guard/assume.
+static void ReplaceFoldableUses(Instruction *Cond, Value *ToVal) {
+  assert(Cond->getType() == ToVal->getType());
+  auto *BB = Cond->getParent();
+  // We can unconditionally replace all uses in non-local blocks (i.e. uses
+  // strictly dominated by BB), since LVI information is true from the
+  // terminator of BB.
+  replaceNonLocalUsesWith(Cond, ToVal);
+  for (Instruction &I : reverse(*BB)) {
+    // Reached the Cond whose uses we are trying to replace, so there are no
+    // more uses.
+    if (&I == Cond)
+      break;
+    // We only replace uses in instructions that are guaranteed to reach the end
+    // of BB, where we know Cond is ToVal.
+    if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+      break;
+    I.replaceUsesOfWith(Cond, ToVal);
+  }
+  if (Cond->use_empty() && !Cond->mayHaveSideEffects())
+    Cond->eraseFromParent();
+}
+
+/// Return the cost of duplicating a piece of this block from first non-phi
+/// and before StopAt instruction to thread across it. Stop scanning the block
+/// when exceeding the threshold. If duplication is impossible, returns ~0U.
+static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
+                                             Instruction *StopAt,
+                                             unsigned Threshold) {
+  assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
+  /// Ignore PHI nodes, these will be flattened when duplication happens.
+  BasicBlock::const_iterator I(BB->getFirstNonPHI());
+
+  // FIXME: THREADING will delete values that are just used to compute the
+  // branch, so they shouldn't count against the duplication cost.
+
+  unsigned Bonus = 0;
+  if (BB->getTerminator() == StopAt) {
+    // Threading through a switch statement is particularly profitable.  If this
+    // block ends in a switch, decrease its cost to make it more likely to
+    // happen.
+    if (isa<SwitchInst>(StopAt))
+      Bonus = 6;
+
+    // The same holds for indirect branches, but slightly more so.
+    if (isa<IndirectBrInst>(StopAt))
+      Bonus = 8;
+  }
+
+  // Bump the threshold up so the early exit from the loop doesn't skip the
+  // terminator-based Size adjustment at the end.
+  Threshold += Bonus;
+
+  // Sum up the cost of each instruction until we get to the terminator.  Don't
+  // include the terminator because the copy won't include it.
+  unsigned Size = 0;
+  for (; &*I != StopAt; ++I) {
+
+    // Stop scanning the block if we've reached the threshold.
+    if (Size > Threshold)
+      return Size;
+
+    // Debugger intrinsics don't incur code size.
+    if (isa<DbgInfoIntrinsic>(I)) continue;
+
+    // If this is a pointer->pointer bitcast, it is free.
+    if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
+      continue;
+
+    // Bail out if this instruction gives back a token type, it is not possible
+    // to duplicate it if it is used outside this BB.
+    if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB))
+      return ~0U;
+
+    // All other instructions count for at least one unit.
+    ++Size;
+
+    // Calls are more expensive.  If they are non-intrinsic calls, we model them
+    // as having cost of 4.  If they are a non-vector intrinsic, we model them
+    // as having cost of 2 total, and if they are a vector intrinsic, we model
+    // them as having cost 1.
+    if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+      if (CI->cannotDuplicate() || CI->isConvergent())
+        // Blocks with NoDuplicate are modelled as having infinite cost, so they
+        // are never duplicated.
+        return ~0U;
+      else if (!isa<IntrinsicInst>(CI))
+        Size += 3;
+      else if (!CI->getType()->isVectorTy())
+        Size += 1;
+    }
+  }
+
+  return Size > Bonus ? Size - Bonus : 0;
+}
+
+/// FindLoopHeaders - We do not want jump threading to turn proper loop
+/// structures into irreducible loops.  Doing this breaks up the loop nesting
+/// hierarchy and pessimizes later transformations.  To prevent this from
+/// happening, we first have to find the loop headers.  Here we approximate this
+/// by finding targets of backedges in the CFG.
+///
+/// Note that there definitely are cases when we want to allow threading of
+/// edges across a loop header.  For example, threading a jump from outside the
+/// loop (the preheader) to an exit block of the loop is definitely profitable.
+/// It is also almost always profitable to thread backedges from within the loop
+/// to exit blocks, and is often profitable to thread backedges to other blocks
+/// within the loop (forming a nested loop).  This simple analysis is not rich
+/// enough to track all of these properties and keep it up-to-date as the CFG
+/// mutates, so we don't allow any of these transformations.
+///
+void JumpThreadingPass::FindLoopHeaders(Function &F) {
+  SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
+  FindFunctionBackedges(F, Edges);
+
+  for (const auto &Edge : Edges)
+    LoopHeaders.insert(Edge.second);
+}
+
+/// getKnownConstant - Helper method to determine if we can thread over a
+/// terminator with the given value as its condition, and if so what value to
+/// use for that. What kind of value this is depends on whether we want an
+/// integer or a block address, but an undef is always accepted.
+/// Returns null if Val is null or not an appropriate constant.
+static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
+  if (!Val)
+    return nullptr;
+
+  // Undef is "known" enough.
+  if (UndefValue *U = dyn_cast<UndefValue>(Val))
+    return U;
+
+  if (Preference == WantBlockAddress)
+    return dyn_cast<BlockAddress>(Val->stripPointerCasts());
+
+  return dyn_cast<ConstantInt>(Val);
+}
+
+/// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see
+/// if we can infer that the value is a known ConstantInt/BlockAddress or undef
+/// in any of our predecessors.  If so, return the known list of value and pred
+/// BB in the result vector.
+///
+/// This returns true if there were any known values.
+///
+bool JumpThreadingPass::ComputeValueKnownInPredecessors(
+    Value *V, BasicBlock *BB, PredValueInfo &Result,
+    ConstantPreference Preference, Instruction *CxtI) {
+  // This method walks up use-def chains recursively.  Because of this, we could
+  // get into an infinite loop going around loops in the use-def chain.  To
+  // prevent this, keep track of what (value, block) pairs we've already visited
+  // and terminate the search if we loop back to them
+  if (!RecursionSet.insert(std::make_pair(V, BB)).second)
+    return false;
+
+  // An RAII help to remove this pair from the recursion set once the recursion
+  // stack pops back out again.
+  RecursionSetRemover remover(RecursionSet, std::make_pair(V, BB));
+
+  // If V is a constant, then it is known in all predecessors.
+  if (Constant *KC = getKnownConstant(V, Preference)) {
+    for (BasicBlock *Pred : predecessors(BB))
+      Result.push_back(std::make_pair(KC, Pred));
+
+    return !Result.empty();
+  }
+
+  // If V is a non-instruction value, or an instruction in a different block,
+  // then it can't be derived from a PHI.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || I->getParent() != BB) {
+
+    // Okay, if this is a live-in value, see if it has a known value at the end
+    // of any of our predecessors.
+    //
+    // FIXME: This should be an edge property, not a block end property.
+    /// TODO: Per PR2563, we could infer value range information about a
+    /// predecessor based on its terminator.
+    //
+    // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if
+    // "I" is a non-local compare-with-a-constant instruction.  This would be
+    // able to handle value inequalities better, for example if the compare is
+    // "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
+    // Perhaps getConstantOnEdge should be smart enough to do this?
+
+    for (BasicBlock *P : predecessors(BB)) {
+      // If the value is known by LazyValueInfo to be a constant in a
+      // predecessor, use that information to try to thread this block.
+      Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI);
+      if (Constant *KC = getKnownConstant(PredCst, Preference))
+        Result.push_back(std::make_pair(KC, P));
+    }
+
+    return !Result.empty();
+  }
+
+  /// If I is a PHI node, then we know the incoming values for any constants.
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *InVal = PN->getIncomingValue(i);
+      if (Constant *KC = getKnownConstant(InVal, Preference)) {
+        Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
+      } else {
+        Constant *CI = LVI->getConstantOnEdge(InVal,
+                                              PN->getIncomingBlock(i),
+                                              BB, CxtI);
+        if (Constant *KC = getKnownConstant(CI, Preference))
+          Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
+      }
+    }
+
+    return !Result.empty();
+  }
+
+  // Handle Cast instructions.  Only see through Cast when the source operand is
+  // PHI or Cmp and the source type is i1 to save the compilation time.
+  if (CastInst *CI = dyn_cast<CastInst>(I)) {
+    Value *Source = CI->getOperand(0);
+    if (!Source->getType()->isIntegerTy(1))
+      return false;
+    if (!isa<PHINode>(Source) && !isa<CmpInst>(Source))
+      return false;
+    ComputeValueKnownInPredecessors(Source, BB, Result, Preference, CxtI);
+    if (Result.empty())
+      return false;
+
+    // Convert the known values.
+    for (auto &R : Result)
+      R.first = ConstantExpr::getCast(CI->getOpcode(), R.first, CI->getType());
+
+    return true;
+  }
+
+  PredValueInfoTy LHSVals, RHSVals;
+
+  // Handle some boolean conditions.
+  if (I->getType()->getPrimitiveSizeInBits() == 1) {
+    assert(Preference == WantInteger && "One-bit non-integer type?");
+    // X | true -> true
+    // X & false -> false
+    if (I->getOpcode() == Instruction::Or ||
+        I->getOpcode() == Instruction::And) {
+      ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
+                                      WantInteger, CxtI);
+      ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals,
+                                      WantInteger, CxtI);
+
+      if (LHSVals.empty() && RHSVals.empty())
+        return false;
+
+      ConstantInt *InterestingVal;
+      if (I->getOpcode() == Instruction::Or)
+        InterestingVal = ConstantInt::getTrue(I->getContext());
+      else
+        InterestingVal = ConstantInt::getFalse(I->getContext());
+
+      SmallPtrSet<BasicBlock*, 4> LHSKnownBBs;
+
+      // Scan for the sentinel.  If we find an undef, force it to the
+      // interesting value: x|undef -> true and x&undef -> false.
+      for (const auto &LHSVal : LHSVals)
+        if (LHSVal.first == InterestingVal || isa<UndefValue>(LHSVal.first)) {
+          Result.emplace_back(InterestingVal, LHSVal.second);
+          LHSKnownBBs.insert(LHSVal.second);
+        }
+      for (const auto &RHSVal : RHSVals)
+        if (RHSVal.first == InterestingVal || isa<UndefValue>(RHSVal.first)) {
+          // If we already inferred a value for this block on the LHS, don't
+          // re-add it.
+          if (!LHSKnownBBs.count(RHSVal.second))
+            Result.emplace_back(InterestingVal, RHSVal.second);
+        }
+
+      return !Result.empty();
+    }
+
+    // Handle the NOT form of XOR.
+    if (I->getOpcode() == Instruction::Xor &&
+        isa<ConstantInt>(I->getOperand(1)) &&
+        cast<ConstantInt>(I->getOperand(1))->isOne()) {
+      ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result,
+                                      WantInteger, CxtI);
+      if (Result.empty())
+        return false;
+
+      // Invert the known values.
+      for (auto &R : Result)
+        R.first = ConstantExpr::getNot(R.first);
+
+      return true;
+    }
+
+  // Try to simplify some other binary operator values.
+  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+    assert(Preference != WantBlockAddress
+            && "A binary operator creating a block address?");
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+      PredValueInfoTy LHSVals;
+      ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals,
+                                      WantInteger, CxtI);
+
+      // Try to use constant folding to simplify the binary operator.
+      for (const auto &LHSVal : LHSVals) {
+        Constant *V = LHSVal.first;
+        Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);
+
+        if (Constant *KC = getKnownConstant(Folded, WantInteger))
+          Result.push_back(std::make_pair(KC, LHSVal.second));
+      }
+    }
+
+    return !Result.empty();
+  }
+
+  // Handle compare with phi operand, where the PHI is defined in this block.
+  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
+    assert(Preference == WantInteger && "Compares only produce integers");
+    Type *CmpType = Cmp->getType();
+    Value *CmpLHS = Cmp->getOperand(0);
+    Value *CmpRHS = Cmp->getOperand(1);
+    CmpInst::Predicate Pred = Cmp->getPredicate();
+
+    PHINode *PN = dyn_cast<PHINode>(CmpLHS);
+    if (PN && PN->getParent() == BB) {
+      const DataLayout &DL = PN->getModule()->getDataLayout();
+      // We can do this simplification if any comparisons fold to true or false.
+      // See if any do.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        BasicBlock *PredBB = PN->getIncomingBlock(i);
+        Value *LHS = PN->getIncomingValue(i);
+        Value *RHS = CmpRHS->DoPHITranslation(BB, PredBB);
+
+        Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});
+        if (!Res) {
+          if (!isa<Constant>(RHS))
+            continue;
+
+          LazyValueInfo::Tristate
+            ResT = LVI->getPredicateOnEdge(Pred, LHS,
+                                           cast<Constant>(RHS), PredBB, BB,
+                                           CxtI ? CxtI : Cmp);
+          if (ResT == LazyValueInfo::Unknown)
+            continue;
+          Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
+        }
+
+        if (Constant *KC = getKnownConstant(Res, WantInteger))
+          Result.push_back(std::make_pair(KC, PredBB));
+      }
+
+      return !Result.empty();
+    }
+
+    // If comparing a live-in value against a constant, see if we know the
+    // live-in value on any predecessors.
+    if (isa<Constant>(CmpRHS) && !CmpType->isVectorTy()) {
+      Constant *CmpConst = cast<Constant>(CmpRHS);
+
+      if (!isa<Instruction>(CmpLHS) ||
+          cast<Instruction>(CmpLHS)->getParent() != BB) {
+        for (BasicBlock *P : predecessors(BB)) {
+          // If the value is known by LazyValueInfo to be a constant in a
+          // predecessor, use that information to try to thread this block.
+          LazyValueInfo::Tristate Res =
+            LVI->getPredicateOnEdge(Pred, CmpLHS,
+                                    CmpConst, P, BB, CxtI ? CxtI : Cmp);
+          if (Res == LazyValueInfo::Unknown)
+            continue;
+
+          Constant *ResC = ConstantInt::get(CmpType, Res);
+          Result.push_back(std::make_pair(ResC, P));
+        }
+
+        return !Result.empty();
+      }
+
+      // InstCombine can fold some forms of constant range checks into
+      // (icmp (add (x, C1)), C2). See if we have we have such a thing with
+      // x as a live-in.
+      {
+        using namespace PatternMatch;
+        Value *AddLHS;
+        ConstantInt *AddConst;
+        if (isa<ConstantInt>(CmpConst) &&
+            match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) {
+          if (!isa<Instruction>(AddLHS) ||
+              cast<Instruction>(AddLHS)->getParent() != BB) {
+            for (BasicBlock *P : predecessors(BB)) {
+              // If the value is known by LazyValueInfo to be a ConstantRange in
+              // a predecessor, use that information to try to thread this
+              // block.
+              ConstantRange CR = LVI->getConstantRangeOnEdge(
+                  AddLHS, P, BB, CxtI ? CxtI : cast<Instruction>(CmpLHS));
+              // Propagate the range through the addition.
+              CR = CR.add(AddConst->getValue());
+
+              // Get the range where the compare returns true.
+              ConstantRange CmpRange = ConstantRange::makeExactICmpRegion(
+                  Pred, cast<ConstantInt>(CmpConst)->getValue());
+
+              Constant *ResC;
+              if (CmpRange.contains(CR))
+                ResC = ConstantInt::getTrue(CmpType);
+              else if (CmpRange.inverse().contains(CR))
+                ResC = ConstantInt::getFalse(CmpType);
+              else
+                continue;
+
+              Result.push_back(std::make_pair(ResC, P));
+            }
+
+            return !Result.empty();
+          }
+        }
+      }
+
+      // Try to find a constant value for the LHS of a comparison,
+      // and evaluate it statically if we can.
+      PredValueInfoTy LHSVals;
+      ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
+                                      WantInteger, CxtI);
+
+      for (const auto &LHSVal : LHSVals) {
+        Constant *V = LHSVal.first;
+        Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst);
+        if (Constant *KC = getKnownConstant(Folded, WantInteger))
+          Result.push_back(std::make_pair(KC, LHSVal.second));
+      }
+
+      return !Result.empty();
+    }
+  }
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+    // Handle select instructions where at least one operand is a known constant
+    // and we can figure out the condition value for any predecessor block.
+    Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference);
+    Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference);
+    PredValueInfoTy Conds;
+    if ((TrueVal || FalseVal) &&
+        ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds,
+                                        WantInteger, CxtI)) {
+      for (auto &C : Conds) {
+        Constant *Cond = C.first;
+
+        // Figure out what value to use for the condition.
+        bool KnownCond;
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(Cond)) {
+          // A known boolean.
+          KnownCond = CI->isOne();
+        } else {
+          assert(isa<UndefValue>(Cond) && "Unexpected condition value");
+          // Either operand will do, so be sure to pick the one that's a known
+          // constant.
+          // FIXME: Do this more cleverly if both values are known constants?
+          KnownCond = (TrueVal != nullptr);
+        }
+
+        // See if the select has a known constant value for this predecessor.
+        if (Constant *Val = KnownCond ? TrueVal : FalseVal)
+          Result.push_back(std::make_pair(Val, C.second));
+      }
+
+      return !Result.empty();
+    }
+  }
+
+  // If all else fails, see if LVI can figure out a constant value for us.
+  Constant *CI = LVI->getConstant(V, BB, CxtI);
+  if (Constant *KC = getKnownConstant(CI, Preference)) {
+    for (BasicBlock *Pred : predecessors(BB))
+      Result.push_back(std::make_pair(KC, Pred));
+  }
+
+  return !Result.empty();
+}
+
+
+
+/// GetBestDestForBranchOnUndef - If we determine that the specified block ends
+/// in an undefined jump, decide which block is best to revector to.
+///
+/// Since we can pick an arbitrary destination, we pick the successor with the
+/// fewest predecessors.  This should reduce the in-degree of the others.
+///
+static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
+  TerminatorInst *BBTerm = BB->getTerminator();
+  unsigned MinSucc = 0;
+  BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
+  // Compute the successor with the minimum number of predecessors.
+  unsigned MinNumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+  for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+    TestBB = BBTerm->getSuccessor(i);
+    unsigned NumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+    if (NumPreds < MinNumPreds) {
+      MinSucc = i;
+      MinNumPreds = NumPreds;
+    }
+  }
+
+  return MinSucc;
+}
+
+static bool hasAddressTakenAndUsed(BasicBlock *BB) {
+  if (!BB->hasAddressTaken()) return false;
+
+  // If the block has its address taken, it may be a tree of dead constants
+  // hanging off of it.  These shouldn't keep the block alive.
+  BlockAddress *BA = BlockAddress::get(BB);
+  BA->removeDeadConstantUsers();
+  return !BA->use_empty();
+}
+
+/// ProcessBlock - If there are any predecessors whose control can be threaded
+/// through to a successor, transform them now.
+bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
+  // If the block is trivially dead, just return and let the caller nuke it.
+  // This simplifies other transformations.
+  if (pred_empty(BB) &&
+      BB != &BB->getParent()->getEntryBlock())
+    return false;
+
+  // If this block has a single predecessor, and if that pred has a single
+  // successor, merge the blocks.  This encourages recursive jump threading
+  // because now the condition in this block can be threaded through
+  // predecessors of our predecessor block.
+  if (BasicBlock *SinglePred = BB->getSinglePredecessor()) {
+    const TerminatorInst *TI = SinglePred->getTerminator();
+    if (!TI->isExceptional() && TI->getNumSuccessors() == 1 &&
+        SinglePred != BB && !hasAddressTakenAndUsed(BB)) {
+      // If SinglePred was a loop header, BB becomes one.
+      if (LoopHeaders.erase(SinglePred))
+        LoopHeaders.insert(BB);
+
+      LVI->eraseBlock(SinglePred);
+      MergeBasicBlockIntoOnlyPred(BB);
+
+      // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by
+      // BB code within one basic block `BB`), we need to invalidate the LVI
+      // information associated with BB, because the LVI information need not be
+      // true for all of BB after the merge. For example,
+      // Before the merge, LVI info and code is as follows:
+      // SinglePred: <LVI info1 for %p val>
+      // %y = use of %p
+      // call @exit() // need not transfer execution to successor.
+      // assume(%p) // from this point on %p is true
+      // br label %BB
+      // BB: <LVI info2 for %p val, i.e. %p is true>
+      // %x = use of %p
+      // br label exit
+      //
+      // Note that this LVI info for blocks BB and SinglPred is correct for %p
+      // (info2 and info1 respectively). After the merge and the deletion of the
+      // LVI info1 for SinglePred. We have the following code:
+      // BB: <LVI info2 for %p val>
+      // %y = use of %p
+      // call @exit()
+      // assume(%p)
+      // %x = use of %p <-- LVI info2 is correct from here onwards.
+      // br label exit
+      // LVI info2 for BB is incorrect at the beginning of BB.
+
+      // Invalidate LVI information for BB if the LVI is not provably true for
+      // all of BB.
+      if (any_of(*BB, [](Instruction &I) {
+            return !isGuaranteedToTransferExecutionToSuccessor(&I);
+          }))
+        LVI->eraseBlock(BB);
+      return true;
+    }
+  }
+
+  if (TryToUnfoldSelectInCurrBB(BB))
+    return true;
+
+  // Look if we can propagate guards to predecessors.
+  if (HasGuards && ProcessGuards(BB))
+    return true;
+
+  // What kind of constant we're looking for.
+  ConstantPreference Preference = WantInteger;
+
+  // Look to see if the terminator is a conditional branch, switch or indirect
+  // branch, if not we can't thread it.
+  Value *Condition;
+  Instruction *Terminator = BB->getTerminator();
+  if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) {
+    // Can't thread an unconditional jump.
+    if (BI->isUnconditional()) return false;
+    Condition = BI->getCondition();
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) {
+    Condition = SI->getCondition();
+  } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) {
+    // Can't thread indirect branch with no successors.
+    if (IB->getNumSuccessors() == 0) return false;
+    Condition = IB->getAddress()->stripPointerCasts();
+    Preference = WantBlockAddress;
+  } else {
+    return false; // Must be an invoke.
+  }
+
+  // Run constant folding to see if we can reduce the condition to a simple
+  // constant.
+  if (Instruction *I = dyn_cast<Instruction>(Condition)) {
+    Value *SimpleVal =
+        ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI);
+    if (SimpleVal) {
+      I->replaceAllUsesWith(SimpleVal);
+      if (isInstructionTriviallyDead(I, TLI))
+        I->eraseFromParent();
+      Condition = SimpleVal;
+    }
+  }
+
+  // If the terminator is branching on an undef, we can pick any of the
+  // successors to branch to.  Let GetBestDestForJumpOnUndef decide.
+  if (isa<UndefValue>(Condition)) {
+    unsigned BestSucc = GetBestDestForJumpOnUndef(BB);
+
+    // Fold the branch/switch.
+    TerminatorInst *BBTerm = BB->getTerminator();
+    for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+      if (i == BestSucc) continue;
+      BBTerm->getSuccessor(i)->removePredecessor(BB, true);
+    }
+
+    DEBUG(dbgs() << "  In block '" << BB->getName()
+          << "' folding undef terminator: " << *BBTerm << '\n');
+    BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
+    BBTerm->eraseFromParent();
+    return true;
+  }
+
+  // If the terminator of this block is branching on a constant, simplify the
+  // terminator to an unconditional branch.  This can occur due to threading in
+  // other blocks.
+  if (getKnownConstant(Condition, Preference)) {
+    DEBUG(dbgs() << "  In block '" << BB->getName()
+          << "' folding terminator: " << *BB->getTerminator() << '\n');
+    ++NumFolds;
+    ConstantFoldTerminator(BB, true);
+    return true;
+  }
+
+  Instruction *CondInst = dyn_cast<Instruction>(Condition);
+
+  // All the rest of our checks depend on the condition being an instruction.
+  if (!CondInst) {
+    // FIXME: Unify this with code below.
+    if (ProcessThreadableEdges(Condition, BB, Preference, Terminator))
+      return true;
+    return false;
+  }
+
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
+    // If we're branching on a conditional, LVI might be able to determine
+    // it's value at the branch instruction.  We only handle comparisons
+    // against a constant at this time.
+    // TODO: This should be extended to handle switches as well.
+    BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+    Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
+    if (CondBr && CondConst) {
+      // We should have returned as soon as we turn a conditional branch to
+      // unconditional. Because its no longer interesting as far as jump
+      // threading is concerned.
+      assert(CondBr->isConditional() && "Threading on unconditional terminator");
+
+      LazyValueInfo::Tristate Ret =
+        LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
+                            CondConst, CondBr);
+      if (Ret != LazyValueInfo::Unknown) {
+        unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
+        unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
+        CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true);
+        BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
+        CondBr->eraseFromParent();
+        if (CondCmp->use_empty())
+          CondCmp->eraseFromParent();
+        // We can safely replace *some* uses of the CondInst if it has
+        // exactly one value as returned by LVI. RAUW is incorrect in the
+        // presence of guards and assumes, that have the `Cond` as the use. This
+        // is because we use the guards/assume to reason about the `Cond` value
+        // at the end of block, but RAUW unconditionally replaces all uses
+        // including the guards/assumes themselves and the uses before the
+        // guard/assume.
+        else if (CondCmp->getParent() == BB) {
+          auto *CI = Ret == LazyValueInfo::True ?
+            ConstantInt::getTrue(CondCmp->getType()) :
+            ConstantInt::getFalse(CondCmp->getType());
+          ReplaceFoldableUses(CondCmp, CI);
+        }
+        return true;
+      }
+
+      // We did not manage to simplify this branch, try to see whether
+      // CondCmp depends on a known phi-select pattern.
+      if (TryToUnfoldSelect(CondCmp, BB))
+        return true;
+    }
+  }
+
+  // Check for some cases that are worth simplifying.  Right now we want to look
+  // for loads that are used by a switch or by the condition for the branch.  If
+  // we see one, check to see if it's partially redundant.  If so, insert a PHI
+  // which can then be used to thread the values.
+  //
+  Value *SimplifyValue = CondInst;
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
+    if (isa<Constant>(CondCmp->getOperand(1)))
+      SimplifyValue = CondCmp->getOperand(0);
+
+  // TODO: There are other places where load PRE would be profitable, such as
+  // more complex comparisons.
+  if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue))
+    if (SimplifyPartiallyRedundantLoad(LI))
+      return true;
+
+  // Handle a variety of cases where we are branching on something derived from
+  // a PHI node in the current block.  If we can prove that any predecessors
+  // compute a predictable value based on a PHI node, thread those predecessors.
+  //
+  if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator))
+    return true;
+
+  // If this is an otherwise-unfoldable branch on a phi node in the current
+  // block, see if we can simplify.
+  if (PHINode *PN = dyn_cast<PHINode>(CondInst))
+    if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+      return ProcessBranchOnPHI(PN);
+
+  // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
+  if (CondInst->getOpcode() == Instruction::Xor &&
+      CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+    return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst));
+
+  // Search for a stronger dominating condition that can be used to simplify a
+  // conditional branch leaving BB.
+  if (ProcessImpliedCondition(BB))
+    return true;
+
+  return false;
+}
+
+bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
+  auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+
+  Value *Cond = BI->getCondition();
+  BasicBlock *CurrentBB = BB;
+  BasicBlock *CurrentPred = BB->getSinglePredecessor();
+  unsigned Iter = 0;
+
+  auto &DL = BB->getModule()->getDataLayout();
+
+  while (CurrentPred && Iter++ < ImplicationSearchThreshold) {
+    auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator());
+    if (!PBI || !PBI->isConditional())
+      return false;
+    if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB)
+      return false;
+
+    bool FalseDest = PBI->getSuccessor(1) == CurrentBB;
+    Optional<bool> Implication =
+      isImpliedCondition(PBI->getCondition(), Cond, DL, FalseDest);
+    if (Implication) {
+      BI->getSuccessor(*Implication ? 1 : 0)->removePredecessor(BB);
+      BranchInst::Create(BI->getSuccessor(*Implication ? 0 : 1), BI);
+      BI->eraseFromParent();
+      return true;
+    }
+    CurrentBB = CurrentPred;
+    CurrentPred = CurrentBB->getSinglePredecessor();
+  }
+
+  return false;
+}
+
+/// Return true if Op is an instruction defined in the given block.
+static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
+  if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+    if (OpInst->getParent() == BB)
+      return true;
+  return false;
+}
+
+/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
+/// load instruction, eliminate it by replacing it with a PHI node.  This is an
+/// important optimization that encourages jump threading, and needs to be run
+/// interlaced with other jump threading tasks.
+bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
+  // Don't hack volatile and ordered loads.
+  if (!LI->isUnordered()) return false;
+
+  // If the load is defined in a block with exactly one predecessor, it can't be
+  // partially redundant.
+  BasicBlock *LoadBB = LI->getParent();
+  if (LoadBB->getSinglePredecessor())
+    return false;
+
+  // If the load is defined in an EH pad, it can't be partially redundant,
+  // because the edges between the invoke and the EH pad cannot have other
+  // instructions between them.
+  if (LoadBB->isEHPad())
+    return false;
+
+  Value *LoadedPtr = LI->getOperand(0);
+
+  // If the loaded operand is defined in the LoadBB and its not a phi,
+  // it can't be available in predecessors.
+  if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa<PHINode>(LoadedPtr))
+    return false;
+
+  // Scan a few instructions up from the load, to see if it is obviously live at
+  // the entry to its block.
+  BasicBlock::iterator BBIt(LI);
+  bool IsLoadCSE;
+  if (Value *AvailableVal = FindAvailableLoadedValue(
+          LI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+    // If the value of the load is locally available within the block, just use
+    // it.  This frequently occurs for reg2mem'd allocas.
+
+    if (IsLoadCSE) {
+      LoadInst *NLI = cast<LoadInst>(AvailableVal);
+      combineMetadataForCSE(NLI, LI);
+    };
+
+    // If the returned value is the load itself, replace with an undef. This can
+    // only happen in dead loops.
+    if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
+    if (AvailableVal->getType() != LI->getType())
+      AvailableVal =
+          CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI);
+    LI->replaceAllUsesWith(AvailableVal);
+    LI->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, if we scanned the whole block and got to the top of the block,
+  // we know the block is locally transparent to the load.  If not, something
+  // might clobber its value.
+  if (BBIt != LoadBB->begin())
+    return false;
+
+  // If all of the loads and stores that feed the value have the same AA tags,
+  // then we can propagate them onto any newly inserted loads.
+  AAMDNodes AATags;
+  LI->getAAMetadata(AATags);
+
+  SmallPtrSet<BasicBlock*, 8> PredsScanned;
+  typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
+  AvailablePredsTy AvailablePreds;
+  BasicBlock *OneUnavailablePred = nullptr;
+  SmallVector<LoadInst*, 8> CSELoads;
+
+  // If we got here, the loaded value is transparent through to the start of the
+  // block.  Check to see if it is available in any of the predecessor blocks.
+  for (BasicBlock *PredBB : predecessors(LoadBB)) {
+    // If we already scanned this predecessor, skip it.
+    if (!PredsScanned.insert(PredBB).second)
+      continue;
+
+    BBIt = PredBB->end();
+    unsigned NumScanedInst = 0;
+    Value *PredAvailable = nullptr;
+    // NOTE: We don't CSE load that is volatile or anything stronger than
+    // unordered, that should have been checked when we entered the function.
+    assert(LI->isUnordered() && "Attempting to CSE volatile or atomic loads");
+    // If this is a load on a phi pointer, phi-translate it and search
+    // for available load/store to the pointer in predecessors.
+    Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
+    PredAvailable = FindAvailablePtrLoadStore(
+        Ptr, LI->getType(), LI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan,
+        AA, &IsLoadCSE, &NumScanedInst);
+
+    // If PredBB has a single predecessor, continue scanning through the
+    // single precessor.
+    BasicBlock *SinglePredBB = PredBB;
+    while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
+           NumScanedInst < DefMaxInstsToScan) {
+      SinglePredBB = SinglePredBB->getSinglePredecessor();
+      if (SinglePredBB) {
+        BBIt = SinglePredBB->end();
+        PredAvailable = FindAvailablePtrLoadStore(
+            Ptr, LI->getType(), LI->isAtomic(), SinglePredBB, BBIt,
+            (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
+            &NumScanedInst);
+      }
+    }
+
+    if (!PredAvailable) {
+      OneUnavailablePred = PredBB;
+      continue;
+    }
+
+    if (IsLoadCSE)
+      CSELoads.push_back(cast<LoadInst>(PredAvailable));
+
+    // If so, this load is partially redundant.  Remember this info so that we
+    // can create a PHI node.
+    AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable));
+  }
+
+  // If the loaded value isn't available in any predecessor, it isn't partially
+  // redundant.
+  if (AvailablePreds.empty()) return false;
+
+  // Okay, the loaded value is available in at least one (and maybe all!)
+  // predecessors.  If the value is unavailable in more than one unique
+  // predecessor, we want to insert a merge block for those common predecessors.
+  // This ensures that we only have to insert one reload, thus not increasing
+  // code size.
+  BasicBlock *UnavailablePred = nullptr;
+
+  // If there is exactly one predecessor where the value is unavailable, the
+  // already computed 'OneUnavailablePred' block is it.  If it ends in an
+  // unconditional branch, we know that it isn't a critical edge.
+  if (PredsScanned.size() == AvailablePreds.size()+1 &&
+      OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
+    UnavailablePred = OneUnavailablePred;
+  } else if (PredsScanned.size() != AvailablePreds.size()) {
+    // Otherwise, we had multiple unavailable predecessors or we had a critical
+    // edge from the one.
+    SmallVector<BasicBlock*, 8> PredsToSplit;
+    SmallPtrSet<BasicBlock*, 8> AvailablePredSet;
+
+    for (const auto &AvailablePred : AvailablePreds)
+      AvailablePredSet.insert(AvailablePred.first);
+
+    // Add all the unavailable predecessors to the PredsToSplit list.
+    for (BasicBlock *P : predecessors(LoadBB)) {
+      // If the predecessor is an indirect goto, we can't split the edge.
+      if (isa<IndirectBrInst>(P->getTerminator()))
+        return false;
+
+      if (!AvailablePredSet.count(P))
+        PredsToSplit.push_back(P);
+    }
+
+    // Split them out to their own block.
+    UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");
+  }
+
+  // If the value isn't available in all predecessors, then there will be
+  // exactly one where it isn't available.  Insert a load on that edge and add
+  // it to the AvailablePreds list.
+  if (UnavailablePred) {
+    assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
+           "Can't handle critical edge here!");
+    LoadInst *NewVal = new LoadInst(
+        LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
+        LI->getName() + ".pr", false, LI->getAlignment(), LI->getOrdering(),
+        LI->getSyncScopeID(), UnavailablePred->getTerminator());
+    NewVal->setDebugLoc(LI->getDebugLoc());
+    if (AATags)
+      NewVal->setAAMetadata(AATags);
+
+    AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
+  }
+
+  // Now we know that each predecessor of this block has a value in
+  // AvailablePreds, sort them for efficient access as we're walking the preds.
+  array_pod_sort(AvailablePreds.begin(), AvailablePreds.end());
+
+  // Create a PHI node at the start of the block for the PRE'd load value.
+  pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
+  PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "",
+                                &LoadBB->front());
+  PN->takeName(LI);
+  PN->setDebugLoc(LI->getDebugLoc());
+
+  // Insert new entries into the PHI for each predecessor.  A single block may
+  // have multiple entries here.
+  for (pred_iterator PI = PB; PI != PE; ++PI) {
+    BasicBlock *P = *PI;
+    AvailablePredsTy::iterator I =
+      std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(),
+                       std::make_pair(P, (Value*)nullptr));
+
+    assert(I != AvailablePreds.end() && I->first == P &&
+           "Didn't find entry for predecessor!");
+
+    // If we have an available predecessor but it requires casting, insert the
+    // cast in the predecessor and use the cast. Note that we have to update the
+    // AvailablePreds vector as we go so that all of the PHI entries for this
+    // predecessor use the same bitcast.
+    Value *&PredV = I->second;
+    if (PredV->getType() != LI->getType())
+      PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "",
+                                               P->getTerminator());
+
+    PN->addIncoming(PredV, I->first);
+  }
+
+  for (LoadInst *PredLI : CSELoads) {
+    combineMetadataForCSE(PredLI, LI);
+  }
+
+  LI->replaceAllUsesWith(PN);
+  LI->eraseFromParent();
+
+  return true;
+}
+
+/// FindMostPopularDest - The specified list contains multiple possible
+/// threadable destinations.  Pick the one that occurs the most frequently in
+/// the list.
+static BasicBlock *
+FindMostPopularDest(BasicBlock *BB,
+                    const SmallVectorImpl<std::pair<BasicBlock*,
+                                  BasicBlock*> > &PredToDestList) {
+  assert(!PredToDestList.empty());
+
+  // Determine popularity.  If there are multiple possible destinations, we
+  // explicitly choose to ignore 'undef' destinations.  We prefer to thread
+  // blocks with known and real destinations to threading undef.  We'll handle
+  // them later if interesting.
+  DenseMap<BasicBlock*, unsigned> DestPopularity;
+  for (const auto &PredToDest : PredToDestList)
+    if (PredToDest.second)
+      DestPopularity[PredToDest.second]++;
+
+  // Find the most popular dest.
+  DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin();
+  BasicBlock *MostPopularDest = DPI->first;
+  unsigned Popularity = DPI->second;
+  SmallVector<BasicBlock*, 4> SamePopularity;
+
+  for (++DPI; DPI != DestPopularity.end(); ++DPI) {
+    // If the popularity of this entry isn't higher than the popularity we've
+    // seen so far, ignore it.
+    if (DPI->second < Popularity)
+      ; // ignore.
+    else if (DPI->second == Popularity) {
+      // If it is the same as what we've seen so far, keep track of it.
+      SamePopularity.push_back(DPI->first);
+    } else {
+      // If it is more popular, remember it.
+      SamePopularity.clear();
+      MostPopularDest = DPI->first;
+      Popularity = DPI->second;
+    }
+  }
+
+  // Okay, now we know the most popular destination.  If there is more than one
+  // destination, we need to determine one.  This is arbitrary, but we need
+  // to make a deterministic decision.  Pick the first one that appears in the
+  // successor list.
+  if (!SamePopularity.empty()) {
+    SamePopularity.push_back(MostPopularDest);
+    TerminatorInst *TI = BB->getTerminator();
+    for (unsigned i = 0; ; ++i) {
+      assert(i != TI->getNumSuccessors() && "Didn't find any successor!");
+
+      if (!is_contained(SamePopularity, TI->getSuccessor(i)))
+        continue;
+
+      MostPopularDest = TI->getSuccessor(i);
+      break;
+    }
+  }
+
+  // Okay, we have finally picked the most popular destination.
+  return MostPopularDest;
+}
+
+bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
+                                               ConstantPreference Preference,
+                                               Instruction *CxtI) {
+  // If threading this would thread across a loop header, don't even try to
+  // thread the edge.
+  if (LoopHeaders.count(BB))
+    return false;
+
+  PredValueInfoTy PredValues;
+  if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference, CxtI))
+    return false;
+
+  assert(!PredValues.empty() &&
+         "ComputeValueKnownInPredecessors returned true with no values");
+
+  DEBUG(dbgs() << "IN BB: " << *BB;
+        for (const auto &PredValue : PredValues) {
+          dbgs() << "  BB '" << BB->getName() << "': FOUND condition = "
+            << *PredValue.first
+            << " for pred '" << PredValue.second->getName() << "'.\n";
+        });
+
+  // Decide what we want to thread through.  Convert our list of known values to
+  // a list of known destinations for each pred.  This also discards duplicate
+  // predecessors and keeps track of the undefined inputs (which are represented
+  // as a null dest in the PredToDestList).
+  SmallPtrSet<BasicBlock*, 16> SeenPreds;
+  SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList;
+
+  BasicBlock *OnlyDest = nullptr;
+  BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL;
+  Constant *OnlyVal = nullptr;
+  Constant *MultipleVal = (Constant *)(intptr_t)~0ULL;
+
+  unsigned PredWithKnownDest = 0;
+  for (const auto &PredValue : PredValues) {
+    BasicBlock *Pred = PredValue.second;
+    if (!SeenPreds.insert(Pred).second)
+      continue;  // Duplicate predecessor entry.
+
+    Constant *Val = PredValue.first;
+
+    BasicBlock *DestBB;
+    if (isa<UndefValue>(Val))
+      DestBB = nullptr;
+    else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+      assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
+      DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+      assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
+      DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor();
+    } else {
+      assert(isa<IndirectBrInst>(BB->getTerminator())
+              && "Unexpected terminator");
+      assert(isa<BlockAddress>(Val) && "Expecting a constant blockaddress");
+      DestBB = cast<BlockAddress>(Val)->getBasicBlock();
+    }
+
+    // If we have exactly one destination, remember it for efficiency below.
+    if (PredToDestList.empty()) {
+      OnlyDest = DestBB;
+      OnlyVal = Val;
+    } else {
+      if (OnlyDest != DestBB)
+        OnlyDest = MultipleDestSentinel;
+      // It possible we have same destination, but different value, e.g. default
+      // case in switchinst.
+      if (Val != OnlyVal)
+        OnlyVal = MultipleVal;
+    }
+
+    // We know where this predecessor is going.
+    ++PredWithKnownDest;
+
+    // If the predecessor ends with an indirect goto, we can't change its
+    // destination.
+    if (isa<IndirectBrInst>(Pred->getTerminator()))
+      continue;
+
+    PredToDestList.push_back(std::make_pair(Pred, DestBB));
+  }
+
+  // If all edges were unthreadable, we fail.
+  if (PredToDestList.empty())
+    return false;
+
+  // If all the predecessors go to a single known successor, we want to fold,
+  // not thread. By doing so, we do not need to duplicate the current block and
+  // also miss potential opportunities in case we dont/cant duplicate.
+  if (OnlyDest && OnlyDest != MultipleDestSentinel) {
+    if (PredWithKnownDest ==
+        (size_t)std::distance(pred_begin(BB), pred_end(BB))) {
+      bool SeenFirstBranchToOnlyDest = false;
+      for (BasicBlock *SuccBB : successors(BB)) {
+        if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest)
+          SeenFirstBranchToOnlyDest = true; // Don't modify the first branch.
+        else
+          SuccBB->removePredecessor(BB, true); // This is unreachable successor.
+      }
+
+      // Finally update the terminator.
+      TerminatorInst *Term = BB->getTerminator();
+      BranchInst::Create(OnlyDest, Term);
+      Term->eraseFromParent();
+
+      // If the condition is now dead due to the removal of the old terminator,
+      // erase it.
+      if (auto *CondInst = dyn_cast<Instruction>(Cond)) {
+        if (CondInst->use_empty() && !CondInst->mayHaveSideEffects())
+          CondInst->eraseFromParent();
+        // We can safely replace *some* uses of the CondInst if it has
+        // exactly one value as returned by LVI. RAUW is incorrect in the
+        // presence of guards and assumes, that have the `Cond` as the use. This
+        // is because we use the guards/assume to reason about the `Cond` value
+        // at the end of block, but RAUW unconditionally replaces all uses
+        // including the guards/assumes themselves and the uses before the
+        // guard/assume.
+        else if (OnlyVal && OnlyVal != MultipleVal &&
+                 CondInst->getParent() == BB)
+          ReplaceFoldableUses(CondInst, OnlyVal);
+      }
+      return true;
+    }
+  }
+
+  // Determine which is the most common successor.  If we have many inputs and
+  // this block is a switch, we want to start by threading the batch that goes
+  // to the most popular destination first.  If we only know about one
+  // threadable destination (the common case) we can avoid this.
+  BasicBlock *MostPopularDest = OnlyDest;
+
+  if (MostPopularDest == MultipleDestSentinel)
+    MostPopularDest = FindMostPopularDest(BB, PredToDestList);
+
+  // Now that we know what the most popular destination is, factor all
+  // predecessors that will jump to it into a single predecessor.
+  SmallVector<BasicBlock*, 16> PredsToFactor;
+  for (const auto &PredToDest : PredToDestList)
+    if (PredToDest.second == MostPopularDest) {
+      BasicBlock *Pred = PredToDest.first;
+
+      // This predecessor may be a switch or something else that has multiple
+      // edges to the block.  Factor each of these edges by listing them
+      // according to # occurrences in PredsToFactor.
+      for (BasicBlock *Succ : successors(Pred))
+        if (Succ == BB)
+          PredsToFactor.push_back(Pred);
+    }
+
+  // If the threadable edges are branching on an undefined value, we get to pick
+  // the destination that these predecessors should get to.
+  if (!MostPopularDest)
+    MostPopularDest = BB->getTerminator()->
+                            getSuccessor(GetBestDestForJumpOnUndef(BB));
+
+  // Ok, try to thread it!
+  return ThreadEdge(BB, PredsToFactor, MostPopularDest);
+}
+
+/// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on
+/// a PHI node in the current block.  See if there are any simplifications we
+/// can do based on inputs to the phi node.
+///
+bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
+  BasicBlock *BB = PN->getParent();
+
+  // TODO: We could make use of this to do it once for blocks with common PHI
+  // values.
+  SmallVector<BasicBlock*, 1> PredBBs;
+  PredBBs.resize(1);
+
+  // If any of the predecessor blocks end in an unconditional branch, we can
+  // *duplicate* the conditional branch into that block in order to further
+  // encourage jump threading and to eliminate cases where we have branch on a
+  // phi of an icmp (branch on icmp is much better).
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *PredBB = PN->getIncomingBlock(i);
+    if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()))
+      if (PredBr->isUnconditional()) {
+        PredBBs[0] = PredBB;
+        // Try to duplicate BB into PredBB.
+        if (DuplicateCondBranchOnPHIIntoPred(BB, PredBBs))
+          return true;
+      }
+  }
+
+  return false;
+}
+
+/// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on
+/// a xor instruction in the current block.  See if there are any
+/// simplifications we can do based on inputs to the xor.
+///
+bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
+  BasicBlock *BB = BO->getParent();
+
+  // If either the LHS or RHS of the xor is a constant, don't do this
+  // optimization.
+  if (isa<ConstantInt>(BO->getOperand(0)) ||
+      isa<ConstantInt>(BO->getOperand(1)))
+    return false;
+
+  // If the first instruction in BB isn't a phi, we won't be able to infer
+  // anything special about any particular predecessor.
+  if (!isa<PHINode>(BB->front()))
+    return false;
+
+  // If this BB is a landing pad, we won't be able to split the edge into it.
+  if (BB->isEHPad())
+    return false;
+
+  // If we have a xor as the branch input to this block, and we know that the
+  // LHS or RHS of the xor in any predecessor is true/false, then we can clone
+  // the condition into the predecessor and fix that value to true, saving some
+  // logical ops on that path and encouraging other paths to simplify.
+  //
+  // This copies something like this:
+  //
+  //  BB:
+  //    %X = phi i1 [1],  [%X']
+  //    %Y = icmp eq i32 %A, %B
+  //    %Z = xor i1 %X, %Y
+  //    br i1 %Z, ...
+  //
+  // Into:
+  //  BB':
+  //    %Y = icmp ne i32 %A, %B
+  //    br i1 %Y, ...
+
+  PredValueInfoTy XorOpValues;
+  bool isLHS = true;
+  if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
+                                       WantInteger, BO)) {
+    assert(XorOpValues.empty());
+    if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
+                                         WantInteger, BO))
+      return false;
+    isLHS = false;
+  }
+
+  assert(!XorOpValues.empty() &&
+         "ComputeValueKnownInPredecessors returned true with no values");
+
+  // Scan the information to see which is most popular: true or false.  The
+  // predecessors can be of the set true, false, or undef.
+  unsigned NumTrue = 0, NumFalse = 0;
+  for (const auto &XorOpValue : XorOpValues) {
+    if (isa<UndefValue>(XorOpValue.first))
+      // Ignore undefs for the count.
+      continue;
+    if (cast<ConstantInt>(XorOpValue.first)->isZero())
+      ++NumFalse;
+    else
+      ++NumTrue;
+  }
+
+  // Determine which value to split on, true, false, or undef if neither.
+  ConstantInt *SplitVal = nullptr;
+  if (NumTrue > NumFalse)
+    SplitVal = ConstantInt::getTrue(BB->getContext());
+  else if (NumTrue != 0 || NumFalse != 0)
+    SplitVal = ConstantInt::getFalse(BB->getContext());
+
+  // Collect all of the blocks that this can be folded into so that we can
+  // factor this once and clone it once.
+  SmallVector<BasicBlock*, 8> BlocksToFoldInto;
+  for (const auto &XorOpValue : XorOpValues) {
+    if (XorOpValue.first != SplitVal && !isa<UndefValue>(XorOpValue.first))
+      continue;
+
+    BlocksToFoldInto.push_back(XorOpValue.second);
+  }
+
+  // If we inferred a value for all of the predecessors, then duplication won't
+  // help us.  However, we can just replace the LHS or RHS with the constant.
+  if (BlocksToFoldInto.size() ==
+      cast<PHINode>(BB->front()).getNumIncomingValues()) {
+    if (!SplitVal) {
+      // If all preds provide undef, just nuke the xor, because it is undef too.
+      BO->replaceAllUsesWith(UndefValue::get(BO->getType()));
+      BO->eraseFromParent();
+    } else if (SplitVal->isZero()) {
+      // If all preds provide 0, replace the xor with the other input.
+      BO->replaceAllUsesWith(BO->getOperand(isLHS));
+      BO->eraseFromParent();
+    } else {
+      // If all preds provide 1, set the computed value to 1.
+      BO->setOperand(!isLHS, SplitVal);
+    }
+
+    return true;
+  }
+
+  // Try to duplicate BB into PredBB.
+  return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
+}
+
+
+/// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
+/// predecessor to the PHIBB block.  If it has PHI nodes, add entries for
+/// NewPred using the entries from OldPred (suitably mapped).
+static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
+                                            BasicBlock *OldPred,
+                                            BasicBlock *NewPred,
+                                     DenseMap<Instruction*, Value*> &ValueMap) {
+  for (BasicBlock::iterator PNI = PHIBB->begin();
+       PHINode *PN = dyn_cast<PHINode>(PNI); ++PNI) {
+    // Ok, we have a PHI node.  Figure out what the incoming value was for the
+    // DestBlock.
+    Value *IV = PN->getIncomingValueForBlock(OldPred);
+
+    // Remap the value if necessary.
+    if (Instruction *Inst = dyn_cast<Instruction>(IV)) {
+      DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst);
+      if (I != ValueMap.end())
+        IV = I->second;
+    }
+
+    PN->addIncoming(IV, NewPred);
+  }
+}
+
+/// ThreadEdge - We have decided that it is safe and profitable to factor the
+/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
+/// across BB.  Transform the IR to reflect this change.
+bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
+                                   const SmallVectorImpl<BasicBlock *> &PredBBs,
+                                   BasicBlock *SuccBB) {
+  // If threading to the same block as we come from, we would infinite loop.
+  if (SuccBB == BB) {
+    DEBUG(dbgs() << "  Not threading across BB '" << BB->getName()
+          << "' - would thread to self!\n");
+    return false;
+  }
+
+  // If threading this would thread across a loop header, don't thread the edge.
+  // See the comments above FindLoopHeaders for justifications and caveats.
+  if (LoopHeaders.count(BB)) {
+    DEBUG(dbgs() << "  Not threading across loop header BB '" << BB->getName()
+          << "' to dest BB '" << SuccBB->getName()
+          << "' - it might create an irreducible loop!\n");
+    return false;
+  }
+
+  unsigned JumpThreadCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
+  if (JumpThreadCost > BBDupThreshold) {
+    DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
+          << "' - Cost is too high: " << JumpThreadCost << "\n");
+    return false;
+  }
+
+  // And finally, do it!  Start by factoring the predecessors if needed.
+  BasicBlock *PredBB;
+  if (PredBBs.size() == 1)
+    PredBB = PredBBs[0];
+  else {
+    DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
+          << " common predecessors.\n");
+    PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
+  }
+
+  // And finally, do it!
+  DEBUG(dbgs() << "  Threading edge from '" << PredBB->getName() << "' to '"
+        << SuccBB->getName() << "' with cost: " << JumpThreadCost
+        << ", across block:\n    "
+        << *BB << "\n");
+
+  LVI->threadEdge(PredBB, BB, SuccBB);
+
+  // We are going to have to map operands from the original BB block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
+  // account for entry from PredBB.
+  DenseMap<Instruction*, Value*> ValueMapping;
+
+  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(),
+                                         BB->getName()+".thread",
+                                         BB->getParent(), BB);
+  NewBB->moveAfter(PredBB);
+
+  // Set the block frequency of NewBB.
+  if (HasProfileData) {
+    auto NewBBFreq =
+        BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB);
+    BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+  }
+
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+
+  // Clone the non-phi instructions of BB into NewBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; !isa<TerminatorInst>(BI); ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    NewBB->getInstList().push_back(New);
+    ValueMapping[&*BI] = New;
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+  }
+
+  // We didn't copy the terminator from BB over to NewBB, because there is now
+  // an unconditional jump to SuccBB.  Insert the unconditional jump.
+  BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB);
+  NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());
+
+  // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
+  // PHI nodes for NewBB now.
+  AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
+
+  // If there were values defined in BB that are used outside the block, then we
+  // now have to update all uses of the value to use either the original value,
+  // the cloned value, or some PHI derived value.  This can require arbitrary
+  // PHI insertion, of which we are prepared to do, clean these up now.
+  SSAUpdater SSAUpdate;
+  SmallVector<Use*, 16> UsesToRename;
+  for (Instruction &I : *BB) {
+    // Scan all uses of this instruction to see if it is used outside of its
+    // block, and if so, record them in UsesToRename.
+    for (Use &U : I.uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
+      if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+        if (UserPN->getIncomingBlock(U) == BB)
+          continue;
+      } else if (User->getParent() == BB)
+        continue;
+
+      UsesToRename.push_back(&U);
+    }
+
+    // If there are no uses outside the block, we're done with this instruction.
+    if (UsesToRename.empty())
+      continue;
+
+    DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+
+    // We found a use of I outside of BB.  Rename all uses of I that are outside
+    // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
+    // with the two values we know.
+    SSAUpdate.Initialize(I.getType(), I.getName());
+    SSAUpdate.AddAvailableValue(BB, &I);
+    SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]);
+
+    while (!UsesToRename.empty())
+      SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
+    DEBUG(dbgs() << "\n");
+  }
+
+
+  // Ok, NewBB is good to go.  Update the terminator of PredBB to jump to
+  // NewBB instead of BB.  This eliminates predecessors from BB, which requires
+  // us to simplify any PHI nodes in BB.
+  TerminatorInst *PredTerm = PredBB->getTerminator();
+  for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
+    if (PredTerm->getSuccessor(i) == BB) {
+      BB->removePredecessor(PredBB, true);
+      PredTerm->setSuccessor(i, NewBB);
+    }
+
+  // At this point, the IR is fully up to date and consistent.  Do a quick scan
+  // over the new instructions and zap any that are constants or dead.  This
+  // frequently happens because of phi translation.
+  SimplifyInstructionsInBlock(NewBB, TLI);
+
+  // Update the edge weight from BB to SuccBB, which should be less than before.
+  UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB);
+
+  // Threaded an edge!
+  ++NumThreads;
+  return true;
+}
+
+/// Create a new basic block that will be the predecessor of BB and successor of
+/// all blocks in Preds. When profile data is available, update the frequency of
+/// this new block.
+BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB,
+                                               ArrayRef<BasicBlock *> Preds,
+                                               const char *Suffix) {
+  // Collect the frequencies of all predecessors of BB, which will be used to
+  // update the edge weight on BB->SuccBB.
+  BlockFrequency PredBBFreq(0);
+  if (HasProfileData)
+    for (auto Pred : Preds)
+      PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB);
+
+  BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix);
+
+  // Set the block frequency of the newly created PredBB, which is the sum of
+  // frequencies of Preds.
+  if (HasProfileData)
+    BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency());
+  return PredBB;
+}
+
+bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
+  const TerminatorInst *TI = BB->getTerminator();
+  assert(TI->getNumSuccessors() > 1 && "not a split");
+
+  MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
+  if (!WeightsNode)
+    return false;
+
+  MDString *MDName = cast<MDString>(WeightsNode->getOperand(0));
+  if (MDName->getString() != "branch_weights")
+    return false;
+
+  // Ensure there are weights for all of the successors. Note that the first
+  // operand to the metadata node is a name, not a weight.
+  return WeightsNode->getNumOperands() == TI->getNumSuccessors() + 1;
+}
+
+/// Update the block frequency of BB and branch weight and the metadata on the
+/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
+/// Freq(PredBB->BB) / Freq(BB->SuccBB).
+void JumpThreadingPass::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
+                                                     BasicBlock *BB,
+                                                     BasicBlock *NewBB,
+                                                     BasicBlock *SuccBB) {
+  if (!HasProfileData)
+    return;
+
+  assert(BFI && BPI && "BFI & BPI should have been created here");
+
+  // As the edge from PredBB to BB is deleted, we have to update the block
+  // frequency of BB.
+  auto BBOrigFreq = BFI->getBlockFreq(BB);
+  auto NewBBFreq = BFI->getBlockFreq(NewBB);
+  auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB);
+  auto BBNewFreq = BBOrigFreq - NewBBFreq;
+  BFI->setBlockFreq(BB, BBNewFreq.getFrequency());
+
+  // Collect updated outgoing edges' frequencies from BB and use them to update
+  // edge probabilities.
+  SmallVector<uint64_t, 4> BBSuccFreq;
+  for (BasicBlock *Succ : successors(BB)) {
+    auto SuccFreq = (Succ == SuccBB)
+                        ? BB2SuccBBFreq - NewBBFreq
+                        : BBOrigFreq * BPI->getEdgeProbability(BB, Succ);
+    BBSuccFreq.push_back(SuccFreq.getFrequency());
+  }
+
+  uint64_t MaxBBSuccFreq =
+      *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end());
+
+  SmallVector<BranchProbability, 4> BBSuccProbs;
+  if (MaxBBSuccFreq == 0)
+    BBSuccProbs.assign(BBSuccFreq.size(),
+                       {1, static_cast<unsigned>(BBSuccFreq.size())});
+  else {
+    for (uint64_t Freq : BBSuccFreq)
+      BBSuccProbs.push_back(
+          BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq));
+    // Normalize edge probabilities so that they sum up to one.
+    BranchProbability::normalizeProbabilities(BBSuccProbs.begin(),
+                                              BBSuccProbs.end());
+  }
+
+  // Update edge probabilities in BPI.
+  for (int I = 0, E = BBSuccProbs.size(); I < E; I++)
+    BPI->setEdgeProbability(BB, I, BBSuccProbs[I]);
+
+  // Update the profile metadata as well.
+  //
+  // Don't do this if the profile of the transformed blocks was statically
+  // estimated.  (This could occur despite the function having an entry
+  // frequency in completely cold parts of the CFG.)
+  //
+  // In this case we don't want to suggest to subsequent passes that the
+  // calculated weights are fully consistent.  Consider this graph:
+  //
+  //                 check_1
+  //             50% /  |
+  //             eq_1   | 50%
+  //                 \  |
+  //                 check_2
+  //             50% /  |
+  //             eq_2   | 50%
+  //                 \  |
+  //                 check_3
+  //             50% /  |
+  //             eq_3   | 50%
+  //                 \  |
+  //
+  // Assuming the blocks check_* all compare the same value against 1, 2 and 3,
+  // the overall probabilities are inconsistent; the total probability that the
+  // value is either 1, 2 or 3 is 150%.
+  //
+  // As a consequence if we thread eq_1 -> check_2 to check_3, check_2->check_3
+  // becomes 0%.  This is even worse if the edge whose probability becomes 0% is
+  // the loop exit edge.  Then based solely on static estimation we would assume
+  // the loop was extremely hot.
+  //
+  // FIXME this locally as well so that BPI and BFI are consistent as well.  We
+  // shouldn't make edges extremely likely or unlikely based solely on static
+  // estimation.
+  if (BBSuccProbs.size() >= 2 && doesBlockHaveProfileData(BB)) {
+    SmallVector<uint32_t, 4> Weights;
+    for (auto Prob : BBSuccProbs)
+      Weights.push_back(Prob.getNumerator());
+
+    auto TI = BB->getTerminator();
+    TI->setMetadata(
+        LLVMContext::MD_prof,
+        MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights));
+  }
+}
+
+/// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
+/// to BB which contains an i1 PHI node and a conditional branch on that PHI.
+/// If we can duplicate the contents of BB up into PredBB do so now, this
+/// improves the odds that the branch will be on an analyzable instruction like
+/// a compare.
+bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
+    BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs) {
+  assert(!PredBBs.empty() && "Can't handle an empty set");
+
+  // If BB is a loop header, then duplicating this block outside the loop would
+  // cause us to transform this into an irreducible loop, don't do this.
+  // See the comments above FindLoopHeaders for justifications and caveats.
+  if (LoopHeaders.count(BB)) {
+    DEBUG(dbgs() << "  Not duplicating loop header '" << BB->getName()
+          << "' into predecessor block '" << PredBBs[0]->getName()
+          << "' - it might create an irreducible loop!\n");
+    return false;
+  }
+
+  unsigned DuplicationCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
+  if (DuplicationCost > BBDupThreshold) {
+    DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
+          << "' - Cost is too high: " << DuplicationCost << "\n");
+    return false;
+  }
+
+  // And finally, do it!  Start by factoring the predecessors if needed.
+  BasicBlock *PredBB;
+  if (PredBBs.size() == 1)
+    PredBB = PredBBs[0];
+  else {
+    DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
+          << " common predecessors.\n");
+    PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
+  }
+
+  // Okay, we decided to do this!  Clone all the instructions in BB onto the end
+  // of PredBB.
+  DEBUG(dbgs() << "  Duplicating block '" << BB->getName() << "' into end of '"
+        << PredBB->getName() << "' to eliminate branch on phi.  Cost: "
+        << DuplicationCost << " block is:" << *BB << "\n");
+
+  // Unless PredBB ends with an unconditional branch, split the edge so that we
+  // can just clone the bits from BB into the end of the new PredBB.
+  BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
+
+  if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
+    PredBB = SplitEdge(PredBB, BB);
+    OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
+  }
+
+  // We are going to have to map operands from the original BB block into the
+  // PredBB block.  Evaluate PHI nodes in BB.
+  DenseMap<Instruction*, Value*> ValueMapping;
+
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+  // Clone the non-phi instructions of BB into PredBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; BI != BB->end(); ++BI) {
+    Instruction *New = BI->clone();
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+
+    // If this instruction can be simplified after the operands are updated,
+    // just use the simplified value instead.  This frequently happens due to
+    // phi translation.
+    if (Value *IV = SimplifyInstruction(
+            New,
+            {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) {
+      ValueMapping[&*BI] = IV;
+      if (!New->mayHaveSideEffects()) {
+        New->deleteValue();
+        New = nullptr;
+      }
+    } else {
+      ValueMapping[&*BI] = New;
+    }
+    if (New) {
+      // Otherwise, insert the new instruction into the block.
+      New->setName(BI->getName());
+      PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
+    }
+  }
+
+  // Check to see if the targets of the branch had PHI nodes. If so, we need to
+  // add entries to the PHI nodes for branch from PredBB now.
+  BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator());
+  AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB,
+                                  ValueMapping);
+  AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
+                                  ValueMapping);
+
+  // If there were values defined in BB that are used outside the block, then we
+  // now have to update all uses of the value to use either the original value,
+  // the cloned value, or some PHI derived value.  This can require arbitrary
+  // PHI insertion, of which we are prepared to do, clean these up now.
+  SSAUpdater SSAUpdate;
+  SmallVector<Use*, 16> UsesToRename;
+  for (Instruction &I : *BB) {
+    // Scan all uses of this instruction to see if it is used outside of its
+    // block, and if so, record them in UsesToRename.
+    for (Use &U : I.uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
+      if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+        if (UserPN->getIncomingBlock(U) == BB)
+          continue;
+      } else if (User->getParent() == BB)
+        continue;
+
+      UsesToRename.push_back(&U);
+    }
+
+    // If there are no uses outside the block, we're done with this instruction.
+    if (UsesToRename.empty())
+      continue;
+
+    DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+
+    // We found a use of I outside of BB.  Rename all uses of I that are outside
+    // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
+    // with the two values we know.
+    SSAUpdate.Initialize(I.getType(), I.getName());
+    SSAUpdate.AddAvailableValue(BB, &I);
+    SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&I]);
+
+    while (!UsesToRename.empty())
+      SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
+    DEBUG(dbgs() << "\n");
+  }
+
+  // PredBB no longer jumps to BB, remove entries in the PHI node for the edge
+  // that we nuked.
+  BB->removePredecessor(PredBB, true);
+
+  // Remove the unconditional branch at the end of the PredBB block.
+  OldPredBranch->eraseFromParent();
+
+  ++NumDupes;
+  return true;
+}
+
+/// TryToUnfoldSelect - Look for blocks of the form
+/// bb1:
+///   %a = select
+///   br bb2
+///
+/// bb2:
+///   %p = phi [%a, %bb1] ...
+///   %c = icmp %p
+///   br i1 %c
+///
+/// And expand the select into a branch structure if one of its arms allows %c
+/// to be folded. This later enables threading from bb1 over bb2.
+bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
+  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+  PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
+  Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));
+
+  if (!CondBr || !CondBr->isConditional() || !CondLHS ||
+      CondLHS->getParent() != BB)
+    return false;
+
+  for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) {
+    BasicBlock *Pred = CondLHS->getIncomingBlock(I);
+    SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I));
+
+    // Look if one of the incoming values is a select in the corresponding
+    // predecessor.
+    if (!SI || SI->getParent() != Pred || !SI->hasOneUse())
+      continue;
+
+    BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
+    if (!PredTerm || !PredTerm->isUnconditional())
+      continue;
+
+    // Now check if one of the select values would allow us to constant fold the
+    // terminator in BB. We don't do the transform if both sides fold, those
+    // cases will be threaded in any case.
+    LazyValueInfo::Tristate LHSFolds =
+        LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
+                                CondRHS, Pred, BB, CondCmp);
+    LazyValueInfo::Tristate RHSFolds =
+        LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2),
+                                CondRHS, Pred, BB, CondCmp);
+    if ((LHSFolds != LazyValueInfo::Unknown ||
+         RHSFolds != LazyValueInfo::Unknown) &&
+        LHSFolds != RHSFolds) {
+      // Expand the select.
+      //
+      // Pred --
+      //  |    v
+      //  |  NewBB
+      //  |    |
+      //  |-----
+      //  v
+      // BB
+      BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
+                                             BB->getParent(), BB);
+      // Move the unconditional branch to NewBB.
+      PredTerm->removeFromParent();
+      NewBB->getInstList().insert(NewBB->end(), PredTerm);
+      // Create a conditional branch and update PHI nodes.
+      BranchInst::Create(NewBB, BB, SI->getCondition(), Pred);
+      CondLHS->setIncomingValue(I, SI->getFalseValue());
+      CondLHS->addIncoming(SI->getTrueValue(), NewBB);
+      // The select is now dead.
+      SI->eraseFromParent();
+
+      // Update any other PHI nodes in BB.
+      for (BasicBlock::iterator BI = BB->begin();
+           PHINode *Phi = dyn_cast<PHINode>(BI); ++BI)
+        if (Phi != CondLHS)
+          Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB);
+      return true;
+    }
+  }
+  return false;
+}
+
+/// TryToUnfoldSelectInCurrBB - Look for PHI/Select in the same BB of the form
+/// bb:
+///   %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
+///   %s = select p, trueval, falseval
+///
+/// And expand the select into a branch structure. This later enables
+/// jump-threading over bb in this pass.
+///
+/// Using the similar approach of SimplifyCFG::FoldCondBranchOnPHI(), unfold
+/// select if the associated PHI has at least one constant.  If the unfolded
+/// select is not jump-threaded, it will be folded again in the later
+/// optimizations.
+bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
+  // If threading this would thread across a loop header, don't thread the edge.
+  // See the comments above FindLoopHeaders for justifications and caveats.
+  if (LoopHeaders.count(BB))
+    return false;
+
+  // Look for a Phi/Select pair in the same basic block.  The Phi feeds the
+  // condition of the Select and at least one of the incoming values is a
+  // constant.
+  for (BasicBlock::iterator BI = BB->begin();
+       PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
+    unsigned NumPHIValues = PN->getNumIncomingValues();
+    if (NumPHIValues == 0 || !PN->hasOneUse())
+      continue;
+
+    SelectInst *SI = dyn_cast<SelectInst>(PN->user_back());
+    if (!SI || SI->getParent() != BB)
+      continue;
+
+    Value *Cond = SI->getCondition();
+    if (!Cond || Cond != PN || !Cond->getType()->isIntegerTy(1))
+      continue;
+
+    bool HasConst = false;
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      if (PN->getIncomingBlock(i) == BB)
+        return false;
+      if (isa<ConstantInt>(PN->getIncomingValue(i)))
+        HasConst = true;
+    }
+
+    if (HasConst) {
+      // Expand the select.
+      TerminatorInst *Term =
+          SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
+      PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
+      NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
+      NewPN->addIncoming(SI->getFalseValue(), BB);
+      SI->replaceAllUsesWith(NewPN);
+      SI->eraseFromParent();
+      return true;
+    }
+  }
+  
+  return false;
+}
+
+/// Try to propagate a guard from the current BB into one of its predecessors
+/// in case if another branch of execution implies that the condition of this
+/// guard is always true. Currently we only process the simplest case that
+/// looks like:
+///
+/// Start:
+///   %cond = ...
+///   br i1 %cond, label %T1, label %F1
+/// T1:
+///   br label %Merge
+/// F1:
+///   br label %Merge
+/// Merge:
+///   %condGuard = ...
+///   call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+///
+/// And cond either implies condGuard or !condGuard. In this case all the
+/// instructions before the guard can be duplicated in both branches, and the
+/// guard is then threaded to one of them.
+bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
+  using namespace PatternMatch;
+  // We only want to deal with two predecessors.
+  BasicBlock *Pred1, *Pred2;
+  auto PI = pred_begin(BB), PE = pred_end(BB);
+  if (PI == PE)
+    return false;
+  Pred1 = *PI++;
+  if (PI == PE)
+    return false;
+  Pred2 = *PI++;
+  if (PI != PE)
+    return false;
+  if (Pred1 == Pred2)
+    return false;
+
+  // Try to thread one of the guards of the block.
+  // TODO: Look up deeper than to immediate predecessor?
+  auto *Parent = Pred1->getSinglePredecessor();
+  if (!Parent || Parent != Pred2->getSinglePredecessor())
+    return false;
+
+  if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
+    for (auto &I : *BB)
+      if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>()))
+        if (ThreadGuard(BB, cast<IntrinsicInst>(&I), BI))
+          return true;
+
+  return false;
+}
+
+/// Try to propagate the guard from BB which is the lower block of a diamond
+/// to one of its branches, in case if diamond's condition implies guard's
+/// condition.
+bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
+                                    BranchInst *BI) {
+  assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
+  assert(BI->isConditional() && "Unconditional branch has 2 successors?");
+  Value *GuardCond = Guard->getArgOperand(0);
+  Value *BranchCond = BI->getCondition();
+  BasicBlock *TrueDest = BI->getSuccessor(0);
+  BasicBlock *FalseDest = BI->getSuccessor(1);
+
+  auto &DL = BB->getModule()->getDataLayout();
+  bool TrueDestIsSafe = false;
+  bool FalseDestIsSafe = false;
+
+  // True dest is safe if BranchCond => GuardCond.
+  auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
+  if (Impl && *Impl)
+    TrueDestIsSafe = true;
+  else {
+    // False dest is safe if !BranchCond => GuardCond.
+    Impl =
+        isImpliedCondition(BranchCond, GuardCond, DL, /* InvertAPred */ true);
+    if (Impl && *Impl)
+      FalseDestIsSafe = true;
+  }
+
+  if (!TrueDestIsSafe && !FalseDestIsSafe)
+    return false;
+
+  BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
+  BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
+
+  ValueToValueMapTy UnguardedMapping, GuardedMapping;
+  Instruction *AfterGuard = Guard->getNextNode();
+  unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
+  if (Cost > BBDupThreshold)
+    return false;
+  // Duplicate all instructions before the guard and the guard itself to the
+  // branch where implication is not proved.
+  GuardedBlock = DuplicateInstructionsInSplitBetween(
+      BB, GuardedBlock, AfterGuard, GuardedMapping);
+  assert(GuardedBlock && "Could not create the guarded block?");
+  // Duplicate all instructions before the guard in the unguarded branch.
+  // Since we have successfully duplicated the guarded block and this block
+  // has fewer instructions, we expect it to succeed.
+  UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock,
+                                                       Guard, UnguardedMapping);
+  assert(UnguardedBlock && "Could not create the unguarded block?");
+  DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
+               << GuardedBlock->getName() << "\n");
+
+  // Some instructions before the guard may still have uses. For them, we need
+  // to create Phi nodes merging their copies in both guarded and unguarded
+  // branches. Those instructions that have no uses can be just removed.
+  SmallVector<Instruction *, 4> ToRemove;
+  for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
+    if (!isa<PHINode>(&*BI))
+      ToRemove.push_back(&*BI);
+
+  Instruction *InsertionPoint = &*BB->getFirstInsertionPt();
+  assert(InsertionPoint && "Empty block?");
+  // Substitute with Phis & remove.
+  for (auto *Inst : reverse(ToRemove)) {
+    if (!Inst->use_empty()) {
+      PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
+      NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
+      NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
+      NewPN->insertBefore(InsertionPoint);
+      Inst->replaceAllUsesWith(NewPN);
+    }
+    Inst->eraseFromParent();
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
new file mode 100644
index 000000000000..37b9c4b1094e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -0,0 +1,1397 @@
+//===-- LICM.cpp - Loop Invariant Code Motion Pass ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion, attempting to remove as much
+// code from the body of a loop as possible.  It does this by either hoisting
+// code into the preheader block, or by sinking code to the exit blocks if it is
+// safe.  This pass also promotes must-aliased memory locations in the loop to
+// live in registers, thus hoisting and sinking "invariant" loads and stores.
+//
+// This pass uses alias analysis for two purposes:
+//
+//  1. Moving loop invariant loads and calls out of loops.  If we can determine
+//     that a load or call inside of a loop never aliases anything stored to,
+//     we can hoist it or sink it like any other instruction.
+//  2. Scalar Promotion of Memory - If there is a store instruction inside of
+//     the loop, we try to move the store to happen AFTER the loop instead of
+//     inside of the loop.  This can only happen if a few conditions are true:
+//       A. The pointer stored through is loop invariant
+//       B. There are no stores or loads in the loop which _may_ alias the
+//          pointer.  There are no calls in the loop which mod/ref the pointer.
+//     If these conditions are true, we can promote the loads and stores in the
+//     loop of the pointer to use a temporary alloca'd variable.  We then use
+//     the SSAUpdater to construct the appropriate SSA form for the value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LICM.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "licm"
+
+STATISTIC(NumSunk, "Number of instructions sunk out of loop");
+STATISTIC(NumHoisted, "Number of instructions hoisted out of loop");
+STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
+STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
+STATISTIC(NumPromoted, "Number of memory locations promoted to registers");
+
+/// Memory promotion is enabled by default.
+static cl::opt<bool>
+    DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
+                     cl::desc("Disable memory promotion in LICM pass"));
+
+static cl::opt<uint32_t> MaxNumUsesTraversed(
+    "licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
+    cl::desc("Max num uses visited for identifying load "
+             "invariance in loop using invariant start (default = 8)"));
+
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
+static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
+                            const LoopSafetyInfo *SafetyInfo);
+static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
+                  const LoopSafetyInfo *SafetyInfo,
+                  OptimizationRemarkEmitter *ORE);
+static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
+                 const Loop *CurLoop, AliasSetTracker *CurAST,
+                 const LoopSafetyInfo *SafetyInfo,
+                 OptimizationRemarkEmitter *ORE);
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,
+                                           const DominatorTree *DT,
+                                           const Loop *CurLoop,
+                                           const LoopSafetyInfo *SafetyInfo,
+                                           OptimizationRemarkEmitter *ORE,
+                                           const Instruction *CtxI = nullptr);
+static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
+                                     const AAMDNodes &AAInfo,
+                                     AliasSetTracker *CurAST);
+static Instruction *
+CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
+                            const LoopInfo *LI,
+                            const LoopSafetyInfo *SafetyInfo);
+
+namespace {
+struct LoopInvariantCodeMotion {
+  bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
+                 TargetLibraryInfo *TLI, ScalarEvolution *SE,
+                 OptimizationRemarkEmitter *ORE, bool DeleteAST);
+
+  DenseMap<Loop *, AliasSetTracker *> &getLoopToAliasSetMap() {
+    return LoopToAliasSetMap;
+  }
+
+private:
+  DenseMap<Loop *, AliasSetTracker *> LoopToAliasSetMap;
+
+  AliasSetTracker *collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
+                                           AliasAnalysis *AA);
+};
+
+struct LegacyLICMPass : public LoopPass {
+  static char ID; // Pass identification, replacement for typeid
+  LegacyLICMPass() : LoopPass(ID) {
+    initializeLegacyLICMPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L)) {
+      // If we have run LICM on a previous loop but now we are skipping
+      // (because we've hit the opt-bisect limit), we need to clear the
+      // loop alias information.
+      for (auto &LTAS : LICM.getLoopToAliasSetMap())
+        delete LTAS.second;
+      LICM.getLoopToAliasSetMap().clear();
+      return false;
+    }
+
+    auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+    // pass.  Function analyses need to be preserved across loop transformations
+    // but ORE cannot be preserved (see comment before the pass definition).
+    OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
+    return LICM.runOnLoop(L,
+                          &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+                          &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
+                          &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+                          &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                          SE ? &SE->getSE() : nullptr, &ORE, false);
+  }
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG...
+  ///
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+
+  using llvm::Pass::doFinalization;
+
+  bool doFinalization() override {
+    assert(LICM.getLoopToAliasSetMap().empty() &&
+           "Didn't free loop alias sets");
+    return false;
+  }
+
+private:
+  LoopInvariantCodeMotion LICM;
+
+  /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
+  void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To,
+                               Loop *L) override;
+
+  /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
+  /// set.
+  void deleteAnalysisValue(Value *V, Loop *L) override;
+
+  /// Simple Analysis hook. Delete loop L from alias set map.
+  void deleteAnalysisLoop(Loop *L) override;
+};
+}
+
+PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
+                                LoopStandardAnalysisResults &AR, LPMUpdater &) {
+  const auto &FAM =
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+  Function *F = L.getHeader()->getParent();
+
+  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
+  // FIXME: This should probably be optional rather than required.
+  if (!ORE)
+    report_fatal_error("LICM: OptimizationRemarkEmitterAnalysis not "
+                       "cached at a higher level");
+
+  LoopInvariantCodeMotion LICM;
+  if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, ORE, true))
+    return PreservedAnalyses::all();
+
+  auto PA = getLoopPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+char LegacyLICMPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
+                    false)
+
+Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
+
+/// Hoist expressions out of the specified loop. Note, alias info for inner
+/// loop is not preserved so it is not a good idea to run LICM multiple
+/// times on one loop.
+/// We should delete AST for inner loops in the new pass manager to avoid
+/// memory leak.
+///
+bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
+                                        LoopInfo *LI, DominatorTree *DT,
+                                        TargetLibraryInfo *TLI,
+                                        ScalarEvolution *SE,
+                                        OptimizationRemarkEmitter *ORE,
+                                        bool DeleteAST) {
+  bool Changed = false;
+
+  assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
+
+  AliasSetTracker *CurAST = collectAliasInfoForLoop(L, LI, AA);
+
+  // Get the preheader block to move instructions into...
+  BasicBlock *Preheader = L->getLoopPreheader();
+
+  // Compute loop safety information.
+  LoopSafetyInfo SafetyInfo;
+  computeLoopSafetyInfo(&SafetyInfo, L);
+
+  // We want to visit all of the instructions in this loop... that are not parts
+  // of our subloops (they have already had their invariants hoisted out of
+  // their loop, into this loop, so there is no need to process the BODIES of
+  // the subloops).
+  //
+  // Traverse the body of the loop in depth first order on the dominator tree so
+  // that we are guaranteed to see definitions before we see uses.  This allows
+  // us to sink instructions in one pass, without iteration.  After sinking
+  // instructions, we perform another pass to hoist them out of the loop.
+  //
+  if (L->hasDedicatedExits())
+    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
+                          CurAST, &SafetyInfo, ORE);
+  if (Preheader)
+    Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
+                           CurAST, &SafetyInfo, ORE);
+
+  // Now that all loop invariants have been removed from the loop, promote any
+  // memory references to scalars that we can.
+  // Don't sink stores from loops without dedicated block exits. Exits
+  // containing indirect branches are not transformed by loop simplify,
+  // make sure we catch that. An additional load may be generated in the
+  // preheader for SSA updater, so also avoid sinking when no preheader
+  // is available.
+  if (!DisablePromotion && Preheader && L->hasDedicatedExits()) {
+    // Figure out the loop exits and their insertion points
+    SmallVector<BasicBlock *, 8> ExitBlocks;
+    L->getUniqueExitBlocks(ExitBlocks);
+
+    // We can't insert into a catchswitch.
+    bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) {
+      return isa<CatchSwitchInst>(Exit->getTerminator());
+    });
+
+    if (!HasCatchSwitch) {
+      SmallVector<Instruction *, 8> InsertPts;
+      InsertPts.reserve(ExitBlocks.size());
+      for (BasicBlock *ExitBlock : ExitBlocks)
+        InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+
+      PredIteratorCache PIC;
+
+      bool Promoted = false;
+
+      // Loop over all of the alias sets in the tracker object.
+      for (AliasSet &AS : *CurAST)
+        Promoted |=
+            promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts, PIC, LI, DT,
+                                         TLI, L, CurAST, &SafetyInfo, ORE);
+
+      // Once we have promoted values across the loop body we have to
+      // recursively reform LCSSA as any nested loop may now have values defined
+      // within the loop used in the outer loop.
+      // FIXME: This is really heavy handed. It would be a bit better to use an
+      // SSAUpdater strategy during promotion that was LCSSA aware and reformed
+      // it as it went.
+      if (Promoted)
+        formLCSSARecursively(*L, *DT, LI, SE);
+
+      Changed |= Promoted;
+    }
+  }
+
+  // Check that neither this loop nor its parent have had LCSSA broken. LICM is
+  // specifically moving instructions across the loop boundary and so it is
+  // especially in need of sanity checking here.
+  assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!");
+  assert((!L->getParentLoop() || L->getParentLoop()->isLCSSAForm(*DT)) &&
+         "Parent loop not left in LCSSA form after LICM!");
+
+  // If this loop is nested inside of another one, save the alias information
+  // for when we process the outer loop.
+  if (L->getParentLoop() && !DeleteAST)
+    LoopToAliasSetMap[L] = CurAST;
+  else
+    delete CurAST;
+
+  if (Changed && SE)
+    SE->forgetLoopDispositions(L);
+  return Changed;
+}
+
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in reverse depth
+/// first order w.r.t the DominatorTree.  This allows us to visit uses before
+/// definitions, allowing us to sink a loop body in one pass without iteration.
+///
+bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
+                      DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+                      AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+                      OptimizationRemarkEmitter *ORE) {
+
+  // Verify inputs.
+  assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
+         CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr &&
+         "Unexpected input to sinkRegion");
+
+  BasicBlock *BB = N->getBlock();
+  // If this subregion is not in the top level loop at all, exit.
+  if (!CurLoop->contains(BB))
+    return false;
+
+  // We are processing blocks in reverse dfo, so process children first.
+  bool Changed = false;
+  const std::vector<DomTreeNode *> &Children = N->getChildren();
+  for (DomTreeNode *Child : Children)
+    Changed |=
+        sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE);
+
+  // Only need to process the contents of this block if it is not part of a
+  // subloop (which would already have been processed).
+  if (inSubLoop(BB, CurLoop, LI))
+    return Changed;
+
+  for (BasicBlock::iterator II = BB->end(); II != BB->begin();) {
+    Instruction &I = *--II;
+
+    // If the instruction is dead, we would try to sink it because it isn't used
+    // in the loop, instead, just delete it.
+    if (isInstructionTriviallyDead(&I, TLI)) {
+      DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+      ++II;
+      CurAST->deleteValue(&I);
+      I.eraseFromParent();
+      Changed = true;
+      continue;
+    }
+
+    // Check to see if we can sink this instruction to the exit blocks
+    // of the loop.  We can do this if the all users of the instruction are
+    // outside of the loop.  In this case, it doesn't even matter if the
+    // operands of the instruction are loop invariant.
+    //
+    if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
+        canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
+      ++II;
+      Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE);
+    }
+  }
+  return Changed;
+}
+
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in depth first
+/// order w.r.t the DominatorTree.  This allows us to visit definitions before
+/// uses, allowing us to hoist a loop body in one pass without iteration.
+///
+bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
+                       DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+                       AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+                       OptimizationRemarkEmitter *ORE) {
+  // Verify inputs.
+  assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
+         CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr &&
+         "Unexpected input to hoistRegion");
+
+  BasicBlock *BB = N->getBlock();
+
+  // If this subregion is not in the top level loop at all, exit.
+  if (!CurLoop->contains(BB))
+    return false;
+
+  // Only need to process the contents of this block if it is not part of a
+  // subloop (which would already have been processed).
+  bool Changed = false;
+  if (!inSubLoop(BB, CurLoop, LI))
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
+      Instruction &I = *II++;
+      // Try constant folding this instruction.  If all the operands are
+      // constants, it is technically hoistable, but it would be better to just
+      // fold it.
+      if (Constant *C = ConstantFoldInstruction(
+              &I, I.getModule()->getDataLayout(), TLI)) {
+        DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C << '\n');
+        CurAST->copyValue(&I, C);
+        I.replaceAllUsesWith(C);
+        if (isInstructionTriviallyDead(&I, TLI)) {
+          CurAST->deleteValue(&I);
+          I.eraseFromParent();
+        }
+        Changed = true;
+        continue;
+      }
+
+      // Attempt to remove floating point division out of the loop by converting
+      // it to a reciprocal multiplication.
+      if (I.getOpcode() == Instruction::FDiv &&
+          CurLoop->isLoopInvariant(I.getOperand(1)) &&
+          I.hasAllowReciprocal()) {
+        auto Divisor = I.getOperand(1);
+        auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
+        auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
+        ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+        ReciprocalDivisor->insertBefore(&I);
+
+        auto Product = BinaryOperator::CreateFMul(I.getOperand(0),
+                                                  ReciprocalDivisor);
+        Product->setFastMathFlags(I.getFastMathFlags());
+        Product->insertAfter(&I);
+        I.replaceAllUsesWith(Product);
+        I.eraseFromParent();
+
+        hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
+        Changed = true;
+        continue;
+      }
+
+      // Try hoisting the instruction out to the preheader.  We can only do this
+      // if all of the operands of the instruction are loop invariant and if it
+      // is safe to hoist the instruction.
+      //
+      if (CurLoop->hasLoopInvariantOperands(&I) &&
+          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
+          isSafeToExecuteUnconditionally(
+              I, DT, CurLoop, SafetyInfo, ORE,
+              CurLoop->getLoopPreheader()->getTerminator()))
+        Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
+    }
+
+  const std::vector<DomTreeNode *> &Children = N->getChildren();
+  for (DomTreeNode *Child : Children)
+    Changed |=
+        hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE);
+  return Changed;
+}
+
+/// Computes loop safety information, checks loop body & header
+/// for the possibility of may throw exception.
+///
+void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
+  assert(CurLoop != nullptr && "CurLoop cant be null");
+  BasicBlock *Header = CurLoop->getHeader();
+  // Setting default safety values.
+  SafetyInfo->MayThrow = false;
+  SafetyInfo->HeaderMayThrow = false;
+  // Iterate over header and compute safety info.
+  for (BasicBlock::iterator I = Header->begin(), E = Header->end();
+       (I != E) && !SafetyInfo->HeaderMayThrow; ++I)
+    SafetyInfo->HeaderMayThrow |=
+        !isGuaranteedToTransferExecutionToSuccessor(&*I);
+
+  SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
+  // Iterate over loop instructions and compute safety info.
+  // Skip header as it has been computed and stored in HeaderMayThrow.
+  // The first block in loopinfo.Blocks is guaranteed to be the header.
+  assert(Header == *CurLoop->getBlocks().begin() && "First block must be header");
+  for (Loop::block_iterator BB = std::next(CurLoop->block_begin()),
+                            BBE = CurLoop->block_end();
+       (BB != BBE) && !SafetyInfo->MayThrow; ++BB)
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
+         (I != E) && !SafetyInfo->MayThrow; ++I)
+      SafetyInfo->MayThrow |= !isGuaranteedToTransferExecutionToSuccessor(&*I);
+
+  // Compute funclet colors if we might sink/hoist in a function with a funclet
+  // personality routine.
+  Function *Fn = CurLoop->getHeader()->getParent();
+  if (Fn->hasPersonalityFn())
+    if (Constant *PersonalityFn = Fn->getPersonalityFn())
+      if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
+        SafetyInfo->BlockColors = colorEHFunclets(*Fn);
+}
+
+// Return true if LI is invariant within scope of the loop. LI is invariant if
+// CurLoop is dominated by an invariant.start representing the same memory location
+// and size as the memory location LI loads from, and also the invariant.start
+// has no uses.
+static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
+                                  Loop *CurLoop) {
+  Value *Addr = LI->getOperand(0);
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  const uint32_t LocSizeInBits = DL.getTypeSizeInBits(
+      cast<PointerType>(Addr->getType())->getElementType());
+
+  // if the type is i8 addrspace(x)*, we know this is the type of
+  // llvm.invariant.start operand
+  auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()),
+                                     LI->getPointerAddressSpace());
+  unsigned BitcastsVisited = 0;
+  // Look through bitcasts until we reach the i8* type (this is invariant.start
+  // operand type).
+  while (Addr->getType() != PtrInt8Ty) {
+    auto *BC = dyn_cast<BitCastInst>(Addr);
+    // Avoid traversing high number of bitcast uses.
+    if (++BitcastsVisited > MaxNumUsesTraversed || !BC)
+      return false;
+    Addr = BC->getOperand(0);
+  }
+
+  unsigned UsesVisited = 0;
+  // Traverse all uses of the load operand value, to see if invariant.start is
+  // one of the uses, and whether it dominates the load instruction.
+  for (auto *U : Addr->users()) {
+    // Avoid traversing for Load operand with high number of users.
+    if (++UsesVisited > MaxNumUsesTraversed)
+      return false;
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+    // If there are escaping uses of invariant.start instruction, the load maybe
+    // non-invariant.
+    if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
+        !II->use_empty())
+      continue;
+    unsigned InvariantSizeInBits =
+        cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
+    // Confirm the invariant.start location size contains the load operand size
+    // in bits. Also, the invariant.start should dominate the load, and we
+    // should not hoist the load out of a loop that contains this dominating
+    // invariant.start.
+    if (LocSizeInBits <= InvariantSizeInBits &&
+        DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
+      return true;
+  }
+
+  return false;
+}
+
+bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
+                              Loop *CurLoop, AliasSetTracker *CurAST,
+                              LoopSafetyInfo *SafetyInfo,
+                              OptimizationRemarkEmitter *ORE) {
+  // Loads have extra constraints we have to verify before we can hoist them.
+  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+    if (!LI->isUnordered())
+      return false; // Don't hoist volatile/atomic loads!
+
+    // Loads from constant memory are always safe to move, even if they end up
+    // in the same alias set as something that ends up being modified.
+    if (AA->pointsToConstantMemory(LI->getOperand(0)))
+      return true;
+    if (LI->getMetadata(LLVMContext::MD_invariant_load))
+      return true;
+
+    // This checks for an invariant.start dominating the load.
+    if (isLoadInvariantInLoop(LI, DT, CurLoop))
+      return true;
+
+    // Don't hoist loads which have may-aliased stores in loop.
+    uint64_t Size = 0;
+    if (LI->getType()->isSized())
+      Size = I.getModule()->getDataLayout().getTypeStoreSize(LI->getType());
+
+    AAMDNodes AAInfo;
+    LI->getAAMetadata(AAInfo);
+
+    bool Invalidated =
+        pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST);
+    // Check loop-invariant address because this may also be a sinkable load
+    // whose address is not necessarily loop-invariant.
+    if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand()))
+      ORE->emit(OptimizationRemarkMissed(
+                    DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI)
+                << "failed to move load with loop-invariant address "
+                   "because the loop may invalidate its value");
+
+    return !Invalidated;
+  } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+    // Don't sink or hoist dbg info; it's legal, but not useful.
+    if (isa<DbgInfoIntrinsic>(I))
+      return false;
+
+    // Don't sink calls which can throw.
+    if (CI->mayThrow())
+      return false;
+
+    // Handle simple cases by querying alias analysis.
+    FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI);
+    if (Behavior == FMRB_DoesNotAccessMemory)
+      return true;
+    if (AliasAnalysis::onlyReadsMemory(Behavior)) {
+      // A readonly argmemonly function only reads from memory pointed to by
+      // it's arguments with arbitrary offsets.  If we can prove there are no
+      // writes to this memory in the loop, we can hoist or sink.
+      if (AliasAnalysis::onlyAccessesArgPointees(Behavior)) {
+        for (Value *Op : CI->arg_operands())
+          if (Op->getType()->isPointerTy() &&
+              pointerInvalidatedByLoop(Op, MemoryLocation::UnknownSize,
+                                       AAMDNodes(), CurAST))
+            return false;
+        return true;
+      }
+      // If this call only reads from memory and there are no writes to memory
+      // in the loop, we can hoist or sink the call as appropriate.
+      bool FoundMod = false;
+      for (AliasSet &AS : *CurAST) {
+        if (!AS.isForwardingAliasSet() && AS.isMod()) {
+          FoundMod = true;
+          break;
+        }
+      }
+      if (!FoundMod)
+        return true;
+    }
+
+    // FIXME: This should use mod/ref information to see if we can hoist or
+    // sink the call.
+
+    return false;
+  }
+
+  // Only these instructions are hoistable/sinkable.
+  if (!isa<BinaryOperator>(I) && !isa<CastInst>(I) && !isa<SelectInst>(I) &&
+      !isa<GetElementPtrInst>(I) && !isa<CmpInst>(I) &&
+      !isa<InsertElementInst>(I) && !isa<ExtractElementInst>(I) &&
+      !isa<ShuffleVectorInst>(I) && !isa<ExtractValueInst>(I) &&
+      !isa<InsertValueInst>(I))
+    return false;
+
+  // SafetyInfo is nullptr if we are checking for sinking from preheader to
+  // loop body. It will be always safe as there is no speculative execution.
+  if (!SafetyInfo)
+    return true;
+
+  // TODO: Plumb the context instruction through to make hoisting and sinking
+  // more powerful. Hoisting of loads already works due to the special casing
+  // above.
+  return isSafeToExecuteUnconditionally(I, DT, CurLoop, SafetyInfo, nullptr);
+}
+
+/// Returns true if a PHINode is a trivially replaceable with an
+/// Instruction.
+/// This is true when all incoming values are that instruction.
+/// This pattern occurs most often with LCSSA PHI nodes.
+///
+static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
+  for (const Value *IncValue : PN.incoming_values())
+    if (IncValue != &I)
+      return false;
+
+  return true;
+}
+
+/// Return true if the only users of this instruction are outside of
+/// the loop. If this is true, we can sink the instruction to the exit
+/// blocks of the loop.
+///
+static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
+                            const LoopSafetyInfo *SafetyInfo) {
+  const auto &BlockColors = SafetyInfo->BlockColors;
+  for (const User *U : I.users()) {
+    const Instruction *UI = cast<Instruction>(U);
+    if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
+      const BasicBlock *BB = PN->getParent();
+      // We cannot sink uses in catchswitches.
+      if (isa<CatchSwitchInst>(BB->getTerminator()))
+        return false;
+
+      // We need to sink a callsite to a unique funclet.  Avoid sinking if the
+      // phi use is too muddled.
+      if (isa<CallInst>(I))
+        if (!BlockColors.empty() &&
+            BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
+          return false;
+
+      // A PHI node where all of the incoming values are this instruction are
+      // special -- they can just be RAUW'ed with the instruction and thus
+      // don't require a use in the predecessor. This is a particular important
+      // special case because it is the pattern found in LCSSA form.
+      if (isTriviallyReplacablePHI(*PN, I)) {
+        if (CurLoop->contains(PN))
+          return false;
+        else
+          continue;
+      }
+
+      // Otherwise, PHI node uses occur in predecessor blocks if the incoming
+      // values. Check for such a use being inside the loop.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingValue(i) == &I)
+          if (CurLoop->contains(PN->getIncomingBlock(i)))
+            return false;
+
+      continue;
+    }
+
+    if (CurLoop->contains(UI))
+      return false;
+  }
+  return true;
+}
+
+static Instruction *
+CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
+                            const LoopInfo *LI,
+                            const LoopSafetyInfo *SafetyInfo) {
+  Instruction *New;
+  if (auto *CI = dyn_cast<CallInst>(&I)) {
+    const auto &BlockColors = SafetyInfo->BlockColors;
+
+    // Sinking call-sites need to be handled differently from other
+    // instructions.  The cloned call-site needs a funclet bundle operand
+    // appropriate for it's location in the CFG.
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    for (unsigned BundleIdx = 0, BundleEnd = CI->getNumOperandBundles();
+         BundleIdx != BundleEnd; ++BundleIdx) {
+      OperandBundleUse Bundle = CI->getOperandBundleAt(BundleIdx);
+      if (Bundle.getTagID() == LLVMContext::OB_funclet)
+        continue;
+
+      OpBundles.emplace_back(Bundle);
+    }
+
+    if (!BlockColors.empty()) {
+      const ColorVector &CV = BlockColors.find(&ExitBlock)->second;
+      assert(CV.size() == 1 && "non-unique color for exit block!");
+      BasicBlock *BBColor = CV.front();
+      Instruction *EHPad = BBColor->getFirstNonPHI();
+      if (EHPad->isEHPad())
+        OpBundles.emplace_back("funclet", EHPad);
+    }
+
+    New = CallInst::Create(CI, OpBundles);
+  } else {
+    New = I.clone();
+  }
+
+  ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New);
+  if (!I.getName().empty())
+    New->setName(I.getName() + ".le");
+
+  // Build LCSSA PHI nodes for any in-loop operands. Note that this is
+  // particularly cheap because we can rip off the PHI node that we're
+  // replacing for the number and blocks of the predecessors.
+  // OPT: If this shows up in a profile, we can instead finish sinking all
+  // invariant instructions, and then walk their operands to re-establish
+  // LCSSA. That will eliminate creating PHI nodes just to nuke them when
+  // sinking bottom-up.
+  for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE;
+       ++OI)
+    if (Instruction *OInst = dyn_cast<Instruction>(*OI))
+      if (Loop *OLoop = LI->getLoopFor(OInst->getParent()))
+        if (!OLoop->contains(&PN)) {
+          PHINode *OpPN =
+              PHINode::Create(OInst->getType(), PN.getNumIncomingValues(),
+                              OInst->getName() + ".lcssa", &ExitBlock.front());
+          for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+            OpPN->addIncoming(OInst, PN.getIncomingBlock(i));
+          *OI = OpPN;
+        }
+  return New;
+}
+
+/// When an instruction is found to only be used outside of the loop, this
+/// function moves it to the exit blocks and patches up SSA form as needed.
+/// This method is guaranteed to remove the original instruction from its
+/// position, and may either delete it or move it to outside of the loop.
+///
+static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
+                 const Loop *CurLoop, AliasSetTracker *CurAST,
+                 const LoopSafetyInfo *SafetyInfo,
+                 OptimizationRemarkEmitter *ORE) {
+  DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
+  ORE->emit(OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
+            << "sinking " << ore::NV("Inst", &I));
+  bool Changed = false;
+  if (isa<LoadInst>(I))
+    ++NumMovedLoads;
+  else if (isa<CallInst>(I))
+    ++NumMovedCalls;
+  ++NumSunk;
+  Changed = true;
+
+#ifndef NDEBUG
+  SmallVector<BasicBlock *, 32> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+                                             ExitBlocks.end());
+#endif
+
+  // Clones of this instruction. Don't create more than one per exit block!
+  SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+
+  // If this instruction is only used outside of the loop, then all users are
+  // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
+  // the instruction.
+  while (!I.use_empty()) {
+    Value::user_iterator UI = I.user_begin();
+    auto *User = cast<Instruction>(*UI);
+    if (!DT->isReachableFromEntry(User->getParent())) {
+      User->replaceUsesOfWith(&I, UndefValue::get(I.getType()));
+      continue;
+    }
+    // The user must be a PHI node.
+    PHINode *PN = cast<PHINode>(User);
+
+    // Surprisingly, instructions can be used outside of loops without any
+    // exits.  This can only happen in PHI nodes if the incoming block is
+    // unreachable.
+    Use &U = UI.getUse();
+    BasicBlock *BB = PN->getIncomingBlock(U);
+    if (!DT->isReachableFromEntry(BB)) {
+      U = UndefValue::get(I.getType());
+      continue;
+    }
+
+    BasicBlock *ExitBlock = PN->getParent();
+    assert(ExitBlockSet.count(ExitBlock) &&
+           "The LCSSA PHI is not in an exit block!");
+
+    Instruction *New;
+    auto It = SunkCopies.find(ExitBlock);
+    if (It != SunkCopies.end())
+      New = It->second;
+    else
+      New = SunkCopies[ExitBlock] =
+          CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo);
+
+    PN->replaceAllUsesWith(New);
+    PN->eraseFromParent();
+  }
+
+  CurAST->deleteValue(&I);
+  I.eraseFromParent();
+  return Changed;
+}
+
+/// When an instruction is found to only use loop invariant operands that
+/// is safe to hoist, this instruction is called to do the dirty work.
+///
+static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
+                  const LoopSafetyInfo *SafetyInfo,
+                  OptimizationRemarkEmitter *ORE) {
+  auto *Preheader = CurLoop->getLoopPreheader();
+  DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
+               << "\n");
+  ORE->emit(OptimizationRemark(DEBUG_TYPE, "Hoisted", &I)
+            << "hoisting " << ore::NV("Inst", &I));
+
+  // Metadata can be dependent on conditions we are hoisting above.
+  // Conservatively strip all metadata on the instruction unless we were
+  // guaranteed to execute I if we entered the loop, in which case the metadata
+  // is valid in the loop preheader.
+  if (I.hasMetadataOtherThanDebugLoc() &&
+      // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning
+      // time in isGuaranteedToExecute if we don't actually have anything to
+      // drop.  It is a compile time optimization, not required for correctness.
+      !isGuaranteedToExecute(I, DT, CurLoop, SafetyInfo))
+    I.dropUnknownNonDebugMetadata();
+
+  // Move the new node to the Preheader, before its terminator.
+  I.moveBefore(Preheader->getTerminator());
+
+  // Do not retain debug locations when we are moving instructions to different
+  // basic blocks, because we want to avoid jumpy line tables. Calls, however,
+  // need to retain their debug locs because they may be inlined.
+  // FIXME: How do we retain source locations without causing poor debugging
+  // behavior?
+  if (!isa<CallInst>(I))
+    I.setDebugLoc(DebugLoc());
+
+  if (isa<LoadInst>(I))
+    ++NumMovedLoads;
+  else if (isa<CallInst>(I))
+    ++NumMovedCalls;
+  ++NumHoisted;
+  return true;
+}
+
+/// Only sink or hoist an instruction if it is not a trapping instruction,
+/// or if the instruction is known not to trap when moved to the preheader.
+/// or if it is a trapping instruction and is guaranteed to execute.
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,
+                                           const DominatorTree *DT,
+                                           const Loop *CurLoop,
+                                           const LoopSafetyInfo *SafetyInfo,
+                                           OptimizationRemarkEmitter *ORE,
+                                           const Instruction *CtxI) {
+  if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT))
+    return true;
+
+  bool GuaranteedToExecute =
+      isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
+
+  if (!GuaranteedToExecute) {
+    auto *LI = dyn_cast<LoadInst>(&Inst);
+    if (LI && CurLoop->isLoopInvariant(LI->getPointerOperand()))
+      ORE->emit(OptimizationRemarkMissed(
+                    DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI)
+                << "failed to hoist load with loop-invariant address "
+                   "because load is conditionally executed");
+  }
+
+  return GuaranteedToExecute;
+}
+
+namespace {
+class LoopPromoter : public LoadAndStorePromoter {
+  Value *SomePtr; // Designated pointer to store to.
+  SmallPtrSetImpl<Value *> &PointerMustAliases;
+  SmallVectorImpl<BasicBlock *> &LoopExitBlocks;
+  SmallVectorImpl<Instruction *> &LoopInsertPts;
+  PredIteratorCache &PredCache;
+  AliasSetTracker &AST;
+  LoopInfo &LI;
+  DebugLoc DL;
+  int Alignment;
+  bool UnorderedAtomic;
+  AAMDNodes AATags;
+
+  Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      if (Loop *L = LI.getLoopFor(I->getParent()))
+        if (!L->contains(BB)) {
+          // We need to create an LCSSA PHI node for the incoming value and
+          // store that.
+          PHINode *PN = PHINode::Create(I->getType(), PredCache.size(BB),
+                                        I->getName() + ".lcssa", &BB->front());
+          for (BasicBlock *Pred : PredCache.get(BB))
+            PN->addIncoming(I, Pred);
+          return PN;
+        }
+    return V;
+  }
+
+public:
+  LoopPromoter(Value *SP, ArrayRef<const Instruction *> Insts, SSAUpdater &S,
+               SmallPtrSetImpl<Value *> &PMA,
+               SmallVectorImpl<BasicBlock *> &LEB,
+               SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
+               AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
+               bool UnorderedAtomic, const AAMDNodes &AATags)
+      : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
+        LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
+        LI(li), DL(std::move(dl)), Alignment(alignment),
+        UnorderedAtomic(UnorderedAtomic),AATags(AATags) {}
+
+  bool isInstInList(Instruction *I,
+                    const SmallVectorImpl<Instruction *> &) const override {
+    Value *Ptr;
+    if (LoadInst *LI = dyn_cast<LoadInst>(I))
+      Ptr = LI->getOperand(0);
+    else
+      Ptr = cast<StoreInst>(I)->getPointerOperand();
+    return PointerMustAliases.count(Ptr);
+  }
+
+  void doExtraRewritesBeforeFinalDeletion() const override {
+    // Insert stores after in the loop exit blocks.  Each exit block gets a
+    // store of the live-out values that feed them.  Since we've already told
+    // the SSA updater about the defs in the loop and the preheader
+    // definition, it is all set and we can start using it.
+    for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
+      BasicBlock *ExitBlock = LoopExitBlocks[i];
+      Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+      LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock);
+      Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
+      Instruction *InsertPos = LoopInsertPts[i];
+      StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
+      if (UnorderedAtomic)
+        NewSI->setOrdering(AtomicOrdering::Unordered);
+      NewSI->setAlignment(Alignment);
+      NewSI->setDebugLoc(DL);
+      if (AATags)
+        NewSI->setAAMetadata(AATags);
+    }
+  }
+
+  void replaceLoadWithValue(LoadInst *LI, Value *V) const override {
+    // Update alias analysis.
+    AST.copyValue(LI, V);
+  }
+  void instructionDeleted(Instruction *I) const override { AST.deleteValue(I); }
+};
+} // end anon namespace
+
+/// Try to promote memory values to scalars by sinking stores out of the
+/// loop and moving loads to before the loop.  We do this by looping over
+/// the stores in the loop, looking for stores to Must pointers which are
+/// loop invariant.
+///
+bool llvm::promoteLoopAccessesToScalars(
+    AliasSet &AS, SmallVectorImpl<BasicBlock *> &ExitBlocks,
+    SmallVectorImpl<Instruction *> &InsertPts, PredIteratorCache &PIC,
+    LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
+    Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+    OptimizationRemarkEmitter *ORE) {
+  // Verify inputs.
+  assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
+         CurAST != nullptr && SafetyInfo != nullptr &&
+         "Unexpected Input to promoteLoopAccessesToScalars");
+
+  // We can promote this alias set if it has a store, if it is a "Must" alias
+  // set, if the pointer is loop invariant, and if we are not eliminating any
+  // volatile loads or stores.
+  if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
+      AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue()))
+    return false;
+
+  assert(!AS.empty() &&
+         "Must alias set should have at least one pointer element in it!");
+
+  Value *SomePtr = AS.begin()->getValue();
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+
+  // It isn't safe to promote a load/store from the loop if the load/store is
+  // conditional.  For example, turning:
+  //
+  //    for () { if (c) *P += 1; }
+  //
+  // into:
+  //
+  //    tmp = *P;  for () { if (c) tmp +=1; } *P = tmp;
+  //
+  // is not safe, because *P may only be valid to access if 'c' is true.
+  //
+  // The safety property divides into two parts:
+  // p1) The memory may not be dereferenceable on entry to the loop.  In this
+  //    case, we can't insert the required load in the preheader.
+  // p2) The memory model does not allow us to insert a store along any dynamic
+  //    path which did not originally have one.
+  //
+  // If at least one store is guaranteed to execute, both properties are
+  // satisfied, and promotion is legal.
+  //
+  // This, however, is not a necessary condition. Even if no store/load is
+  // guaranteed to execute, we can still establish these properties.
+  // We can establish (p1) by proving that hoisting the load into the preheader
+  // is safe (i.e. proving dereferenceability on all paths through the loop). We
+  // can use any access within the alias set to prove dereferenceability,
+  // since they're all must alias.
+  // 
+  // There are two ways establish (p2): 
+  // a) Prove the location is thread-local. In this case the memory model
+  // requirement does not apply, and stores are safe to insert.
+  // b) Prove a store dominates every exit block. In this case, if an exit
+  // blocks is reached, the original dynamic path would have taken us through
+  // the store, so inserting a store into the exit block is safe. Note that this
+  // is different from the store being guaranteed to execute. For instance,
+  // if an exception is thrown on the first iteration of the loop, the original
+  // store is never executed, but the exit blocks are not executed either.
+
+  bool DereferenceableInPH = false;
+  bool SafeToInsertStore = false;
+
+  SmallVector<Instruction *, 64> LoopUses;
+  SmallPtrSet<Value *, 4> PointerMustAliases;
+
+  // We start with an alignment of one and try to find instructions that allow
+  // us to prove better alignment.
+  unsigned Alignment = 1;
+  // Keep track of which types of access we see
+  bool SawUnorderedAtomic = false; 
+  bool SawNotAtomic = false;
+  AAMDNodes AATags;
+
+  const DataLayout &MDL = Preheader->getModule()->getDataLayout();
+
+  // Do we know this object does not escape ?
+  bool IsKnownNonEscapingObject = false;
+  if (SafetyInfo->MayThrow) {
+    // If a loop can throw, we have to insert a store along each unwind edge.
+    // That said, we can't actually make the unwind edge explicit. Therefore,
+    // we have to prove that the store is dead along the unwind edge.
+    //
+    // If the underlying object is not an alloca, nor a pointer that does not
+    // escape, then we can not effectively prove that the store is dead along
+    // the unwind edge. i.e. the caller of this function could have ways to
+    // access the pointed object.
+    Value *Object = GetUnderlyingObject(SomePtr, MDL);
+    // If this is a base pointer we do not understand, simply bail.
+    // We only handle alloca and return value from alloc-like fn right now.
+    if (!isa<AllocaInst>(Object)) {
+        if (!isAllocLikeFn(Object, TLI))
+          return false;
+      // If this is an alloc like fn. There are more constraints we need to verify.
+      // More specifically, we must make sure that the pointer can not escape.
+      //
+      // NOTE: PointerMayBeCaptured is not enough as the pointer may have escaped
+      // even though its not captured by the enclosing function. Standard allocation
+      // functions like malloc, calloc, and operator new return values which can
+      // be assumed not to have previously escaped.
+      if (PointerMayBeCaptured(Object, true, true))
+        return false;
+      IsKnownNonEscapingObject = true;
+    }
+  }
+
+  // Check that all of the pointers in the alias set have the same type.  We
+  // cannot (yet) promote a memory location that is loaded and stored in
+  // different sizes.  While we are at it, collect alignment and AA info.
+  for (const auto &ASI : AS) {
+    Value *ASIV = ASI.getValue();
+    PointerMustAliases.insert(ASIV);
+
+    // Check that all of the pointers in the alias set have the same type.  We
+    // cannot (yet) promote a memory location that is loaded and stored in
+    // different sizes.
+    if (SomePtr->getType() != ASIV->getType())
+      return false;
+
+    for (User *U : ASIV->users()) {
+      // Ignore instructions that are outside the loop.
+      Instruction *UI = dyn_cast<Instruction>(U);
+      if (!UI || !CurLoop->contains(UI))
+        continue;
+
+      // If there is an non-load/store instruction in the loop, we can't promote
+      // it.
+      if (LoadInst *Load = dyn_cast<LoadInst>(UI)) {
+        assert(!Load->isVolatile() && "AST broken");
+        if (!Load->isUnordered())
+          return false;
+        
+        SawUnorderedAtomic |= Load->isAtomic();
+        SawNotAtomic |= !Load->isAtomic();
+
+        if (!DereferenceableInPH)
+          DereferenceableInPH = isSafeToExecuteUnconditionally(
+              *Load, DT, CurLoop, SafetyInfo, ORE, Preheader->getTerminator());
+      } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
+        // Stores *of* the pointer are not interesting, only stores *to* the
+        // pointer.
+        if (UI->getOperand(1) != ASIV)
+          continue;
+        assert(!Store->isVolatile() && "AST broken");
+        if (!Store->isUnordered())
+          return false;
+
+        SawUnorderedAtomic |= Store->isAtomic();
+        SawNotAtomic |= !Store->isAtomic();
+
+        // If the store is guaranteed to execute, both properties are satisfied.
+        // We may want to check if a store is guaranteed to execute even if we
+        // already know that promotion is safe, since it may have higher
+        // alignment than any other guaranteed stores, in which case we can
+        // raise the alignment on the promoted store.
+        unsigned InstAlignment = Store->getAlignment();
+        if (!InstAlignment)
+          InstAlignment =
+              MDL.getABITypeAlignment(Store->getValueOperand()->getType());
+
+        if (!DereferenceableInPH || !SafeToInsertStore ||
+            (InstAlignment > Alignment)) {
+          if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) {
+            DereferenceableInPH = true;
+            SafeToInsertStore = true;
+            Alignment = std::max(Alignment, InstAlignment);
+          }
+        }
+
+        // If a store dominates all exit blocks, it is safe to sink.
+        // As explained above, if an exit block was executed, a dominating
+        // store must have been been executed at least once, so we are not
+        // introducing stores on paths that did not have them.
+        // Note that this only looks at explicit exit blocks. If we ever
+        // start sinking stores into unwind edges (see above), this will break.
+        if (!SafeToInsertStore)
+          SafeToInsertStore = llvm::all_of(ExitBlocks, [&](BasicBlock *Exit) {
+            return DT->dominates(Store->getParent(), Exit);
+          });
+
+        // If the store is not guaranteed to execute, we may still get
+        // deref info through it.
+        if (!DereferenceableInPH) {
+          DereferenceableInPH = isDereferenceableAndAlignedPointer(
+              Store->getPointerOperand(), Store->getAlignment(), MDL,
+              Preheader->getTerminator(), DT);
+        }
+      } else
+        return false; // Not a load or store.
+
+      // Merge the AA tags.
+      if (LoopUses.empty()) {
+        // On the first load/store, just take its AA tags.
+        UI->getAAMetadata(AATags);
+      } else if (AATags) {
+        UI->getAAMetadata(AATags, /* Merge = */ true);
+      }
+
+      LoopUses.push_back(UI);
+    }
+  }
+
+  // If we found both an unordered atomic instruction and a non-atomic memory
+  // access, bail.  We can't blindly promote non-atomic to atomic since we
+  // might not be able to lower the result.  We can't downgrade since that
+  // would violate memory model.  Also, align 0 is an error for atomics.
+  if (SawUnorderedAtomic && SawNotAtomic)
+    return false;
+
+  // If we couldn't prove we can hoist the load, bail.
+  if (!DereferenceableInPH)
+    return false;
+
+  // We know we can hoist the load, but don't have a guaranteed store.
+  // Check whether the location is thread-local. If it is, then we can insert
+  // stores along paths which originally didn't have them without violating the
+  // memory model.
+  if (!SafeToInsertStore) {
+    // If this is a known non-escaping object, it is safe to insert the stores.
+    if (IsKnownNonEscapingObject)
+      SafeToInsertStore = true;
+    else {
+      Value *Object = GetUnderlyingObject(SomePtr, MDL);
+      SafeToInsertStore =
+        (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) && 
+        !PointerMayBeCaptured(Object, true, true);
+    }
+  }
+
+  // If we've still failed to prove we can sink the store, give up.
+  if (!SafeToInsertStore)
+    return false;
+
+  // Otherwise, this is safe to promote, lets do it!
+  DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
+               << '\n');
+  ORE->emit(
+      OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar", LoopUses[0])
+      << "Moving accesses to memory location out of the loop");
+  ++NumPromoted;
+
+  // Grab a debug location for the inserted loads/stores; given that the
+  // inserted loads/stores have little relation to the original loads/stores,
+  // this code just arbitrarily picks a location from one, since any debug
+  // location is better than none.
+  DebugLoc DL = LoopUses[0]->getDebugLoc();
+
+  // We use the SSAUpdater interface to insert phi nodes as required.
+  SmallVector<PHINode *, 16> NewPHIs;
+  SSAUpdater SSA(&NewPHIs);
+  LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
+                        InsertPts, PIC, *CurAST, *LI, DL, Alignment,
+                        SawUnorderedAtomic, AATags);
+
+  // Set up the preheader to have a definition of the value.  It is the live-out
+  // value from the preheader that uses in the loop will use.
+  LoadInst *PreheaderLoad = new LoadInst(
+      SomePtr, SomePtr->getName() + ".promoted", Preheader->getTerminator());
+  if (SawUnorderedAtomic)
+    PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
+  PreheaderLoad->setAlignment(Alignment);
+  PreheaderLoad->setDebugLoc(DL);
+  if (AATags)
+    PreheaderLoad->setAAMetadata(AATags);
+  SSA.AddAvailableValue(Preheader, PreheaderLoad);
+
+  // Rewrite all the loads in the loop and remember all the definitions from
+  // stores in the loop.
+  Promoter.run(LoopUses);
+
+  // If the SSAUpdater didn't use the load in the preheader, just zap it now.
+  if (PreheaderLoad->use_empty())
+    PreheaderLoad->eraseFromParent();
+
+  return true;
+}
+
+/// Returns an owning pointer to an alias set which incorporates aliasing info
+/// from L and all subloops of L.
+/// FIXME: In new pass manager, there is no helper function to handle loop
+/// analysis such as cloneBasicBlockAnalysis, so the AST needs to be recomputed
+/// from scratch for every loop. Hook up with the helper functions when
+/// available in the new pass manager to avoid redundant computation.
+AliasSetTracker *
+LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
+                                                 AliasAnalysis *AA) {
+  AliasSetTracker *CurAST = nullptr;
+  SmallVector<Loop *, 4> RecomputeLoops;
+  for (Loop *InnerL : L->getSubLoops()) {
+    auto MapI = LoopToAliasSetMap.find(InnerL);
+    // If the AST for this inner loop is missing it may have been merged into
+    // some other loop's AST and then that loop unrolled, and so we need to
+    // recompute it.
+    if (MapI == LoopToAliasSetMap.end()) {
+      RecomputeLoops.push_back(InnerL);
+      continue;
+    }
+    AliasSetTracker *InnerAST = MapI->second;
+
+    if (CurAST != nullptr) {
+      // What if InnerLoop was modified by other passes ?
+      CurAST->add(*InnerAST);
+
+      // Once we've incorporated the inner loop's AST into ours, we don't need
+      // the subloop's anymore.
+      delete InnerAST;
+    } else {
+      CurAST = InnerAST;
+    }
+    LoopToAliasSetMap.erase(MapI);
+  }
+  if (CurAST == nullptr)
+    CurAST = new AliasSetTracker(*AA);
+
+  auto mergeLoop = [&](Loop *L) {
+    // Loop over the body of this loop, looking for calls, invokes, and stores.
+    for (BasicBlock *BB : L->blocks())
+        CurAST->add(*BB);          // Incorporate the specified basic block
+  };
+
+  // Add everything from the sub loops that are no longer directly available.
+  for (Loop *InnerL : RecomputeLoops)
+    mergeLoop(InnerL);
+
+  // And merge in this loop.
+  mergeLoop(L);
+
+  return CurAST;
+}
+
+/// Simple analysis hook. Clone alias set info.
+///
+void LegacyLICMPass::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To,
+                                             Loop *L) {
+  AliasSetTracker *AST = LICM.getLoopToAliasSetMap().lookup(L);
+  if (!AST)
+    return;
+
+  AST->copyValue(From, To);
+}
+
+/// Simple Analysis hook. Delete value V from alias set
+///
+void LegacyLICMPass::deleteAnalysisValue(Value *V, Loop *L) {
+  AliasSetTracker *AST = LICM.getLoopToAliasSetMap().lookup(L);
+  if (!AST)
+    return;
+
+  AST->deleteValue(V);
+}
+
+/// Simple Analysis hook. Delete value L from alias set map.
+///
+void LegacyLICMPass::deleteAnalysisLoop(Loop *L) {
+  AliasSetTracker *AST = LICM.getLoopToAliasSetMap().lookup(L);
+  if (!AST)
+    return;
+
+  delete AST;
+  LICM.getLoopToAliasSetMap().erase(L);
+}
+
+/// Return true if the body of this loop may store into the memory
+/// location pointed to by V.
+///
+static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
+                                     const AAMDNodes &AAInfo,
+                                     AliasSetTracker *CurAST) {
+  // Check to see if any of the basic blocks in CurLoop invalidate *V.
+  return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod();
+}
+
+/// Little predicate that returns true if the specified basic block is in
+/// a subloop of the current one, not the current one itself.
+///
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) {
+  assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
+  return LI->getLoopFor(BB) != CurLoop;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
new file mode 100644
index 000000000000..a64c99117d64
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
@@ -0,0 +1,25 @@
+//===- LoopAccessAnalysisPrinter.cpp - Loop Access Analysis Printer --------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-accesses"
+
+PreservedAnalyses
+LoopAccessInfoPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+                               LoopStandardAnalysisResults &AR, LPMUpdater &) {
+  Function &F = *L.getHeader()->getParent();
+  auto &LAI = AM.getResult<LoopAccessAnalysis>(L, AR);
+  OS << "Loop access info in function '" << F.getName() << "':\n";
+  OS.indent(2) << L.getHeader()->getName() << ":\n";
+  LAI.print(OS, 4);
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
new file mode 100644
index 000000000000..d09af32a99fd
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -0,0 +1,341 @@
+//===-------- LoopDataPrefetch.cpp - Loop Data Prefetching Pass -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Loop Data Prefetching Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
+
+#define DEBUG_TYPE "loop-data-prefetch"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+// By default, we limit this to creating 16 PHIs (which is a little over half
+// of the allocatable register set).
+static cl::opt<bool>
+PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false),
+               cl::desc("Prefetch write addresses"));
+
+static cl::opt<unsigned>
+    PrefetchDistance("prefetch-distance",
+                     cl::desc("Number of instructions to prefetch ahead"),
+                     cl::Hidden);
+
+static cl::opt<unsigned>
+    MinPrefetchStride("min-prefetch-stride",
+                      cl::desc("Min stride to add prefetches"), cl::Hidden);
+
+static cl::opt<unsigned> MaxPrefetchIterationsAhead(
+    "max-prefetch-iters-ahead",
+    cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden);
+
+STATISTIC(NumPrefetches, "Number of prefetches inserted");
+
+namespace {
+
+/// Loop prefetch implementation class.
+class LoopDataPrefetch {
+public:
+  LoopDataPrefetch(AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE,
+                   const TargetTransformInfo *TTI,
+                   OptimizationRemarkEmitter *ORE)
+      : AC(AC), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
+
+  bool run();
+
+private:
+  bool runOnLoop(Loop *L);
+
+  /// \brief Check if the the stride of the accesses is large enough to
+  /// warrant a prefetch.
+  bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
+
+  unsigned getMinPrefetchStride() {
+    if (MinPrefetchStride.getNumOccurrences() > 0)
+      return MinPrefetchStride;
+    return TTI->getMinPrefetchStride();
+  }
+
+  unsigned getPrefetchDistance() {
+    if (PrefetchDistance.getNumOccurrences() > 0)
+      return PrefetchDistance;
+    return TTI->getPrefetchDistance();
+  }
+
+  unsigned getMaxPrefetchIterationsAhead() {
+    if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0)
+      return MaxPrefetchIterationsAhead;
+    return TTI->getMaxPrefetchIterationsAhead();
+  }
+
+  AssumptionCache *AC;
+  LoopInfo *LI;
+  ScalarEvolution *SE;
+  const TargetTransformInfo *TTI;
+  OptimizationRemarkEmitter *ORE;
+};
+
+/// Legacy class for inserting loop data prefetches.
+class LoopDataPrefetchLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopDataPrefetchLegacyPass() : FunctionPass(ID) {
+    initializeLoopDataPrefetchLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    // FIXME: For some reason, preserving SE here breaks LSR (even if
+    // this pass changes nothing).
+    // AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+  };
+}
+
+char LoopDataPrefetchLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
+                      "Loop Data Prefetch", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
+                    "Loop Data Prefetch", false, false)
+
+FunctionPass *llvm::createLoopDataPrefetchPass() {
+  return new LoopDataPrefetchLegacyPass();
+}
+
+bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
+  unsigned TargetMinStride = getMinPrefetchStride();
+  // No need to check if any stride goes.
+  if (TargetMinStride <= 1)
+    return true;
+
+  const auto *ConstStride = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+  // If MinStride is set, don't prefetch unless we can ensure that stride is
+  // larger.
+  if (!ConstStride)
+    return false;
+
+  unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue());
+  return TargetMinStride <= AbsStride;
+}
+
+PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
+  ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+  AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+  OptimizationRemarkEmitter *ORE =
+      &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+  LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+  bool Changed = LDP.run();
+
+  if (Changed) {
+    PreservedAnalyses PA;
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserve<LoopAnalysis>();
+    return PA;
+  }
+
+  return PreservedAnalyses::all();
+}
+
+bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  AssumptionCache *AC =
+      &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  OptimizationRemarkEmitter *ORE =
+      &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+  const TargetTransformInfo *TTI =
+      &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+  return LDP.run();
+}
+
+bool LoopDataPrefetch::run() {
+  // If PrefetchDistance is not set, don't run the pass.  This gives an
+  // opportunity for targets to run this pass for selected subtargets only
+  // (whose TTI sets PrefetchDistance).
+  if (getPrefetchDistance() == 0)
+    return false;
+  assert(TTI->getCacheLineSize() && "Cache line size is not set for target");
+
+  bool MadeChange = false;
+
+  for (Loop *I : *LI)
+    for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
+      MadeChange |= runOnLoop(*L);
+
+  return MadeChange;
+}
+
+bool LoopDataPrefetch::runOnLoop(Loop *L) {
+  bool MadeChange = false;
+
+  // Only prefetch in the inner-most loop
+  if (!L->empty())
+    return MadeChange;
+
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+  // Calculate the number of iterations ahead to prefetch
+  CodeMetrics Metrics;
+  for (const auto BB : L->blocks()) {
+    // If the loop already has prefetches, then assume that the user knows
+    // what they are doing and don't add any more.
+    for (auto &I : *BB)
+      if (CallInst *CI = dyn_cast<CallInst>(&I))
+        if (Function *F = CI->getCalledFunction())
+          if (F->getIntrinsicID() == Intrinsic::prefetch)
+            return MadeChange;
+
+    Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+  }
+  unsigned LoopSize = Metrics.NumInsts;
+  if (!LoopSize)
+    LoopSize = 1;
+
+  unsigned ItersAhead = getPrefetchDistance() / LoopSize;
+  if (!ItersAhead)
+    ItersAhead = 1;
+
+  if (ItersAhead > getMaxPrefetchIterationsAhead())
+    return MadeChange;
+
+  DEBUG(dbgs() << "Prefetching " << ItersAhead
+               << " iterations ahead (loop size: " << LoopSize << ") in "
+               << L->getHeader()->getParent()->getName() << ": " << *L);
+
+  SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads;
+  for (const auto BB : L->blocks()) {
+    for (auto &I : *BB) {
+      Value *PtrValue;
+      Instruction *MemI;
+
+      if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) {
+        MemI = LMemI;
+        PtrValue = LMemI->getPointerOperand();
+      } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
+        if (!PrefetchWrites) continue;
+        MemI = SMemI;
+        PtrValue = SMemI->getPointerOperand();
+      } else continue;
+
+      unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
+      if (PtrAddrSpace)
+        continue;
+
+      if (L->isLoopInvariant(PtrValue))
+        continue;
+
+      const SCEV *LSCEV = SE->getSCEV(PtrValue);
+      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+      if (!LSCEVAddRec)
+        continue;
+
+      // Check if the the stride of the accesses is large enough to warrant a
+      // prefetch.
+      if (!isStrideLargeEnough(LSCEVAddRec))
+        continue;
+
+      // We don't want to double prefetch individual cache lines. If this load
+      // is known to be within one cache line of some other load that has
+      // already been prefetched, then don't prefetch this one as well.
+      bool DupPref = false;
+      for (const auto &PrefLoad : PrefLoads) {
+        const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, PrefLoad.second);
+        if (const SCEVConstant *ConstPtrDiff =
+            dyn_cast<SCEVConstant>(PtrDiff)) {
+          int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
+          if (PD < (int64_t) TTI->getCacheLineSize()) {
+            DupPref = true;
+            break;
+          }
+        }
+      }
+      if (DupPref)
+        continue;
+
+      const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr(
+        SE->getConstant(LSCEVAddRec->getType(), ItersAhead),
+        LSCEVAddRec->getStepRecurrence(*SE)));
+      if (!isSafeToExpand(NextLSCEV, *SE))
+        continue;
+
+      PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec));
+
+      Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), PtrAddrSpace);
+      SCEVExpander SCEVE(*SE, I.getModule()->getDataLayout(), "prefaddr");
+      Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI);
+
+      IRBuilder<> Builder(MemI);
+      Module *M = BB->getParent()->getParent();
+      Type *I32 = Type::getInt32Ty(BB->getContext());
+      Value *PrefetchFunc = Intrinsic::getDeclaration(M, Intrinsic::prefetch);
+      Builder.CreateCall(
+          PrefetchFunc,
+          {PrefPtrValue,
+           ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),
+           ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+      ++NumPrefetches;
+      DEBUG(dbgs() << "  Access: " << *PtrValue << ", SCEV: " << *LSCEV
+                   << "\n");
+      ORE->emit(OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
+                << "prefetched memory access");
+
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
new file mode 100644
index 000000000000..ac4dd44a0e90
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -0,0 +1,367 @@
+//===- LoopDeletion.cpp - Dead Loop Deletion Pass ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Dead Loop Deletion Pass. This pass is responsible
+// for eliminating loops with non-infinite computable trip counts that have no
+// side effects or volatile instructions, and do not contribute to the
+// computation of the function's return value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopDeletion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-delete"
+
+STATISTIC(NumDeleted, "Number of loops deleted");
+
+/// This function deletes dead loops. The caller of this function needs to
+/// guarantee that the loop is infact dead. Here we handle two kinds of dead
+/// loop. The first kind (\p isLoopDead) is where only invariant values from
+/// within the loop are used outside of it. The second kind (\p
+/// isLoopNeverExecuted) is where the loop is provably never executed. We can
+/// always remove never executed loops since they will not cause any difference
+/// to program behaviour.
+/// 
+/// This also updates the relevant analysis information in \p DT, \p SE, and \p
+/// LI. It also updates the loop PM if an updater struct is provided.
+// TODO: This function will be used by loop-simplifyCFG as well. So, move this
+// to LoopUtils.cpp
+static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+                           LoopInfo &LI, LPMUpdater *Updater = nullptr);
+/// Determines if a loop is dead.
+///
+/// This assumes that we've already checked for unique exit and exiting blocks,
+/// and that the code is in LCSSA form.
+static bool isLoopDead(Loop *L, ScalarEvolution &SE,
+                       SmallVectorImpl<BasicBlock *> &ExitingBlocks,
+                       BasicBlock *ExitBlock, bool &Changed,
+                       BasicBlock *Preheader) {
+  // Make sure that all PHI entries coming from the loop are loop invariant.
+  // Because the code is in LCSSA form, any values used outside of the loop
+  // must pass through a PHI in the exit block, meaning that this check is
+  // sufficient to guarantee that no loop-variant values are used outside
+  // of the loop.
+  BasicBlock::iterator BI = ExitBlock->begin();
+  bool AllEntriesInvariant = true;
+  bool AllOutgoingValuesSame = true;
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    Value *incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
+
+    // Make sure all exiting blocks produce the same incoming value for the exit
+    // block.  If there are different incoming values for different exiting
+    // blocks, then it is impossible to statically determine which value should
+    // be used.
+    AllOutgoingValuesSame =
+        all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) {
+          return incoming == P->getIncomingValueForBlock(BB);
+        });
+
+    if (!AllOutgoingValuesSame)
+      break;
+
+    if (Instruction *I = dyn_cast<Instruction>(incoming))
+      if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) {
+        AllEntriesInvariant = false;
+        break;
+      }
+
+    ++BI;
+  }
+
+  if (Changed)
+    SE.forgetLoopDispositions(L);
+
+  if (!AllEntriesInvariant || !AllOutgoingValuesSame)
+    return false;
+
+  // Make sure that no instructions in the block have potential side-effects.
+  // This includes instructions that could write to memory, and loads that are
+  // marked volatile.
+  for (auto &I : L->blocks())
+    if (any_of(*I, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+      return false;
+  return true;
+}
+
+/// This function returns true if there is no viable path from the
+/// entry block to the header of \p L. Right now, it only does
+/// a local search to save compile time.
+static bool isLoopNeverExecuted(Loop *L) {
+  using namespace PatternMatch;
+
+  auto *Preheader = L->getLoopPreheader();
+  // TODO: We can relax this constraint, since we just need a loop
+  // predecessor.
+  assert(Preheader && "Needs preheader!");
+
+  if (Preheader == &Preheader->getParent()->getEntryBlock())
+    return false;
+  // All predecessors of the preheader should have a constant conditional
+  // branch, with the loop's preheader as not-taken.
+  for (auto *Pred: predecessors(Preheader)) {
+    BasicBlock *Taken, *NotTaken;
+    ConstantInt *Cond;
+    if (!match(Pred->getTerminator(),
+               m_Br(m_ConstantInt(Cond), Taken, NotTaken)))
+      return false;
+    if (!Cond->getZExtValue())
+      std::swap(Taken, NotTaken);
+    if (Taken == Preheader)
+      return false;
+  }
+  assert(!pred_empty(Preheader) &&
+         "Preheader should have predecessors at this point!");
+  // All the predecessors have the loop preheader as not-taken target.
+  return true;
+}
+
+/// Remove a loop if it is dead.
+///
+/// A loop is considered dead if it does not impact the observable behavior of
+/// the program other than finite running time. This never removes a loop that
+/// might be infinite (unless it is never executed), as doing so could change
+/// the halting/non-halting nature of a program.
+///
+/// This entire process relies pretty heavily on LoopSimplify form and LCSSA in
+/// order to make various safety checks work.
+///
+/// \returns true if any changes were made. This may mutate the loop even if it
+/// is unable to delete it due to hoisting trivially loop invariant
+/// instructions out of the loop.
+static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+                             LoopInfo &LI, LPMUpdater *Updater = nullptr) {
+  assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
+
+  // We can only remove the loop if there is a preheader that we can branch from
+  // after removing it. Also, if LoopSimplify form is not available, stay out
+  // of trouble.
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader || !L->hasDedicatedExits()) {
+    DEBUG(dbgs()
+          << "Deletion requires Loop with preheader and dedicated exits.\n");
+    return false;
+  }
+  // We can't remove loops that contain subloops.  If the subloops were dead,
+  // they would already have been removed in earlier executions of this pass.
+  if (L->begin() != L->end()) {
+    DEBUG(dbgs() << "Loop contains subloops.\n");
+    return false;
+  }
+
+
+  BasicBlock *ExitBlock = L->getUniqueExitBlock();
+
+  if (ExitBlock && isLoopNeverExecuted(L)) {
+    DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
+    // Set incoming value to undef for phi nodes in the exit block.
+    BasicBlock::iterator BI = ExitBlock->begin();
+    while (PHINode *P = dyn_cast<PHINode>(BI)) {
+      for (unsigned i = 0; i < P->getNumIncomingValues(); i++)
+        P->setIncomingValue(i, UndefValue::get(P->getType()));
+      BI++;
+    }
+    deleteDeadLoop(L, DT, SE, LI, Updater);
+    ++NumDeleted;
+    return true;
+  }
+
+  // The remaining checks below are for a loop being dead because all statements
+  // in the loop are invariant.
+  SmallVector<BasicBlock *, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  // We require that the loop only have a single exit block.  Otherwise, we'd
+  // be in the situation of needing to be able to solve statically which exit
+  // block will be branched to, or trying to preserve the branching logic in
+  // a loop invariant manner.
+  if (!ExitBlock) {
+    DEBUG(dbgs() << "Deletion requires single exit block\n");
+    return false;
+  }
+  // Finally, we have to check that the loop really is dead.
+  bool Changed = false;
+  if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) {
+    DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
+    return Changed;
+  }
+
+  // Don't remove loops for which we can't solve the trip count.
+  // They could be infinite, in which case we'd be changing program behavior.
+  const SCEV *S = SE.getMaxBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(S)) {
+    DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
+    return Changed;
+  }
+
+  DEBUG(dbgs() << "Loop is invariant, delete it!");
+  deleteDeadLoop(L, DT, SE, LI, Updater);
+  ++NumDeleted;
+
+  return true;
+}
+
+static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+                           LoopInfo &LI, LPMUpdater *Updater) {
+  assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
+  auto *Preheader = L->getLoopPreheader();
+  assert(Preheader && "Preheader should exist!");
+
+  // Now that we know the removal is safe, remove the loop by changing the
+  // branch from the preheader to go to the single exit block.
+  //
+  // Because we're deleting a large chunk of code at once, the sequence in which
+  // we remove things is very important to avoid invalidation issues.
+
+  // If we have an LPM updater, tell it about the loop being removed.
+  if (Updater)
+    Updater->markLoopAsDeleted(*L);
+
+  // Tell ScalarEvolution that the loop is deleted. Do this before
+  // deleting the loop so that ScalarEvolution can look at the loop
+  // to determine what it needs to clean up.
+  SE.forgetLoop(L);
+
+  auto *ExitBlock = L->getUniqueExitBlock();
+  assert(ExitBlock && "Should have a unique exit block!");
+
+  assert(L->hasDedicatedExits() && "Loop should have dedicated exits!");
+
+  // Connect the preheader directly to the exit block.
+  // Even when the loop is never executed, we cannot remove the edge from the
+  // source block to the exit block. Consider the case where the unexecuted loop
+  // branches back to an outer loop. If we deleted the loop and removed the edge
+  // coming to this inner loop, this will break the outer loop structure (by
+  // deleting the backedge of the outer loop). If the outer loop is indeed a
+  // non-loop, it will be deleted in a future iteration of loop deletion pass.
+  Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(), ExitBlock);
+
+  // Rewrite phis in the exit block to get their inputs from the Preheader
+  // instead of the exiting block.
+  BasicBlock::iterator BI = ExitBlock->begin();
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    // Set the zero'th element of Phi to be from the preheader and remove all
+    // other incoming values. Given the loop has dedicated exits, all other
+    // incoming values must be from the exiting blocks.
+    int PredIndex = 0;
+    P->setIncomingBlock(PredIndex, Preheader);
+    // Removes all incoming values from all other exiting blocks (including
+    // duplicate values from an exiting block).
+    // Nuke all entries except the zero'th entry which is the preheader entry.
+    // NOTE! We need to remove Incoming Values in the reverse order as done
+    // below, to keep the indices valid for deletion (removeIncomingValues
+    // updates getNumIncomingValues and shifts all values down into the operand
+    // being deleted).
+    for (unsigned i = 0, e = P->getNumIncomingValues() - 1; i != e; ++i)
+      P->removeIncomingValue(e-i, false);
+
+    assert((P->getNumIncomingValues() == 1 &&
+            P->getIncomingBlock(PredIndex) == Preheader) &&
+           "Should have exactly one value and that's from the preheader!");
+    ++BI;
+  }
+
+  // Update the dominator tree and remove the instructions and blocks that will
+  // be deleted from the reference counting scheme.
+  SmallVector<DomTreeNode*, 8> ChildNodes;
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI) {
+    // Move all of the block's children to be children of the Preheader, which
+    // allows us to remove the domtree entry for the block.
+    ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end());
+    for (DomTreeNode *ChildNode : ChildNodes) {
+      DT.changeImmediateDominator(ChildNode, DT[Preheader]);
+    }
+
+    ChildNodes.clear();
+    DT.eraseNode(*LI);
+
+    // Remove the block from the reference counting scheme, so that we can
+    // delete it freely later.
+    (*LI)->dropAllReferences();
+  }
+
+  // Erase the instructions and the blocks without having to worry
+  // about ordering because we already dropped the references.
+  // NOTE: This iteration is safe because erasing the block does not remove its
+  // entry from the loop's block list.  We do that in the next section.
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI)
+    (*LI)->eraseFromParent();
+
+  // Finally, the blocks from loopinfo.  This has to happen late because
+  // otherwise our loop iterators won't work.
+
+  SmallPtrSet<BasicBlock *, 8> blocks;
+  blocks.insert(L->block_begin(), L->block_end());
+  for (BasicBlock *BB : blocks)
+    LI.removeBlock(BB);
+
+  // The last step is to update LoopInfo now that we've eliminated this loop.
+  LI.markAsRemoved(L);
+}
+
+PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
+                                        LoopStandardAnalysisResults &AR,
+                                        LPMUpdater &Updater) {
+
+  DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+  DEBUG(L.dump());
+  if (!deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, &Updater))
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+class LoopDeletionLegacyPass : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopDeletionLegacyPass() : LoopPass(ID) {
+    initializeLoopDeletionLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // Possibly eliminate loop L if it is dead.
+  bool runOnLoop(Loop *L, LPPassManager &) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    getLoopAnalysisUsage(AU);
+  }
+};
+}
+
+char LoopDeletionLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDeletionLegacyPass, "loop-deletion",
+                      "Delete dead loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopDeletionLegacyPass, "loop-deletion",
+                    "Delete dead loops", false, false)
+
+Pass *llvm::createLoopDeletionPass() { return new LoopDeletionLegacyPass(); }
+
+bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) {
+  if (skipLoop(L))
+    return false;
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+  DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+  DEBUG(L->dump());
+  return deleteLoopIfDead(L, DT, SE, LI);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
new file mode 100644
index 000000000000..3624bba10345
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -0,0 +1,987 @@
+//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Loop Distribution Pass.  Its main focus is to
+// distribute loops that cannot be vectorized due to dependence cycles.  It
+// tries to isolate the offending dependences into a new loop allowing
+// vectorization of the remaining parts.
+//
+// For dependence analysis, the pass uses the LoopVectorizer's
+// LoopAccessAnalysis.  Because this analysis presumes no change in the order of
+// memory operations, special care is taken to preserve the lexical order of
+// these operations.
+//
+// Similarly to the Vectorizer, the pass also supports loop versioning to
+// run-time disambiguate potentially overlapping arrays.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopDistribute.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include <list>
+
+#define LDIST_NAME "loop-distribute"
+#define DEBUG_TYPE LDIST_NAME
+
+using namespace llvm;
+
+static cl::opt<bool>
+    LDistVerify("loop-distribute-verify", cl::Hidden,
+                cl::desc("Turn on DominatorTree and LoopInfo verification "
+                         "after Loop Distribution"),
+                cl::init(false));
+
+static cl::opt<bool> DistributeNonIfConvertible(
+    "loop-distribute-non-if-convertible", cl::Hidden,
+    cl::desc("Whether to distribute into a loop that may not be "
+             "if-convertible by the loop vectorizer"),
+    cl::init(false));
+
+static cl::opt<unsigned> DistributeSCEVCheckThreshold(
+    "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed for Loop "
+             "Distribution"));
+
+static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
+    "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
+    cl::Hidden,
+    cl::desc(
+        "The maximum number of SCEV checks allowed for Loop "
+        "Distribution for loop marked with #pragma loop distribute(enable)"));
+
+static cl::opt<bool> EnableLoopDistribute(
+    "enable-loop-distribute", cl::Hidden,
+    cl::desc("Enable the new, experimental LoopDistribution Pass"),
+    cl::init(false));
+
+STATISTIC(NumLoopsDistributed, "Number of loops distributed");
+
+namespace {
+/// \brief Maintains the set of instructions of the loop for a partition before
+/// cloning.  After cloning, it hosts the new loop.
+class InstPartition {
+  typedef SmallPtrSet<Instruction *, 8> InstructionSet;
+
+public:
+  InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
+      : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) {
+    Set.insert(I);
+  }
+
+  /// \brief Returns whether this partition contains a dependence cycle.
+  bool hasDepCycle() const { return DepCycle; }
+
+  /// \brief Adds an instruction to this partition.
+  void add(Instruction *I) { Set.insert(I); }
+
+  /// \brief Collection accessors.
+  InstructionSet::iterator begin() { return Set.begin(); }
+  InstructionSet::iterator end() { return Set.end(); }
+  InstructionSet::const_iterator begin() const { return Set.begin(); }
+  InstructionSet::const_iterator end() const { return Set.end(); }
+  bool empty() const { return Set.empty(); }
+
+  /// \brief Moves this partition into \p Other.  This partition becomes empty
+  /// after this.
+  void moveTo(InstPartition &Other) {
+    Other.Set.insert(Set.begin(), Set.end());
+    Set.clear();
+    Other.DepCycle |= DepCycle;
+  }
+
+  /// \brief Populates the partition with a transitive closure of all the
+  /// instructions that the seeded instructions dependent on.
+  void populateUsedSet() {
+    // FIXME: We currently don't use control-dependence but simply include all
+    // blocks (possibly empty at the end) and let simplifycfg mostly clean this
+    // up.
+    for (auto *B : OrigLoop->getBlocks())
+      Set.insert(B->getTerminator());
+
+    // Follow the use-def chains to form a transitive closure of all the
+    // instructions that the originally seeded instructions depend on.
+    SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
+    while (!Worklist.empty()) {
+      Instruction *I = Worklist.pop_back_val();
+      // Insert instructions from the loop that we depend on.
+      for (Value *V : I->operand_values()) {
+        auto *I = dyn_cast<Instruction>(V);
+        if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
+          Worklist.push_back(I);
+      }
+    }
+  }
+
+  /// \brief Clones the original loop.
+  ///
+  /// Updates LoopInfo and DominatorTree using the information that block \p
+  /// LoopDomBB dominates the loop.
+  Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
+                               unsigned Index, LoopInfo *LI,
+                               DominatorTree *DT) {
+    ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
+                                          VMap, Twine(".ldist") + Twine(Index),
+                                          LI, DT, ClonedLoopBlocks);
+    return ClonedLoop;
+  }
+
+  /// \brief The cloned loop.  If this partition is mapped to the original loop,
+  /// this is null.
+  const Loop *getClonedLoop() const { return ClonedLoop; }
+
+  /// \brief Returns the loop where this partition ends up after distribution.
+  /// If this partition is mapped to the original loop then use the block from
+  /// the loop.
+  const Loop *getDistributedLoop() const {
+    return ClonedLoop ? ClonedLoop : OrigLoop;
+  }
+
+  /// \brief The VMap that is populated by cloning and then used in
+  /// remapinstruction to remap the cloned instructions.
+  ValueToValueMapTy &getVMap() { return VMap; }
+
+  /// \brief Remaps the cloned instructions using VMap.
+  void remapInstructions() {
+    remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
+  }
+
+  /// \brief Based on the set of instructions selected for this partition,
+  /// removes the unnecessary ones.
+  void removeUnusedInsts() {
+    SmallVector<Instruction *, 8> Unused;
+
+    for (auto *Block : OrigLoop->getBlocks())
+      for (auto &Inst : *Block)
+        if (!Set.count(&Inst)) {
+          Instruction *NewInst = &Inst;
+          if (!VMap.empty())
+            NewInst = cast<Instruction>(VMap[NewInst]);
+
+          assert(!isa<BranchInst>(NewInst) &&
+                 "Branches are marked used early on");
+          Unused.push_back(NewInst);
+        }
+
+    // Delete the instructions backwards, as it has a reduced likelihood of
+    // having to update as many def-use and use-def chains.
+    for (auto *Inst : reverse(Unused)) {
+      if (!Inst->use_empty())
+        Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+      Inst->eraseFromParent();
+    }
+  }
+
+  void print() const {
+    if (DepCycle)
+      dbgs() << "  (cycle)\n";
+    for (auto *I : Set)
+      // Prefix with the block name.
+      dbgs() << "  " << I->getParent()->getName() << ":" << *I << "\n";
+  }
+
+  void printBlocks() const {
+    for (auto *BB : getDistributedLoop()->getBlocks())
+      dbgs() << *BB;
+  }
+
+private:
+  /// \brief Instructions from OrigLoop selected for this partition.
+  InstructionSet Set;
+
+  /// \brief Whether this partition contains a dependence cycle.
+  bool DepCycle;
+
+  /// \brief The original loop.
+  Loop *OrigLoop;
+
+  /// \brief The cloned loop.  If this partition is mapped to the original loop,
+  /// this is null.
+  Loop *ClonedLoop;
+
+  /// \brief The blocks of ClonedLoop including the preheader.  If this
+  /// partition is mapped to the original loop, this is empty.
+  SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
+
+  /// \brief These gets populated once the set of instructions have been
+  /// finalized. If this partition is mapped to the original loop, these are not
+  /// set.
+  ValueToValueMapTy VMap;
+};
+
+/// \brief Holds the set of Partitions.  It populates them, merges them and then
+/// clones the loops.
+class InstPartitionContainer {
+  typedef DenseMap<Instruction *, int> InstToPartitionIdT;
+
+public:
+  InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
+      : L(L), LI(LI), DT(DT) {}
+
+  /// \brief Returns the number of partitions.
+  unsigned getSize() const { return PartitionContainer.size(); }
+
+  /// \brief Adds \p Inst into the current partition if that is marked to
+  /// contain cycles.  Otherwise start a new partition for it.
+  void addToCyclicPartition(Instruction *Inst) {
+    // If the current partition is non-cyclic.  Start a new one.
+    if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
+      PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
+    else
+      PartitionContainer.back().add(Inst);
+  }
+
+  /// \brief Adds \p Inst into a partition that is not marked to contain
+  /// dependence cycles.
+  ///
+  //  Initially we isolate memory instructions into as many partitions as
+  //  possible, then later we may merge them back together.
+  void addToNewNonCyclicPartition(Instruction *Inst) {
+    PartitionContainer.emplace_back(Inst, L);
+  }
+
+  /// \brief Merges adjacent non-cyclic partitions.
+  ///
+  /// The idea is that we currently only want to isolate the non-vectorizable
+  /// partition.  We could later allow more distribution among these partition
+  /// too.
+  void mergeAdjacentNonCyclic() {
+    mergeAdjacentPartitionsIf(
+        [](const InstPartition *P) { return !P->hasDepCycle(); });
+  }
+
+  /// \brief If a partition contains only conditional stores, we won't vectorize
+  /// it.  Try to merge it with a previous cyclic partition.
+  void mergeNonIfConvertible() {
+    mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
+      if (Partition->hasDepCycle())
+        return true;
+
+      // Now, check if all stores are conditional in this partition.
+      bool seenStore = false;
+
+      for (auto *Inst : *Partition)
+        if (isa<StoreInst>(Inst)) {
+          seenStore = true;
+          if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
+            return false;
+        }
+      return seenStore;
+    });
+  }
+
+  /// \brief Merges the partitions according to various heuristics.
+  void mergeBeforePopulating() {
+    mergeAdjacentNonCyclic();
+    if (!DistributeNonIfConvertible)
+      mergeNonIfConvertible();
+  }
+
+  /// \brief Merges partitions in order to ensure that no loads are duplicated.
+  ///
+  /// We can't duplicate loads because that could potentially reorder them.
+  /// LoopAccessAnalysis provides dependency information with the context that
+  /// the order of memory operation is preserved.
+  ///
+  /// Return if any partitions were merged.
+  bool mergeToAvoidDuplicatedLoads() {
+    typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT;
+    typedef EquivalenceClasses<InstPartition *> ToBeMergedT;
+
+    LoadToPartitionT LoadToPartition;
+    ToBeMergedT ToBeMerged;
+
+    // Step through the partitions and create equivalence between partitions
+    // that contain the same load.  Also put partitions in between them in the
+    // same equivalence class to avoid reordering of memory operations.
+    for (PartitionContainerT::iterator I = PartitionContainer.begin(),
+                                       E = PartitionContainer.end();
+         I != E; ++I) {
+      auto *PartI = &*I;
+
+      // If a load occurs in two partitions PartI and PartJ, merge all
+      // partitions (PartI, PartJ] into PartI.
+      for (Instruction *Inst : *PartI)
+        if (isa<LoadInst>(Inst)) {
+          bool NewElt;
+          LoadToPartitionT::iterator LoadToPart;
+
+          std::tie(LoadToPart, NewElt) =
+              LoadToPartition.insert(std::make_pair(Inst, PartI));
+          if (!NewElt) {
+            DEBUG(dbgs() << "Merging partitions due to this load in multiple "
+                         << "partitions: " << PartI << ", "
+                         << LoadToPart->second << "\n" << *Inst << "\n");
+
+            auto PartJ = I;
+            do {
+              --PartJ;
+              ToBeMerged.unionSets(PartI, &*PartJ);
+            } while (&*PartJ != LoadToPart->second);
+          }
+        }
+    }
+    if (ToBeMerged.empty())
+      return false;
+
+    // Merge the member of an equivalence class into its class leader.  This
+    // makes the members empty.
+    for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
+         I != E; ++I) {
+      if (!I->isLeader())
+        continue;
+
+      auto PartI = I->getData();
+      for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
+                                   ToBeMerged.member_end())) {
+        PartJ->moveTo(*PartI);
+      }
+    }
+
+    // Remove the empty partitions.
+    PartitionContainer.remove_if(
+        [](const InstPartition &P) { return P.empty(); });
+
+    return true;
+  }
+
+  /// \brief Sets up the mapping between instructions to partitions.  If the
+  /// instruction is duplicated across multiple partitions, set the entry to -1.
+  void setupPartitionIdOnInstructions() {
+    int PartitionID = 0;
+    for (const auto &Partition : PartitionContainer) {
+      for (Instruction *Inst : Partition) {
+        bool NewElt;
+        InstToPartitionIdT::iterator Iter;
+
+        std::tie(Iter, NewElt) =
+            InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
+        if (!NewElt)
+          Iter->second = -1;
+      }
+      ++PartitionID;
+    }
+  }
+
+  /// \brief Populates the partition with everything that the seeding
+  /// instructions require.
+  void populateUsedSet() {
+    for (auto &P : PartitionContainer)
+      P.populateUsedSet();
+  }
+
+  /// \brief This performs the main chunk of the work of cloning the loops for
+  /// the partitions.
+  void cloneLoops() {
+    BasicBlock *OrigPH = L->getLoopPreheader();
+    // At this point the predecessor of the preheader is either the memcheck
+    // block or the top part of the original preheader.
+    BasicBlock *Pred = OrigPH->getSinglePredecessor();
+    assert(Pred && "Preheader does not have a single predecessor");
+    BasicBlock *ExitBlock = L->getExitBlock();
+    assert(ExitBlock && "No single exit block");
+    Loop *NewLoop;
+
+    assert(!PartitionContainer.empty() && "at least two partitions expected");
+    // We're cloning the preheader along with the loop so we already made sure
+    // it was empty.
+    assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
+           "preheader not empty");
+
+    // Create a loop for each partition except the last.  Clone the original
+    // loop before PH along with adding a preheader for the cloned loop.  Then
+    // update PH to point to the newly added preheader.
+    BasicBlock *TopPH = OrigPH;
+    unsigned Index = getSize() - 1;
+    for (auto I = std::next(PartitionContainer.rbegin()),
+              E = PartitionContainer.rend();
+         I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
+      auto *Part = &*I;
+
+      NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
+
+      Part->getVMap()[ExitBlock] = TopPH;
+      Part->remapInstructions();
+    }
+    Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
+
+    // Now go in forward order and update the immediate dominator for the
+    // preheaders with the exiting block of the previous loop.  Dominance
+    // within the loop is updated in cloneLoopWithPreheader.
+    for (auto Curr = PartitionContainer.cbegin(),
+              Next = std::next(PartitionContainer.cbegin()),
+              E = PartitionContainer.cend();
+         Next != E; ++Curr, ++Next)
+      DT->changeImmediateDominator(
+          Next->getDistributedLoop()->getLoopPreheader(),
+          Curr->getDistributedLoop()->getExitingBlock());
+  }
+
+  /// \brief Removes the dead instructions from the cloned loops.
+  void removeUnusedInsts() {
+    for (auto &Partition : PartitionContainer)
+      Partition.removeUnusedInsts();
+  }
+
+  /// \brief For each memory pointer, it computes the partitionId the pointer is
+  /// used in.
+  ///
+  /// This returns an array of int where the I-th entry corresponds to I-th
+  /// entry in LAI.getRuntimePointerCheck().  If the pointer is used in multiple
+  /// partitions its entry is set to -1.
+  SmallVector<int, 8>
+  computePartitionSetForPointers(const LoopAccessInfo &LAI) {
+    const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
+
+    unsigned N = RtPtrCheck->Pointers.size();
+    SmallVector<int, 8> PtrToPartitions(N);
+    for (unsigned I = 0; I < N; ++I) {
+      Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
+      auto Instructions =
+          LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
+
+      int &Partition = PtrToPartitions[I];
+      // First set it to uninitialized.
+      Partition = -2;
+      for (Instruction *Inst : Instructions) {
+        // Note that this could be -1 if Inst is duplicated across multiple
+        // partitions.
+        int ThisPartition = this->InstToPartitionId[Inst];
+        if (Partition == -2)
+          Partition = ThisPartition;
+        // -1 means belonging to multiple partitions.
+        else if (Partition == -1)
+          break;
+        else if (Partition != (int)ThisPartition)
+          Partition = -1;
+      }
+      assert(Partition != -2 && "Pointer not belonging to any partition");
+    }
+
+    return PtrToPartitions;
+  }
+
+  void print(raw_ostream &OS) const {
+    unsigned Index = 0;
+    for (const auto &P : PartitionContainer) {
+      OS << "Partition " << Index++ << " (" << &P << "):\n";
+      P.print();
+    }
+  }
+
+  void dump() const { print(dbgs()); }
+
+#ifndef NDEBUG
+  friend raw_ostream &operator<<(raw_ostream &OS,
+                                 const InstPartitionContainer &Partitions) {
+    Partitions.print(OS);
+    return OS;
+  }
+#endif
+
+  void printBlocks() const {
+    unsigned Index = 0;
+    for (const auto &P : PartitionContainer) {
+      dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
+      P.printBlocks();
+    }
+  }
+
+private:
+  typedef std::list<InstPartition> PartitionContainerT;
+
+  /// \brief List of partitions.
+  PartitionContainerT PartitionContainer;
+
+  /// \brief Mapping from Instruction to partition Id.  If the instruction
+  /// belongs to multiple partitions the entry contains -1.
+  InstToPartitionIdT InstToPartitionId;
+
+  Loop *L;
+  LoopInfo *LI;
+  DominatorTree *DT;
+
+  /// \brief The control structure to merge adjacent partitions if both satisfy
+  /// the \p Predicate.
+  template <class UnaryPredicate>
+  void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
+    InstPartition *PrevMatch = nullptr;
+    for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
+      auto DoesMatch = Predicate(&*I);
+      if (PrevMatch == nullptr && DoesMatch) {
+        PrevMatch = &*I;
+        ++I;
+      } else if (PrevMatch != nullptr && DoesMatch) {
+        I->moveTo(*PrevMatch);
+        I = PartitionContainer.erase(I);
+      } else {
+        PrevMatch = nullptr;
+        ++I;
+      }
+    }
+  }
+};
+
+/// \brief For each memory instruction, this class maintains difference of the
+/// number of unsafe dependences that start out from this instruction minus
+/// those that end here.
+///
+/// By traversing the memory instructions in program order and accumulating this
+/// number, we know whether any unsafe dependence crosses over a program point.
+class MemoryInstructionDependences {
+  typedef MemoryDepChecker::Dependence Dependence;
+
+public:
+  struct Entry {
+    Instruction *Inst;
+    unsigned NumUnsafeDependencesStartOrEnd;
+
+    Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {}
+  };
+
+  typedef SmallVector<Entry, 8> AccessesType;
+
+  AccessesType::const_iterator begin() const { return Accesses.begin(); }
+  AccessesType::const_iterator end() const { return Accesses.end(); }
+
+  MemoryInstructionDependences(
+      const SmallVectorImpl<Instruction *> &Instructions,
+      const SmallVectorImpl<Dependence> &Dependences) {
+    Accesses.append(Instructions.begin(), Instructions.end());
+
+    DEBUG(dbgs() << "Backward dependences:\n");
+    for (auto &Dep : Dependences)
+      if (Dep.isPossiblyBackward()) {
+        // Note that the designations source and destination follow the program
+        // order, i.e. source is always first.  (The direction is given by the
+        // DepType.)
+        ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
+        --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
+
+        DEBUG(Dep.print(dbgs(), 2, Instructions));
+      }
+  }
+
+private:
+  AccessesType Accesses;
+};
+
+/// \brief The actual class performing the per-loop work.
+class LoopDistributeForLoop {
+public:
+  LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
+                        ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
+      : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE), ORE(ORE) {
+    setForced();
+  }
+
+  /// \brief Try to distribute an inner-most loop.
+  bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
+    assert(L->empty() && "Only process inner loops.");
+
+    DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
+                 << "\" checking " << *L << "\n");
+
+    if (!L->getExitBlock())
+      return fail("MultipleExitBlocks", "multiple exit blocks");
+    if (!L->isLoopSimplifyForm())
+      return fail("NotLoopSimplifyForm",
+                  "loop is not in loop-simplify form");
+
+    BasicBlock *PH = L->getLoopPreheader();
+
+    // LAA will check that we only have a single exiting block.
+    LAI = &GetLAA(*L);
+
+    // Currently, we only distribute to isolate the part of the loop with
+    // dependence cycles to enable partial vectorization.
+    if (LAI->canVectorizeMemory())
+      return fail("MemOpsCanBeVectorized",
+                  "memory operations are safe for vectorization");
+
+    auto *Dependences = LAI->getDepChecker().getDependences();
+    if (!Dependences || Dependences->empty())
+      return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
+
+    InstPartitionContainer Partitions(L, LI, DT);
+
+    // First, go through each memory operation and assign them to consecutive
+    // partitions (the order of partitions follows program order).  Put those
+    // with unsafe dependences into "cyclic" partition otherwise put each store
+    // in its own "non-cyclic" partition (we'll merge these later).
+    //
+    // Note that a memory operation (e.g. Load2 below) at a program point that
+    // has an unsafe dependence (Store3->Load1) spanning over it must be
+    // included in the same cyclic partition as the dependent operations.  This
+    // is to preserve the original program order after distribution.  E.g.:
+    //
+    //                NumUnsafeDependencesStartOrEnd  NumUnsafeDependencesActive
+    //  Load1   -.                     1                       0->1
+    //  Load2    | /Unsafe/            0                       1
+    //  Store3  -'                    -1                       1->0
+    //  Load4                          0                       0
+    //
+    // NumUnsafeDependencesActive > 0 indicates this situation and in this case
+    // we just keep assigning to the same cyclic partition until
+    // NumUnsafeDependencesActive reaches 0.
+    const MemoryDepChecker &DepChecker = LAI->getDepChecker();
+    MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
+                                     *Dependences);
+
+    int NumUnsafeDependencesActive = 0;
+    for (auto &InstDep : MID) {
+      Instruction *I = InstDep.Inst;
+      // We update NumUnsafeDependencesActive post-instruction, catch the
+      // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
+      if (NumUnsafeDependencesActive ||
+          InstDep.NumUnsafeDependencesStartOrEnd > 0)
+        Partitions.addToCyclicPartition(I);
+      else
+        Partitions.addToNewNonCyclicPartition(I);
+      NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
+      assert(NumUnsafeDependencesActive >= 0 &&
+             "Negative number of dependences active");
+    }
+
+    // Add partitions for values used outside.  These partitions can be out of
+    // order from the original program order.  This is OK because if the
+    // partition uses a load we will merge this partition with the original
+    // partition of the load that we set up in the previous loop (see
+    // mergeToAvoidDuplicatedLoads).
+    auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
+    for (auto *Inst : DefsUsedOutside)
+      Partitions.addToNewNonCyclicPartition(Inst);
+
+    DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
+    if (Partitions.getSize() < 2)
+      return fail("CantIsolateUnsafeDeps",
+                  "cannot isolate unsafe dependencies");
+
+    // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
+    // should be able to vectorize these together.
+    Partitions.mergeBeforePopulating();
+    DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
+    if (Partitions.getSize() < 2)
+      return fail("CantIsolateUnsafeDeps",
+                  "cannot isolate unsafe dependencies");
+
+    // Now, populate the partitions with non-memory operations.
+    Partitions.populateUsedSet();
+    DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
+
+    // In order to preserve original lexical order for loads, keep them in the
+    // partition that we set up in the MemoryInstructionDependences loop.
+    if (Partitions.mergeToAvoidDuplicatedLoads()) {
+      DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
+                   << Partitions);
+      if (Partitions.getSize() < 2)
+        return fail("CantIsolateUnsafeDeps",
+                    "cannot isolate unsafe dependencies");
+    }
+
+    // Don't distribute the loop if we need too many SCEV run-time checks.
+    const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
+    if (Pred.getComplexity() > (IsForced.getValueOr(false)
+                                    ? PragmaDistributeSCEVCheckThreshold
+                                    : DistributeSCEVCheckThreshold))
+      return fail("TooManySCEVRuntimeChecks",
+                  "too many SCEV run-time checks needed.\n");
+
+    DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
+    // We're done forming the partitions set up the reverse mapping from
+    // instructions to partitions.
+    Partitions.setupPartitionIdOnInstructions();
+
+    // To keep things simple have an empty preheader before we version or clone
+    // the loop.  (Also split if this has no predecessor, i.e. entry, because we
+    // rely on PH having a predecessor.)
+    if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
+      SplitBlock(PH, PH->getTerminator(), DT, LI);
+
+    // If we need run-time checks, version the loop now.
+    auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
+    const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
+    const auto &AllChecks = RtPtrChecking->getChecks();
+    auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
+                                                  RtPtrChecking);
+
+    if (!Pred.isAlwaysTrue() || !Checks.empty()) {
+      DEBUG(dbgs() << "\nPointers:\n");
+      DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+      LoopVersioning LVer(*LAI, L, LI, DT, SE, false);
+      LVer.setAliasChecks(std::move(Checks));
+      LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate());
+      LVer.versionLoop(DefsUsedOutside);
+      LVer.annotateLoopWithNoAlias();
+    }
+
+    // Create identical copies of the original loop for each partition and hook
+    // them up sequentially.
+    Partitions.cloneLoops();
+
+    // Now, we remove the instruction from each loop that don't belong to that
+    // partition.
+    Partitions.removeUnusedInsts();
+    DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
+    DEBUG(Partitions.printBlocks());
+
+    if (LDistVerify) {
+      LI->verify(*DT);
+      DT->verifyDomTree();
+    }
+
+    ++NumLoopsDistributed;
+    // Report the success.
+    ORE->emit(OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(),
+                                 L->getHeader())
+              << "distributed loop");
+    return true;
+  }
+
+  /// \brief Provide diagnostics then \return with false.
+  bool fail(StringRef RemarkName, StringRef Message) {
+    LLVMContext &Ctx = F->getContext();
+    bool Forced = isForced().getValueOr(false);
+
+    DEBUG(dbgs() << "Skipping; " << Message << "\n");
+
+    // With Rpass-missed report that distribution failed.
+    ORE->emit(
+        OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", L->getStartLoc(),
+                                 L->getHeader())
+        << "loop not distributed: use -Rpass-analysis=loop-distribute for more "
+           "info");
+
+    // With Rpass-analysis report why.  This is on by default if distribution
+    // was requested explicitly.
+    ORE->emit(OptimizationRemarkAnalysis(
+                  Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME,
+                  RemarkName, L->getStartLoc(), L->getHeader())
+              << "loop not distributed: " << Message);
+
+    // Also issue a warning if distribution was requested explicitly but it
+    // failed.
+    if (Forced)
+      Ctx.diagnose(DiagnosticInfoOptimizationFailure(
+          *F, L->getStartLoc(), "loop not distributed: failed "
+                                "explicitly specified loop distribution"));
+
+    return false;
+  }
+
+  /// \brief Return if distribution forced to be enabled/disabled for the loop.
+  ///
+  /// If the optional has a value, it indicates whether distribution was forced
+  /// to be enabled (true) or disabled (false).  If the optional has no value
+  /// distribution was not forced either way.
+  const Optional<bool> &isForced() const { return IsForced; }
+
+private:
+  /// \brief Filter out checks between pointers from the same partition.
+  ///
+  /// \p PtrToPartition contains the partition number for pointers.  Partition
+  /// number -1 means that the pointer is used in multiple partitions.  In this
+  /// case we can't safely omit the check.
+  SmallVector<RuntimePointerChecking::PointerCheck, 4>
+  includeOnlyCrossPartitionChecks(
+      const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
+      const SmallVectorImpl<int> &PtrToPartition,
+      const RuntimePointerChecking *RtPtrChecking) {
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+
+    copy_if(AllChecks, std::back_inserter(Checks),
+            [&](const RuntimePointerChecking::PointerCheck &Check) {
+              for (unsigned PtrIdx1 : Check.first->Members)
+                for (unsigned PtrIdx2 : Check.second->Members)
+                  // Only include this check if there is a pair of pointers
+                  // that require checking and the pointers fall into
+                  // separate partitions.
+                  //
+                  // (Note that we already know at this point that the two
+                  // pointer groups need checking but it doesn't follow
+                  // that each pair of pointers within the two groups need
+                  // checking as well.
+                  //
+                  // In other words we don't want to include a check just
+                  // because there is a pair of pointers between the two
+                  // pointer groups that require checks and a different
+                  // pair whose pointers fall into different partitions.)
+                  if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
+                      !RuntimePointerChecking::arePointersInSamePartition(
+                          PtrToPartition, PtrIdx1, PtrIdx2))
+                    return true;
+              return false;
+            });
+
+    return Checks;
+  }
+
+  /// \brief Check whether the loop metadata is forcing distribution to be
+  /// enabled/disabled.
+  void setForced() {
+    Optional<const MDOperand *> Value =
+        findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
+    if (!Value)
+      return;
+
+    const MDOperand *Op = *Value;
+    assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
+    IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
+  }
+
+  Loop *L;
+  Function *F;
+
+  // Analyses used.
+  LoopInfo *LI;
+  const LoopAccessInfo *LAI;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  OptimizationRemarkEmitter *ORE;
+
+  /// \brief Indicates whether distribution is forced to be enabled/disabled for
+  /// the loop.
+  ///
+  /// If the optional has a value, it indicates whether distribution was forced
+  /// to be enabled (true) or disabled (false).  If the optional has no value
+  /// distribution was not forced either way.
+  Optional<bool> IsForced;
+};
+
+/// Shared implementation between new and old PMs.
+static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
+                    ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
+                    std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
+  // Build up a worklist of inner-loops to vectorize. This is necessary as the
+  // act of distributing a loop creates new loops and can invalidate iterators
+  // across the loops.
+  SmallVector<Loop *, 8> Worklist;
+
+  for (Loop *TopLevelLoop : *LI)
+    for (Loop *L : depth_first(TopLevelLoop))
+      // We only handle inner-most loops.
+      if (L->empty())
+        Worklist.push_back(L);
+
+  // Now walk the identified inner loops.
+  bool Changed = false;
+  for (Loop *L : Worklist) {
+    LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE);
+
+    // If distribution was forced for the specific loop to be
+    // enabled/disabled, follow that.  Otherwise use the global flag.
+    if (LDL.isForced().getValueOr(EnableLoopDistribute))
+      Changed |= LDL.processLoop(GetLAA);
+  }
+
+  // Process each loop nest in the function.
+  return Changed;
+}
+
+/// \brief The pass class.
+class LoopDistributeLegacy : public FunctionPass {
+public:
+  LoopDistributeLegacy() : FunctionPass(ID) {
+    // The default is set by the caller.
+    initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+    std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+        [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
+
+    return runImpl(F, LI, DT, SE, ORE, GetLAA);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  static char ID;
+};
+} // anonymous namespace
+
+PreservedAnalyses LoopDistributePass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+  // We don't directly need these analyses but they're required for loop
+  // analyses so provide them below.
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
+  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+      [&](Loop &L) -> const LoopAccessInfo & {
+    LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+    return LAM.getResult<LoopAccessAnalysis>(L, AR);
+  };
+
+  bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+char LoopDistributeLegacy::ID;
+static const char ldist_name[] = "Loop Distribution";
+
+INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false)
+
+namespace llvm {
+FunctionPass *createLoopDistributePass() { return new LoopDistributeLegacy(); }
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
new file mode 100644
index 000000000000..4a6a35c0ab1b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -0,0 +1,1685 @@
+//===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an idiom recognizer that transforms simple loops into a
+// non-loop form.  In cases that this kicks in, it can be a significant
+// performance win.
+//
+// If compiling for code size we avoid idiom recognition if the resulting
+// code could be larger than the code for the original loop. One way this could
+// happen is if the loop is not removable after idiom recognition due to the
+// presence of non-idiom instructions. The initial implementation of the
+// heuristics applies to idioms in multi-block loops.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO List:
+//
+// Future loop memory idioms to recognize:
+//   memcmp, memmove, strlen, etc.
+// Future floating point idioms to recognize in -ffast-math mode:
+//   fpowi
+// Future integer operation idioms to recognize:
+//   ctpop, ctlz, cttz
+//
+// Beware that isel's default lowering for ctpop is highly inefficient for
+// i64 and larger types when i64 is legal and the value has few bits set.  It
+// would be good to enhance isel to emit a loop for ctpop in this case.
+//
+// This could recognize common matrix multiplies and dot product idioms and
+// replace them with calls to BLAS (if linked in??).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-idiom"
+
+STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
+STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
+
+static cl::opt<bool> UseLIRCodeSizeHeurs(
+    "use-lir-code-size-heurs",
+    cl::desc("Use loop idiom recognition code size heuristics when compiling"
+             "with -Os/-Oz"),
+    cl::init(true), cl::Hidden);
+
+namespace {
+
+class LoopIdiomRecognize {
+  Loop *CurLoop;
+  AliasAnalysis *AA;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  ScalarEvolution *SE;
+  TargetLibraryInfo *TLI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+  bool ApplyCodeSizeHeuristics;
+
+public:
+  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
+                              LoopInfo *LI, ScalarEvolution *SE,
+                              TargetLibraryInfo *TLI,
+                              const TargetTransformInfo *TTI,
+                              const DataLayout *DL)
+      : CurLoop(nullptr), AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI),
+        DL(DL) {}
+
+  bool runOnLoop(Loop *L);
+
+private:
+  typedef SmallVector<StoreInst *, 8> StoreList;
+  typedef MapVector<Value *, StoreList> StoreListMap;
+  StoreListMap StoreRefsForMemset;
+  StoreListMap StoreRefsForMemsetPattern;
+  StoreList StoreRefsForMemcpy;
+  bool HasMemset;
+  bool HasMemsetPattern;
+  bool HasMemcpy;
+  /// Return code for isLegalStore()
+  enum LegalStoreKind {
+    None = 0,
+    Memset,
+    MemsetPattern,
+    Memcpy,
+    UnorderedAtomicMemcpy,
+    DontUse // Dummy retval never to be used. Allows catching errors in retval
+            // handling.
+  };
+
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+                      SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+  void collectStores(BasicBlock *BB);
+  LegalStoreKind isLegalStore(StoreInst *SI);
+  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
+                         bool ForMemset);
+  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
+
+  bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
+                               unsigned StoreAlignment, Value *StoredVal,
+                               Instruction *TheStore,
+                               SmallPtrSetImpl<Instruction *> &Stores,
+                               const SCEVAddRecExpr *Ev, const SCEV *BECount,
+                               bool NegStride, bool IsLoopMemset = false);
+  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
+  bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
+                                 bool IsLoopMemset = false);
+
+  /// @}
+  /// \name Noncountable Loop Idiom Handling
+  /// @{
+
+  bool runOnNoncountableLoop();
+
+  bool recognizePopcount();
+  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
+                               PHINode *CntPhi, Value *Var);
+  bool recognizeAndInsertCTLZ();
+  void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
+                                PHINode *CntPhi, Value *Var, const DebugLoc DL,
+                                bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
+
+  /// @}
+};
+
+class LoopIdiomRecognizeLegacyPass : public LoopPass {
+public:
+  static char ID;
+  explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
+    initializeLoopIdiomRecognizeLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
+    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    const TargetTransformInfo *TTI =
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+            *L->getHeader()->getParent());
+    const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
+
+    LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL);
+    return LIR.runOnLoop(L);
+  }
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG.
+  ///
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+} // End anonymous namespace.
+
+PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
+                                              LoopStandardAnalysisResults &AR,
+                                              LPMUpdater &) {
+  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
+
+  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL);
+  if (!LIR.runOnLoop(&L))
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+char LoopIdiomRecognizeLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
+                      "Recognize loop idioms", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
+                    "Recognize loop idioms", false, false)
+
+Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
+
+static void deleteDeadInstruction(Instruction *I) {
+  I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  I->eraseFromParent();
+}
+
+//===----------------------------------------------------------------------===//
+//
+//          Implementation of LoopIdiomRecognize
+//
+//===----------------------------------------------------------------------===//
+
+bool LoopIdiomRecognize::runOnLoop(Loop *L) {
+  CurLoop = L;
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!L->getLoopPreheader())
+    return false;
+
+  // Disable loop idiom recognition if the function's name is a common idiom.
+  StringRef Name = L->getHeader()->getParent()->getName();
+  if (Name == "memset" || Name == "memcpy")
+    return false;
+
+  // Determine if code size heuristics need to be applied.
+  ApplyCodeSizeHeuristics =
+      L->getHeader()->getParent()->optForSize() && UseLIRCodeSizeHeurs;
+
+  HasMemset = TLI->has(LibFunc_memset);
+  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
+  HasMemcpy = TLI->has(LibFunc_memcpy);
+
+  if (HasMemset || HasMemsetPattern || HasMemcpy)
+    if (SE->hasLoopInvariantBackedgeTakenCount(L))
+      return runOnCountableLoop();
+
+  return runOnNoncountableLoop();
+}
+
+bool LoopIdiomRecognize::runOnCountableLoop() {
+  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
+  assert(!isa<SCEVCouldNotCompute>(BECount) &&
+         "runOnCountableLoop() called on a loop without a predictable"
+         "backedge-taken count");
+
+  // If this loop executes exactly one time, then it should be peeled, not
+  // optimized by this pass.
+  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+    if (BECst->getAPInt() == 0)
+      return false;
+
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+
+  DEBUG(dbgs() << "loop-idiom Scanning: F["
+               << CurLoop->getHeader()->getParent()->getName() << "] Loop %"
+               << CurLoop->getHeader()->getName() << "\n");
+
+  bool MadeChange = false;
+
+  // The following transforms hoist stores/memsets into the loop pre-header.
+  // Give up if the loop has instructions may throw.
+  LoopSafetyInfo SafetyInfo;
+  computeLoopSafetyInfo(&SafetyInfo, CurLoop);
+  if (SafetyInfo.MayThrow)
+    return MadeChange;
+
+  // Scan all the blocks in the loop that are not in subloops.
+  for (auto *BB : CurLoop->getBlocks()) {
+    // Ignore blocks in subloops.
+    if (LI->getLoopFor(BB) != CurLoop)
+      continue;
+
+    MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
+  }
+  return MadeChange;
+}
+
+static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) {
+  uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
+  assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) &&
+         "Don't overflow unsigned.");
+  return (unsigned)SizeInBits >> 3;
+}
+
+static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
+  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
+  return ConstStride->getAPInt();
+}
+
+/// getMemSetPatternValue - If a strided store of the specified value is safe to
+/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
+/// be passed in.  Otherwise, return null.
+///
+/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
+/// just replicate their input array and then pass on to memset_pattern16.
+static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
+  // If the value isn't a constant, we can't promote it to being in a constant
+  // array.  We could theoretically do a store to an alloca or something, but
+  // that doesn't seem worthwhile.
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return nullptr;
+
+  // Only handle simple values that are a power of two bytes in size.
+  uint64_t Size = DL->getTypeSizeInBits(V->getType());
+  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
+    return nullptr;
+
+  // Don't care enough about darwin/ppc to implement this.
+  if (DL->isBigEndian())
+    return nullptr;
+
+  // Convert to size in bytes.
+  Size /= 8;
+
+  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
+  // if the top and bottom are the same (e.g. for vectors and large integers).
+  if (Size > 16)
+    return nullptr;
+
+  // If the constant is exactly 16 bytes, just use it.
+  if (Size == 16)
+    return C;
+
+  // Otherwise, we'll use an array of the constants.
+  unsigned ArraySize = 16 / Size;
+  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
+  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
+}
+
+LoopIdiomRecognize::LegalStoreKind
+LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
+
+  // Don't touch volatile stores.
+  if (SI->isVolatile())
+    return LegalStoreKind::None;
+  // We only want simple or unordered-atomic stores.
+  if (!SI->isUnordered())
+    return LegalStoreKind::None;
+
+  // Don't convert stores of non-integral pointer types to memsets (which stores
+  // integers).
+  if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
+    return LegalStoreKind::None;
+
+  // Avoid merging nontemporal stores.
+  if (SI->getMetadata(LLVMContext::MD_nontemporal))
+    return LegalStoreKind::None;
+
+  Value *StoredVal = SI->getValueOperand();
+  Value *StorePtr = SI->getPointerOperand();
+
+  // Reject stores that are so large that they overflow an unsigned.
+  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
+  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
+    return LegalStoreKind::None;
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided store.  If we have something else, it's a
+  // random store we can't handle.
+  const SCEVAddRecExpr *StoreEv =
+      dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
+    return LegalStoreKind::None;
+
+  // Check to see if we have a constant stride.
+  if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
+    return LegalStoreKind::None;
+
+  // See if the store can be turned into a memset.
+
+  // If the stored value is a byte-wise value (like i32 -1), then it may be
+  // turned into a memset of i8 -1, assuming that all the consecutive bytes
+  // are stored.  A store of i32 0x01020304 can never be turned into a memset,
+  // but it can be turned into memset_pattern if the target supports it.
+  Value *SplatValue = isBytewiseValue(StoredVal);
+  Constant *PatternValue = nullptr;
+
+  // Note: memset and memset_pattern on unordered-atomic is yet not supported
+  bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
+
+  // If we're allowed to form a memset, and the stored value would be
+  // acceptable for memset, use it.
+  if (!UnorderedAtomic && HasMemset && SplatValue &&
+      // Verify that the stored value is loop invariant.  If not, we can't
+      // promote the memset.
+      CurLoop->isLoopInvariant(SplatValue)) {
+    // It looks like we can use SplatValue.
+    return LegalStoreKind::Memset;
+  } else if (!UnorderedAtomic && HasMemsetPattern &&
+             // Don't create memset_pattern16s with address spaces.
+             StorePtr->getType()->getPointerAddressSpace() == 0 &&
+             (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
+    // It looks like we can use PatternValue!
+    return LegalStoreKind::MemsetPattern;
+  }
+
+  // Otherwise, see if the store can be turned into a memcpy.
+  if (HasMemcpy) {
+    // Check to see if the stride matches the size of the store.  If so, then we
+    // know that every byte is touched in the loop.
+    APInt Stride = getStoreStride(StoreEv);
+    unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+    if (StoreSize != Stride && StoreSize != -Stride)
+      return LegalStoreKind::None;
+
+    // The store must be feeding a non-volatile load.
+    LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+
+    // Only allow non-volatile loads
+    if (!LI || LI->isVolatile())
+      return LegalStoreKind::None;
+    // Only allow simple or unordered-atomic loads
+    if (!LI->isUnordered())
+      return LegalStoreKind::None;
+
+    // See if the pointer expression is an AddRec like {base,+,1} on the current
+    // loop, which indicates a strided load.  If we have something else, it's a
+    // random load we can't handle.
+    const SCEVAddRecExpr *LoadEv =
+        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+    if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+      return LegalStoreKind::None;
+
+    // The store and load must share the same stride.
+    if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
+      return LegalStoreKind::None;
+
+    // Success.  This store can be converted into a memcpy.
+    UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
+    return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
+                           : LegalStoreKind::Memcpy;
+  }
+  // This store can't be transformed into a memset/memcpy.
+  return LegalStoreKind::None;
+}
+
+void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
+  StoreRefsForMemset.clear();
+  StoreRefsForMemsetPattern.clear();
+  StoreRefsForMemcpy.clear();
+  for (Instruction &I : *BB) {
+    StoreInst *SI = dyn_cast<StoreInst>(&I);
+    if (!SI)
+      continue;
+
+    // Make sure this is a strided store with a constant stride.
+    switch (isLegalStore(SI)) {
+    case LegalStoreKind::None:
+      // Nothing to do
+      break;
+    case LegalStoreKind::Memset: {
+      // Find the base pointer.
+      Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
+      StoreRefsForMemset[Ptr].push_back(SI);
+    } break;
+    case LegalStoreKind::MemsetPattern: {
+      // Find the base pointer.
+      Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
+      StoreRefsForMemsetPattern[Ptr].push_back(SI);
+    } break;
+    case LegalStoreKind::Memcpy:
+    case LegalStoreKind::UnorderedAtomicMemcpy:
+      StoreRefsForMemcpy.push_back(SI);
+      break;
+    default:
+      assert(false && "unhandled return value");
+      break;
+    }
+  }
+}
+
+/// runOnLoopBlock - Process the specified block, which lives in a counted loop
+/// with the specified backedge count.  This block is known to be in the current
+/// loop and not in any subloops.
+bool LoopIdiomRecognize::runOnLoopBlock(
+    BasicBlock *BB, const SCEV *BECount,
+    SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+  // We can only promote stores in this block if they are unconditionally
+  // executed in the loop.  For a block to be unconditionally executed, it has
+  // to dominate all the exit blocks of the loop.  Verify this now.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    if (!DT->dominates(BB, ExitBlocks[i]))
+      return false;
+
+  bool MadeChange = false;
+  // Look for store instructions, which may be optimized to memset/memcpy.
+  collectStores(BB);
+
+  // Look for a single store or sets of stores with a common base, which can be
+  // optimized into a memset (memset_pattern).  The latter most commonly happens
+  // with structs and handunrolled loops.
+  for (auto &SL : StoreRefsForMemset)
+    MadeChange |= processLoopStores(SL.second, BECount, true);
+
+  for (auto &SL : StoreRefsForMemsetPattern)
+    MadeChange |= processLoopStores(SL.second, BECount, false);
+
+  // Optimize the store into a memcpy, if it feeds an similarly strided load.
+  for (auto &SI : StoreRefsForMemcpy)
+    MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
+
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+    Instruction *Inst = &*I++;
+    // Look for memset instructions, which may be optimized to a larger memset.
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
+      WeakTrackingVH InstPtr(&*I);
+      if (!processLoopMemSet(MSI, BECount))
+        continue;
+      MadeChange = true;
+
+      // If processing the memset invalidated our iterator, start over from the
+      // top of the block.
+      if (!InstPtr)
+        I = BB->begin();
+      continue;
+    }
+  }
+
+  return MadeChange;
+}
+
+/// processLoopStores - See if this store(s) can be promoted to a memset.
+bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
+                                           const SCEV *BECount,
+                                           bool ForMemset) {
+  // Try to find consecutive stores that can be transformed into memsets.
+  SetVector<StoreInst *> Heads, Tails;
+  SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
+
+  // Do a quadratic search on all of the given stores and find
+  // all of the pairs of stores that follow each other.
+  SmallVector<unsigned, 16> IndexQueue;
+  for (unsigned i = 0, e = SL.size(); i < e; ++i) {
+    assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
+
+    Value *FirstStoredVal = SL[i]->getValueOperand();
+    Value *FirstStorePtr = SL[i]->getPointerOperand();
+    const SCEVAddRecExpr *FirstStoreEv =
+        cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
+    APInt FirstStride = getStoreStride(FirstStoreEv);
+    unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL);
+
+    // See if we can optimize just this store in isolation.
+    if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
+      Heads.insert(SL[i]);
+      continue;
+    }
+
+    Value *FirstSplatValue = nullptr;
+    Constant *FirstPatternValue = nullptr;
+
+    if (ForMemset)
+      FirstSplatValue = isBytewiseValue(FirstStoredVal);
+    else
+      FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
+
+    assert((FirstSplatValue || FirstPatternValue) &&
+           "Expected either splat value or pattern value.");
+
+    IndexQueue.clear();
+    // If a store has multiple consecutive store candidates, search Stores
+    // array according to the sequence: from i+1 to e, then from i-1 to 0.
+    // This is because usually pairing with immediate succeeding or preceding
+    // candidate create the best chance to find memset opportunity.
+    unsigned j = 0;
+    for (j = i + 1; j < e; ++j)
+      IndexQueue.push_back(j);
+    for (j = i; j > 0; --j)
+      IndexQueue.push_back(j - 1);
+
+    for (auto &k : IndexQueue) {
+      assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
+      Value *SecondStorePtr = SL[k]->getPointerOperand();
+      const SCEVAddRecExpr *SecondStoreEv =
+          cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
+      APInt SecondStride = getStoreStride(SecondStoreEv);
+
+      if (FirstStride != SecondStride)
+        continue;
+
+      Value *SecondStoredVal = SL[k]->getValueOperand();
+      Value *SecondSplatValue = nullptr;
+      Constant *SecondPatternValue = nullptr;
+
+      if (ForMemset)
+        SecondSplatValue = isBytewiseValue(SecondStoredVal);
+      else
+        SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
+
+      assert((SecondSplatValue || SecondPatternValue) &&
+             "Expected either splat value or pattern value.");
+
+      if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
+        if (ForMemset) {
+          if (FirstSplatValue != SecondSplatValue)
+            continue;
+        } else {
+          if (FirstPatternValue != SecondPatternValue)
+            continue;
+        }
+        Tails.insert(SL[k]);
+        Heads.insert(SL[i]);
+        ConsecutiveChain[SL[i]] = SL[k];
+        break;
+      }
+    }
+  }
+
+  // We may run into multiple chains that merge into a single chain. We mark the
+  // stores that we transformed so that we don't visit the same store twice.
+  SmallPtrSet<Value *, 16> TransformedStores;
+  bool Changed = false;
+
+  // For stores that start but don't end a link in the chain:
+  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
+       it != e; ++it) {
+    if (Tails.count(*it))
+      continue;
+
+    // We found a store instr that starts a chain. Now follow the chain and try
+    // to transform it.
+    SmallPtrSet<Instruction *, 8> AdjacentStores;
+    StoreInst *I = *it;
+
+    StoreInst *HeadStore = I;
+    unsigned StoreSize = 0;
+
+    // Collect the chain into a list.
+    while (Tails.count(I) || Heads.count(I)) {
+      if (TransformedStores.count(I))
+        break;
+      AdjacentStores.insert(I);
+
+      StoreSize += getStoreSizeInBytes(I, DL);
+      // Move to the next value in the chain.
+      I = ConsecutiveChain[I];
+    }
+
+    Value *StoredVal = HeadStore->getValueOperand();
+    Value *StorePtr = HeadStore->getPointerOperand();
+    const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+    APInt Stride = getStoreStride(StoreEv);
+
+    // Check to see if the stride matches the size of the stores.  If so, then
+    // we know that every byte is touched in the loop.
+    if (StoreSize != Stride && StoreSize != -Stride)
+      continue;
+
+    bool NegStride = StoreSize == -Stride;
+
+    if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(),
+                                StoredVal, HeadStore, AdjacentStores, StoreEv,
+                                BECount, NegStride)) {
+      TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// processLoopMemSet - See if this memset can be promoted to a large memset.
+bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
+                                           const SCEV *BECount) {
+  // We can only handle non-volatile memsets with a constant size.
+  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
+    return false;
+
+  // If we're not allowed to hack on memset, we fail.
+  if (!HasMemset)
+    return false;
+
+  Value *Pointer = MSI->getDest();
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided store.  If we have something else, it's a
+  // random store we can't handle.
+  const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
+  if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
+    return false;
+
+  // Reject memsets that are so large that they overflow an unsigned.
+  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+  if ((SizeInBytes >> 32) != 0)
+    return false;
+
+  // Check to see if the stride matches the size of the memset.  If so, then we
+  // know that every byte is touched in the loop.
+  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
+  if (!ConstStride)
+    return false;
+
+  APInt Stride = ConstStride->getAPInt();
+  if (SizeInBytes != Stride && SizeInBytes != -Stride)
+    return false;
+
+  // Verify that the memset value is loop invariant.  If not, we can't promote
+  // the memset.
+  Value *SplatValue = MSI->getValue();
+  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
+    return false;
+
+  SmallPtrSet<Instruction *, 1> MSIs;
+  MSIs.insert(MSI);
+  bool NegStride = SizeInBytes == -Stride;
+  return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
+                                 MSI->getAlignment(), SplatValue, MSI, MSIs, Ev,
+                                 BECount, NegStride, /*IsLoopMemset=*/true);
+}
+
+/// mayLoopAccessLocation - Return true if the specified loop might access the
+/// specified pointer location, which is a loop-strided access.  The 'Access'
+/// argument specifies what the verboten forms of access are (read or write).
+static bool
+mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
+                      const SCEV *BECount, unsigned StoreSize,
+                      AliasAnalysis &AA,
+                      SmallPtrSetImpl<Instruction *> &IgnoredStores) {
+  // Get the location that may be stored across the loop.  Since the access is
+  // strided positively through memory, we say that the modified location starts
+  // at the pointer and has infinite size.
+  uint64_t AccessSize = MemoryLocation::UnknownSize;
+
+  // If the loop iterates a fixed number of times, we can refine the access size
+  // to be exactly the size of the memset, which is (BECount+1)*StoreSize
+  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+    AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
+
+  // TODO: For this to be really effective, we have to dive into the pointer
+  // operand in the store.  Store to &A[i] of 100 will always return may alias
+  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
+  // which will then no-alias a store to &A[100].
+  MemoryLocation StoreLoc(Ptr, AccessSize);
+
+  for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
+       ++BI)
+    for (Instruction &I : **BI)
+      if (IgnoredStores.count(&I) == 0 &&
+          (AA.getModRefInfo(&I, StoreLoc) & Access))
+        return true;
+
+  return false;
+}
+
+// If we have a negative stride, Start refers to the end of the memory location
+// we're trying to memset.  Therefore, we need to recompute the base pointer,
+// which is just Start - BECount*Size.
+static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
+                                        Type *IntPtr, unsigned StoreSize,
+                                        ScalarEvolution *SE) {
+  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+  if (StoreSize != 1)
+    Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
+                           SCEV::FlagNUW);
+  return SE->getMinusSCEV(Start, Index);
+}
+
+/// processLoopStridedStore - We see a strided store of some value.  If we can
+/// transform this into a memset or memset_pattern in the loop preheader, do so.
+bool LoopIdiomRecognize::processLoopStridedStore(
+    Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
+    Value *StoredVal, Instruction *TheStore,
+    SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
+    const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
+  Value *SplatValue = isBytewiseValue(StoredVal);
+  Constant *PatternValue = nullptr;
+
+  if (!SplatValue)
+    PatternValue = getMemSetPatternValue(StoredVal, DL);
+
+  assert((SplatValue || PatternValue) &&
+         "Expected either splat value or pattern value.");
+
+  // The trip count of the loop and the base pointer of the addrec SCEV is
+  // guaranteed to be loop invariant, which means that it should dominate the
+  // header.  This allows us to insert code for it in the preheader.
+  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  IRBuilder<> Builder(Preheader->getTerminator());
+  SCEVExpander Expander(*SE, *DL, "loop-idiom");
+
+  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
+  Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS);
+
+  const SCEV *Start = Ev->getStart();
+  // Handle negative strided loops.
+  if (NegStride)
+    Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);
+
+  // TODO: ideally we should still be able to generate memset if SCEV expander
+  // is taught to generate the dependencies at the latest point.
+  if (!isSafeToExpand(Start, *SE))
+    return false;
+
+  // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
+  // this into a memset in the loop preheader now if we want.  However, this
+  // would be unsafe to do if there is anything else in the loop that may read
+  // or write to the aliased location.  Check for any overlap by generating the
+  // base pointer and checking the region.
+  Value *BasePtr =
+      Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
+  if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize,
+                            *AA, Stores)) {
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
+    return false;
+  }
+
+  if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
+    return false;
+
+  // Okay, everything looks good, insert the memset.
+
+  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
+  // pointer size if it isn't already.
+  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+
+  const SCEV *NumBytesS =
+      SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW);
+  if (StoreSize != 1) {
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
+                               SCEV::FlagNUW);
+  }
+
+  // TODO: ideally we should still be able to generate memset if SCEV expander
+  // is taught to generate the dependencies at the latest point.
+  if (!isSafeToExpand(NumBytesS, *SE))
+    return false;
+
+  Value *NumBytes =
+      Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
+
+  CallInst *NewCall;
+  if (SplatValue) {
+    NewCall =
+        Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment);
+  } else {
+    // Everything is emitted in default address space
+    Type *Int8PtrTy = DestInt8PtrTy;
+
+    Module *M = TheStore->getModule();
+    Value *MSP =
+        M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
+                               Int8PtrTy, Int8PtrTy, IntPtr);
+    inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI);
+
+    // Otherwise we should form a memset_pattern16.  PatternValue is known to be
+    // an constant array of 16-bytes.  Plop the value into a mergable global.
+    GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
+                                            GlobalValue::PrivateLinkage,
+                                            PatternValue, ".memset_pattern");
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
+    GV->setAlignment(16);
+    Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
+    NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
+  }
+
+  DEBUG(dbgs() << "  Formed memset: " << *NewCall << "\n"
+               << "    from store to: " << *Ev << " at: " << *TheStore << "\n");
+  NewCall->setDebugLoc(TheStore->getDebugLoc());
+
+  // Okay, the memset has been formed.  Zap the original store and anything that
+  // feeds into it.
+  for (auto *I : Stores)
+    deleteDeadInstruction(I);
+  ++NumMemSet;
+  return true;
+}
+
+/// If the stored value is a strided load in the same loop with the same stride
+/// this may be transformable into a memcpy.  This kicks in for stuff like
+/// for (i) A[i] = B[i];
+bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
+                                                    const SCEV *BECount) {
+  assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
+
+  Value *StorePtr = SI->getPointerOperand();
+  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  APInt Stride = getStoreStride(StoreEv);
+  unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+  bool NegStride = StoreSize == -Stride;
+
+  // The store must be feeding a non-volatile load.
+  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
+  assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided load.  If we have something else, it's a
+  // random load we can't handle.
+  const SCEVAddRecExpr *LoadEv =
+      cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+
+  // The trip count of the loop and the base pointer of the addrec SCEV is
+  // guaranteed to be loop invariant, which means that it should dominate the
+  // header.  This allows us to insert code for it in the preheader.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  IRBuilder<> Builder(Preheader->getTerminator());
+  SCEVExpander Expander(*SE, *DL, "loop-idiom");
+
+  const SCEV *StrStart = StoreEv->getStart();
+  unsigned StrAS = SI->getPointerAddressSpace();
+  Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS);
+
+  // Handle negative strided loops.
+  if (NegStride)
+    StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE);
+
+  // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
+  // this into a memcpy in the loop preheader now if we want.  However, this
+  // would be unsafe to do if there is anything else in the loop that may read
+  // or write the memory region we're storing to.  This includes the load that
+  // feeds the stores.  Check for an alias by generating the base address and
+  // checking everything.
+  Value *StoreBasePtr = Expander.expandCodeFor(
+      StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
+
+  SmallPtrSet<Instruction *, 1> Stores;
+  Stores.insert(SI);
+  if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
+                            StoreSize, *AA, Stores)) {
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
+    return false;
+  }
+
+  const SCEV *LdStart = LoadEv->getStart();
+  unsigned LdAS = LI->getPointerAddressSpace();
+
+  // Handle negative strided loops.
+  if (NegStride)
+    LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE);
+
+  // For a memcpy, we have to make sure that the input array is not being
+  // mutated by the loop.
+  Value *LoadBasePtr = Expander.expandCodeFor(
+      LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
+
+  if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
+                            *AA, Stores)) {
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
+    RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
+    return false;
+  }
+
+  if (avoidLIRForMultiBlockLoop())
+    return false;
+
+  // Okay, everything is safe, we can transform this!
+
+  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
+  // pointer size if it isn't already.
+  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
+
+  const SCEV *NumBytesS =
+      SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
+
+  if (StoreSize != 1)
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
+                               SCEV::FlagNUW);
+
+  Value *NumBytes =
+      Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
+
+  unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
+  CallInst *NewCall = nullptr;
+  // Check whether to generate an unordered atomic memcpy:
+  //  If the load or store are atomic, then they must neccessarily be unordered
+  //  by previous checks.
+  if (!SI->isAtomic() && !LI->isAtomic())
+    NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align);
+  else {
+    // We cannot allow unaligned ops for unordered load/store, so reject
+    // anything where the alignment isn't at least the element size.
+    if (Align < StoreSize)
+      return false;
+
+    // If the element.atomic memcpy is not lowered into explicit
+    // loads/stores later, then it will be lowered into an element-size
+    // specific lib call. If the lib call doesn't exist for our store size, then
+    // we shouldn't generate the memcpy.
+    if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
+      return false;
+
+    NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
+        StoreBasePtr, LoadBasePtr, NumBytes, StoreSize);
+
+    // Propagate alignment info onto the pointer args. Note that unordered
+    // atomic loads/stores are *required* by the spec to have an alignment
+    // but non-atomic loads/stores may not.
+    NewCall->addParamAttr(0, Attribute::getWithAlignment(NewCall->getContext(),
+                                                         SI->getAlignment()));
+    NewCall->addParamAttr(1, Attribute::getWithAlignment(NewCall->getContext(),
+                                                         LI->getAlignment()));
+  }
+  NewCall->setDebugLoc(SI->getDebugLoc());
+
+  DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
+               << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+               << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n");
+
+  // Okay, the memcpy has been formed.  Zap the original store and anything that
+  // feeds into it.
+  deleteDeadInstruction(SI);
+  ++NumMemCpy;
+  return true;
+}
+
+// When compiling for codesize we avoid idiom recognition for a multi-block loop
+// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
+//
+bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
+                                                   bool IsLoopMemset) {
+  if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
+    if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
+      DEBUG(dbgs() << "  " << CurLoop->getHeader()->getParent()->getName()
+                   << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
+                   << " avoided: multi-block top-level loop\n");
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool LoopIdiomRecognize::runOnNoncountableLoop() {
+  return recognizePopcount() || recognizeAndInsertCTLZ();
+}
+
+/// Check if the given conditional branch is based on the comparison between
+/// a variable and zero, and if the variable is non-zero, the control yields to
+/// the loop entry. If the branch matches the behavior, the variable involved
+/// in the comparison is returned. This function will be called to see if the
+/// precondition and postcondition of the loop are in desirable form.
+static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
+  if (!BI || !BI->isConditional())
+    return nullptr;
+
+  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!Cond)
+    return nullptr;
+
+  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
+  if (!CmpZero || !CmpZero->isZero())
+    return nullptr;
+
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+  if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) ||
+      (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry))
+    return Cond->getOperand(0);
+
+  return nullptr;
+}
+
+// Check if the recurrence variable `VarX` is in the right form to create
+// the idiom. Returns the value coerced to a PHINode if so.
+static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
+                                 BasicBlock *LoopEntry) {
+  auto *PhiX = dyn_cast<PHINode>(VarX);
+  if (PhiX && PhiX->getParent() == LoopEntry &&
+      (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
+    return PhiX;
+  return nullptr;
+}
+
+/// Return true iff the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction counting the population bit.
+/// 2) \p CntPhi is set to the corresponding phi node.
+/// 3) \p Var is set to the value whose population bits are being counted.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///    if (x0 != 0)
+///      goto loop-exit // the precondition of the loop
+///    cnt0 = init-val;
+///    do {
+///       x1 = phi (x0, x2);
+///       cnt1 = phi(cnt0, cnt2);
+///
+///       cnt2 = cnt1 + 1;
+///        ...
+///       x2 = x1 & (x1 - 1);
+///        ...
+///    } while(x != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
+                                Instruction *&CntInst, PHINode *&CntPhi,
+                                Value *&Var) {
+  // step 1: Check to see if the look-back branch match this pattern:
+  //    "if (a!=0) goto loop-entry".
+  BasicBlock *LoopEntry;
+  Instruction *DefX2, *CountInst;
+  Value *VarX1, *VarX0;
+  PHINode *PhiX, *CountPhi;
+
+  DefX2 = CountInst = nullptr;
+  VarX1 = VarX0 = nullptr;
+  PhiX = CountPhi = nullptr;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  {
+    if (Value *T = matchCondition(
+            dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+      DefX2 = dyn_cast<Instruction>(T);
+    else
+      return false;
+  }
+
+  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
+  {
+    if (!DefX2 || DefX2->getOpcode() != Instruction::And)
+      return false;
+
+    BinaryOperator *SubOneOp;
+
+    if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
+      VarX1 = DefX2->getOperand(1);
+    else {
+      VarX1 = DefX2->getOperand(0);
+      SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
+    }
+    if (!SubOneOp)
+      return false;
+
+    Instruction *SubInst = cast<Instruction>(SubOneOp);
+    ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
+    if (!Dec ||
+        !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
+          (SubInst->getOpcode() == Instruction::Add &&
+           Dec->isMinusOne()))) {
+      return false;
+    }
+  }
+
+  // step 3: Check the recurrence of variable X
+  PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
+  if (!PhiX)
+    return false;
+
+  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
+  {
+    CountInst = nullptr;
+    for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+                              IterE = LoopEntry->end();
+         Iter != IterE; Iter++) {
+      Instruction *Inst = &*Iter;
+      if (Inst->getOpcode() != Instruction::Add)
+        continue;
+
+      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+      if (!Inc || !Inc->isOne())
+        continue;
+
+      PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+      if (!Phi)
+        continue;
+
+      // Check if the result of the instruction is live of the loop.
+      bool LiveOutLoop = false;
+      for (User *U : Inst->users()) {
+        if ((cast<Instruction>(U))->getParent() != LoopEntry) {
+          LiveOutLoop = true;
+          break;
+        }
+      }
+
+      if (LiveOutLoop) {
+        CountInst = Inst;
+        CountPhi = Phi;
+        break;
+      }
+    }
+
+    if (!CountInst)
+      return false;
+  }
+
+  // step 5: check if the precondition is in this form:
+  //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
+  {
+    auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+    Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
+    if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
+      return false;
+
+    CntInst = CountInst;
+    CntPhi = CountPhi;
+    Var = T;
+  }
+
+  return true;
+}
+
+/// Return true if the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
+///       or nullptr if there is no such.
+/// 2) \p CntPhi is set to the corresponding phi node
+///       or nullptr if there is no such.
+/// 3) \p Var is set to the value whose CTLZ could be used.
+/// 4) \p DefX is set to the instruction calculating Loop exit condition.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///    if (x0 == 0)
+///      goto loop-exit // the precondition of the loop
+///    cnt0 = init-val;
+///    do {
+///       x = phi (x0, x.next);   //PhiX
+///       cnt = phi(cnt0, cnt.next);
+///
+///       cnt.next = cnt + 1;
+///        ...
+///       x.next = x >> 1;   // DefX
+///        ...
+///    } while(x.next != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
+                            Instruction *&CntInst, PHINode *&CntPhi,
+                            Instruction *&DefX) {
+  BasicBlock *LoopEntry;
+  Value *VarX = nullptr;
+
+  DefX = nullptr;
+  PhiX = nullptr;
+  CntInst = nullptr;
+  CntPhi = nullptr;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  if (Value *T = matchCondition(
+          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+    DefX = dyn_cast<Instruction>(T);
+  else
+    return false;
+
+  // step 2: detect instructions corresponding to "x.next = x >> 1"
+  if (!DefX || DefX->getOpcode() != Instruction::AShr)
+    return false;
+  if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
+    if (!Shft || !Shft->isOne())
+      return false;
+  VarX = DefX->getOperand(0);
+
+  // step 3: Check the recurrence of variable X
+  PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
+  if (!PhiX)
+    return false;
+
+  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+  // TODO: We can skip the step. If loop trip count is known (CTLZ),
+  //       then all uses of "cnt.next" could be optimized to the trip count
+  //       plus "cnt0". Currently it is not optimized.
+  //       This step could be used to detect POPCNT instruction:
+  //       cnt.next = cnt + (x.next & 1)
+  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+                            IterE = LoopEntry->end();
+       Iter != IterE; Iter++) {
+    Instruction *Inst = &*Iter;
+    if (Inst->getOpcode() != Instruction::Add)
+      continue;
+
+    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+    if (!Inc || !Inc->isOne())
+      continue;
+
+    PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+    if (!Phi)
+      continue;
+
+    CntInst = Inst;
+    CntPhi = Phi;
+    break;
+  }
+  if (!CntInst)
+    return false;
+
+  return true;
+}
+
+/// Recognize CTLZ idiom in a non-countable loop and convert the loop
+/// to countable (with CTLZ trip count).
+/// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
+
+  Instruction *CntInst, *DefX;
+  PHINode *CntPhi, *PhiX;
+  if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
+    return false;
+
+  bool IsCntPhiUsedOutsideLoop = false;
+  for (User *U : CntPhi->users())
+    if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+      IsCntPhiUsedOutsideLoop = true;
+      break;
+    }
+  bool IsCntInstUsedOutsideLoop = false;
+  for (User *U : CntInst->users())
+    if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+      IsCntInstUsedOutsideLoop = true;
+      break;
+    }
+  // If both CntInst and CntPhi are used outside the loop the profitability
+  // is questionable.
+  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
+    return false;
+
+  // For some CPUs result of CTLZ(X) intrinsic is undefined
+  // when X is 0. If we can not guarantee X != 0, we need to check this
+  // when expand.
+  bool ZeroCheck = false;
+  // It is safe to assume Preheader exist as it was checked in
+  // parent function RunOnLoop.
+  BasicBlock *PH = CurLoop->getLoopPreheader();
+  Value *InitX = PhiX->getIncomingValueForBlock(PH);
+  // If we check X != 0 before entering the loop we don't need a zero
+  // check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the
+  // loop (if it is used we count CTLZ(X >> 1)).
+  if (!IsCntPhiUsedOutsideLoop)
+    if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
+      if (BranchInst *PreCondBr =
+          dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
+        if (matchCondition(PreCondBr, PH) == InitX)
+          ZeroCheck = true;
+      }
+
+  // Check if CTLZ intrinsic is profitable. Assume it is always profitable
+  // if we delete the loop (the loop has only 6 instructions):
+  //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
+  //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
+  //  %shr = ashr %n.addr.0, 1
+  //  %tobool = icmp eq %shr, 0
+  //  %inc = add nsw %i.0, 1
+  //  br i1 %tobool
+
+  IRBuilder<> Builder(PH->getTerminator());
+  SmallVector<const Value *, 2> Ops =
+      {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
+  ArrayRef<const Value *> Args(Ops);
+  if (CurLoop->getHeader()->size() != 6 &&
+      TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
+          TargetTransformInfo::TCC_Basic)
+    return false;
+
+  const DebugLoc DL = DefX->getDebugLoc();
+  transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
+                           IsCntPhiUsedOutsideLoop);
+  return true;
+}
+
+/// Recognizes a population count idiom in a non-countable loop.
+///
+/// If detected, transforms the relevant code to issue the popcount intrinsic
+/// function call, and returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizePopcount() {
+  if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
+    return false;
+
+  // Counting population are usually conducted by few arithmetic instructions.
+  // Such instructions can be easily "absorbed" by vacant slots in a
+  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
+  // in a compact loop.
+
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
+
+  BasicBlock *LoopBody = *(CurLoop->block_begin());
+  if (LoopBody->size() >= 20) {
+    // The loop is too big, bail out.
+    return false;
+  }
+
+  // It should have a preheader containing nothing but an unconditional branch.
+  BasicBlock *PH = CurLoop->getLoopPreheader();
+  if (!PH || &PH->front() != PH->getTerminator())
+    return false;
+  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
+  if (!EntryBI || EntryBI->isConditional())
+    return false;
+
+  // It should have a precondition block where the generated popcount instrinsic
+  // function can be inserted.
+  auto *PreCondBB = PH->getSinglePredecessor();
+  if (!PreCondBB)
+    return false;
+  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+  if (!PreCondBI || PreCondBI->isUnconditional())
+    return false;
+
+  Instruction *CntInst;
+  PHINode *CntPhi;
+  Value *Val;
+  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
+    return false;
+
+  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
+  return true;
+}
+
+static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+                                       const DebugLoc &DL) {
+  Value *Ops[] = {Val};
+  Type *Tys[] = {Val->getType()};
+
+  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
+  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CI->setDebugLoc(DL);
+
+  return CI;
+}
+
+static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+                                     const DebugLoc &DL, bool ZeroCheck) {
+  Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
+  Type *Tys[] = {Val->getType()};
+
+  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
+  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CI->setDebugLoc(DL);
+
+  return CI;
+}
+
+/// Transform the following loop:
+/// loop:
+///   CntPhi = PHI [Cnt0, CntInst]
+///   PhiX = PHI [InitX, DefX]
+///   CntInst = CntPhi + 1
+///   DefX = PhiX >> 1
+//    LOOP_BODY
+///   Br: loop if (DefX != 0)
+/// Use(CntPhi) or Use(CntInst)
+///
+/// Into:
+/// If CntPhi used outside the loop:
+///   CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
+///   Count = CountPrev + 1
+/// else
+///   Count = BitWidth(InitX) - CTLZ(InitX)
+/// loop:
+///   CntPhi = PHI [Cnt0, CntInst]
+///   PhiX = PHI [InitX, DefX]
+///   PhiCount = PHI [Count, Dec]
+///   CntInst = CntPhi + 1
+///   DefX = PhiX >> 1
+///   Dec = PhiCount - 1
+///   LOOP_BODY
+///   Br: loop if (Dec != 0)
+/// Use(CountPrev + Cnt0) // Use(CntPhi)
+/// or
+/// Use(Count + Cnt0) // Use(CntInst)
+///
+/// If LOOP_BODY is empty the loop will be deleted.
+/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
+void LoopIdiomRecognize::transformLoopToCountable(
+    BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
+    const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
+  BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+
+  // Step 1: Insert the CTLZ instruction at the end of the preheader block
+  //   Count = BitWidth - CTLZ(InitX);
+  // If there are uses of CntPhi create:
+  //   CountPrev = BitWidth - CTLZ(InitX >> 1);
+  IRBuilder<> Builder(PreheaderBr);
+  Builder.SetCurrentDebugLocation(DL);
+  Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
+
+  if (IsCntPhiUsedOutsideLoop)
+    InitXNext = Builder.CreateAShr(InitX,
+                                   ConstantInt::get(InitX->getType(), 1));
+  else
+    InitXNext = InitX;
+  CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
+  Count = Builder.CreateSub(
+      ConstantInt::get(CTLZ->getType(),
+                       CTLZ->getType()->getIntegerBitWidth()),
+      CTLZ);
+  if (IsCntPhiUsedOutsideLoop) {
+    CountPrev = Count;
+    Count = Builder.CreateAdd(
+        CountPrev,
+        ConstantInt::get(CountPrev->getType(), 1));
+  }
+  if (IsCntPhiUsedOutsideLoop)
+    NewCount = Builder.CreateZExtOrTrunc(CountPrev,
+        cast<IntegerType>(CntInst->getType()));
+  else
+    NewCount = Builder.CreateZExtOrTrunc(Count,
+        cast<IntegerType>(CntInst->getType()));
+
+  // If the CTLZ counter's initial value is not zero, insert Add Inst.
+  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
+  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+  if (!InitConst || !InitConst->isZero())
+    NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+
+  // Step 2: Insert new IV and loop condition:
+  // loop:
+  //   ...
+  //   PhiCount = PHI [Count, Dec]
+  //   ...
+  //   Dec = PhiCount - 1
+  //   ...
+  //   Br: loop if (Dec != 0)
+  BasicBlock *Body = *(CurLoop->block_begin());
+  auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+  Type *Ty = Count->getType();
+
+  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+  Builder.SetInsertPoint(LbCond);
+  Instruction *TcDec = cast<Instruction>(
+      Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+                        "tcdec", false, true));
+
+  TcPhi->addIncoming(Count, Preheader);
+  TcPhi->addIncoming(TcDec, Body);
+
+  CmpInst::Predicate Pred =
+      (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+  LbCond->setPredicate(Pred);
+  LbCond->setOperand(0, TcDec);
+  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+
+  // Step 3: All the references to the original counter outside
+  //  the loop are replaced with the NewCount -- the value returned from
+  //  __builtin_ctlz(x).
+  if (IsCntPhiUsedOutsideLoop)
+    CntPhi->replaceUsesOutsideBlock(NewCount, Body);
+  else
+    CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+  // step 4: Forget the "non-computable" trip-count SCEV associated with the
+  //   loop. The loop would otherwise not be deleted even if it becomes empty.
+  SE->forgetLoop(CurLoop);
+}
+
+void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
+                                                 Instruction *CntInst,
+                                                 PHINode *CntPhi, Value *Var) {
+  BasicBlock *PreHead = CurLoop->getLoopPreheader();
+  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+  const DebugLoc DL = CntInst->getDebugLoc();
+
+  // Assuming before transformation, the loop is following:
+  //  if (x) // the precondition
+  //     do { cnt++; x &= x - 1; } while(x);
+
+  // Step 1: Insert the ctpop instruction at the end of the precondition block
+  IRBuilder<> Builder(PreCondBr);
+  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
+  {
+    PopCnt = createPopcntIntrinsic(Builder, Var, DL);
+    NewCount = PopCntZext =
+        Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
+
+    if (NewCount != PopCnt)
+      (cast<Instruction>(NewCount))->setDebugLoc(DL);
+
+    // TripCnt is exactly the number of iterations the loop has
+    TripCnt = NewCount;
+
+    // If the population counter's initial value is not zero, insert Add Inst.
+    Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
+    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+    if (!InitConst || !InitConst->isZero()) {
+      NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+      (cast<Instruction>(NewCount))->setDebugLoc(DL);
+    }
+  }
+
+  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
+  //   "if (NewCount == 0) loop-exit". Without this change, the intrinsic
+  //   function would be partial dead code, and downstream passes will drag
+  //   it back from the precondition block to the preheader.
+  {
+    ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
+
+    Value *Opnd0 = PopCntZext;
+    Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
+    if (PreCond->getOperand(0) != Var)
+      std::swap(Opnd0, Opnd1);
+
+    ICmpInst *NewPreCond = cast<ICmpInst>(
+        Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
+    PreCondBr->setCondition(NewPreCond);
+
+    RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
+  }
+
+  // Step 3: Note that the population count is exactly the trip count of the
+  // loop in question, which enable us to to convert the loop from noncountable
+  // loop into a countable one. The benefit is twofold:
+  //
+  //  - If the loop only counts population, the entire loop becomes dead after
+  //    the transformation. It is a lot easier to prove a countable loop dead
+  //    than to prove a noncountable one. (In some C dialects, an infinite loop
+  //    isn't dead even if it computes nothing useful. In general, DCE needs
+  //    to prove a noncountable loop finite before safely delete it.)
+  //
+  //  - If the loop also performs something else, it remains alive.
+  //    Since it is transformed to countable form, it can be aggressively
+  //    optimized by some optimizations which are in general not applicable
+  //    to a noncountable loop.
+  //
+  // After this step, this loop (conceptually) would look like following:
+  //   newcnt = __builtin_ctpop(x);
+  //   t = newcnt;
+  //   if (x)
+  //     do { cnt++; x &= x-1; t--) } while (t > 0);
+  BasicBlock *Body = *(CurLoop->block_begin());
+  {
+    auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+    ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+    Type *Ty = TripCnt->getType();
+
+    PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+    Builder.SetInsertPoint(LbCond);
+    Instruction *TcDec = cast<Instruction>(
+        Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+                          "tcdec", false, true));
+
+    TcPhi->addIncoming(TripCnt, PreHead);
+    TcPhi->addIncoming(TcDec, Body);
+
+    CmpInst::Predicate Pred =
+        (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
+    LbCond->setPredicate(Pred);
+    LbCond->setOperand(0, TcDec);
+    LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+  }
+
+  // Step 4: All the references to the original population counter outside
+  //  the loop are replaced with the NewCount -- the value returned from
+  //  __builtin_ctpop().
+  CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+  // step 5: Forget the "non-computable" trip-count SCEV associated with the
+  //   loop. The loop would otherwise not be deleted even if it becomes empty.
+  SE->forgetLoop(CurLoop);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
new file mode 100644
index 000000000000..af095560cc02
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -0,0 +1,208 @@
+//===- LoopInstSimplify.cpp - Loop Instruction Simplification Pass --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs lightweight instruction simplification on loop bodies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopInstSimplify.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-instsimplify"
+
+STATISTIC(NumSimplified, "Number of redundant instructions simplified");
+
+static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI,
+                             AssumptionCache *AC,
+                             const TargetLibraryInfo *TLI) {
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+  array_pod_sort(ExitBlocks.begin(), ExitBlocks.end());
+
+  SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+
+  // The bit we are stealing from the pointer represents whether this basic
+  // block is the header of a subloop, in which case we only process its phis.
+  typedef PointerIntPair<BasicBlock *, 1> WorklistItem;
+  SmallVector<WorklistItem, 16> VisitStack;
+  SmallPtrSet<BasicBlock *, 32> Visited;
+
+  bool Changed = false;
+  bool LocalChanged;
+  do {
+    LocalChanged = false;
+
+    VisitStack.clear();
+    Visited.clear();
+
+    VisitStack.push_back(WorklistItem(L->getHeader(), false));
+
+    while (!VisitStack.empty()) {
+      WorklistItem Item = VisitStack.pop_back_val();
+      BasicBlock *BB = Item.getPointer();
+      bool IsSubloopHeader = Item.getInt();
+      const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+      // Simplify instructions in the current basic block.
+      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+        Instruction *I = &*BI++;
+
+        // The first time through the loop ToSimplify is empty and we try to
+        // simplify all instructions. On later iterations ToSimplify is not
+        // empty and we only bother simplifying instructions that are in it.
+        if (!ToSimplify->empty() && !ToSimplify->count(I))
+          continue;
+
+        // Don't bother simplifying unused instructions.
+        if (!I->use_empty()) {
+          Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC});
+          if (V && LI->replacementPreservesLCSSAForm(I, V)) {
+            // Mark all uses for resimplification next time round the loop.
+            for (User *U : I->users())
+              Next->insert(cast<Instruction>(U));
+
+            I->replaceAllUsesWith(V);
+            LocalChanged = true;
+            ++NumSimplified;
+          }
+        }
+        if (RecursivelyDeleteTriviallyDeadInstructions(I, TLI)) {
+          // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
+          // instruction, so simply incrementing the iterator does not work.
+          // When instructions get deleted re-iterate instead.
+          BI = BB->begin();
+          BE = BB->end();
+          LocalChanged = true;
+        }
+
+        if (IsSubloopHeader && !isa<PHINode>(I))
+          break;
+      }
+
+      // Add all successors to the worklist, except for loop exit blocks and the
+      // bodies of subloops. We visit the headers of loops so that we can
+      // process
+      // their phis, but we contract the rest of the subloop body and only
+      // follow
+      // edges leading back to the original loop.
+      for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE;
+           ++SI) {
+        BasicBlock *SuccBB = *SI;
+        if (!Visited.insert(SuccBB).second)
+          continue;
+
+        const Loop *SuccLoop = LI->getLoopFor(SuccBB);
+        if (SuccLoop && SuccLoop->getHeader() == SuccBB &&
+            L->contains(SuccLoop)) {
+          VisitStack.push_back(WorklistItem(SuccBB, true));
+
+          SmallVector<BasicBlock *, 8> SubLoopExitBlocks;
+          SuccLoop->getExitBlocks(SubLoopExitBlocks);
+
+          for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) {
+            BasicBlock *ExitBB = SubLoopExitBlocks[i];
+            if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB).second)
+              VisitStack.push_back(WorklistItem(ExitBB, false));
+          }
+
+          continue;
+        }
+
+        bool IsExitBlock =
+            std::binary_search(ExitBlocks.begin(), ExitBlocks.end(), SuccBB);
+        if (IsExitBlock)
+          continue;
+
+        VisitStack.push_back(WorklistItem(SuccBB, false));
+      }
+    }
+
+    // Place the list of instructions to simplify on the next loop iteration
+    // into ToSimplify.
+    std::swap(ToSimplify, Next);
+    Next->clear();
+
+    Changed |= LocalChanged;
+  } while (LocalChanged);
+
+  return Changed;
+}
+
+namespace {
+class LoopInstSimplifyLegacyPass : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopInstSimplifyLegacyPass() : LoopPass(ID) {
+    initializeLoopInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+    DominatorTreeWrapperPass *DTWP =
+        getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    AssumptionCache *AC =
+        &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+            *L->getHeader()->getParent());
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+    return SimplifyLoopInst(L, DT, LI, AC, TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.setPreservesCFG();
+    getLoopAnalysisUsage(AU);
+  }
+};
+}
+
+PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
+                                            LoopStandardAnalysisResults &AR,
+                                            LPMUpdater &) {
+  if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI))
+    return PreservedAnalyses::all();
+
+  auto PA = getLoopPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+char LoopInstSimplifyLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopInstSimplifyLegacyPass, "loop-instsimplify",
+                      "Simplify instructions in loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LoopInstSimplifyLegacyPass, "loop-instsimplify",
+                    "Simplify instructions in loops", false, false)
+
+Pass *llvm::createLoopInstSimplifyPass() {
+  return new LoopInstSimplifyLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
new file mode 100644
index 000000000000..606136dc31a4
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -0,0 +1,1298 @@
+//===- LoopInterchange.cpp - Loop interchange pass------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This Pass handles loop interchange transform.
+// This pass interchanges loops to provide a more cache-friendly memory access
+// patterns.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-interchange"
+
+static cl::opt<int> LoopInterchangeCostThreshold(
+    "loop-interchange-threshold", cl::init(0), cl::Hidden,
+    cl::desc("Interchange if you gain more than this number"));
+
+namespace {
+
+typedef SmallVector<Loop *, 8> LoopVector;
+
+// TODO: Check if we can use a sparse matrix here.
+typedef std::vector<std::vector<char>> CharMatrix;
+
+// Maximum number of dependencies that can be handled in the dependency matrix.
+static const unsigned MaxMemInstrCount = 100;
+
+// Maximum loop depth supported.
+static const unsigned MaxLoopNestDepth = 10;
+
+struct LoopInterchange;
+
+#ifdef DUMP_DEP_MATRICIES
+void printDepMatrix(CharMatrix &DepMatrix) {
+  for (auto I = DepMatrix.begin(), E = DepMatrix.end(); I != E; ++I) {
+    std::vector<char> Vec = *I;
+    for (auto II = Vec.begin(), EE = Vec.end(); II != EE; ++II)
+      DEBUG(dbgs() << *II << " ");
+    DEBUG(dbgs() << "\n");
+  }
+}
+#endif
+
+static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
+                                     Loop *L, DependenceInfo *DI) {
+  typedef SmallVector<Value *, 16> ValueVector;
+  ValueVector MemInstr;
+
+  // For each block.
+  for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end();
+       BB != BE; ++BB) {
+    // Scan the BB and collect legal loads and stores.
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E;
+         ++I) {
+      if (!isa<Instruction>(I))
+        return false;
+      if (LoadInst *Ld = dyn_cast<LoadInst>(I)) {
+        if (!Ld->isSimple())
+          return false;
+        MemInstr.push_back(&*I);
+      } else if (StoreInst *St = dyn_cast<StoreInst>(I)) {
+        if (!St->isSimple())
+          return false;
+        MemInstr.push_back(&*I);
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "Found " << MemInstr.size()
+               << " Loads and Stores to analyze\n");
+
+  ValueVector::iterator I, IE, J, JE;
+
+  for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
+    for (J = I, JE = MemInstr.end(); J != JE; ++J) {
+      std::vector<char> Dep;
+      Instruction *Src = cast<Instruction>(*I);
+      Instruction *Dst = cast<Instruction>(*J);
+      if (Src == Dst)
+        continue;
+      // Ignore Input dependencies.
+      if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
+        continue;
+      // Track Output, Flow, and Anti dependencies.
+      if (auto D = DI->depends(Src, Dst, true)) {
+        assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+        DEBUG(StringRef DepType =
+                  D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
+              dbgs() << "Found " << DepType
+                     << " dependency between Src and Dst\n"
+                     << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
+        unsigned Levels = D->getLevels();
+        char Direction;
+        for (unsigned II = 1; II <= Levels; ++II) {
+          const SCEV *Distance = D->getDistance(II);
+          const SCEVConstant *SCEVConst =
+              dyn_cast_or_null<SCEVConstant>(Distance);
+          if (SCEVConst) {
+            const ConstantInt *CI = SCEVConst->getValue();
+            if (CI->isNegative())
+              Direction = '<';
+            else if (CI->isZero())
+              Direction = '=';
+            else
+              Direction = '>';
+            Dep.push_back(Direction);
+          } else if (D->isScalar(II)) {
+            Direction = 'S';
+            Dep.push_back(Direction);
+          } else {
+            unsigned Dir = D->getDirection(II);
+            if (Dir == Dependence::DVEntry::LT ||
+                Dir == Dependence::DVEntry::LE)
+              Direction = '<';
+            else if (Dir == Dependence::DVEntry::GT ||
+                     Dir == Dependence::DVEntry::GE)
+              Direction = '>';
+            else if (Dir == Dependence::DVEntry::EQ)
+              Direction = '=';
+            else
+              Direction = '*';
+            Dep.push_back(Direction);
+          }
+        }
+        while (Dep.size() != Level) {
+          Dep.push_back('I');
+        }
+
+        DepMatrix.push_back(Dep);
+        if (DepMatrix.size() > MaxMemInstrCount) {
+          DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
+                       << " dependencies inside loop\n");
+          return false;
+        }
+      }
+    }
+  }
+
+  // We don't have a DepMatrix to check legality return false.
+  if (DepMatrix.size() == 0)
+    return false;
+  return true;
+}
+
+// A loop is moved from index 'from' to an index 'to'. Update the Dependence
+// matrix by exchanging the two columns.
+static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
+                                    unsigned ToIndx) {
+  unsigned numRows = DepMatrix.size();
+  for (unsigned i = 0; i < numRows; ++i) {
+    char TmpVal = DepMatrix[i][ToIndx];
+    DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx];
+    DepMatrix[i][FromIndx] = TmpVal;
+  }
+}
+
+// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
+// '>'
+static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
+                                   unsigned Column) {
+  for (unsigned i = 0; i <= Column; ++i) {
+    if (DepMatrix[Row][i] == '<')
+      return false;
+    if (DepMatrix[Row][i] == '>')
+      return true;
+  }
+  // All dependencies were '=','S' or 'I'
+  return false;
+}
+
+// Checks if no dependence exist in the dependency matrix in Row before Column.
+static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
+                                 unsigned Column) {
+  for (unsigned i = 0; i < Column; ++i) {
+    if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' &&
+        DepMatrix[Row][i] != 'I')
+      return false;
+  }
+  return true;
+}
+
+static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
+                                unsigned OuterLoopId, char InnerDep,
+                                char OuterDep) {
+
+  if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
+    return false;
+
+  if (InnerDep == OuterDep)
+    return true;
+
+  // It is legal to interchange if and only if after interchange no row has a
+  // '>' direction as the leftmost non-'='.
+
+  if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I')
+    return true;
+
+  if (InnerDep == '<')
+    return true;
+
+  if (InnerDep == '>') {
+    // If OuterLoopId represents outermost loop then interchanging will make the
+    // 1st dependency as '>'
+    if (OuterLoopId == 0)
+      return false;
+
+    // If all dependencies before OuterloopId are '=','S'or 'I'. Then
+    // interchanging will result in this row having an outermost non '='
+    // dependency of '>'
+    if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
+      return true;
+  }
+
+  return false;
+}
+
+// Checks if it is legal to interchange 2 loops.
+// [Theorem] A permutation of the loops in a perfect nest is legal if and only
+// if the direction matrix, after the same permutation is applied to its
+// columns, has no ">" direction as the leftmost non-"=" direction in any row.
+static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
+                                      unsigned InnerLoopId,
+                                      unsigned OuterLoopId) {
+
+  unsigned NumRows = DepMatrix.size();
+  // For each row check if it is valid to interchange.
+  for (unsigned Row = 0; Row < NumRows; ++Row) {
+    char InnerDep = DepMatrix[Row][InnerLoopId];
+    char OuterDep = DepMatrix[Row][OuterLoopId];
+    if (InnerDep == '*' || OuterDep == '*')
+      return false;
+    if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep))
+      return false;
+  }
+  return true;
+}
+
+static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
+
+  DEBUG(dbgs() << "Calling populateWorklist on Func: "
+               << L.getHeader()->getParent()->getName() << " Loop: %"
+               << L.getHeader()->getName() << '\n');
+  LoopVector LoopList;
+  Loop *CurrentLoop = &L;
+  const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
+  while (!Vec->empty()) {
+    // The current loop has multiple subloops in it hence it is not tightly
+    // nested.
+    // Discard all loops above it added into Worklist.
+    if (Vec->size() != 1) {
+      LoopList.clear();
+      return;
+    }
+    LoopList.push_back(CurrentLoop);
+    CurrentLoop = Vec->front();
+    Vec = &CurrentLoop->getSubLoops();
+  }
+  LoopList.push_back(CurrentLoop);
+  V.push_back(std::move(LoopList));
+}
+
+static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
+  PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
+  if (InnerIndexVar)
+    return InnerIndexVar;
+  if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
+    return nullptr;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PhiVar = cast<PHINode>(I);
+    Type *PhiTy = PhiVar->getType();
+    if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+        !PhiTy->isPointerTy())
+      return nullptr;
+    const SCEVAddRecExpr *AddRec =
+        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
+    if (!AddRec || !AddRec->isAffine())
+      continue;
+    const SCEV *Step = AddRec->getStepRecurrence(*SE);
+    if (!isa<SCEVConstant>(Step))
+      continue;
+    // Found the induction variable.
+    // FIXME: Handle loops with more than one induction variable. Note that,
+    // currently, legality makes sure we have only one induction variable.
+    return PhiVar;
+  }
+  return nullptr;
+}
+
+/// LoopInterchangeLegality checks if it is legal to interchange the loop.
+class LoopInterchangeLegality {
+public:
+  LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+                          LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
+        PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {}
+
+  /// Check if the loops can be interchanged.
+  bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
+                           CharMatrix &DepMatrix);
+  /// Check if the loop structure is understood. We do not handle triangular
+  /// loops for now.
+  bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
+
+  bool currentLimitations();
+
+  bool hasInnerLoopReduction() { return InnerLoopHasReduction; }
+
+private:
+  bool tightlyNested(Loop *Outer, Loop *Inner);
+  bool containsUnsafeInstructionsInHeader(BasicBlock *BB);
+  bool areAllUsesReductions(Instruction *Ins, Loop *L);
+  bool containsUnsafeInstructionsInLatch(BasicBlock *BB);
+  bool findInductionAndReductions(Loop *L,
+                                  SmallVector<PHINode *, 8> &Inductions,
+                                  SmallVector<PHINode *, 8> &Reductions);
+  Loop *OuterLoop;
+  Loop *InnerLoop;
+
+  ScalarEvolution *SE;
+  LoopInfo *LI;
+  DominatorTree *DT;
+  bool PreserveLCSSA;
+
+  bool InnerLoopHasReduction;
+};
+
+/// LoopInterchangeProfitability checks if it is profitable to interchange the
+/// loop.
+class LoopInterchangeProfitability {
+public:
+  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {}
+
+  /// Check if the loop interchange is profitable.
+  bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
+                    CharMatrix &DepMatrix);
+
+private:
+  int getInstrOrderCost();
+
+  Loop *OuterLoop;
+  Loop *InnerLoop;
+
+  /// Scev analysis.
+  ScalarEvolution *SE;
+};
+
+/// LoopInterchangeTransform interchanges the loop.
+class LoopInterchangeTransform {
+public:
+  LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+                           LoopInfo *LI, DominatorTree *DT,
+                           BasicBlock *LoopNestExit,
+                           bool InnerLoopContainsReductions)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
+        LoopExit(LoopNestExit),
+        InnerLoopHasReduction(InnerLoopContainsReductions) {}
+
+  /// Interchange OuterLoop and InnerLoop.
+  bool transform();
+  void restructureLoops(Loop *InnerLoop, Loop *OuterLoop);
+  void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
+
+private:
+  void splitInnerLoopLatch(Instruction *);
+  void splitInnerLoopHeader();
+  bool adjustLoopLinks();
+  void adjustLoopPreheaders();
+  bool adjustLoopBranches();
+  void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred,
+                           BasicBlock *NewPred);
+
+  Loop *OuterLoop;
+  Loop *InnerLoop;
+
+  /// Scev analysis.
+  ScalarEvolution *SE;
+  LoopInfo *LI;
+  DominatorTree *DT;
+  BasicBlock *LoopExit;
+  bool InnerLoopHasReduction;
+};
+
+// Main LoopInterchange Pass.
+struct LoopInterchange : public FunctionPass {
+  static char ID;
+  ScalarEvolution *SE;
+  LoopInfo *LI;
+  DependenceInfo *DI;
+  DominatorTree *DT;
+  bool PreserveLCSSA;
+  LoopInterchange()
+      : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) {
+    initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<DependenceAnalysisWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+    // Build up a worklist of loop pairs to analyze.
+    SmallVector<LoopVector, 8> Worklist;
+
+    for (Loop *L : *LI)
+      populateWorklist(*L, Worklist);
+
+    DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
+    bool Changed = true;
+    while (!Worklist.empty()) {
+      LoopVector LoopList = Worklist.pop_back_val();
+      Changed = processLoopList(LoopList, F);
+    }
+    return Changed;
+  }
+
+  bool isComputableLoopNest(LoopVector LoopList) {
+    for (Loop *L : LoopList) {
+      const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
+      if (ExitCountOuter == SE->getCouldNotCompute()) {
+        DEBUG(dbgs() << "Couldn't compute backedge count\n");
+        return false;
+      }
+      if (L->getNumBackEdges() != 1) {
+        DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
+        return false;
+      }
+      if (!L->getExitingBlock()) {
+        DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
+        return false;
+      }
+    }
+    return true;
+  }
+
+  unsigned selectLoopForInterchange(const LoopVector &LoopList) {
+    // TODO: Add a better heuristic to select the loop to be interchanged based
+    // on the dependence matrix. Currently we select the innermost loop.
+    return LoopList.size() - 1;
+  }
+
+  bool processLoopList(LoopVector LoopList, Function &F) {
+
+    bool Changed = false;
+    unsigned LoopNestDepth = LoopList.size();
+    if (LoopNestDepth < 2) {
+      DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
+      return false;
+    }
+    if (LoopNestDepth > MaxLoopNestDepth) {
+      DEBUG(dbgs() << "Cannot handle loops of depth greater than "
+                   << MaxLoopNestDepth << "\n");
+      return false;
+    }
+    if (!isComputableLoopNest(LoopList)) {
+      DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
+      return false;
+    }
+
+    DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth << "\n");
+
+    CharMatrix DependencyMatrix;
+    Loop *OuterMostLoop = *(LoopList.begin());
+    if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
+                                  OuterMostLoop, DI)) {
+      DEBUG(dbgs() << "Populating dependency matrix failed\n");
+      return false;
+    }
+#ifdef DUMP_DEP_MATRICIES
+    DEBUG(dbgs() << "Dependence before interchange\n");
+    printDepMatrix(DependencyMatrix);
+#endif
+
+    BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch();
+    BranchInst *OuterMostLoopLatchBI =
+        dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator());
+    if (!OuterMostLoopLatchBI)
+      return false;
+
+    // Since we currently do not handle LCSSA PHI's any failure in loop
+    // condition will now branch to LoopNestExit.
+    // TODO: This should be removed once we handle LCSSA PHI nodes.
+
+    // Get the Outermost loop exit.
+    BasicBlock *LoopNestExit;
+    if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader())
+      LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1);
+    else
+      LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0);
+
+    if (isa<PHINode>(LoopNestExit->begin())) {
+      DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now "
+                      "since on failure all loops branch to loop nest exit.\n");
+      return false;
+    }
+
+    unsigned SelecLoopId = selectLoopForInterchange(LoopList);
+    // Move the selected loop outwards to the best possible position.
+    for (unsigned i = SelecLoopId; i > 0; i--) {
+      bool Interchanged =
+          processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
+      if (!Interchanged)
+        return Changed;
+      // Loops interchanged reflect the same in LoopList
+      std::swap(LoopList[i - 1], LoopList[i]);
+
+      // Update the DependencyMatrix
+      interChangeDependencies(DependencyMatrix, i, i - 1);
+      DT->recalculate(F);
+#ifdef DUMP_DEP_MATRICIES
+      DEBUG(dbgs() << "Dependence after interchange\n");
+      printDepMatrix(DependencyMatrix);
+#endif
+      Changed |= Interchanged;
+    }
+    return Changed;
+  }
+
+  bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
+                   unsigned OuterLoopId, BasicBlock *LoopNestExit,
+                   std::vector<std::vector<char>> &DependencyMatrix) {
+
+    DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
+                 << " and OuterLoopId = " << OuterLoopId << "\n");
+    Loop *InnerLoop = LoopList[InnerLoopId];
+    Loop *OuterLoop = LoopList[OuterLoopId];
+
+    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
+                                PreserveLCSSA);
+    if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+      DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
+      return false;
+    }
+    DEBUG(dbgs() << "Loops are legal to interchange\n");
+    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE);
+    if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+      DEBUG(dbgs() << "Interchanging loops not profitable\n");
+      return false;
+    }
+
+    LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
+                                 LoopNestExit, LIL.hasInnerLoopReduction());
+    LIT.transform();
+    DEBUG(dbgs() << "Loops interchanged\n");
+    return true;
+  }
+};
+
+} // end of namespace
+bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
+  return none_of(Ins->users(), [=](User *U) -> bool {
+    auto *UserIns = dyn_cast<PHINode>(U);
+    RecurrenceDescriptor RD;
+    return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD);
+  });
+}
+
+bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
+    BasicBlock *BB) {
+  for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+    // Load corresponding to reduction PHI's are safe while concluding if
+    // tightly nested.
+    if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+      if (!areAllUsesReductions(L, InnerLoop))
+        return true;
+    } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+      return true;
+  }
+  return false;
+}
+
+bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch(
+    BasicBlock *BB) {
+  for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+    // Stores corresponding to reductions are safe while concluding if tightly
+    // nested.
+    if (StoreInst *L = dyn_cast<StoreInst>(I)) {
+      if (!isa<PHINode>(L->getOperand(0)))
+        return true;
+    } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+      return true;
+  }
+  return false;
+}
+
+bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+
+  DEBUG(dbgs() << "Checking if loops are tightly nested\n");
+
+  // A perfectly nested loop will not have any branch in between the outer and
+  // inner block i.e. outer header will branch to either inner preheader and
+  // outerloop latch.
+  BranchInst *OuterLoopHeaderBI =
+      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+  if (!OuterLoopHeaderBI)
+    return false;
+
+  for (unsigned i = 0, e = OuterLoopHeaderBI->getNumSuccessors(); i < e; ++i) {
+    if (OuterLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader &&
+        OuterLoopHeaderBI->getSuccessor(i) != OuterLoopLatch)
+      return false;
+  }
+
+  DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
+  // We do not have any basic block in between now make sure the outer header
+  // and outer loop latch doesn't contain any unsafe instructions.
+  if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
+      containsUnsafeInstructionsInLatch(OuterLoopLatch))
+    return false;
+
+  DEBUG(dbgs() << "Loops are perfectly nested\n");
+  // We have a perfect loop nest.
+  return true;
+}
+
+
+bool LoopInterchangeLegality::isLoopStructureUnderstood(
+    PHINode *InnerInduction) {
+
+  unsigned Num = InnerInduction->getNumOperands();
+  BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
+  for (unsigned i = 0; i < Num; ++i) {
+    Value *Val = InnerInduction->getOperand(i);
+    if (isa<Constant>(Val))
+      continue;
+    Instruction *I = dyn_cast<Instruction>(Val);
+    if (!I)
+      return false;
+    // TODO: Handle triangular loops.
+    // e.g. for(int i=0;i<N;i++)
+    //        for(int j=i;j<N;j++)
+    unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
+    if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
+            InnerLoopPreheader &&
+        !OuterLoop->isLoopInvariant(I)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool LoopInterchangeLegality::findInductionAndReductions(
+    Loop *L, SmallVector<PHINode *, 8> &Inductions,
+    SmallVector<PHINode *, 8> &Reductions) {
+  if (!L->getLoopLatch() || !L->getLoopPredecessor())
+    return false;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    RecurrenceDescriptor RD;
+    InductionDescriptor ID;
+    PHINode *PHI = cast<PHINode>(I);
+    if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID))
+      Inductions.push_back(PHI);
+    else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
+      Reductions.push_back(PHI);
+    else {
+      DEBUG(
+          dbgs() << "Failed to recognize PHI as an induction or reduction.\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
+  for (auto I = Block->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PHI = cast<PHINode>(I);
+    // Reduction lcssa phi will have only 1 incoming block that from loop latch.
+    if (PHI->getNumIncomingValues() > 1)
+      return false;
+    Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0));
+    if (!Ins)
+      return false;
+    // Incoming value for lcssa phi's in outer loop exit can only be inner loop
+    // exits lcssa phi else it would not be tightly nested.
+    if (!isa<PHINode>(Ins) && isOuterLoopExitBlock)
+      return false;
+  }
+  return true;
+}
+
+static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock,
+                                         BasicBlock *LoopHeader) {
+  if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) {
+    unsigned Num = BI->getNumSuccessors();
+    assert(Num == 2);
+    for (unsigned i = 0; i < Num; ++i) {
+      if (BI->getSuccessor(i) == LoopHeader)
+        continue;
+      return BI->getSuccessor(i);
+    }
+  }
+  return nullptr;
+}
+
+// This function indicates the current limitations in the transform as a result
+// of which we do not proceed.
+bool LoopInterchangeLegality::currentLimitations() {
+
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+
+  PHINode *InnerInductionVar;
+  SmallVector<PHINode *, 8> Inductions;
+  SmallVector<PHINode *, 8> Reductions;
+  if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
+    DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
+                 << "are supported currently.\n");
+    return true;
+  }
+
+  // TODO: Currently we handle only loops with 1 induction variable.
+  if (Inductions.size() != 1) {
+    DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
+                 << "Failed to interchange due to current limitation\n");
+    return true;
+  }
+  if (Reductions.size() > 0)
+    InnerLoopHasReduction = true;
+
+  InnerInductionVar = Inductions.pop_back_val();
+  Reductions.clear();
+  if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
+    DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
+                 << "are supported currently.\n");
+    return true;
+  }
+
+  // Outer loop cannot have reduction because then loops will not be tightly
+  // nested.
+  if (!Reductions.empty()) {
+    DEBUG(dbgs() << "Outer loops with reductions are not supported "
+                 << "currently.\n");
+    return true;
+  }
+  // TODO: Currently we handle only loops with 1 induction variable.
+  if (Inductions.size() != 1) {
+    DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
+                 << "supported currently.\n");
+    return true;
+  }
+
+  // TODO: Triangular loops are not handled for now.
+  if (!isLoopStructureUnderstood(InnerInductionVar)) {
+    DEBUG(dbgs() << "Loop structure not understood by pass\n");
+    return true;
+  }
+
+  // TODO: We only handle LCSSA PHI's corresponding to reduction for now.
+  BasicBlock *LoopExitBlock =
+      getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
+  if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) {
+    DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
+    return true;
+  }
+
+  LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
+  if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) {
+    DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
+    return true;
+  }
+
+  // TODO: Current limitation: Since we split the inner loop latch at the point
+  // were induction variable is incremented (induction.next); We cannot have
+  // more than 1 user of induction.next since it would result in broken code
+  // after split.
+  // e.g.
+  // for(i=0;i<N;i++) {
+  //    for(j = 0;j<M;j++) {
+  //      A[j+1][i+2] = A[j][i]+k;
+  //  }
+  // }
+  Instruction *InnerIndexVarInc = nullptr;
+  if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
+    InnerIndexVarInc =
+        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
+  else
+    InnerIndexVarInc =
+        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
+
+  if (!InnerIndexVarInc) {
+    DEBUG(dbgs() << "Did not find an instruction to increment the induction "
+                 << "variable.\n");
+    return true;
+  }
+
+  // Since we split the inner loop latch on this induction variable. Make sure
+  // we do not have any instruction between the induction variable and branch
+  // instruction.
+
+  bool FoundInduction = false;
+  for (const Instruction &I : reverse(*InnerLoopLatch)) {
+    if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I))
+      continue;
+
+    // We found an instruction. If this is not induction variable then it is not
+    // safe to split this loop latch.
+    if (!I.isIdenticalTo(InnerIndexVarInc)) {
+      DEBUG(dbgs() << "Found unsupported instructions between induction "
+                   << "variable increment and branch.\n");
+      return true;
+    }
+
+    FoundInduction = true;
+    break;
+  }
+  // The loop latch ended and we didn't find the induction variable return as
+  // current limitation.
+  if (!FoundInduction) {
+    DEBUG(dbgs() << "Did not find the induction variable.\n");
+    return true;
+  }
+  return false;
+}
+
+bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
+                                                  unsigned OuterLoopId,
+                                                  CharMatrix &DepMatrix) {
+
+  if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
+    DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
+                 << " and OuterLoopId = " << OuterLoopId
+                 << " due to dependence\n");
+    return false;
+  }
+
+  // Create unique Preheaders if we already do not have one.
+  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+
+  // Create  a unique outer preheader -
+  // 1) If OuterLoop preheader is not present.
+  // 2) If OuterLoop Preheader is same as OuterLoop Header
+  // 3) If OuterLoop Preheader is same as Header of the previous loop.
+  // 4) If OuterLoop Preheader is Entry node.
+  if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() ||
+      isa<PHINode>(OuterLoopPreHeader->begin()) ||
+      !OuterLoopPreHeader->getUniquePredecessor()) {
+    OuterLoopPreHeader =
+        InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA);
+  }
+
+  if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() ||
+      InnerLoopPreHeader == OuterLoop->getHeader()) {
+    InnerLoopPreHeader =
+        InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA);
+  }
+
+  // TODO: The loops could not be interchanged due to current limitations in the
+  // transform module.
+  if (currentLimitations()) {
+    DEBUG(dbgs() << "Not legal because of current transform limitation\n");
+    return false;
+  }
+
+  // Check if the loops are tightly nested.
+  if (!tightlyNested(OuterLoop, InnerLoop)) {
+    DEBUG(dbgs() << "Loops not tightly nested\n");
+    return false;
+  }
+
+  return true;
+}
+
+int LoopInterchangeProfitability::getInstrOrderCost() {
+  unsigned GoodOrder, BadOrder;
+  BadOrder = GoodOrder = 0;
+  for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end();
+       BI != BE; ++BI) {
+    for (Instruction &Ins : **BI) {
+      if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
+        unsigned NumOp = GEP->getNumOperands();
+        bool FoundInnerInduction = false;
+        bool FoundOuterInduction = false;
+        for (unsigned i = 0; i < NumOp; ++i) {
+          const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
+          const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
+          if (!AR)
+            continue;
+
+          // If we find the inner induction after an outer induction e.g.
+          // for(int i=0;i<N;i++)
+          //   for(int j=0;j<N;j++)
+          //     A[i][j] = A[i-1][j-1]+k;
+          // then it is a good order.
+          if (AR->getLoop() == InnerLoop) {
+            // We found an InnerLoop induction after OuterLoop induction. It is
+            // a good order.
+            FoundInnerInduction = true;
+            if (FoundOuterInduction) {
+              GoodOrder++;
+              break;
+            }
+          }
+          // If we find the outer induction after an inner induction e.g.
+          // for(int i=0;i<N;i++)
+          //   for(int j=0;j<N;j++)
+          //     A[j][i] = A[j-1][i-1]+k;
+          // then it is a bad order.
+          if (AR->getLoop() == OuterLoop) {
+            // We found an OuterLoop induction after InnerLoop induction. It is
+            // a bad order.
+            FoundOuterInduction = true;
+            if (FoundInnerInduction) {
+              BadOrder++;
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+  return GoodOrder - BadOrder;
+}
+
+static bool isProfitableForVectorization(unsigned InnerLoopId,
+                                         unsigned OuterLoopId,
+                                         CharMatrix &DepMatrix) {
+  // TODO: Improve this heuristic to catch more cases.
+  // If the inner loop is loop independent or doesn't carry any dependency it is
+  // profitable to move this to outer position.
+  unsigned Row = DepMatrix.size();
+  for (unsigned i = 0; i < Row; ++i) {
+    if (DepMatrix[i][InnerLoopId] != 'S' && DepMatrix[i][InnerLoopId] != 'I')
+      return false;
+    // TODO: We need to improve this heuristic.
+    if (DepMatrix[i][OuterLoopId] != '=')
+      return false;
+  }
+  // If outer loop has dependence and inner loop is loop independent then it is
+  // profitable to interchange to enable parallelism.
+  return true;
+}
+
+bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
+                                                unsigned OuterLoopId,
+                                                CharMatrix &DepMatrix) {
+
+  // TODO: Add better profitability checks.
+  // e.g
+  // 1) Construct dependency matrix and move the one with no loop carried dep
+  //    inside to enable vectorization.
+
+  // This is rough cost estimation algorithm. It counts the good and bad order
+  // of induction variables in the instruction and allows reordering if number
+  // of bad orders is more than good.
+  int Cost = getInstrOrderCost();
+  DEBUG(dbgs() << "Cost = " << Cost << "\n");
+  if (Cost < -LoopInterchangeCostThreshold)
+    return true;
+
+  // It is not profitable as per current cache profitability model. But check if
+  // we can move this loop outside to improve parallelism.
+  bool ImprovesPar =
+      isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
+  return ImprovesPar;
+}
+
+void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
+                                               Loop *InnerLoop) {
+  for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E;
+       ++I) {
+    if (*I == InnerLoop) {
+      OuterLoop->removeChildLoop(I);
+      return;
+    }
+  }
+  llvm_unreachable("Couldn't find loop");
+}
+
+void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop,
+                                                Loop *OuterLoop) {
+  Loop *OuterLoopParent = OuterLoop->getParentLoop();
+  if (OuterLoopParent) {
+    // Remove the loop from its parent loop.
+    removeChildLoop(OuterLoopParent, OuterLoop);
+    removeChildLoop(OuterLoop, InnerLoop);
+    OuterLoopParent->addChildLoop(InnerLoop);
+  } else {
+    removeChildLoop(OuterLoop, InnerLoop);
+    LI->changeTopLevelLoop(OuterLoop, InnerLoop);
+  }
+
+  while (!InnerLoop->empty())
+    OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin()));
+
+  InnerLoop->addChildLoop(OuterLoop);
+}
+
+bool LoopInterchangeTransform::transform() {
+  bool Transformed = false;
+  Instruction *InnerIndexVar;
+
+  if (InnerLoop->getSubLoops().size() == 0) {
+    BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+    DEBUG(dbgs() << "Calling Split Inner Loop\n");
+    PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
+    if (!InductionPHI) {
+      DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
+      return false;
+    }
+
+    if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
+      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
+    else
+      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
+
+    //
+    // Split at the place were the induction variable is
+    // incremented/decremented.
+    // TODO: This splitting logic may not work always. Fix this.
+    splitInnerLoopLatch(InnerIndexVar);
+    DEBUG(dbgs() << "splitInnerLoopLatch done\n");
+
+    // Splits the inner loops phi nodes out into a separate basic block.
+    splitInnerLoopHeader();
+    DEBUG(dbgs() << "splitInnerLoopHeader done\n");
+  }
+
+  Transformed |= adjustLoopLinks();
+  if (!Transformed) {
+    DEBUG(dbgs() << "adjustLoopLinks failed\n");
+    return false;
+  }
+
+  restructureLoops(InnerLoop, OuterLoop);
+  return true;
+}
+
+void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
+  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+  BasicBlock *InnerLoopLatchPred = InnerLoopLatch;
+  InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI);
+}
+
+void LoopInterchangeTransform::splitInnerLoopHeader() {
+
+  // Split the inner loop header out. Here make sure that the reduction PHI's
+  // stay in the innerloop body.
+  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  if (InnerLoopHasReduction) {
+    // FIXME: Check if the induction PHI will always be the first PHI.
+    BasicBlock *New = InnerLoopHeader->splitBasicBlock(
+        ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split");
+    if (LI)
+      if (Loop *L = LI->getLoopFor(InnerLoopHeader))
+        L->addBasicBlockToLoop(New, *LI);
+
+    // Adjust Reduction PHI's in the block.
+    SmallVector<PHINode *, 8> PHIVec;
+    for (auto I = New->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PHI = dyn_cast<PHINode>(I);
+      Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader);
+      PHI->replaceAllUsesWith(V);
+      PHIVec.push_back((PHI));
+    }
+    for (PHINode *P : PHIVec) {
+      P->eraseFromParent();
+    }
+  } else {
+    SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
+  }
+
+  DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & "
+                  "InnerLoopHeader\n");
+}
+
+/// \brief Move all instructions except the terminator from FromBB right before
+/// InsertBefore
+static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
+  auto &ToList = InsertBefore->getParent()->getInstList();
+  auto &FromList = FromBB->getInstList();
+
+  ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
+                FromBB->getTerminator()->getIterator());
+}
+
+void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock,
+                                                   BasicBlock *OldPred,
+                                                   BasicBlock *NewPred) {
+  for (auto I = CurrBlock->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PHI = cast<PHINode>(I);
+    unsigned Num = PHI->getNumIncomingValues();
+    for (unsigned i = 0; i < Num; ++i) {
+      if (PHI->getIncomingBlock(i) == OldPred)
+        PHI->setIncomingBlock(i, NewPred);
+    }
+  }
+}
+
+bool LoopInterchangeTransform::adjustLoopBranches() {
+
+  DEBUG(dbgs() << "adjustLoopBranches called\n");
+  // Adjust the loop preheader
+  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
+  BasicBlock *InnerLoopLatchPredecessor =
+      InnerLoopLatch->getUniquePredecessor();
+  BasicBlock *InnerLoopLatchSuccessor;
+  BasicBlock *OuterLoopLatchSuccessor;
+
+  BranchInst *OuterLoopLatchBI =
+      dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
+  BranchInst *InnerLoopLatchBI =
+      dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
+  BranchInst *OuterLoopHeaderBI =
+      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+  BranchInst *InnerLoopHeaderBI =
+      dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
+
+  if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
+      !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
+      !InnerLoopHeaderBI)
+    return false;
+
+  BranchInst *InnerLoopLatchPredecessorBI =
+      dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
+  BranchInst *OuterLoopPredecessorBI =
+      dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
+
+  if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
+    return false;
+  BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
+  if (!InnerLoopHeaderSuccessor)
+    return false;
+
+  // Adjust Loop Preheader and headers
+
+  unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors();
+  for (unsigned i = 0; i < NumSucc; ++i) {
+    if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader)
+      OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader);
+  }
+
+  NumSucc = OuterLoopHeaderBI->getNumSuccessors();
+  for (unsigned i = 0; i < NumSucc; ++i) {
+    if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch)
+      OuterLoopHeaderBI->setSuccessor(i, LoopExit);
+    else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader)
+      OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor);
+  }
+
+  // Adjust reduction PHI's now that the incoming block has changed.
+  updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader,
+                      OuterLoopHeader);
+
+  BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI);
+  InnerLoopHeaderBI->eraseFromParent();
+
+  // -------------Adjust loop latches-----------
+  if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
+    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
+  else
+    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
+
+  NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors();
+  for (unsigned i = 0; i < NumSucc; ++i) {
+    if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch)
+      InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor);
+  }
+
+  // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with
+  // the value and remove this PHI node from inner loop.
+  SmallVector<PHINode *, 8> LcssaVec;
+  for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I); ++I) {
+    PHINode *LcssaPhi = cast<PHINode>(I);
+    LcssaVec.push_back(LcssaPhi);
+  }
+  for (PHINode *P : LcssaVec) {
+    Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch);
+    P->replaceAllUsesWith(Incoming);
+    P->eraseFromParent();
+  }
+
+  if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
+    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
+  else
+    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
+
+  if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor)
+    InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor);
+  else
+    InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor);
+
+  updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch);
+
+  if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) {
+    OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch);
+  } else {
+    OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch);
+  }
+
+  return true;
+}
+void LoopInterchangeTransform::adjustLoopPreheaders() {
+
+  // We have interchanged the preheaders so we need to interchange the data in
+  // the preheader as well.
+  // This is because the content of inner preheader was previously executed
+  // inside the outer loop.
+  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+  BranchInst *InnerTermBI =
+      cast<BranchInst>(InnerLoopPreHeader->getTerminator());
+
+  // These instructions should now be executed inside the loop.
+  // Move instruction into a new block after outer header.
+  moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator());
+  // These instructions were not executed previously in the loop so move them to
+  // the older inner loop preheader.
+  moveBBContents(OuterLoopPreHeader, InnerTermBI);
+}
+
+bool LoopInterchangeTransform::adjustLoopLinks() {
+
+  // Adjust all branches in the inner and outer loop.
+  bool Changed = adjustLoopBranches();
+  if (Changed)
+    adjustLoopPreheaders();
+  return Changed;
+}
+
+char LoopInterchange::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
+                      "Interchanges loops for cache reuse", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+
+INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
+                    "Interchanges loops for cache reuse", false, false)
+
+Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
new file mode 100644
index 000000000000..20b37c4b70e6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -0,0 +1,666 @@
+//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implement a loop-aware load elimination pass.
+//
+// It uses LoopAccessAnalysis to identify loop-carried dependences with a
+// distance of one between stores and loads.  These form the candidates for the
+// transformation.  The source value of each store then propagated to the user
+// of the corresponding load.  This makes the load dead.
+//
+// The pass can also version the loop and add memchecks in order to prove that
+// may-aliasing stores can't change the value in memory before it's read by the
+// load.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopLoadElimination.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include <algorithm>
+#include <cassert>
+#include <forward_list>
+#include <set>
+#include <tuple>
+#include <utility>
+
+#define LLE_OPTION "loop-load-elim"
+#define DEBUG_TYPE LLE_OPTION
+
+using namespace llvm;
+
+static cl::opt<unsigned> CheckPerElim(
+    "runtime-check-per-loop-load-elim", cl::Hidden,
+    cl::desc("Max number of memchecks allowed per eliminated load on average"),
+    cl::init(1));
+
+static cl::opt<unsigned> LoadElimSCEVCheckThreshold(
+    "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed for Loop "
+             "Load Elimination"));
+
+STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE");
+
+namespace {
+
+/// \brief Represent a store-to-forwarding candidate.
+struct StoreToLoadForwardingCandidate {
+  LoadInst *Load;
+  StoreInst *Store;
+
+  StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store)
+      : Load(Load), Store(Store) {}
+
+  /// \brief Return true if the dependence from the store to the load has a
+  /// distance of one.  E.g. A[i+1] = A[i]
+  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
+                                 Loop *L) const {
+    Value *LoadPtr = Load->getPointerOperand();
+    Value *StorePtr = Store->getPointerOperand();
+    Type *LoadPtrType = LoadPtr->getType();
+    Type *LoadType = LoadPtrType->getPointerElementType();
+
+    assert(LoadPtrType->getPointerAddressSpace() ==
+               StorePtr->getType()->getPointerAddressSpace() &&
+           LoadType == StorePtr->getType()->getPointerElementType() &&
+           "Should be a known dependence");
+
+    // Currently we only support accesses with unit stride.  FIXME: we should be
+    // able to handle non unit stirde as well as long as the stride is equal to
+    // the dependence distance.
+    if (getPtrStride(PSE, LoadPtr, L) != 1 ||
+        getPtrStride(PSE, StorePtr, L) != 1)
+      return false;
+
+    auto &DL = Load->getParent()->getModule()->getDataLayout();
+    unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
+
+    auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr));
+    auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr));
+
+    // We don't need to check non-wrapping here because forward/backward
+    // dependence wouldn't be valid if these weren't monotonic accesses.
+    auto *Dist = cast<SCEVConstant>(
+        PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
+    const APInt &Val = Dist->getAPInt();
+    return Val == TypeByteSize;
+  }
+
+  Value *getLoadPtr() const { return Load->getPointerOperand(); }
+
+#ifndef NDEBUG
+  friend raw_ostream &operator<<(raw_ostream &OS,
+                                 const StoreToLoadForwardingCandidate &Cand) {
+    OS << *Cand.Store << " -->\n";
+    OS.indent(2) << *Cand.Load << "\n";
+    return OS;
+  }
+#endif
+};
+
+/// \brief Check if the store dominates all latches, so as long as there is no
+/// intervening store this value will be loaded in the next iteration.
+bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
+                                  DominatorTree *DT) {
+  SmallVector<BasicBlock *, 8> Latches;
+  L->getLoopLatches(Latches);
+  return llvm::all_of(Latches, [&](const BasicBlock *Latch) {
+    return DT->dominates(StoreBlock, Latch);
+  });
+}
+
+/// \brief Return true if the load is not executed on all paths in the loop.
+static bool isLoadConditional(LoadInst *Load, Loop *L) {
+  return Load->getParent() != L->getHeader();
+}
+
+/// \brief The per-loop class that does most of the work.
+class LoadEliminationForLoop {
+public:
+  LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
+                         DominatorTree *DT)
+      : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.getPSE()) {}
+
+  /// \brief Look through the loop-carried and loop-independent dependences in
+  /// this loop and find store->load dependences.
+  ///
+  /// Note that no candidate is returned if LAA has failed to analyze the loop
+  /// (e.g. if it's not bottom-tested, contains volatile memops, etc.)
+  std::forward_list<StoreToLoadForwardingCandidate>
+  findStoreToLoadDependences(const LoopAccessInfo &LAI) {
+    std::forward_list<StoreToLoadForwardingCandidate> Candidates;
+
+    const auto *Deps = LAI.getDepChecker().getDependences();
+    if (!Deps)
+      return Candidates;
+
+    // Find store->load dependences (consequently true dep).  Both lexically
+    // forward and backward dependences qualify.  Disqualify loads that have
+    // other unknown dependences.
+
+    SmallSet<Instruction *, 4> LoadsWithUnknownDepedence;
+
+    for (const auto &Dep : *Deps) {
+      Instruction *Source = Dep.getSource(LAI);
+      Instruction *Destination = Dep.getDestination(LAI);
+
+      if (Dep.Type == MemoryDepChecker::Dependence::Unknown) {
+        if (isa<LoadInst>(Source))
+          LoadsWithUnknownDepedence.insert(Source);
+        if (isa<LoadInst>(Destination))
+          LoadsWithUnknownDepedence.insert(Destination);
+        continue;
+      }
+
+      if (Dep.isBackward())
+        // Note that the designations source and destination follow the program
+        // order, i.e. source is always first.  (The direction is given by the
+        // DepType.)
+        std::swap(Source, Destination);
+      else
+        assert(Dep.isForward() && "Needs to be a forward dependence");
+
+      auto *Store = dyn_cast<StoreInst>(Source);
+      if (!Store)
+        continue;
+      auto *Load = dyn_cast<LoadInst>(Destination);
+      if (!Load)
+        continue;
+
+      // Only progagate the value if they are of the same type.
+      if (Store->getPointerOperandType() != Load->getPointerOperandType())
+        continue;
+
+      Candidates.emplace_front(Load, Store);
+    }
+
+    if (!LoadsWithUnknownDepedence.empty())
+      Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) {
+        return LoadsWithUnknownDepedence.count(C.Load);
+      });
+
+    return Candidates;
+  }
+
+  /// \brief Return the index of the instruction according to program order.
+  unsigned getInstrIndex(Instruction *Inst) {
+    auto I = InstOrder.find(Inst);
+    assert(I != InstOrder.end() && "No index for instruction");
+    return I->second;
+  }
+
+  /// \brief If a load has multiple candidates associated (i.e. different
+  /// stores), it means that it could be forwarding from multiple stores
+  /// depending on control flow.  Remove these candidates.
+  ///
+  /// Here, we rely on LAA to include the relevant loop-independent dependences.
+  /// LAA is known to omit these in the very simple case when the read and the
+  /// write within an alias set always takes place using the *same* pointer.
+  ///
+  /// However, we know that this is not the case here, i.e. we can rely on LAA
+  /// to provide us with loop-independent dependences for the cases we're
+  /// interested.  Consider the case for example where a loop-independent
+  /// dependece S1->S2 invalidates the forwarding S3->S2.
+  ///
+  ///         A[i]   = ...   (S1)
+  ///         ...    = A[i]  (S2)
+  ///         A[i+1] = ...   (S3)
+  ///
+  /// LAA will perform dependence analysis here because there are two
+  /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]).
+  void removeDependencesFromMultipleStores(
+      std::forward_list<StoreToLoadForwardingCandidate> &Candidates) {
+    // If Store is nullptr it means that we have multiple stores forwarding to
+    // this store.
+    typedef DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>
+        LoadToSingleCandT;
+    LoadToSingleCandT LoadToSingleCand;
+
+    for (const auto &Cand : Candidates) {
+      bool NewElt;
+      LoadToSingleCandT::iterator Iter;
+
+      std::tie(Iter, NewElt) =
+          LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand));
+      if (!NewElt) {
+        const StoreToLoadForwardingCandidate *&OtherCand = Iter->second;
+        // Already multiple stores forward to this load.
+        if (OtherCand == nullptr)
+          continue;
+
+        // Handle the very basic case when the two stores are in the same block
+        // so deciding which one forwards is easy.  The later one forwards as
+        // long as they both have a dependence distance of one to the load.
+        if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
+            Cand.isDependenceDistanceOfOne(PSE, L) &&
+            OtherCand->isDependenceDistanceOfOne(PSE, L)) {
+          // They are in the same block, the later one will forward to the load.
+          if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
+            OtherCand = &Cand;
+        } else
+          OtherCand = nullptr;
+      }
+    }
+
+    Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) {
+      if (LoadToSingleCand[Cand.Load] != &Cand) {
+        DEBUG(dbgs() << "Removing from candidates: \n" << Cand
+                     << "  The load may have multiple stores forwarding to "
+                     << "it\n");
+        return true;
+      }
+      return false;
+    });
+  }
+
+  /// \brief Given two pointers operations by their RuntimePointerChecking
+  /// indices, return true if they require an alias check.
+  ///
+  /// We need a check if one is a pointer for a candidate load and the other is
+  /// a pointer for a possibly intervening store.
+  bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
+                     const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath,
+                     const std::set<Value *> &CandLoadPtrs) {
+    Value *Ptr1 =
+        LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue;
+    Value *Ptr2 =
+        LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue;
+    return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) ||
+            (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1)));
+  }
+
+  /// \brief Return pointers that are possibly written to on the path from a
+  /// forwarding store to a load.
+  ///
+  /// These pointers need to be alias-checked against the forwarding candidates.
+  SmallSet<Value *, 4> findPointersWrittenOnForwardingPath(
+      const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+    // From FirstStore to LastLoad neither of the elimination candidate loads
+    // should overlap with any of the stores.
+    //
+    // E.g.:
+    //
+    // st1 C[i]
+    // ld1 B[i] <-------,
+    // ld0 A[i] <----,  |              * LastLoad
+    // ...           |  |
+    // st2 E[i]      |  |
+    // st3 B[i+1] -- | -'              * FirstStore
+    // st0 A[i+1] ---'
+    // st4 D[i]
+    //
+    // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with
+    // ld0.
+
+    LoadInst *LastLoad =
+        std::max_element(Candidates.begin(), Candidates.end(),
+                         [&](const StoreToLoadForwardingCandidate &A,
+                             const StoreToLoadForwardingCandidate &B) {
+                           return getInstrIndex(A.Load) < getInstrIndex(B.Load);
+                         })
+            ->Load;
+    StoreInst *FirstStore =
+        std::min_element(Candidates.begin(), Candidates.end(),
+                         [&](const StoreToLoadForwardingCandidate &A,
+                             const StoreToLoadForwardingCandidate &B) {
+                           return getInstrIndex(A.Store) <
+                                  getInstrIndex(B.Store);
+                         })
+            ->Store;
+
+    // We're looking for stores after the first forwarding store until the end
+    // of the loop, then from the beginning of the loop until the last
+    // forwarded-to load.  Collect the pointer for the stores.
+    SmallSet<Value *, 4> PtrsWrittenOnFwdingPath;
+
+    auto InsertStorePtr = [&](Instruction *I) {
+      if (auto *S = dyn_cast<StoreInst>(I))
+        PtrsWrittenOnFwdingPath.insert(S->getPointerOperand());
+    };
+    const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions();
+    std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1,
+                  MemInstrs.end(), InsertStorePtr);
+    std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)],
+                  InsertStorePtr);
+
+    return PtrsWrittenOnFwdingPath;
+  }
+
+  /// \brief Determine the pointer alias checks to prove that there are no
+  /// intervening stores.
+  SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks(
+      const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+
+    SmallSet<Value *, 4> PtrsWrittenOnFwdingPath =
+        findPointersWrittenOnForwardingPath(Candidates);
+
+    // Collect the pointers of the candidate loads.
+    // FIXME: SmallSet does not work with std::inserter.
+    std::set<Value *> CandLoadPtrs;
+    transform(Candidates,
+                   std::inserter(CandLoadPtrs, CandLoadPtrs.begin()),
+                   std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr));
+
+    const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+
+    copy_if(AllChecks, std::back_inserter(Checks),
+            [&](const RuntimePointerChecking::PointerCheck &Check) {
+              for (auto PtrIdx1 : Check.first->Members)
+                for (auto PtrIdx2 : Check.second->Members)
+                  if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
+                                    CandLoadPtrs))
+                    return true;
+              return false;
+            });
+
+    DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n");
+    DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+
+    return Checks;
+  }
+
+  /// \brief Perform the transformation for a candidate.
+  void
+  propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
+                                  SCEVExpander &SEE) {
+    //
+    // loop:
+    //      %x = load %gep_i
+    //         = ... %x
+    //      store %y, %gep_i_plus_1
+    //
+    // =>
+    //
+    // ph:
+    //      %x.initial = load %gep_0
+    // loop:
+    //      %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+    //      %x = load %gep_i            <---- now dead
+    //         = ... %x.storeforward
+    //      store %y, %gep_i_plus_1
+
+    Value *Ptr = Cand.Load->getPointerOperand();
+    auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr));
+    auto *PH = L->getLoopPreheader();
+    Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
+                                          PH->getTerminator());
+    Value *Initial =
+        new LoadInst(InitialPtr, "load_initial", /* isVolatile */ false,
+                     Cand.Load->getAlignment(), PH->getTerminator());
+
+    PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
+                                   &L->getHeader()->front());
+    PHI->addIncoming(Initial, PH);
+    PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch());
+
+    Cand.Load->replaceAllUsesWith(PHI);
+  }
+
+  /// \brief Top-level driver for each loop: find store->load forwarding
+  /// candidates, add run-time checks and perform transformation.
+  bool processLoop() {
+    DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
+                 << "\" checking " << *L << "\n");
+    // Look for store-to-load forwarding cases across the
+    // backedge. E.g.:
+    //
+    // loop:
+    //      %x = load %gep_i
+    //         = ... %x
+    //      store %y, %gep_i_plus_1
+    //
+    // =>
+    //
+    // ph:
+    //      %x.initial = load %gep_0
+    // loop:
+    //      %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+    //      %x = load %gep_i            <---- now dead
+    //         = ... %x.storeforward
+    //      store %y, %gep_i_plus_1
+
+    // First start with store->load dependences.
+    auto StoreToLoadDependences = findStoreToLoadDependences(LAI);
+    if (StoreToLoadDependences.empty())
+      return false;
+
+    // Generate an index for each load and store according to the original
+    // program order.  This will be used later.
+    InstOrder = LAI.getDepChecker().generateInstructionOrderMap();
+
+    // To keep things simple for now, remove those where the load is potentially
+    // fed by multiple stores.
+    removeDependencesFromMultipleStores(StoreToLoadDependences);
+    if (StoreToLoadDependences.empty())
+      return false;
+
+    // Filter the candidates further.
+    SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
+    unsigned NumForwarding = 0;
+    for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) {
+      DEBUG(dbgs() << "Candidate " << Cand);
+
+      // Make sure that the stored values is available everywhere in the loop in
+      // the next iteration.
+      if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT))
+        continue;
+
+      // If the load is conditional we can't hoist its 0-iteration instance to
+      // the preheader because that would make it unconditional.  Thus we would
+      // access a memory location that the original loop did not access.
+      if (isLoadConditional(Cand.Load, L))
+        continue;
+
+      // Check whether the SCEV difference is the same as the induction step,
+      // thus we load the value in the next iteration.
+      if (!Cand.isDependenceDistanceOfOne(PSE, L))
+        continue;
+
+      ++NumForwarding;
+      DEBUG(dbgs()
+            << NumForwarding
+            << ". Valid store-to-load forwarding across the loop backedge\n");
+      Candidates.push_back(Cand);
+    }
+    if (Candidates.empty())
+      return false;
+
+    // Check intervening may-alias stores.  These need runtime checks for alias
+    // disambiguation.
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks =
+        collectMemchecks(Candidates);
+
+    // Too many checks are likely to outweigh the benefits of forwarding.
+    if (Checks.size() > Candidates.size() * CheckPerElim) {
+      DEBUG(dbgs() << "Too many run-time checks needed.\n");
+      return false;
+    }
+
+    if (LAI.getPSE().getUnionPredicate().getComplexity() >
+        LoadElimSCEVCheckThreshold) {
+      DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
+      return false;
+    }
+
+    if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
+      if (L->getHeader()->getParent()->optForSize()) {
+        DEBUG(dbgs() << "Versioning is needed but not allowed when optimizing "
+                        "for size.\n");
+        return false;
+      }
+
+      if (!L->isLoopSimplifyForm()) {
+        DEBUG(dbgs() << "Loop is not is loop-simplify form");
+        return false;
+      }
+
+      // Point of no-return, start the transformation.  First, version the loop
+      // if necessary.
+
+      LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false);
+      LV.setAliasChecks(std::move(Checks));
+      LV.setSCEVChecks(LAI.getPSE().getUnionPredicate());
+      LV.versionLoop();
+    }
+
+    // Next, propagate the value stored by the store to the users of the load.
+    // Also for the first iteration, generate the initial value of the load.
+    SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(),
+                     "storeforward");
+    for (const auto &Cand : Candidates)
+      propagateStoredValueToLoadUsers(Cand, SEE);
+    NumLoopLoadEliminted += NumForwarding;
+
+    return true;
+  }
+
+private:
+  Loop *L;
+
+  /// \brief Maps the load/store instructions to their index according to
+  /// program order.
+  DenseMap<Instruction *, unsigned> InstOrder;
+
+  // Analyses used.
+  LoopInfo *LI;
+  const LoopAccessInfo &LAI;
+  DominatorTree *DT;
+  PredicatedScalarEvolution PSE;
+};
+
+static bool
+eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
+                          function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
+  // Build up a worklist of inner-loops to transform to avoid iterator
+  // invalidation.
+  // FIXME: This logic comes from other passes that actually change the loop
+  // nest structure. It isn't clear this is necessary (or useful) for a pass
+  // which merely optimizes the use of loads in a loop.
+  SmallVector<Loop *, 8> Worklist;
+
+  for (Loop *TopLevelLoop : LI)
+    for (Loop *L : depth_first(TopLevelLoop))
+      // We only handle inner-most loops.
+      if (L->empty())
+        Worklist.push_back(L);
+
+  // Now walk the identified inner loops.
+  bool Changed = false;
+  for (Loop *L : Worklist) {
+    // The actual work is performed by LoadEliminationForLoop.
+    LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT);
+    Changed |= LEL.processLoop();
+  }
+  return Changed;
+}
+
+/// \brief The pass.  Most of the work is delegated to the per-loop
+/// LoadEliminationForLoop class.
+class LoopLoadElimination : public FunctionPass {
+public:
+  LoopLoadElimination() : FunctionPass(ID) {
+    initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>();
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+    // Process each loop nest in the function.
+    return eliminateLoadsAcrossLoops(
+        F, LI, DT,
+        [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); });
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  static char ID;
+};
+
+} // end anonymous namespace
+
+char LoopLoadElimination::ID;
+static const char LLE_name[] = "Loop Load Elimination";
+
+INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+
+namespace llvm {
+
+FunctionPass *createLoopLoadEliminationPass() {
+  return new LoopLoadElimination();
+}
+
+PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+
+  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+  bool Changed = eliminateLoadsAcrossLoops(
+      F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & {
+        LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+        return LAM.getResult<LoopAccessAnalysis>(L, AR);
+      });
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  return PA;
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
new file mode 100644
index 000000000000..10f6fcdcfdb7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -0,0 +1,92 @@
+//===- LoopPassManager.cpp - Loop pass management -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+using namespace llvm;
+
+// Explicit template instantiations and specialization defininitions for core
+// template typedefs.
+namespace llvm {
+template class PassManager<Loop, LoopAnalysisManager,
+                           LoopStandardAnalysisResults &, LPMUpdater &>;
+
+/// Explicitly specialize the pass manager's run method to handle loop nest
+/// structure updates.
+template <>
+PreservedAnalyses
+PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
+            LPMUpdater &>::run(Loop &L, LoopAnalysisManager &AM,
+                               LoopStandardAnalysisResults &AR, LPMUpdater &U) {
+  PreservedAnalyses PA = PreservedAnalyses::all();
+
+  if (DebugLogging)
+    dbgs() << "Starting Loop pass manager run.\n";
+
+  for (auto &Pass : Passes) {
+    if (DebugLogging)
+      dbgs() << "Running pass: " << Pass->name() << " on " << L;
+
+    PreservedAnalyses PassPA = Pass->run(L, AM, AR, U);
+
+    // If the loop was deleted, abort the run and return to the outer walk.
+    if (U.skipCurrentLoop()) {
+      PA.intersect(std::move(PassPA));
+      break;
+    }
+
+#ifndef NDEBUG
+    // Verify the loop structure and LCSSA form before visiting the loop.
+    L.verifyLoop();
+    assert(L.isRecursivelyLCSSAForm(AR.DT, AR.LI) &&
+           "Loops must remain in LCSSA form!");
+#endif
+
+    // Update the analysis manager as each pass runs and potentially
+    // invalidates analyses.
+    AM.invalidate(L, PassPA);
+
+    // Finally, we intersect the final preserved analyses to compute the
+    // aggregate preserved set for this pass manager.
+    PA.intersect(std::move(PassPA));
+
+    // FIXME: Historically, the pass managers all called the LLVM context's
+    // yield function here. We don't have a generic way to acquire the
+    // context and it isn't yet clear what the right pattern is for yielding
+    // in the new pass manager so it is currently omitted.
+    // ...getContext().yield();
+  }
+
+  // Invalidation for the current loop should be handled above, and other loop
+  // analysis results shouldn't be impacted by runs over this loop. Therefore,
+  // the remaining analysis results in the AnalysisManager are preserved. We
+  // mark this with a set so that we don't need to inspect each one
+  // individually.
+  // FIXME: This isn't correct! This loop and all nested loops' analyses should
+  // be preserved, but unrolling should invalidate the parent loop's analyses.
+  PA.preserveSet<AllAnalysesOn<Loop>>();
+
+  if (DebugLogging)
+    dbgs() << "Finished Loop pass manager run.\n";
+
+  return PA;
+}
+}
+
+PrintLoopPass::PrintLoopPass() : OS(dbgs()) {}
+PrintLoopPass::PrintLoopPass(raw_ostream &OS, const std::string &Banner)
+    : OS(OS), Banner(Banner) {}
+
+PreservedAnalyses PrintLoopPass::run(Loop &L, LoopAnalysisManager &,
+                                     LoopStandardAnalysisResults &,
+                                     LPMUpdater &) {
+  printLoop(L, OS, Banner);
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
new file mode 100644
index 000000000000..9b12ba180444
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -0,0 +1,330 @@
+//===-- LoopPredication.cpp - Guard based loop predication pass -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LoopPredication pass tries to convert loop variant range checks to loop
+// invariant by widening checks across loop iterations. For example, it will
+// convert
+//
+//   for (i = 0; i < n; i++) {
+//     guard(i < len);
+//     ...
+//   }
+//
+// to
+//
+//   for (i = 0; i < n; i++) {
+//     guard(n - 1 < len);
+//     ...
+//   }
+//
+// After this transformation the condition of the guard is loop invariant, so
+// loop-unswitch can later unswitch the loop by this condition which basically
+// predicates the loop by the widened condition:
+//
+//   if (n - 1 < len)
+//     for (i = 0; i < n; i++) {
+//       ...
+//     }
+//   else
+//     deoptimize
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopPredication.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#define DEBUG_TYPE "loop-predication"
+
+using namespace llvm;
+
+namespace {
+class LoopPredication {
+  /// Represents an induction variable check:
+  ///   icmp Pred, <induction variable>, <loop invariant limit>
+  struct LoopICmp {
+    ICmpInst::Predicate Pred;
+    const SCEVAddRecExpr *IV;
+    const SCEV *Limit;
+    LoopICmp(ICmpInst::Predicate Pred, const SCEVAddRecExpr *IV,
+             const SCEV *Limit)
+        : Pred(Pred), IV(IV), Limit(Limit) {}
+    LoopICmp() {}
+  };
+
+  ScalarEvolution *SE;
+
+  Loop *L;
+  const DataLayout *DL;
+  BasicBlock *Preheader;
+
+  Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI);
+
+  Value *expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder,
+                     ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
+                     Instruction *InsertAt);
+
+  Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander,
+                                        IRBuilder<> &Builder);
+  bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
+
+public:
+  LoopPredication(ScalarEvolution *SE) : SE(SE){};
+  bool runOnLoop(Loop *L);
+};
+
+class LoopPredicationLegacyPass : public LoopPass {
+public:
+  static char ID;
+  LoopPredicationLegacyPass() : LoopPass(ID) {
+    initializeLoopPredicationLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    getLoopAnalysisUsage(AU);
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    LoopPredication LP(SE);
+    return LP.runOnLoop(L);
+  }
+};
+
+char LoopPredicationLegacyPass::ID = 0;
+} // end namespace llvm
+
+INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication",
+                      "Loop predication", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication",
+                    "Loop predication", false, false)
+
+Pass *llvm::createLoopPredicationPass() {
+  return new LoopPredicationLegacyPass();
+}
+
+PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
+                                           LoopStandardAnalysisResults &AR,
+                                           LPMUpdater &U) {
+  LoopPredication LP(&AR.SE);
+  if (!LP.runOnLoop(&L))
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+Optional<LoopPredication::LoopICmp>
+LoopPredication::parseLoopICmp(ICmpInst *ICI) {
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+
+  Value *LHS = ICI->getOperand(0);
+  Value *RHS = ICI->getOperand(1);
+  const SCEV *LHSS = SE->getSCEV(LHS);
+  if (isa<SCEVCouldNotCompute>(LHSS))
+    return None;
+  const SCEV *RHSS = SE->getSCEV(RHS);
+  if (isa<SCEVCouldNotCompute>(RHSS))
+    return None;
+
+  // Canonicalize RHS to be loop invariant bound, LHS - a loop computable IV
+  if (SE->isLoopInvariant(LHSS, L)) {
+    std::swap(LHS, RHS);
+    std::swap(LHSS, RHSS);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHSS);
+  if (!AR || AR->getLoop() != L)
+    return None;
+
+  return LoopICmp(Pred, AR, RHSS);
+}
+
+Value *LoopPredication::expandCheck(SCEVExpander &Expander,
+                                    IRBuilder<> &Builder,
+                                    ICmpInst::Predicate Pred, const SCEV *LHS,
+                                    const SCEV *RHS, Instruction *InsertAt) {
+  Type *Ty = LHS->getType();
+  assert(Ty == RHS->getType() && "expandCheck operands have different types?");
+  Value *LHSV = Expander.expandCodeFor(LHS, Ty, InsertAt);
+  Value *RHSV = Expander.expandCodeFor(RHS, Ty, InsertAt);
+  return Builder.CreateICmp(Pred, LHSV, RHSV);
+}
+
+/// If ICI can be widened to a loop invariant condition emits the loop
+/// invariant condition in the loop preheader and return it, otherwise
+/// returns None.
+Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
+                                                       SCEVExpander &Expander,
+                                                       IRBuilder<> &Builder) {
+  DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
+  DEBUG(ICI->dump());
+
+  auto RangeCheck = parseLoopICmp(ICI);
+  if (!RangeCheck) {
+    DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+    return None;
+  }
+
+  ICmpInst::Predicate Pred = RangeCheck->Pred;
+  const SCEVAddRecExpr *IndexAR = RangeCheck->IV;
+  const SCEV *RHSS = RangeCheck->Limit;
+
+  auto CanExpand = [this](const SCEV *S) {
+    return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
+  };
+  if (!CanExpand(RHSS))
+    return None;
+
+  DEBUG(dbgs() << "IndexAR: ");
+  DEBUG(IndexAR->dump());
+
+  bool IsIncreasing = false;
+  if (!SE->isMonotonicPredicate(IndexAR, Pred, IsIncreasing))
+    return None;
+
+  // If the predicate is increasing the condition can change from false to true
+  // as the loop progresses, in this case take the value on the first iteration
+  // for the widened check. Otherwise the condition can change from true to
+  // false as the loop progresses, so take the value on the last iteration.
+  const SCEV *NewLHSS = IsIncreasing
+                            ? IndexAR->getStart()
+                            : SE->getSCEVAtScope(IndexAR, L->getParentLoop());
+  if (NewLHSS == IndexAR) {
+    DEBUG(dbgs() << "Can't compute NewLHSS!\n");
+    return None;
+  }
+
+  DEBUG(dbgs() << "NewLHSS: ");
+  DEBUG(NewLHSS->dump());
+
+  if (!CanExpand(NewLHSS))
+    return None;
+
+  DEBUG(dbgs() << "NewLHSS is loop invariant and safe to expand. Expand!\n");
+
+  Instruction *InsertAt = Preheader->getTerminator();
+  return expandCheck(Expander, Builder, Pred, NewLHSS, RHSS, InsertAt);
+}
+
+bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
+                                           SCEVExpander &Expander) {
+  DEBUG(dbgs() << "Processing guard:\n");
+  DEBUG(Guard->dump());
+
+  IRBuilder<> Builder(cast<Instruction>(Preheader->getTerminator()));
+
+  // The guard condition is expected to be in form of:
+  //   cond1 && cond2 && cond3 ...
+  // Iterate over subconditions looking for for icmp conditions which can be
+  // widened across loop iterations. Widening these conditions remember the
+  // resulting list of subconditions in Checks vector.
+  SmallVector<Value *, 4> Worklist(1, Guard->getOperand(0));
+  SmallPtrSet<Value *, 4> Visited;
+
+  SmallVector<Value *, 4> Checks;
+
+  unsigned NumWidened = 0;
+  do {
+    Value *Condition = Worklist.pop_back_val();
+    if (!Visited.insert(Condition).second)
+      continue;
+
+    Value *LHS, *RHS;
+    using namespace llvm::PatternMatch;
+    if (match(Condition, m_And(m_Value(LHS), m_Value(RHS)))) {
+      Worklist.push_back(LHS);
+      Worklist.push_back(RHS);
+      continue;
+    }
+
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) {
+      if (auto NewRangeCheck = widenICmpRangeCheck(ICI, Expander, Builder)) {
+        Checks.push_back(NewRangeCheck.getValue());
+        NumWidened++;
+        continue;
+      }
+    }
+
+    // Save the condition as is if we can't widen it
+    Checks.push_back(Condition);
+  } while (Worklist.size() != 0);
+
+  if (NumWidened == 0)
+    return false;
+
+  // Emit the new guard condition
+  Builder.SetInsertPoint(Guard);
+  Value *LastCheck = nullptr;
+  for (auto *Check : Checks)
+    if (!LastCheck)
+      LastCheck = Check;
+    else
+      LastCheck = Builder.CreateAnd(LastCheck, Check);
+  Guard->setOperand(0, LastCheck);
+
+  DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
+  return true;
+}
+
+bool LoopPredication::runOnLoop(Loop *Loop) {
+  L = Loop;
+
+  DEBUG(dbgs() << "Analyzing ");
+  DEBUG(L->dump());
+
+  Module *M = L->getHeader()->getModule();
+
+  // There is nothing to do if the module doesn't use guards
+  auto *GuardDecl =
+      M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
+  if (!GuardDecl || GuardDecl->use_empty())
+    return false;
+
+  DL = &M->getDataLayout();
+
+  Preheader = L->getLoopPreheader();
+  if (!Preheader)
+    return false;
+
+  // Collect all the guards into a vector and process later, so as not
+  // to invalidate the instruction iterator.
+  SmallVector<IntrinsicInst *, 4> Guards;
+  for (const auto BB : L->blocks())
+    for (auto &I : *BB)
+      if (auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::experimental_guard)
+          Guards.push_back(II);
+
+  if (Guards.empty())
+    return false;
+
+  SCEVExpander Expander(*SE, *DL, "loop-predication");
+
+  bool Changed = false;
+  for (auto *Guard : Guards)
+    Changed |= widenGuardConditions(Guard, Expander);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
new file mode 100644
index 000000000000..fc0216e76a5b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -0,0 +1,1761 @@
+//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop reroller.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-reroll"
+
+STATISTIC(NumRerolledLoops, "Number of rerolled loops");
+
+static cl::opt<unsigned>
+MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
+  cl::desc("The maximum increment for loop rerolling"));
+
+static cl::opt<unsigned>
+NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
+                          cl::Hidden,
+                          cl::desc("The maximum number of failures to tolerate"
+                                   " during fuzzy matching. (default: 400)"));
+
+// This loop re-rolling transformation aims to transform loops like this:
+//
+// int foo(int a);
+// void bar(int *x) {
+//   for (int i = 0; i < 500; i += 3) {
+//     foo(i);
+//     foo(i+1);
+//     foo(i+2);
+//   }
+// }
+//
+// into a loop like this:
+//
+// void bar(int *x) {
+//   for (int i = 0; i < 500; ++i)
+//     foo(i);
+// }
+//
+// It does this by looking for loops that, besides the latch code, are composed
+// of isomorphic DAGs of instructions, with each DAG rooted at some increment
+// to the induction variable, and where each DAG is isomorphic to the DAG
+// rooted at the induction variable (excepting the sub-DAGs which root the
+// other induction-variable increments). In other words, we're looking for loop
+// bodies of the form:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// where each f(i) is a set of instructions that, collectively, are a function
+// only of i (and other loop-invariant values).
+//
+// As a special case, we can also reroll loops like this:
+//
+// int foo(int);
+// void bar(int *x) {
+//   for (int i = 0; i < 500; ++i) {
+//     x[3*i] = foo(0);
+//     x[3*i+1] = foo(0);
+//     x[3*i+2] = foo(0);
+//   }
+// }
+//
+// into this:
+//
+// void bar(int *x) {
+//   for (int i = 0; i < 1500; ++i)
+//     x[i] = foo(0);
+// }
+//
+// in which case, we're looking for inputs like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// %scaled.iv = mul %iv, scale
+// f(%scaled.iv)
+// %scaled.iv.1 = add %scaled.iv, 1
+// f(%scaled.iv.1)
+// %scaled.iv.2 = add %scaled.iv, 2
+// f(%scaled.iv.2)
+// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
+// f(%scaled.iv.scale_m_1)
+// ...
+// %iv.next = add %iv, 1
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+
+namespace {
+  enum IterationLimits {
+    /// The maximum number of iterations that we'll try and reroll.
+    IL_MaxRerollIterations = 32,
+    /// The bitvector index used by loop induction variables and other
+    /// instructions that belong to all iterations.
+    IL_All,
+    IL_End
+  };
+
+  class LoopReroll : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopReroll() : LoopPass(ID) {
+      initializeLoopRerollPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      getLoopAnalysisUsage(AU);
+    }
+
+  protected:
+    AliasAnalysis *AA;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    TargetLibraryInfo *TLI;
+    DominatorTree *DT;
+    bool PreserveLCSSA;
+
+    typedef SmallVector<Instruction *, 16> SmallInstructionVector;
+    typedef SmallSet<Instruction *, 16>   SmallInstructionSet;
+
+    // Map between induction variable and its increment
+    DenseMap<Instruction *, int64_t> IVToIncMap;
+    // For loop with multiple induction variable, remember the one used only to
+    // control the loop.
+    Instruction *LoopControlIV;
+
+    // A chain of isomorphic instructions, identified by a single-use PHI
+    // representing a reduction. Only the last value may be used outside the
+    // loop.
+    struct SimpleLoopReduction {
+      SimpleLoopReduction(Instruction *P, Loop *L)
+        : Valid(false), Instructions(1, P) {
+        assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
+        add(L);
+      }
+
+      bool valid() const {
+        return Valid;
+      }
+
+      Instruction *getPHI() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.front();
+      }
+
+      Instruction *getReducedValue() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.back();
+      }
+
+      Instruction *get(size_t i) const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions[i+1];
+      }
+
+      Instruction *operator [] (size_t i) const { return get(i); }
+
+      // The size, ignoring the initial PHI.
+      size_t size() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.size()-1;
+      }
+
+      typedef SmallInstructionVector::iterator iterator;
+      typedef SmallInstructionVector::const_iterator const_iterator;
+
+      iterator begin() {
+        assert(Valid && "Using invalid reduction");
+        return std::next(Instructions.begin());
+      }
+
+      const_iterator begin() const {
+        assert(Valid && "Using invalid reduction");
+        return std::next(Instructions.begin());
+      }
+
+      iterator end() { return Instructions.end(); }
+      const_iterator end() const { return Instructions.end(); }
+
+    protected:
+      bool Valid;
+      SmallInstructionVector Instructions;
+
+      void add(Loop *L);
+    };
+
+    // The set of all reductions, and state tracking of possible reductions
+    // during loop instruction processing.
+    struct ReductionTracker {
+      typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector;
+
+      // Add a new possible reduction.
+      void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
+
+      // Setup to track possible reductions corresponding to the provided
+      // rerolling scale. Only reductions with a number of non-PHI instructions
+      // that is divisible by the scale are considered. Three instructions sets
+      // are filled in:
+      //   - A set of all possible instructions in eligible reductions.
+      //   - A set of all PHIs in eligible reductions
+      //   - A set of all reduced values (last instructions) in eligible
+      //     reductions.
+      void restrictToScale(uint64_t Scale,
+                           SmallInstructionSet &PossibleRedSet,
+                           SmallInstructionSet &PossibleRedPHISet,
+                           SmallInstructionSet &PossibleRedLastSet) {
+        PossibleRedIdx.clear();
+        PossibleRedIter.clear();
+        Reds.clear();
+
+        for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
+          if (PossibleReds[i].size() % Scale == 0) {
+            PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
+            PossibleRedPHISet.insert(PossibleReds[i].getPHI());
+
+            PossibleRedSet.insert(PossibleReds[i].getPHI());
+            PossibleRedIdx[PossibleReds[i].getPHI()] = i;
+            for (Instruction *J : PossibleReds[i]) {
+              PossibleRedSet.insert(J);
+              PossibleRedIdx[J] = i;
+            }
+          }
+      }
+
+      // The functions below are used while processing the loop instructions.
+
+      // Are the two instructions both from reductions, and furthermore, from
+      // the same reduction?
+      bool isPairInSame(Instruction *J1, Instruction *J2) {
+        DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
+        if (J1I != PossibleRedIdx.end()) {
+          DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
+          if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
+            return true;
+        }
+
+        return false;
+      }
+
+      // The two provided instructions, the first from the base iteration, and
+      // the second from iteration i, form a matched pair. If these are part of
+      // a reduction, record that fact.
+      void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
+        if (PossibleRedIdx.count(J1)) {
+          assert(PossibleRedIdx.count(J2) &&
+                 "Recording reduction vs. non-reduction instruction?");
+
+          PossibleRedIter[J1] = 0;
+          PossibleRedIter[J2] = i;
+
+          int Idx = PossibleRedIdx[J1];
+          assert(Idx == PossibleRedIdx[J2] &&
+                 "Recording pair from different reductions?");
+          Reds.insert(Idx);
+        }
+      }
+
+      // The functions below can be called after we've finished processing all
+      // instructions in the loop, and we know which reductions were selected.
+
+      bool validateSelected();
+      void replaceSelected();
+
+    protected:
+      // The vector of all possible reductions (for any scale).
+      SmallReductionVector PossibleReds;
+
+      DenseMap<Instruction *, int> PossibleRedIdx;
+      DenseMap<Instruction *, int> PossibleRedIter;
+      DenseSet<int> Reds;
+    };
+
+    // A DAGRootSet models an induction variable being used in a rerollable
+    // loop. For example,
+    //
+    //   x[i*3+0] = y1
+    //   x[i*3+1] = y2
+    //   x[i*3+2] = y3
+    //
+    //   Base instruction -> i*3
+    //                    +---+----+
+    //                   /    |     \
+    //               ST[y1]  +1     +2  <-- Roots
+    //                        |      |
+    //                      ST[y2] ST[y3]
+    //
+    // There may be multiple DAGRoots, for example:
+    //
+    //   x[i*2+0] = ...   (1)
+    //   x[i*2+1] = ...   (1)
+    //   x[i*2+4] = ...   (2)
+    //   x[i*2+5] = ...   (2)
+    //   x[(i+1234)*2+5678] = ... (3)
+    //   x[(i+1234)*2+5679] = ... (3)
+    //
+    // The loop will be rerolled by adding a new loop induction variable,
+    // one for the Base instruction in each DAGRootSet.
+    //
+    struct DAGRootSet {
+      Instruction *BaseInst;
+      SmallInstructionVector Roots;
+      // The instructions between IV and BaseInst (but not including BaseInst).
+      SmallInstructionSet SubsumedInsts;
+    };
+
+    // The set of all DAG roots, and state tracking of all roots
+    // for a particular induction variable.
+    struct DAGRootTracker {
+      DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
+                     ScalarEvolution *SE, AliasAnalysis *AA,
+                     TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI,
+                     bool PreserveLCSSA,
+                     DenseMap<Instruction *, int64_t> &IncrMap,
+                     Instruction *LoopCtrlIV)
+          : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI),
+            PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap),
+            LoopControlIV(LoopCtrlIV) {}
+
+      /// Stage 1: Find all the DAG roots for the induction variable.
+      bool findRoots();
+      /// Stage 2: Validate if the found roots are valid.
+      bool validate(ReductionTracker &Reductions);
+      /// Stage 3: Assuming validate() returned true, perform the
+      /// replacement.
+      /// @param IterCount The maximum iteration count of L.
+      void replace(const SCEV *IterCount);
+
+    protected:
+      typedef MapVector<Instruction*, BitVector> UsesTy;
+
+      void findRootsRecursive(Instruction *IVU,
+                              SmallInstructionSet SubsumedInsts);
+      bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
+      bool collectPossibleRoots(Instruction *Base,
+                                std::map<int64_t,Instruction*> &Roots);
+      bool validateRootSet(DAGRootSet &DRS);
+
+      bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
+      void collectInLoopUserSet(const SmallInstructionVector &Roots,
+                                const SmallInstructionSet &Exclude,
+                                const SmallInstructionSet &Final,
+                                DenseSet<Instruction *> &Users);
+      void collectInLoopUserSet(Instruction *Root,
+                                const SmallInstructionSet &Exclude,
+                                const SmallInstructionSet &Final,
+                                DenseSet<Instruction *> &Users);
+
+      UsesTy::iterator nextInstr(int Val, UsesTy &In,
+                                 const SmallInstructionSet &Exclude,
+                                 UsesTy::iterator *StartI=nullptr);
+      bool isBaseInst(Instruction *I);
+      bool isRootInst(Instruction *I);
+      bool instrDependsOn(Instruction *I,
+                          UsesTy::iterator Start,
+                          UsesTy::iterator End);
+      void replaceIV(Instruction *Inst, Instruction *IV, const SCEV *IterCount);
+      void updateNonLoopCtrlIncr();
+
+      LoopReroll *Parent;
+
+      // Members of Parent, replicated here for brevity.
+      Loop *L;
+      ScalarEvolution *SE;
+      AliasAnalysis *AA;
+      TargetLibraryInfo *TLI;
+      DominatorTree *DT;
+      LoopInfo *LI;
+      bool PreserveLCSSA;
+
+      // The loop induction variable.
+      Instruction *IV;
+      // Loop step amount.
+      int64_t Inc;
+      // Loop reroll count; if Inc == 1, this records the scaling applied
+      // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
+      // If Inc is not 1, Scale = Inc.
+      uint64_t Scale;
+      // The roots themselves.
+      SmallVector<DAGRootSet,16> RootSets;
+      // All increment instructions for IV.
+      SmallInstructionVector LoopIncs;
+      // Map of all instructions in the loop (in order) to the iterations
+      // they are used in (or specially, IL_All for instructions
+      // used in the loop increment mechanism).
+      UsesTy Uses;
+      // Map between induction variable and its increment
+      DenseMap<Instruction *, int64_t> &IVToIncMap;
+      Instruction *LoopControlIV;
+    };
+
+    // Check if it is a compare-like instruction whose user is a branch
+    bool isCompareUsedByBranch(Instruction *I) {
+      auto *TI = I->getParent()->getTerminator();
+      if (!isa<BranchInst>(TI) || !isa<CmpInst>(I))
+        return false;
+      return I->hasOneUse() && TI->getOperand(0) == I;
+    };
+
+    bool isLoopControlIV(Loop *L, Instruction *IV);
+    void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
+    void collectPossibleReductions(Loop *L,
+           ReductionTracker &Reductions);
+    bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
+                ReductionTracker &Reductions);
+  };
+}
+
+char LoopReroll::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+
+Pass *llvm::createLoopRerollPass() {
+  return new LoopReroll;
+}
+
+// Returns true if the provided instruction is used outside the given loop.
+// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
+// non-loop blocks to be outside the loop.
+static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
+  for (User *U : I->users()) {
+    if (!L->contains(cast<Instruction>(U)))
+      return true;
+  }
+  return false;
+}
+
+static const SCEVConstant *getIncrmentFactorSCEV(ScalarEvolution *SE,
+                                                 const SCEV *SCEVExpr,
+                                                 Instruction &IV) {
+  const SCEVMulExpr *MulSCEV = dyn_cast<SCEVMulExpr>(SCEVExpr);
+
+  // If StepRecurrence of a SCEVExpr is a constant (c1 * c2, c2 = sizeof(ptr)),
+  // Return c1.
+  if (!MulSCEV && IV.getType()->isPointerTy())
+    if (const SCEVConstant *IncSCEV = dyn_cast<SCEVConstant>(SCEVExpr)) {
+      const PointerType *PTy = cast<PointerType>(IV.getType());
+      Type *ElTy = PTy->getElementType();
+      const SCEV *SizeOfExpr =
+          SE->getSizeOfExpr(SE->getEffectiveSCEVType(IV.getType()), ElTy);
+      if (IncSCEV->getValue()->getValue().isNegative()) {
+        const SCEV *NewSCEV =
+            SE->getUDivExpr(SE->getNegativeSCEV(SCEVExpr), SizeOfExpr);
+        return dyn_cast<SCEVConstant>(SE->getNegativeSCEV(NewSCEV));
+      } else {
+        return dyn_cast<SCEVConstant>(SE->getUDivExpr(SCEVExpr, SizeOfExpr));
+      }
+    }
+
+  if (!MulSCEV)
+    return nullptr;
+
+  // If StepRecurrence of a SCEVExpr is a c * sizeof(x), where c is constant,
+  // Return c.
+  const SCEVConstant *CIncSCEV = nullptr;
+  for (const SCEV *Operand : MulSCEV->operands()) {
+    if (const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Operand)) {
+      CIncSCEV = Constant;
+    } else if (const SCEVUnknown *Unknown = dyn_cast<SCEVUnknown>(Operand)) {
+      Type *AllocTy;
+      if (!Unknown->isSizeOf(AllocTy))
+        break;
+    } else {
+      return nullptr;
+    }
+  }
+  return CIncSCEV;
+}
+
+// Check if an IV is only used to control the loop. There are two cases:
+// 1. It only has one use which is loop increment, and the increment is only
+// used by comparison and the PHI (could has sext with nsw in between), and the
+// comparison is only used by branch.
+// 2. It is used by loop increment and the comparison, the loop increment is
+// only used by the PHI, and the comparison is used only by the branch.
+bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) {
+  unsigned IVUses = IV->getNumUses();
+  if (IVUses != 2 && IVUses != 1)
+    return false;
+
+  for (auto *User : IV->users()) {
+    int32_t IncOrCmpUses = User->getNumUses();
+    bool IsCompInst = isCompareUsedByBranch(cast<Instruction>(User));
+
+    // User can only have one or two uses.
+    if (IncOrCmpUses != 2 && IncOrCmpUses != 1)
+      return false;
+
+    // Case 1
+    if (IVUses == 1) {
+      // The only user must be the loop increment.
+      // The loop increment must have two uses.
+      if (IsCompInst || IncOrCmpUses != 2)
+        return false;
+    }
+
+    // Case 2
+    if (IVUses == 2 && IncOrCmpUses != 1)
+      return false;
+
+    // The users of the IV must be a binary operation or a comparison
+    if (auto *BO = dyn_cast<BinaryOperator>(User)) {
+      if (BO->getOpcode() == Instruction::Add) {
+        // Loop Increment
+        // User of Loop Increment should be either PHI or CMP
+        for (auto *UU : User->users()) {
+          if (PHINode *PN = dyn_cast<PHINode>(UU)) {
+            if (PN != IV)
+              return false;
+          }
+          // Must be a CMP or an ext (of a value with nsw) then CMP
+          else {
+            Instruction *UUser = dyn_cast<Instruction>(UU);
+            // Skip SExt if we are extending an nsw value
+            // TODO: Allow ZExt too
+            if (BO->hasNoSignedWrap() && UUser && UUser->hasOneUse() &&
+                isa<SExtInst>(UUser))
+              UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
+            if (!isCompareUsedByBranch(UUser))
+              return false;
+          }
+        }
+      } else
+        return false;
+      // Compare : can only have one use, and must be branch
+    } else if (!IsCompInst)
+      return false;
+  }
+  return true;
+}
+
+// Collect the list of loop induction variables with respect to which it might
+// be possible to reroll the loop.
+void LoopReroll::collectPossibleIVs(Loop *L,
+                                    SmallInstructionVector &PossibleIVs) {
+  BasicBlock *Header = L->getHeader();
+  for (BasicBlock::iterator I = Header->begin(),
+       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+    if (!isa<PHINode>(I))
+      continue;
+    if (!I->getType()->isIntegerTy() && !I->getType()->isPointerTy())
+      continue;
+
+    if (const SCEVAddRecExpr *PHISCEV =
+            dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) {
+      if (PHISCEV->getLoop() != L)
+        continue;
+      if (!PHISCEV->isAffine())
+        continue;
+      const SCEVConstant *IncSCEV = nullptr;
+      if (I->getType()->isPointerTy())
+        IncSCEV =
+            getIncrmentFactorSCEV(SE, PHISCEV->getStepRecurrence(*SE), *I);
+      else
+        IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
+      if (IncSCEV) {
+        const APInt &AInt = IncSCEV->getValue()->getValue().abs();
+        if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc))
+          continue;
+        IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
+        DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
+                     << "\n");
+
+        if (isLoopControlIV(L, &*I)) {
+          assert(!LoopControlIV && "Found two loop control only IV");
+          LoopControlIV = &(*I);
+          DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I << " = "
+                       << *PHISCEV << "\n");
+        } else
+          PossibleIVs.push_back(&*I);
+      }
+    }
+  }
+}
+
+// Add the remainder of the reduction-variable chain to the instruction vector
+// (the initial PHINode has already been added). If successful, the object is
+// marked as valid.
+void LoopReroll::SimpleLoopReduction::add(Loop *L) {
+  assert(!Valid && "Cannot add to an already-valid chain");
+
+  // The reduction variable must be a chain of single-use instructions
+  // (including the PHI), except for the last value (which is used by the PHI
+  // and also outside the loop).
+  Instruction *C = Instructions.front();
+  if (C->user_empty())
+    return;
+
+  do {
+    C = cast<Instruction>(*C->user_begin());
+    if (C->hasOneUse()) {
+      if (!C->isBinaryOp())
+        return;
+
+      if (!(isa<PHINode>(Instructions.back()) ||
+            C->isSameOperationAs(Instructions.back())))
+        return;
+
+      Instructions.push_back(C);
+    }
+  } while (C->hasOneUse());
+
+  if (Instructions.size() < 2 ||
+      !C->isSameOperationAs(Instructions.back()) ||
+      C->use_empty())
+    return;
+
+  // C is now the (potential) last instruction in the reduction chain.
+  for (User *U : C->users()) {
+    // The only in-loop user can be the initial PHI.
+    if (L->contains(cast<Instruction>(U)))
+      if (cast<Instruction>(U) != Instructions.front())
+        return;
+  }
+
+  Instructions.push_back(C);
+  Valid = true;
+}
+
+// Collect the vector of possible reduction variables.
+void LoopReroll::collectPossibleReductions(Loop *L,
+  ReductionTracker &Reductions) {
+  BasicBlock *Header = L->getHeader();
+  for (BasicBlock::iterator I = Header->begin(),
+       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+    if (!isa<PHINode>(I))
+      continue;
+    if (!I->getType()->isSingleValueType())
+      continue;
+
+    SimpleLoopReduction SLR(&*I, L);
+    if (!SLR.valid())
+      continue;
+
+    DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " <<
+          SLR.size() << " chained instructions)\n");
+    Reductions.addSLR(SLR);
+  }
+}
+
+// Collect the set of all users of the provided root instruction. This set of
+// users contains not only the direct users of the root instruction, but also
+// all users of those users, and so on. There are two exceptions:
+//
+//   1. Instructions in the set of excluded instructions are never added to the
+//   use set (even if they are users). This is used, for example, to exclude
+//   including root increments in the use set of the primary IV.
+//
+//   2. Instructions in the set of final instructions are added to the use set
+//   if they are users, but their users are not added. This is used, for
+//   example, to prevent a reduction update from forcing all later reduction
+//   updates into the use set.
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
+  Instruction *Root, const SmallInstructionSet &Exclude,
+  const SmallInstructionSet &Final,
+  DenseSet<Instruction *> &Users) {
+  SmallInstructionVector Queue(1, Root);
+  while (!Queue.empty()) {
+    Instruction *I = Queue.pop_back_val();
+    if (!Users.insert(I).second)
+      continue;
+
+    if (!Final.count(I))
+      for (Use &U : I->uses()) {
+        Instruction *User = cast<Instruction>(U.getUser());
+        if (PHINode *PN = dyn_cast<PHINode>(User)) {
+          // Ignore "wrap-around" uses to PHIs of this loop's header.
+          if (PN->getIncomingBlock(U) == L->getHeader())
+            continue;
+        }
+
+        if (L->contains(User) && !Exclude.count(User)) {
+          Queue.push_back(User);
+        }
+      }
+
+    // We also want to collect single-user "feeder" values.
+    for (User::op_iterator OI = I->op_begin(),
+         OIE = I->op_end(); OI != OIE; ++OI) {
+      if (Instruction *Op = dyn_cast<Instruction>(*OI))
+        if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
+            !Final.count(Op))
+          Queue.push_back(Op);
+    }
+  }
+}
+
+// Collect all of the users of all of the provided root instructions (combined
+// into a single set).
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
+  const SmallInstructionVector &Roots,
+  const SmallInstructionSet &Exclude,
+  const SmallInstructionSet &Final,
+  DenseSet<Instruction *> &Users) {
+  for (Instruction *Root : Roots)
+    collectInLoopUserSet(Root, Exclude, Final, Users);
+}
+
+static bool isUnorderedLoadStore(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isUnordered();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isUnordered();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return !MI->isVolatile();
+  return false;
+}
+
+/// Return true if IVU is a "simple" arithmetic operation.
+/// This is used for narrowing the search space for DAGRoots; only arithmetic
+/// and GEPs can be part of a DAGRoot.
+static bool isSimpleArithmeticOp(User *IVU) {
+  if (Instruction *I = dyn_cast<Instruction>(IVU)) {
+    switch (I->getOpcode()) {
+    default: return false;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::Shl:
+    case Instruction::AShr:
+    case Instruction::LShr:
+    case Instruction::GetElementPtr:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool isLoopIncrement(User *U, Instruction *IV) {
+  BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
+
+  if ((BO && BO->getOpcode() != Instruction::Add) ||
+      (!BO && !isa<GetElementPtrInst>(U)))
+    return false;
+
+  for (auto *UU : U->users()) {
+    PHINode *PN = dyn_cast<PHINode>(UU);
+    if (PN && PN == IV)
+      return true;
+  }
+  return false;
+}
+
+bool LoopReroll::DAGRootTracker::
+collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
+  SmallInstructionVector BaseUsers;
+
+  for (auto *I : Base->users()) {
+    ConstantInt *CI = nullptr;
+
+    if (isLoopIncrement(I, IV)) {
+      LoopIncs.push_back(cast<Instruction>(I));
+      continue;
+    }
+
+    // The root nodes must be either GEPs, ORs or ADDs.
+    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+      if (BO->getOpcode() == Instruction::Add ||
+          BO->getOpcode() == Instruction::Or)
+        CI = dyn_cast<ConstantInt>(BO->getOperand(1));
+    } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+      Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1);
+      CI = dyn_cast<ConstantInt>(LastOperand);
+    }
+
+    if (!CI) {
+      if (Instruction *II = dyn_cast<Instruction>(I)) {
+        BaseUsers.push_back(II);
+        continue;
+      } else {
+        DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n");
+        return false;
+      }
+    }
+
+    int64_t V = std::abs(CI->getValue().getSExtValue());
+    if (Roots.find(V) != Roots.end())
+      // No duplicates, please.
+      return false;
+
+    Roots[V] = cast<Instruction>(I);
+  }
+
+  // Make sure we have at least two roots.
+  if (Roots.empty() || (Roots.size() == 1 && BaseUsers.empty()))
+    return false;
+
+  // If we found non-loop-inc, non-root users of Base, assume they are
+  // for the zeroth root index. This is because "add %a, 0" gets optimized
+  // away.
+  if (BaseUsers.size()) {
+    if (Roots.find(0) != Roots.end()) {
+      DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
+      return false;
+    }
+    Roots[0] = Base;
+  }
+
+  // Calculate the number of users of the base, or lowest indexed, iteration.
+  unsigned NumBaseUses = BaseUsers.size();
+  if (NumBaseUses == 0)
+    NumBaseUses = Roots.begin()->second->getNumUses();
+
+  // Check that every node has the same number of users.
+  for (auto &KV : Roots) {
+    if (KV.first == 0)
+      continue;
+    if (!KV.second->hasNUses(NumBaseUses)) {
+      DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
+            << "#Base=" << NumBaseUses << ", #Root=" <<
+            KV.second->getNumUses() << "\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void LoopReroll::DAGRootTracker::
+findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
+  // Does the user look like it could be part of a root set?
+  // All its users must be simple arithmetic ops.
+  if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1))
+    return;
+
+  if (I != IV && findRootsBase(I, SubsumedInsts))
+    return;
+
+  SubsumedInsts.insert(I);
+
+  for (User *V : I->users()) {
+    Instruction *I = cast<Instruction>(V);
+    if (is_contained(LoopIncs, I))
+      continue;
+
+    if (!isSimpleArithmeticOp(I))
+      continue;
+
+    // The recursive call makes a copy of SubsumedInsts.
+    findRootsRecursive(I, SubsumedInsts);
+  }
+}
+
+bool LoopReroll::DAGRootTracker::validateRootSet(DAGRootSet &DRS) {
+  if (DRS.Roots.empty())
+    return false;
+
+  // Consider a DAGRootSet with N-1 roots (so N different values including
+  //   BaseInst).
+  // Define d = Roots[0] - BaseInst, which should be the same as
+  //   Roots[I] - Roots[I-1] for all I in [1..N).
+  // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
+  //   loop iteration J.
+  //
+  // Now, For the loop iterations to be consecutive:
+  //   D = d * N
+  const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
+  if (!ADR)
+    return false;
+  unsigned N = DRS.Roots.size() + 1;
+  const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), ADR);
+  const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
+  if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV))
+    return false;
+
+  return true;
+}
+
+bool LoopReroll::DAGRootTracker::
+findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
+  // The base of a RootSet must be an AddRec, so it can be erased.
+  const auto *IVU_ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IVU));
+  if (!IVU_ADR || IVU_ADR->getLoop() != L)
+    return false;
+
+  std::map<int64_t, Instruction*> V;
+  if (!collectPossibleRoots(IVU, V))
+    return false;
+
+  // If we didn't get a root for index zero, then IVU must be
+  // subsumed.
+  if (V.find(0) == V.end())
+    SubsumedInsts.insert(IVU);
+
+  // Partition the vector into monotonically increasing indexes.
+  DAGRootSet DRS;
+  DRS.BaseInst = nullptr;
+
+  SmallVector<DAGRootSet, 16> PotentialRootSets;
+
+  for (auto &KV : V) {
+    if (!DRS.BaseInst) {
+      DRS.BaseInst = KV.second;
+      DRS.SubsumedInsts = SubsumedInsts;
+    } else if (DRS.Roots.empty()) {
+      DRS.Roots.push_back(KV.second);
+    } else if (V.find(KV.first - 1) != V.end()) {
+      DRS.Roots.push_back(KV.second);
+    } else {
+      // Linear sequence terminated.
+      if (!validateRootSet(DRS))
+        return false;
+
+      // Construct a new DAGRootSet with the next sequence.
+      PotentialRootSets.push_back(DRS);
+      DRS.BaseInst = KV.second;
+      DRS.Roots.clear();
+    }
+  }
+
+  if (!validateRootSet(DRS))
+    return false;
+
+  PotentialRootSets.push_back(DRS);
+
+  RootSets.append(PotentialRootSets.begin(), PotentialRootSets.end());
+
+  return true;
+}
+
+bool LoopReroll::DAGRootTracker::findRoots() {
+  Inc = IVToIncMap[IV];
+
+  assert(RootSets.empty() && "Unclean state!");
+  if (std::abs(Inc) == 1) {
+    for (auto *IVU : IV->users()) {
+      if (isLoopIncrement(IVU, IV))
+        LoopIncs.push_back(cast<Instruction>(IVU));
+    }
+    findRootsRecursive(IV, SmallInstructionSet());
+    LoopIncs.push_back(IV);
+  } else {
+    if (!findRootsBase(IV, SmallInstructionSet()))
+      return false;
+  }
+
+  // Ensure all sets have the same size.
+  if (RootSets.empty()) {
+    DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
+    return false;
+  }
+  for (auto &V : RootSets) {
+    if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
+      DEBUG(dbgs()
+            << "LRR: Aborting because not all root sets have the same size\n");
+      return false;
+    }
+  }
+
+  Scale = RootSets[0].Roots.size() + 1;
+
+  if (Scale > IL_MaxRerollIterations) {
+    DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
+          << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations
+          << "\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n");
+
+  return true;
+}
+
+bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
+  // Populate the MapVector with all instructions in the block, in order first,
+  // so we can iterate over the contents later in perfect order.
+  for (auto &I : *L->getHeader()) {
+    Uses[&I].resize(IL_End);
+  }
+
+  SmallInstructionSet Exclude;
+  for (auto &DRS : RootSets) {
+    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+    Exclude.insert(DRS.BaseInst);
+  }
+  Exclude.insert(LoopIncs.begin(), LoopIncs.end());
+
+  for (auto &DRS : RootSets) {
+    DenseSet<Instruction*> VBase;
+    collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase);
+    for (auto *I : VBase) {
+      Uses[I].set(0);
+    }
+
+    unsigned Idx = 1;
+    for (auto *Root : DRS.Roots) {
+      DenseSet<Instruction*> V;
+      collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
+
+      // While we're here, check the use sets are the same size.
+      if (V.size() != VBase.size()) {
+        DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
+        return false;
+      }
+
+      for (auto *I : V) {
+        Uses[I].set(Idx);
+      }
+      ++Idx;
+    }
+
+    // Make sure our subsumed instructions are remembered too.
+    for (auto *I : DRS.SubsumedInsts) {
+      Uses[I].set(IL_All);
+    }
+  }
+
+  // Make sure the loop increments are also accounted for.
+
+  Exclude.clear();
+  for (auto &DRS : RootSets) {
+    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+    Exclude.insert(DRS.BaseInst);
+  }
+
+  DenseSet<Instruction*> V;
+  collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
+  for (auto *I : V) {
+    Uses[I].set(IL_All);
+  }
+
+  return true;
+
+}
+
+/// Get the next instruction in "In" that is a member of set Val.
+/// Start searching from StartI, and do not return anything in Exclude.
+/// If StartI is not given, start from In.begin().
+LoopReroll::DAGRootTracker::UsesTy::iterator
+LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
+                                      const SmallInstructionSet &Exclude,
+                                      UsesTy::iterator *StartI) {
+  UsesTy::iterator I = StartI ? *StartI : In.begin();
+  while (I != In.end() && (I->second.test(Val) == 0 ||
+                           Exclude.count(I->first) != 0))
+    ++I;
+  return I;
+}
+
+bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
+  for (auto &DRS : RootSets) {
+    if (DRS.BaseInst == I)
+      return true;
+  }
+  return false;
+}
+
+bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
+  for (auto &DRS : RootSets) {
+    if (is_contained(DRS.Roots, I))
+      return true;
+  }
+  return false;
+}
+
+/// Return true if instruction I depends on any instruction between
+/// Start and End.
+bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
+                                                UsesTy::iterator Start,
+                                                UsesTy::iterator End) {
+  for (auto *U : I->users()) {
+    for (auto It = Start; It != End; ++It)
+      if (U == It->first)
+        return true;
+  }
+  return false;
+}
+
+static bool isIgnorableInst(const Instruction *I) {
+  if (isa<DbgInfoIntrinsic>(I))
+    return true;
+  const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+  if (!II)
+    return false;
+  switch (II->getIntrinsicID()) {
+    default:
+      return false;
+    case llvm::Intrinsic::annotation:
+    case Intrinsic::ptr_annotation:
+    case Intrinsic::var_annotation:
+    // TODO: the following intrinsics may also be whitelisted:
+    //   lifetime_start, lifetime_end, invariant_start, invariant_end
+      return true;
+  }
+  return false;
+}
+
+bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
+  // We now need to check for equivalence of the use graph of each root with
+  // that of the primary induction variable (excluding the roots). Our goal
+  // here is not to solve the full graph isomorphism problem, but rather to
+  // catch common cases without a lot of work. As a result, we will assume
+  // that the relative order of the instructions in each unrolled iteration
+  // is the same (although we will not make an assumption about how the
+  // different iterations are intermixed). Note that while the order must be
+  // the same, the instructions may not be in the same basic block.
+
+  // An array of just the possible reductions for this scale factor. When we
+  // collect the set of all users of some root instructions, these reduction
+  // instructions are treated as 'final' (their uses are not considered).
+  // This is important because we don't want the root use set to search down
+  // the reduction chain.
+  SmallInstructionSet PossibleRedSet;
+  SmallInstructionSet PossibleRedLastSet;
+  SmallInstructionSet PossibleRedPHISet;
+  Reductions.restrictToScale(Scale, PossibleRedSet,
+                             PossibleRedPHISet, PossibleRedLastSet);
+
+  // Populate "Uses" with where each instruction is used.
+  if (!collectUsedInstructions(PossibleRedSet))
+    return false;
+
+  // Make sure we mark the reduction PHIs as used in all iterations.
+  for (auto *I : PossibleRedPHISet) {
+    Uses[I].set(IL_All);
+  }
+
+  // Make sure we mark loop-control-only PHIs as used in all iterations. See
+  // comment above LoopReroll::isLoopControlIV for more information.
+  BasicBlock *Header = L->getHeader();
+  if (LoopControlIV && LoopControlIV != IV) {
+    for (auto *U : LoopControlIV->users()) {
+      Instruction *IVUser = dyn_cast<Instruction>(U);
+      // IVUser could be loop increment or compare
+      Uses[IVUser].set(IL_All);
+      for (auto *UU : IVUser->users()) {
+        Instruction *UUser = dyn_cast<Instruction>(UU);
+        // UUser could be compare, PHI or branch
+        Uses[UUser].set(IL_All);
+        // Skip SExt
+        if (isa<SExtInst>(UUser)) {
+          UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
+          Uses[UUser].set(IL_All);
+        }
+        // Is UUser a compare instruction?
+        if (UU->hasOneUse()) {
+          Instruction *BI = dyn_cast<BranchInst>(*UUser->user_begin());
+          if (BI == cast<BranchInst>(Header->getTerminator()))
+            Uses[BI].set(IL_All);
+        }
+      }
+    }
+  }
+
+  // Make sure all instructions in the loop are in one and only one
+  // set.
+  for (auto &KV : Uses) {
+    if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
+      DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
+            << *KV.first << " (#uses=" << KV.second.count() << ")\n");
+      return false;
+    }
+  }
+
+  DEBUG(
+    for (auto &KV : Uses) {
+      dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
+    }
+    );
+
+  for (unsigned Iter = 1; Iter < Scale; ++Iter) {
+    // In addition to regular aliasing information, we need to look for
+    // instructions from later (future) iterations that have side effects
+    // preventing us from reordering them past other instructions with side
+    // effects.
+    bool FutureSideEffects = false;
+    AliasSetTracker AST(*AA);
+    // The map between instructions in f(%iv.(i+1)) and f(%iv).
+    DenseMap<Value *, Value *> BaseMap;
+
+    // Compare iteration Iter to the base.
+    SmallInstructionSet Visited;
+    auto BaseIt = nextInstr(0, Uses, Visited);
+    auto RootIt = nextInstr(Iter, Uses, Visited);
+    auto LastRootIt = Uses.begin();
+
+    while (BaseIt != Uses.end() && RootIt != Uses.end()) {
+      Instruction *BaseInst = BaseIt->first;
+      Instruction *RootInst = RootIt->first;
+
+      // Skip over the IV or root instructions; only match their users.
+      bool Continue = false;
+      if (isBaseInst(BaseInst)) {
+        Visited.insert(BaseInst);
+        BaseIt = nextInstr(0, Uses, Visited);
+        Continue = true;
+      }
+      if (isRootInst(RootInst)) {
+        LastRootIt = RootIt;
+        Visited.insert(RootInst);
+        RootIt = nextInstr(Iter, Uses, Visited);
+        Continue = true;
+      }
+      if (Continue) continue;
+
+      if (!BaseInst->isSameOperationAs(RootInst)) {
+        // Last chance saloon. We don't try and solve the full isomorphism
+        // problem, but try and at least catch the case where two instructions
+        // *of different types* are round the wrong way. We won't be able to
+        // efficiently tell, given two ADD instructions, which way around we
+        // should match them, but given an ADD and a SUB, we can at least infer
+        // which one is which.
+        //
+        // This should allow us to deal with a greater subset of the isomorphism
+        // problem. It does however change a linear algorithm into a quadratic
+        // one, so limit the number of probes we do.
+        auto TryIt = RootIt;
+        unsigned N = NumToleratedFailedMatches;
+        while (TryIt != Uses.end() &&
+               !BaseInst->isSameOperationAs(TryIt->first) &&
+               N--) {
+          ++TryIt;
+          TryIt = nextInstr(Iter, Uses, Visited, &TryIt);
+        }
+
+        if (TryIt == Uses.end() || TryIt == RootIt ||
+            instrDependsOn(TryIt->first, RootIt, TryIt)) {
+          DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                " vs. " << *RootInst << "\n");
+          return false;
+        }
+
+        RootIt = TryIt;
+        RootInst = TryIt->first;
+      }
+
+      // All instructions between the last root and this root
+      // may belong to some other iteration. If they belong to a
+      // future iteration, then they're dangerous to alias with.
+      //
+      // Note that because we allow a limited amount of flexibility in the order
+      // that we visit nodes, LastRootIt might be *before* RootIt, in which
+      // case we've already checked this set of instructions so we shouldn't
+      // do anything.
+      for (; LastRootIt < RootIt; ++LastRootIt) {
+        Instruction *I = LastRootIt->first;
+        if (LastRootIt->second.find_first() < (int)Iter)
+          continue;
+        if (I->mayWriteToMemory())
+          AST.add(I);
+        // Note: This is specifically guarded by a check on isa<PHINode>,
+        // which while a valid (somewhat arbitrary) micro-optimization, is
+        // needed because otherwise isSafeToSpeculativelyExecute returns
+        // false on PHI nodes.
+        if (!isa<PHINode>(I) && !isUnorderedLoadStore(I) &&
+            !isSafeToSpeculativelyExecute(I))
+          // Intervening instructions cause side effects.
+          FutureSideEffects = true;
+      }
+
+      // Make sure that this instruction, which is in the use set of this
+      // root instruction, does not also belong to the base set or the set of
+      // some other root instruction.
+      if (RootIt->second.count() > 1) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                        " vs. " << *RootInst << " (prev. case overlap)\n");
+        return false;
+      }
+
+      // Make sure that we don't alias with any instruction in the alias set
+      // tracker. If we do, then we depend on a future iteration, and we
+      // can't reroll.
+      if (RootInst->mayReadFromMemory())
+        for (auto &K : AST) {
+          if (K.aliasesUnknownInst(RootInst, *AA)) {
+            DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                            " vs. " << *RootInst << " (depends on future store)\n");
+            return false;
+          }
+        }
+
+      // If we've past an instruction from a future iteration that may have
+      // side effects, and this instruction might also, then we can't reorder
+      // them, and this matching fails. As an exception, we allow the alias
+      // set tracker to handle regular (unordered) load/store dependencies.
+      if (FutureSideEffects && ((!isUnorderedLoadStore(BaseInst) &&
+                                 !isSafeToSpeculativelyExecute(BaseInst)) ||
+                                (!isUnorderedLoadStore(RootInst) &&
+                                 !isSafeToSpeculativelyExecute(RootInst)))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                        " vs. " << *RootInst <<
+                        " (side effects prevent reordering)\n");
+        return false;
+      }
+
+      // For instructions that are part of a reduction, if the operation is
+      // associative, then don't bother matching the operands (because we
+      // already know that the instructions are isomorphic, and the order
+      // within the iteration does not matter). For non-associative reductions,
+      // we do need to match the operands, because we need to reject
+      // out-of-order instructions within an iteration!
+      // For example (assume floating-point addition), we need to reject this:
+      //   x += a[i]; x += b[i];
+      //   x += a[i+1]; x += b[i+1];
+      //   x += b[i+2]; x += a[i+2];
+      bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
+
+      if (!(InReduction && BaseInst->isAssociative())) {
+        bool Swapped = false, SomeOpMatched = false;
+        for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
+          Value *Op2 = RootInst->getOperand(j);
+
+          // If this is part of a reduction (and the operation is not
+          // associatve), then we match all operands, but not those that are
+          // part of the reduction.
+          if (InReduction)
+            if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
+              if (Reductions.isPairInSame(RootInst, Op2I))
+                continue;
+
+          DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
+          if (BMI != BaseMap.end()) {
+            Op2 = BMI->second;
+          } else {
+            for (auto &DRS : RootSets) {
+              if (DRS.Roots[Iter-1] == (Instruction*) Op2) {
+                Op2 = DRS.BaseInst;
+                break;
+              }
+            }
+          }
+
+          if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
+            // If we've not already decided to swap the matched operands, and
+            // we've not already matched our first operand (note that we could
+            // have skipped matching the first operand because it is part of a
+            // reduction above), and the instruction is commutative, then try
+            // the swapped match.
+            if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
+                BaseInst->getOperand(!j) == Op2) {
+              Swapped = true;
+            } else {
+              DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+                    << " vs. " << *RootInst << " (operand " << j << ")\n");
+              return false;
+            }
+          }
+
+          SomeOpMatched = true;
+        }
+      }
+
+      if ((!PossibleRedLastSet.count(BaseInst) &&
+           hasUsesOutsideLoop(BaseInst, L)) ||
+          (!PossibleRedLastSet.count(RootInst) &&
+           hasUsesOutsideLoop(RootInst, L))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                        " vs. " << *RootInst << " (uses outside loop)\n");
+        return false;
+      }
+
+      Reductions.recordPair(BaseInst, RootInst, Iter);
+      BaseMap.insert(std::make_pair(RootInst, BaseInst));
+
+      LastRootIt = RootIt;
+      Visited.insert(BaseInst);
+      Visited.insert(RootInst);
+      BaseIt = nextInstr(0, Uses, Visited);
+      RootIt = nextInstr(Iter, Uses, Visited);
+    }
+    assert (BaseIt == Uses.end() && RootIt == Uses.end() &&
+            "Mismatched set sizes!");
+  }
+
+  DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
+                  *IV << "\n");
+
+  return true;
+}
+
+void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
+  BasicBlock *Header = L->getHeader();
+  // Remove instructions associated with non-base iterations.
+  for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend();
+       J != JE;) {
+    unsigned I = Uses[&*J].find_first();
+    if (I > 0 && I < IL_All) {
+      DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
+      J++->eraseFromParent();
+      continue;
+    }
+
+    ++J;
+  }
+
+  bool HasTwoIVs = LoopControlIV && LoopControlIV != IV;
+
+  if (HasTwoIVs) {
+    updateNonLoopCtrlIncr();
+    replaceIV(LoopControlIV, LoopControlIV, IterCount);
+  } else
+    // We need to create a new induction variable for each different BaseInst.
+    for (auto &DRS : RootSets)
+      // Insert the new induction variable.
+      replaceIV(DRS.BaseInst, IV, IterCount);
+
+  SimplifyInstructionsInBlock(Header, TLI);
+  DeleteDeadPHIs(Header, TLI);
+}
+
+// For non-loop-control IVs, we only need to update the last increment
+// with right amount, then we are done.
+void LoopReroll::DAGRootTracker::updateNonLoopCtrlIncr() {
+  const SCEV *NewInc = nullptr;
+  for (auto *LoopInc : LoopIncs) {
+    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LoopInc);
+    const SCEVConstant *COp = nullptr;
+    if (GEP && LoopInc->getOperand(0)->getType()->isPointerTy()) {
+      COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1)));
+    } else {
+      COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(0)));
+      if (!COp)
+        COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1)));
+    }
+
+    assert(COp && "Didn't find constant operand of LoopInc!\n");
+
+    const APInt &AInt = COp->getValue()->getValue();
+    const SCEV *ScaleSCEV = SE->getConstant(COp->getType(), Scale);
+    if (AInt.isNegative()) {
+      NewInc = SE->getNegativeSCEV(COp);
+      NewInc = SE->getUDivExpr(NewInc, ScaleSCEV);
+      NewInc = SE->getNegativeSCEV(NewInc);
+    } else
+      NewInc = SE->getUDivExpr(COp, ScaleSCEV);
+
+    LoopInc->setOperand(1, dyn_cast<SCEVConstant>(NewInc)->getValue());
+  }
+}
+
+void LoopReroll::DAGRootTracker::replaceIV(Instruction *Inst,
+                                           Instruction *InstIV,
+                                           const SCEV *IterCount) {
+  BasicBlock *Header = L->getHeader();
+  int64_t Inc = IVToIncMap[InstIV];
+  bool NeedNewIV = InstIV == LoopControlIV;
+  bool Negative = !NeedNewIV && Inc < 0;
+
+  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(Inst));
+  const SCEV *Start = RealIVSCEV->getStart();
+
+  if (NeedNewIV)
+    Start = SE->getConstant(Start->getType(), 0);
+
+  const SCEV *SizeOfExpr = nullptr;
+  const SCEV *IncrExpr =
+      SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1);
+  if (auto *PTy = dyn_cast<PointerType>(Inst->getType())) {
+    Type *ElTy = PTy->getElementType();
+    SizeOfExpr =
+        SE->getSizeOfExpr(SE->getEffectiveSCEVType(Inst->getType()), ElTy);
+    IncrExpr = SE->getMulExpr(IncrExpr, SizeOfExpr);
+  }
+  const SCEV *NewIVSCEV =
+      SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap);
+
+  { // Limit the lifetime of SCEVExpander.
+    const DataLayout &DL = Header->getModule()->getDataLayout();
+    SCEVExpander Expander(*SE, DL, "reroll");
+    Value *NewIV = Expander.expandCodeFor(NewIVSCEV, Inst->getType(),
+                                          Header->getFirstNonPHIOrDbg());
+
+    for (auto &KV : Uses)
+      if (KV.second.find_first() == 0)
+        KV.first->replaceUsesOfWith(Inst, NewIV);
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
+      // FIXME: Why do we need this check?
+      if (Uses[BI].find_first() == IL_All) {
+        const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
+
+        if (NeedNewIV)
+          ICSCEV = SE->getMulExpr(IterCount,
+                                  SE->getConstant(IterCount->getType(), Scale));
+
+        // Iteration count SCEV minus or plus 1
+        const SCEV *MinusPlus1SCEV =
+            SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1);
+        if (Inst->getType()->isPointerTy()) {
+          assert(SizeOfExpr && "SizeOfExpr is not initialized");
+          MinusPlus1SCEV = SE->getMulExpr(MinusPlus1SCEV, SizeOfExpr);
+        }
+
+        const SCEV *ICMinusPlus1SCEV = SE->getMinusSCEV(ICSCEV, MinusPlus1SCEV);
+        // Iteration count minus 1
+        Instruction *InsertPtr = nullptr;
+        if (isa<SCEVConstant>(ICMinusPlus1SCEV)) {
+          InsertPtr = BI;
+        } else {
+          BasicBlock *Preheader = L->getLoopPreheader();
+          if (!Preheader)
+            Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
+          InsertPtr = Preheader->getTerminator();
+        }
+
+        if (!isa<PointerType>(NewIV->getType()) && NeedNewIV &&
+            (SE->getTypeSizeInBits(NewIV->getType()) <
+             SE->getTypeSizeInBits(ICMinusPlus1SCEV->getType()))) {
+          IRBuilder<> Builder(BI);
+          Builder.SetCurrentDebugLocation(BI->getDebugLoc());
+          NewIV = Builder.CreateSExt(NewIV, ICMinusPlus1SCEV->getType());
+        }
+        Value *ICMinusPlus1 = Expander.expandCodeFor(
+            ICMinusPlus1SCEV, NewIV->getType(), InsertPtr);
+
+        Value *Cond =
+            new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinusPlus1, "exitcond");
+        BI->setCondition(Cond);
+
+        if (BI->getSuccessor(1) != Header)
+          BI->swapSuccessors();
+      }
+    }
+  }
+}
+
+// Validate the selected reductions. All iterations must have an isomorphic
+// part of the reduction chain and, for non-associative reductions, the chain
+// entries must appear in order.
+bool LoopReroll::ReductionTracker::validateSelected() {
+  // For a non-associative reduction, the chain entries must appear in order.
+  for (int i : Reds) {
+    int PrevIter = 0, BaseCount = 0, Count = 0;
+    for (Instruction *J : PossibleReds[i]) {
+      // Note that all instructions in the chain must have been found because
+      // all instructions in the function must have been assigned to some
+      // iteration.
+      int Iter = PossibleRedIter[J];
+      if (Iter != PrevIter && Iter != PrevIter + 1 &&
+          !PossibleReds[i].getReducedValue()->isAssociative()) {
+        DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
+                        J << "\n");
+        return false;
+      }
+
+      if (Iter != PrevIter) {
+        if (Count != BaseCount) {
+          DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
+                " reduction use count " << Count <<
+                " is not equal to the base use count " <<
+                BaseCount << "\n");
+          return false;
+        }
+
+        Count = 0;
+      }
+
+      ++Count;
+      if (Iter == 0)
+        ++BaseCount;
+
+      PrevIter = Iter;
+    }
+  }
+
+  return true;
+}
+
+// For all selected reductions, remove all parts except those in the first
+// iteration (and the PHI). Replace outside uses of the reduced value with uses
+// of the first-iteration reduced value (in other words, reroll the selected
+// reductions).
+void LoopReroll::ReductionTracker::replaceSelected() {
+  // Fixup reductions to refer to the last instruction associated with the
+  // first iteration (not the last).
+  for (int i : Reds) {
+    int j = 0;
+    for (int e = PossibleReds[i].size(); j != e; ++j)
+      if (PossibleRedIter[PossibleReds[i][j]] != 0) {
+        --j;
+        break;
+      }
+
+    // Replace users with the new end-of-chain value.
+    SmallInstructionVector Users;
+    for (User *U : PossibleReds[i].getReducedValue()->users()) {
+      Users.push_back(cast<Instruction>(U));
+    }
+
+    for (Instruction *User : Users)
+      User->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
+                              PossibleReds[i][j]);
+  }
+}
+
+// Reroll the provided loop with respect to the provided induction variable.
+// Generally, we're looking for a loop like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
+// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
+// be intermixed with eachother. The restriction imposed by this algorithm is
+// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
+// etc. be the same.
+//
+// First, we collect the use set of %iv, excluding the other increment roots.
+// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
+// times, having collected the use set of f(%iv.(i+1)), during which we:
+//   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
+//     the next unmatched instruction in f(%iv.(i+1)).
+//   - Ensure that both matched instructions don't have any external users
+//     (with the exception of last-in-chain reduction instructions).
+//   - Track the (aliasing) write set, and other side effects, of all
+//     instructions that belong to future iterations that come before the matched
+//     instructions. If the matched instructions read from that write set, then
+//     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
+//     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
+//     if any of these future instructions had side effects (could not be
+//     speculatively executed), and so do the matched instructions, when we
+//     cannot reorder those side-effect-producing instructions, and rerolling
+//     fails.
+//
+// Finally, we make sure that all loop instructions are either loop increment
+// roots, belong to simple latch code, parts of validated reductions, part of
+// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
+// have been validated), then we reroll the loop.
+bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+                        const SCEV *IterCount,
+                        ReductionTracker &Reductions) {
+  DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
+                          IVToIncMap, LoopControlIV);
+
+  if (!DAGRoots.findRoots())
+    return false;
+  DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
+                  *IV << "\n");
+
+  if (!DAGRoots.validate(Reductions))
+    return false;
+  if (!Reductions.validateSelected())
+    return false;
+  // At this point, we've validated the rerolling, and we're committed to
+  // making changes!
+
+  Reductions.replaceSelected();
+  DAGRoots.replace(IterCount);
+
+  ++NumRerolledLoops;
+  return true;
+}
+
+bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipLoop(L))
+    return false;
+
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+  BasicBlock *Header = L->getHeader();
+  DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
+        "] Loop %" << Header->getName() << " (" <<
+        L->getNumBlocks() << " block(s))\n");
+
+  // For now, we'll handle only single BB loops.
+  if (L->getNumBlocks() > 1)
+    return false;
+
+  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+    return false;
+
+  const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
+  const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType()));
+  DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
+  DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
+
+  // First, we need to find the induction variable with respect to which we can
+  // reroll (there may be several possible options).
+  SmallInstructionVector PossibleIVs;
+  IVToIncMap.clear();
+  LoopControlIV = nullptr;
+  collectPossibleIVs(L, PossibleIVs);
+
+  if (PossibleIVs.empty()) {
+    DEBUG(dbgs() << "LRR: No possible IVs found\n");
+    return false;
+  }
+
+  ReductionTracker Reductions;
+  collectPossibleReductions(L, Reductions);
+  bool Changed = false;
+
+  // For each possible IV, collect the associated possible set of 'root' nodes
+  // (i+1, i+2, etc.).
+  for (Instruction *PossibleIV : PossibleIVs)
+    if (reroll(PossibleIV, L, Header, IterCount, Reductions)) {
+      Changed = true;
+      break;
+    }
+  DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
+
+  // Trip count of L has changed so SE must be re-evaluated.
+  if (Changed)
+    SE->forgetLoop(L);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
new file mode 100644
index 000000000000..3506ac343d59
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -0,0 +1,748 @@
+//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Loop Rotation Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopRotation.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-rotate"
+
+static cl::opt<unsigned> DefaultRotationThreshold(
+    "rotation-max-header-size", cl::init(16), cl::Hidden,
+    cl::desc("The default maximum header size for automatic loop rotation"));
+
+STATISTIC(NumRotated, "Number of loops rotated");
+
+namespace {
+/// A simple loop rotation transformation.
+class LoopRotate {
+  const unsigned MaxHeaderSize;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  AssumptionCache *AC;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  const SimplifyQuery &SQ;
+
+public:
+  LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
+             const TargetTransformInfo *TTI, AssumptionCache *AC,
+             DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ)
+      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
+        SQ(SQ) {}
+  bool processLoop(Loop *L);
+
+private:
+  bool rotateLoop(Loop *L, bool SimplifiedLatch);
+  bool simplifyLoopLatch(Loop *L);
+};
+} // end anonymous namespace
+
+/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
+/// old header into the preheader.  If there were uses of the values produced by
+/// these instruction that were outside of the loop, we have to insert PHI nodes
+/// to merge the two values.  Do this now.
+static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
+                                            BasicBlock *OrigPreheader,
+                                            ValueToValueMapTy &ValueMap,
+                                SmallVectorImpl<PHINode*> *InsertedPHIs) {
+  // Remove PHI node entries that are no longer live.
+  BasicBlock::iterator I, E = OrigHeader->end();
+  for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
+
+  // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
+  // as necessary.
+  SSAUpdater SSA(InsertedPHIs);
+  for (I = OrigHeader->begin(); I != E; ++I) {
+    Value *OrigHeaderVal = &*I;
+
+    // If there are no uses of the value (e.g. because it returns void), there
+    // is nothing to rewrite.
+    if (OrigHeaderVal->use_empty())
+      continue;
+
+    Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
+
+    // The value now exits in two versions: the initial value in the preheader
+    // and the loop "next" value in the original header.
+    SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
+    SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
+    SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
+
+    // Visit each use of the OrigHeader instruction.
+    for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
+                             UE = OrigHeaderVal->use_end();
+         UI != UE;) {
+      // Grab the use before incrementing the iterator.
+      Use &U = *UI;
+
+      // Increment the iterator before removing the use from the list.
+      ++UI;
+
+      // SSAUpdater can't handle a non-PHI use in the same block as an
+      // earlier def. We can easily handle those cases manually.
+      Instruction *UserInst = cast<Instruction>(U.getUser());
+      if (!isa<PHINode>(UserInst)) {
+        BasicBlock *UserBB = UserInst->getParent();
+
+        // The original users in the OrigHeader are already using the
+        // original definitions.
+        if (UserBB == OrigHeader)
+          continue;
+
+        // Users in the OrigPreHeader need to use the value to which the
+        // original definitions are mapped.
+        if (UserBB == OrigPreheader) {
+          U = OrigPreHeaderVal;
+          continue;
+        }
+      }
+
+      // Anything else can be handled by SSAUpdater.
+      SSA.RewriteUse(U);
+    }
+
+    // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
+    // intrinsics.
+    LLVMContext &C = OrigHeader->getContext();
+    if (auto *VAM = ValueAsMetadata::getIfExists(OrigHeaderVal)) {
+      if (auto *MAV = MetadataAsValue::getIfExists(C, VAM)) {
+        for (auto UI = MAV->use_begin(), E = MAV->use_end(); UI != E;) {
+          // Grab the use before incrementing the iterator. Otherwise, altering
+          // the Use will invalidate the iterator.
+          Use &U = *UI++;
+          DbgInfoIntrinsic *UserInst = dyn_cast<DbgInfoIntrinsic>(U.getUser());
+          if (!UserInst)
+            continue;
+
+          // The original users in the OrigHeader are already using the original
+          // definitions.
+          BasicBlock *UserBB = UserInst->getParent();
+          if (UserBB == OrigHeader)
+            continue;
+
+          // Users in the OrigPreHeader need to use the value to which the
+          // original definitions are mapped and anything else can be handled by
+          // the SSAUpdater. To avoid adding PHINodes, check if the value is
+          // available in UserBB, if not substitute undef.
+          Value *NewVal;
+          if (UserBB == OrigPreheader)
+            NewVal = OrigPreHeaderVal;
+          else if (SSA.HasValueForBlock(UserBB))
+            NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
+          else
+            NewVal = UndefValue::get(OrigHeaderVal->getType());
+          U = MetadataAsValue::get(C, ValueAsMetadata::get(NewVal));
+        }
+      }
+    }
+  }
+}
+
+/// Propagate dbg.value intrinsics through the newly inserted Phis.
+static void insertDebugValues(BasicBlock *OrigHeader,
+                              SmallVectorImpl<PHINode*> &InsertedPHIs) {
+  ValueToValueMapTy DbgValueMap;
+
+  // Map existing PHI nodes to their dbg.values.
+  for (auto &I : *OrigHeader) {
+    if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+      if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
+        DbgValueMap.insert({Loc, DbgII});
+    }
+  }
+
+  // Then iterate through the new PHIs and look to see if they use one of the
+  // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
+  // propagate the info through the new PHI.
+  LLVMContext &C = OrigHeader->getContext();
+  for (auto PHI : InsertedPHIs) {
+    for (auto VI : PHI->operand_values()) {
+      auto V = DbgValueMap.find(VI);
+      if (V != DbgValueMap.end()) {
+        auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
+        Instruction *NewDbgII = DbgII->clone();
+        auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
+        NewDbgII->setOperand(0, PhiMAV);
+        BasicBlock *Parent = PHI->getParent();
+        NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime());
+      }
+    }
+  }
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+///
+/// \param SimplifiedLatch is true if the latch was just folded into the final
+/// loop exit. In this case we may want to rotate even though the new latch is
+/// now an exiting branch. This rotation would have happened had the latch not
+/// been simplified. However, if SimplifiedLatch is false, then we avoid
+/// rotating loops in which the latch exits to avoid excessive or endless
+/// rotation. LoopRotate should be repeatable and converge to a canonical
+/// form. This property is satisfied because simplifying the loop latch can only
+/// happen once across multiple invocations of the LoopRotate pass.
+bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
+  // If the loop has only one block then there is not much to rotate.
+  if (L->getBlocks().size() == 1)
+    return false;
+
+  BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
+
+  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+  if (!BI || BI->isUnconditional())
+    return false;
+
+  // If the loop header is not one of the loop exiting blocks then
+  // either this loop is already rotated or it is not
+  // suitable for loop rotation transformations.
+  if (!L->isLoopExiting(OrigHeader))
+    return false;
+
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (!OrigLatch)
+    return false;
+
+  // Rotate if either the loop latch does *not* exit the loop, or if the loop
+  // latch was just simplified.
+  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
+    return false;
+
+  // Check size of original header and reject loop if it is very big or we can't
+  // duplicate blocks inside it.
+  {
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+    CodeMetrics Metrics;
+    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
+    if (Metrics.notDuplicatable) {
+      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+                   << " instructions: ";
+            L->dump());
+      return false;
+    }
+    if (Metrics.convergent) {
+      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+                      "instructions: ";
+            L->dump());
+      return false;
+    }
+    if (Metrics.NumInsts > MaxHeaderSize)
+      return false;
+  }
+
+  // Now, this loop is suitable for rotation.
+  BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!OrigPreheader)
+    return false;
+
+  // Anything ScalarEvolution may know about this loop or the PHI nodes
+  // in its header will soon be invalidated.
+  if (SE)
+    SE->forgetLoop(L);
+
+  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
+  // Find new Loop header. NewHeader is a Header's one and only successor
+  // that is inside loop.  Header's other successor is outside the
+  // loop.  Otherwise loop is not suitable for rotation.
+  BasicBlock *Exit = BI->getSuccessor(0);
+  BasicBlock *NewHeader = BI->getSuccessor(1);
+  if (L->contains(Exit))
+    std::swap(Exit, NewHeader);
+  assert(NewHeader && "Unable to determine new loop header");
+  assert(L->contains(NewHeader) && !L->contains(Exit) &&
+         "Unable to determine loop header and exit blocks");
+
+  // This code assumes that the new header has exactly one predecessor.
+  // Remove any single-entry PHI nodes in it.
+  assert(NewHeader->getSinglePredecessor() &&
+         "New header doesn't have one pred!");
+  FoldSingleEntryPHINodes(NewHeader);
+
+  // Begin by walking OrigHeader and populating ValueMap with an entry for
+  // each Instruction.
+  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+  ValueToValueMapTy ValueMap;
+
+  // For PHI nodes, the value available in OldPreHeader is just the
+  // incoming value from OldPreHeader.
+  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
+
+  // For the rest of the instructions, either hoist to the OrigPreheader if
+  // possible or create a clone in the OldPreHeader if not.
+  TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
+  while (I != E) {
+    Instruction *Inst = &*I++;
+
+    // If the instruction's operands are invariant and it doesn't read or write
+    // memory, then it is safe to hoist.  Doing this doesn't change the order of
+    // execution in the preheader, but does prevent the instruction from
+    // executing in each iteration of the loop.  This means it is safe to hoist
+    // something that might trap, but isn't safe to hoist something that reads
+    // memory (without proving that the loop doesn't write).
+    if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+        !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) &&
+        !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
+      Inst->moveBefore(LoopEntryBranch);
+      continue;
+    }
+
+    // Otherwise, create a duplicate of the instruction.
+    Instruction *C = Inst->clone();
+
+    // Eagerly remap the operands of the instruction.
+    RemapInstruction(C, ValueMap,
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+    // With the operands remapped, see if the instruction constant folds or is
+    // otherwise simplifyable.  This commonly occurs because the entry from PHI
+    // nodes allows icmps and other instructions to fold.
+    Value *V = SimplifyInstruction(C, SQ);
+    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+      // If so, then delete the temporary instruction and stick the folded value
+      // in the map.
+      ValueMap[Inst] = V;
+      if (!C->mayHaveSideEffects()) {
+        C->deleteValue();
+        C = nullptr;
+      }
+    } else {
+      ValueMap[Inst] = C;
+    }
+    if (C) {
+      // Otherwise, stick the new instruction into the new block!
+      C->setName(Inst->getName());
+      C->insertBefore(LoopEntryBranch);
+
+      if (auto *II = dyn_cast<IntrinsicInst>(C))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
+    }
+  }
+
+  // Along with all the other instructions, we just cloned OrigHeader's
+  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+  // successors by duplicating their incoming values for OrigHeader.
+  TerminatorInst *TI = OrigHeader->getTerminator();
+  for (BasicBlock *SuccBB : TI->successors())
+    for (BasicBlock::iterator BI = SuccBB->begin();
+         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+  // OrigPreHeader's old terminator (the original branch into the loop), and
+  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+  LoopEntryBranch->eraseFromParent();
+
+
+  SmallVector<PHINode*, 2> InsertedPHIs;
+  // If there were any uses of instructions in the duplicated block outside the
+  // loop, update them, inserting PHI nodes as required
+  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+                                  &InsertedPHIs);
+
+  // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+  // previously had debug metadata attached. This keeps the debug info
+  // up-to-date in the loop body.
+  if (!InsertedPHIs.empty())
+    insertDebugValues(OrigHeader, InsertedPHIs);
+
+  // NewHeader is now the header of the loop.
+  L->moveToHeader(NewHeader);
+  assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+  // At this point, we've finished our major CFG changes.  As part of cloning
+  // the loop into the preheader we've simplified instructions and the
+  // duplicated conditional branch may now be branching on a constant.  If it is
+  // branching on a constant and if that constant means that we enter the loop,
+  // then we fold away the cond branch to an uncond branch.  This simplifies the
+  // loop in cases important for nested loops, and it also means we don't have
+  // to split as many edges.
+  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+  if (!isa<ConstantInt>(PHBI->getCondition()) ||
+      PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
+          NewHeader) {
+    // The conditional branch can't be folded, handle the general case.
+    // Update DominatorTree to reflect the CFG change we just made.  Then split
+    // edges as necessary to preserve LoopSimplify form.
+    if (DT) {
+      // Everything that was dominated by the old loop header is now dominated
+      // by the original loop preheader. Conceptually the header was merged
+      // into the preheader, even though we reuse the actual block as a new
+      // loop latch.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
+      for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
+        DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
+
+      assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
+      assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
+
+      // Update OrigHeader to be dominated by the new header block.
+      DT->changeImmediateDominator(OrigHeader, OrigLatch);
+    }
+
+    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+    // thus is not a preheader anymore.
+    // Split the edge to form a real preheader.
+    BasicBlock *NewPH = SplitCriticalEdge(
+        OrigPreheader, NewHeader,
+        CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+    NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+    // Preserve canonical loop form, which means that 'Exit' should have only
+    // one predecessor. Note that Exit could be an exit block for multiple
+    // nested loops, causing both of the edges to now be critical and need to
+    // be split.
+    SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
+    bool SplitLatchEdge = false;
+    for (BasicBlock *ExitPred : ExitPreds) {
+      // We only need to split loop exit edges.
+      Loop *PredLoop = LI->getLoopFor(ExitPred);
+      if (!PredLoop || PredLoop->contains(Exit))
+        continue;
+      if (isa<IndirectBrInst>(ExitPred->getTerminator()))
+        continue;
+      SplitLatchEdge |= L->getLoopLatch() == ExitPred;
+      BasicBlock *ExitSplit = SplitCriticalEdge(
+          ExitPred, Exit,
+          CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+      ExitSplit->moveBefore(Exit);
+    }
+    assert(SplitLatchEdge &&
+           "Despite splitting all preds, failed to split latch exit?");
+  } else {
+    // We can fold the conditional branch in the preheader, this makes things
+    // simpler. The first step is to remove the extra edge to the Exit block.
+    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+    NewBI->setDebugLoc(PHBI->getDebugLoc());
+    PHBI->eraseFromParent();
+
+    // With our CFG finalized, update DomTree if it is available.
+    if (DT) {
+      // Update OrigHeader to be dominated by the new header block.
+      DT->changeImmediateDominator(NewHeader, OrigPreheader);
+      DT->changeImmediateDominator(OrigHeader, OrigLatch);
+
+      // Brute force incremental dominator tree update. Call
+      // findNearestCommonDominator on all CFG predecessors of each child of the
+      // original header.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      bool Changed;
+      do {
+        Changed = false;
+        for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) {
+          DomTreeNode *Node = HeaderChildren[I];
+          BasicBlock *BB = Node->getBlock();
+
+          BasicBlock *NearestDom = nullptr;
+          for (BasicBlock *Pred : predecessors(BB)) {
+            // Consider only reachable basic blocks.
+            if (!DT->getNode(Pred))
+              continue;
+
+            if (!NearestDom) {
+              NearestDom = Pred;
+              continue;
+            }
+
+            NearestDom = DT->findNearestCommonDominator(NearestDom, Pred);
+            assert(NearestDom && "No NearestCommonDominator found");
+          }
+
+          assert(NearestDom && "Nearest dominator not found");
+
+          // Remember if this changes the DomTree.
+          if (Node->getIDom()->getBlock() != NearestDom) {
+            DT->changeImmediateDominator(BB, NearestDom);
+            Changed = true;
+          }
+        }
+
+        // If the dominator changed, this may have an effect on other
+        // predecessors, continue until we reach a fixpoint.
+      } while (Changed);
+    }
+  }
+
+  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+
+  // Now that the CFG and DomTree are in a consistent state again, try to merge
+  // the OrigHeader block into OrigLatch.  This will succeed if they are
+  // connected by an unconditional branch.  This is just a cleanup so the
+  // emitted code isn't too gross in this common case.
+  MergeBlockIntoPredecessor(OrigHeader, DT, LI);
+
+  DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
+  ++NumRotated;
+  return true;
+}
+
+/// Determine whether the instructions in this range may be safely and cheaply
+/// speculated. This is not an important enough situation to develop complex
+/// heuristics. We handle a single arithmetic instruction along with any type
+/// conversions.
+static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
+                                  BasicBlock::iterator End, Loop *L) {
+  bool seenIncrement = false;
+  bool MultiExitLoop = false;
+
+  if (!L->getExitingBlock())
+    MultiExitLoop = true;
+
+  for (BasicBlock::iterator I = Begin; I != End; ++I) {
+
+    if (!isSafeToSpeculativelyExecute(&*I))
+      return false;
+
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    switch (I->getOpcode()) {
+    default:
+      return false;
+    case Instruction::GetElementPtr:
+      // GEPs are cheap if all indices are constant.
+      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+        return false;
+      // fall-thru to increment case
+      LLVM_FALLTHROUGH;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr: {
+      Value *IVOpnd =
+          !isa<Constant>(I->getOperand(0))
+              ? I->getOperand(0)
+              : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
+      if (!IVOpnd)
+        return false;
+
+      // If increment operand is used outside of the loop, this speculation
+      // could cause extra live range interference.
+      if (MultiExitLoop) {
+        for (User *UseI : IVOpnd->users()) {
+          auto *UserInst = cast<Instruction>(UseI);
+          if (!L->contains(UserInst))
+            return false;
+        }
+      }
+
+      if (seenIncrement)
+        return false;
+      seenIncrement = true;
+      break;
+    }
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // ignore type conversions
+      break;
+    }
+  }
+  return true;
+}
+
+/// Fold the loop tail into the loop exit by speculating the loop tail
+/// instructions. Typically, this is a single post-increment. In the case of a
+/// simple 2-block loop, hoisting the increment can be much better than
+/// duplicating the entire loop header. In the case of loops with early exits,
+/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
+/// canonical form so downstream passes can handle it.
+///
+/// I don't believe this invalidates SCEV.
+bool LoopRotate::simplifyLoopLatch(Loop *L) {
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch || Latch->hasAddressTaken())
+    return false;
+
+  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!Jmp || !Jmp->isUnconditional())
+    return false;
+
+  BasicBlock *LastExit = Latch->getSinglePredecessor();
+  if (!LastExit || !L->isLoopExiting(LastExit))
+    return false;
+
+  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
+  if (!BI)
+    return false;
+
+  if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
+    return false;
+
+  DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
+               << LastExit->getName() << "\n");
+
+  // Hoist the instructions from Latch into LastExit.
+  LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
+                                 Latch->begin(), Jmp->getIterator());
+
+  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
+  BasicBlock *Header = Jmp->getSuccessor(0);
+  assert(Header == L->getHeader() && "expected a backward branch");
+
+  // Remove Latch from the CFG so that LastExit becomes the new Latch.
+  BI->setSuccessor(FallThruPath, Header);
+  Latch->replaceSuccessorsPhiUsesWith(LastExit);
+  Jmp->eraseFromParent();
+
+  // Nuke the Latch block.
+  assert(Latch->empty() && "unable to evacuate Latch");
+  LI->removeBlock(Latch);
+  if (DT)
+    DT->eraseNode(Latch);
+  Latch->eraseFromParent();
+  return true;
+}
+
+/// Rotate \c L, and return true if any modification was made.
+bool LoopRotate::processLoop(Loop *L) {
+  // Save the loop metadata.
+  MDNode *LoopMD = L->getLoopID();
+
+  // Simplify the loop latch before attempting to rotate the header
+  // upward. Rotation may not be needed if the loop tail can be folded into the
+  // loop exit.
+  bool SimplifiedLatch = simplifyLoopLatch(L);
+
+  bool MadeChange = rotateLoop(L, SimplifiedLatch);
+  assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
+         "Loop latch should be exiting after loop-rotate.");
+
+  // Restore the loop metadata.
+  // NB! We presume LoopRotation DOESN'T ADD its own metadata.
+  if ((MadeChange || SimplifiedLatch) && LoopMD)
+    L->setLoopID(LoopMD);
+
+  return MadeChange;
+}
+
+LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
+    : EnableHeaderDuplication(EnableHeaderDuplication) {}
+
+PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
+                                      LoopStandardAnalysisResults &AR,
+                                      LPMUpdater &) {
+  int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0;
+  const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+  const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
+  LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
+                SQ);
+
+  bool Changed = LR.processLoop(&L);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+
+class LoopRotateLegacyPass : public LoopPass {
+  unsigned MaxHeaderSize;
+
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
+    initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry());
+    if (SpecifiedMaxHeaderSize == -1)
+      MaxHeaderSize = DefaultRotationThreshold;
+    else
+      MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
+  }
+
+  // LCSSA form makes instruction renaming easier.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+    Function &F = *L->getHeader()->getParent();
+
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+    auto *SE = SEWP ? &SEWP->getSE() : nullptr;
+    const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
+    LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE, SQ);
+    return LR.processLoop(L);
+  }
+};
+}
+
+char LoopRotateLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false,
+                    false)
+
+Pass *llvm::createLoopRotatePass(int MaxHeaderSize) {
+  return new LoopRotateLegacyPass(MaxHeaderSize);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
new file mode 100644
index 000000000000..35c05e84fd68
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -0,0 +1,109 @@
+//===--------- LoopSimplifyCFG.cpp - Loop CFG Simplification Pass ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Loop SimplifyCFG Pass. This pass is responsible for
+// basic loop CFG cleanup, primarily to assist other loop passes. If you
+// encounter a noncanonical CFG construct that causes another loop pass to
+// perform suboptimally, this is the place to fix it up.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-simplifycfg"
+
+static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
+  bool Changed = false;
+  // Copy blocks into a temporary array to avoid iterator invalidation issues
+  // as we remove them.
+  SmallVector<WeakTrackingVH, 16> Blocks(L.blocks());
+
+  for (auto &Block : Blocks) {
+    // Attempt to merge blocks in the trivial case. Don't modify blocks which
+    // belong to other loops.
+    BasicBlock *Succ = cast_or_null<BasicBlock>(Block);
+    if (!Succ)
+      continue;
+
+    BasicBlock *Pred = Succ->getSinglePredecessor();
+    if (!Pred || !Pred->getSingleSuccessor() || LI.getLoopFor(Pred) != &L)
+      continue;
+
+    // Pred is going to disappear, so we need to update the loop info.
+    if (L.getHeader() == Pred)
+      L.moveToHeader(Succ);
+    LI.removeBlock(Pred);
+    MergeBasicBlockIntoOnlyPred(Succ, &DT);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
+                                           LoopStandardAnalysisResults &AR,
+                                           LPMUpdater &) {
+  if (!simplifyLoopCFG(L, AR.DT, AR.LI))
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+class LoopSimplifyCFGLegacyPass : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopSimplifyCFGLegacyPass() : LoopPass(ID) {
+    initializeLoopSimplifyCFGLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &) override {
+    if (skipLoop(L))
+      return false;
+
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    return simplifyLoopCFG(*L, DT, LI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<DependenceAnalysisWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+}
+
+char LoopSimplifyCFGLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopSimplifyCFGLegacyPass, "loop-simplifycfg",
+                      "Simplify loop CFG", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopSimplifyCFGLegacyPass, "loop-simplifycfg",
+                    "Simplify loop CFG", false, false)
+
+Pass *llvm::createLoopSimplifyCFGPass() {
+  return new LoopSimplifyCFGLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
new file mode 100644
index 000000000000..c9d55b4594fe
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -0,0 +1,373 @@
+//===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does the inverse transformation of what LICM does.
+// It traverses all of the instructions in the loop's preheader and sinks
+// them to the loop body where frequency is lower than the loop's preheader.
+// This pass is a reverse-transformation of LICM. It differs from the Sink
+// pass in the following ways:
+//
+// * It only handles sinking of instructions from the loop's preheader to the
+//   loop's body
+// * It uses alias set tracker to get more accurate alias info
+// * It uses block frequency info to find the optimal sinking locations
+//
+// Overall algorithm:
+//
+// For I in Preheader:
+//   InsertBBs = BBs that uses I
+//   For BB in sorted(LoopBBs):
+//     DomBBs = BBs in InsertBBs that are dominated by BB
+//     if freq(DomBBs) > freq(BB)
+//       InsertBBs = UseBBs - DomBBs + BB
+//   For BB in InsertBBs:
+//     Insert I at BB's beginning
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopSink.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loopsink"
+
+STATISTIC(NumLoopSunk, "Number of instructions sunk into loop");
+STATISTIC(NumLoopSunkCloned, "Number of cloned instructions sunk into loop");
+
+static cl::opt<unsigned> SinkFrequencyPercentThreshold(
+    "sink-freq-percent-threshold", cl::Hidden, cl::init(90),
+    cl::desc("Do not sink instructions that require cloning unless they "
+             "execute less than this percent of the time."));
+
+static cl::opt<unsigned> MaxNumberOfUseBBsForSinking(
+    "max-uses-for-sinking", cl::Hidden, cl::init(30),
+    cl::desc("Do not sink instructions that have too many uses."));
+
+/// Return adjusted total frequency of \p BBs.
+///
+/// * If there is only one BB, sinking instruction will not introduce code
+///   size increase. Thus there is no need to adjust the frequency.
+/// * If there are more than one BB, sinking would lead to code size increase.
+///   In this case, we add some "tax" to the total frequency to make it harder
+///   to sink. E.g.
+///     Freq(Preheader) = 100
+///     Freq(BBs) = sum(50, 49) = 99
+///   Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to
+///   BBs as the difference is too small to justify the code size increase.
+///   To model this, The adjusted Freq(BBs) will be:
+///     AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold%
+static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs,
+                                      BlockFrequencyInfo &BFI) {
+  BlockFrequency T = 0;
+  for (BasicBlock *B : BBs)
+    T += BFI.getBlockFreq(B);
+  if (BBs.size() > 1)
+    T /= BranchProbability(SinkFrequencyPercentThreshold, 100);
+  return T;
+}
+
+/// Return a set of basic blocks to insert sinked instructions.
+///
+/// The returned set of basic blocks (BBsToSinkInto) should satisfy:
+///
+/// * Inside the loop \p L
+/// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto
+///   that domintates the UseBB
+/// * Has minimum total frequency that is no greater than preheader frequency
+///
+/// The purpose of the function is to find the optimal sinking points to
+/// minimize execution cost, which is defined as "sum of frequency of
+/// BBsToSinkInto".
+/// As a result, the returned BBsToSinkInto needs to have minimum total
+/// frequency.
+/// Additionally, if the total frequency of BBsToSinkInto exceeds preheader
+/// frequency, the optimal solution is not sinking (return empty set).
+///
+/// \p ColdLoopBBs is used to help find the optimal sinking locations.
+/// It stores a list of BBs that is:
+///
+/// * Inside the loop \p L
+/// * Has a frequency no larger than the loop's preheader
+/// * Sorted by BB frequency
+///
+/// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()).
+/// To avoid expensive computation, we cap the maximum UseBBs.size() in its
+/// caller.
+static SmallPtrSet<BasicBlock *, 2>
+findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs,
+                  const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
+                  DominatorTree &DT, BlockFrequencyInfo &BFI) {
+  SmallPtrSet<BasicBlock *, 2> BBsToSinkInto;
+  if (UseBBs.size() == 0)
+    return BBsToSinkInto;
+
+  BBsToSinkInto.insert(UseBBs.begin(), UseBBs.end());
+  SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB;
+
+  // For every iteration:
+  //   * Pick the ColdestBB from ColdLoopBBs
+  //   * Find the set BBsDominatedByColdestBB that satisfy:
+  //     - BBsDominatedByColdestBB is a subset of BBsToSinkInto
+  //     - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB
+  //   * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove
+  //     BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to
+  //     BBsToSinkInto
+  for (BasicBlock *ColdestBB : ColdLoopBBs) {
+    BBsDominatedByColdestBB.clear();
+    for (BasicBlock *SinkedBB : BBsToSinkInto)
+      if (DT.dominates(ColdestBB, SinkedBB))
+        BBsDominatedByColdestBB.insert(SinkedBB);
+    if (BBsDominatedByColdestBB.size() == 0)
+      continue;
+    if (adjustedSumFreq(BBsDominatedByColdestBB, BFI) >
+        BFI.getBlockFreq(ColdestBB)) {
+      for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) {
+        BBsToSinkInto.erase(DominatedBB);
+      }
+      BBsToSinkInto.insert(ColdestBB);
+    }
+  }
+
+  // If the total frequency of BBsToSinkInto is larger than preheader frequency,
+  // do not sink.
+  if (adjustedSumFreq(BBsToSinkInto, BFI) >
+      BFI.getBlockFreq(L.getLoopPreheader()))
+    BBsToSinkInto.clear();
+  return BBsToSinkInto;
+}
+
+// Sinks \p I from the loop \p L's preheader to its uses. Returns true if
+// sinking is successful.
+// \p LoopBlockNumber is used to sort the insertion blocks to ensure
+// determinism.
+static bool sinkInstruction(Loop &L, Instruction &I,
+                            const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
+                            const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber,
+                            LoopInfo &LI, DominatorTree &DT,
+                            BlockFrequencyInfo &BFI) {
+  // Compute the set of blocks in loop L which contain a use of I.
+  SmallPtrSet<BasicBlock *, 2> BBs;
+  for (auto &U : I.uses()) {
+    Instruction *UI = cast<Instruction>(U.getUser());
+    // We cannot sink I to PHI-uses.
+    if (dyn_cast<PHINode>(UI))
+      return false;
+    // We cannot sink I if it has uses outside of the loop.
+    if (!L.contains(LI.getLoopFor(UI->getParent())))
+      return false;
+    BBs.insert(UI->getParent());
+  }
+
+  // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max
+  // BBs.size() to avoid expensive computation.
+  // FIXME: Handle code size growth for min_size and opt_size.
+  if (BBs.size() > MaxNumberOfUseBBsForSinking)
+    return false;
+
+  // Find the set of BBs that we should insert a copy of I.
+  SmallPtrSet<BasicBlock *, 2> BBsToSinkInto =
+      findBBsToSinkInto(L, BBs, ColdLoopBBs, DT, BFI);
+  if (BBsToSinkInto.empty())
+    return false;
+
+  // Copy the final BBs into a vector and sort them using the total ordering
+  // of the loop block numbers as iterating the set doesn't give a useful
+  // order. No need to stable sort as the block numbers are a total ordering.
+  SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
+  SortedBBsToSinkInto.insert(SortedBBsToSinkInto.begin(), BBsToSinkInto.begin(),
+                             BBsToSinkInto.end());
+  std::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(),
+            [&](BasicBlock *A, BasicBlock *B) {
+              return *LoopBlockNumber.find(A) < *LoopBlockNumber.find(B);
+            });
+
+  BasicBlock *MoveBB = *SortedBBsToSinkInto.begin();
+  // FIXME: Optimize the efficiency for cloned value replacement. The current
+  //        implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
+  for (BasicBlock *N : SortedBBsToSinkInto) {
+    if (N == MoveBB)
+      continue;
+    // Clone I and replace its uses.
+    Instruction *IC = I.clone();
+    IC->setName(I.getName());
+    IC->insertBefore(&*N->getFirstInsertionPt());
+    // Replaces uses of I with IC in N
+    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;) {
+      Use &U = *UI++;
+      auto *I = cast<Instruction>(U.getUser());
+      if (I->getParent() == N)
+        U.set(IC);
+    }
+    // Replaces uses of I with IC in blocks dominated by N
+    replaceDominatedUsesWith(&I, IC, DT, N);
+    DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
+                 << '\n');
+    NumLoopSunkCloned++;
+  }
+  DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
+  NumLoopSunk++;
+  I.moveBefore(&*MoveBB->getFirstInsertionPt());
+
+  return true;
+}
+
+/// Sinks instructions from loop's preheader to the loop body if the
+/// sum frequency of inserted copy is smaller than preheader's frequency.
+static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
+                                          DominatorTree &DT,
+                                          BlockFrequencyInfo &BFI,
+                                          ScalarEvolution *SE) {
+  BasicBlock *Preheader = L.getLoopPreheader();
+  if (!Preheader)
+    return false;
+
+  // Enable LoopSink only when runtime profile is available.
+  // With static profile, the sinking decision may be sub-optimal.
+  if (!Preheader->getParent()->getEntryCount())
+    return false;
+
+  const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
+  // If there are no basic blocks with lower frequency than the preheader then
+  // we can avoid the detailed analysis as we will never find profitable sinking
+  // opportunities.
+  if (all_of(L.blocks(), [&](const BasicBlock *BB) {
+        return BFI.getBlockFreq(BB) > PreheaderFreq;
+      }))
+    return false;
+
+  bool Changed = false;
+  AliasSetTracker CurAST(AA);
+
+  // Compute alias set.
+  for (BasicBlock *BB : L.blocks())
+    CurAST.add(*BB);
+
+  // Sort loop's basic blocks by frequency
+  SmallVector<BasicBlock *, 10> ColdLoopBBs;
+  SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber;
+  int i = 0;
+  for (BasicBlock *B : L.blocks())
+    if (BFI.getBlockFreq(B) < BFI.getBlockFreq(L.getLoopPreheader())) {
+      ColdLoopBBs.push_back(B);
+      LoopBlockNumber[B] = ++i;
+    }
+  std::stable_sort(ColdLoopBBs.begin(), ColdLoopBBs.end(),
+                   [&](BasicBlock *A, BasicBlock *B) {
+                     return BFI.getBlockFreq(A) < BFI.getBlockFreq(B);
+                   });
+
+  // Traverse preheader's instructions in reverse order becaue if A depends
+  // on B (A appears after B), A needs to be sinked first before B can be
+  // sinked.
+  for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) {
+    Instruction *I = &*II++;
+    // No need to check for instruction's operands are loop invariant.
+    assert(L.hasLoopInvariantOperands(I) &&
+           "Insts in a loop's preheader should have loop invariant operands!");
+    if (!canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr))
+      continue;
+    if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI))
+      Changed = true;
+  }
+
+  if (Changed && SE)
+    SE->forgetLoopDispositions(&L);
+  return Changed;
+}
+
+PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
+  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+  // Nothing to do if there are no loops.
+  if (LI.empty())
+    return PreservedAnalyses::all();
+
+  AAResults &AA = FAM.getResult<AAManager>(F);
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+
+  // We want to do a postorder walk over the loops. Since loops are a tree this
+  // is equivalent to a reversed preorder walk and preorder is easy to compute
+  // without recursion. Since we reverse the preorder, we will visit siblings
+  // in reverse program order. This isn't expected to matter at all but is more
+  // consistent with sinking algorithms which generally work bottom-up.
+  SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder();
+
+  bool Changed = false;
+  do {
+    Loop &L = *PreorderLoops.pop_back_val();
+
+    // Note that we don't pass SCEV here because it is only used to invalidate
+    // loops in SCEV and we don't preserve (or request) SCEV at all making that
+    // unnecessary.
+    Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI,
+                                             /*ScalarEvolution*/ nullptr);
+  } while (!PreorderLoops.empty());
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+namespace {
+struct LegacyLoopSinkPass : public LoopPass {
+  static char ID;
+  LegacyLoopSinkPass() : LoopPass(ID) {
+    initializeLegacyLoopSinkPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
+    auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+    return sinkLoopInvariantInstructions(
+        *L, getAnalysis<AAResultsWrapperPass>().getAAResults(),
+        getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
+        getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+        getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(),
+        SE ? &SE->getSE() : nullptr);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+}
+
+char LegacyLoopSinkPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, false)
+
+Pass *llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
new file mode 100644
index 000000000000..3638da118cb7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -0,0 +1,5462 @@
+//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into forms suitable for efficient execution
+// on the target.
+//
+// This pass performs a strength reduction on array references inside loops that
+// have as one or more of their components the loop induction variable, it
+// rewrites expressions to take advantage of scaled-index addressing modes
+// available on the target, and it performs a variety of other optimizations
+// related to loop induction variables.
+//
+// Terminology note: this code has a lot of handling for "post-increment" or
+// "post-inc" users. This is not talking about post-increment addressing modes;
+// it is instead talking about code like this:
+//
+//   %i = phi [ 0, %entry ], [ %i.next, %latch ]
+//   ...
+//   %i.next = add %i, 1
+//   %c = icmp eq %i.next, %n
+//
+// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
+// it's useful to think about these as the same register, with some uses using
+// the value of the register before the add and some using it after. In this
+// example, the icmp is a post-increment user, since it uses %i.next, which is
+// the value of the induction variable after the increment. The other common
+// case of post-increment users is users outside the loop.
+//
+// TODO: More sophistication in the way Formulae are generated and filtered.
+//
+// TODO: Handle multiple loops at a time.
+//
+// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
+//       of a GlobalValue?
+//
+// TODO: When truncation is free, truncate ICmp users' operands to make it a
+//       smaller encoding (on x86 at least).
+//
+// TODO: When a negated register is used by an add (such as in a list of
+//       multiple base registers, or as the increment expression in an addrec),
+//       we may not actually need both reg and (-1 * reg) in registers; the
+//       negation can be implemented by using a sub instead of an add. The
+//       lack of support for taking this into consideration when making
+//       register pressure decisions is partly worked around by the "Special"
+//       use kind.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionNormalization.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-reduce"
+
+/// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
+/// bail out. This threshold is far beyond the number of users that LSR can
+/// conceivably solve, so it should not affect generated code, but catches the
+/// worst cases before LSR burns too much compile time and stack space.
+static const unsigned MaxIVUsers = 200;
+
+// Temporary flag to cleanup congruent phis after LSR phi expansion.
+// It's currently disabled until we can determine whether it's truly useful or
+// not. The flag should be removed after the v3.0 release.
+// This is now needed for ivchains.
+static cl::opt<bool> EnablePhiElim(
+  "enable-lsr-phielim", cl::Hidden, cl::init(true),
+  cl::desc("Enable LSR phi elimination"));
+
+// The flag adds instruction count to solutions cost comparision.
+static cl::opt<bool> InsnsCost(
+  "lsr-insns-cost", cl::Hidden, cl::init(false),
+  cl::desc("Add instruction count to a LSR cost model"));
+
+// Flag to choose how to narrow complex lsr solution
+static cl::opt<bool> LSRExpNarrow(
+  "lsr-exp-narrow", cl::Hidden, cl::init(false),
+  cl::desc("Narrow LSR complex solution using"
+           " expectation of registers number"));
+
+// Flag to narrow search space by filtering non-optimal formulae with
+// the same ScaledReg and Scale.
+static cl::opt<bool> FilterSameScaledReg(
+    "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
+    cl::desc("Narrow LSR search space by filtering non-optimal formulae"
+             " with the same ScaledReg and Scale"));
+
+#ifndef NDEBUG
+// Stress test IV chain generation.
+static cl::opt<bool> StressIVChain(
+  "stress-ivchain", cl::Hidden, cl::init(false),
+  cl::desc("Stress test LSR IV chains"));
+#else
+static bool StressIVChain = false;
+#endif
+
+namespace {
+
+struct MemAccessTy {
+  /// Used in situations where the accessed memory type is unknown.
+  static const unsigned UnknownAddressSpace = ~0u;
+
+  Type *MemTy;
+  unsigned AddrSpace;
+
+  MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {}
+
+  MemAccessTy(Type *Ty, unsigned AS) :
+    MemTy(Ty), AddrSpace(AS) {}
+
+  bool operator==(MemAccessTy Other) const {
+    return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
+  }
+
+  bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
+
+  static MemAccessTy getUnknown(LLVMContext &Ctx,
+                                unsigned AS = UnknownAddressSpace) {
+    return MemAccessTy(Type::getVoidTy(Ctx), AS);
+  }
+};
+
+/// This class holds data which is used to order reuse candidates.
+class RegSortData {
+public:
+  /// This represents the set of LSRUse indices which reference
+  /// a particular register.
+  SmallBitVector UsedByIndices;
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+} // end anonymous namespace
+
+void RegSortData::print(raw_ostream &OS) const {
+  OS << "[NumUses=" << UsedByIndices.count() << ']';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegSortData::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+namespace {
+
+/// Map register candidates to information about how they are used.
+class RegUseTracker {
+  typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
+
+  RegUsesTy RegUsesMap;
+  SmallVector<const SCEV *, 16> RegSequence;
+
+public:
+  void countRegister(const SCEV *Reg, size_t LUIdx);
+  void dropRegister(const SCEV *Reg, size_t LUIdx);
+  void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
+
+  bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
+
+  const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
+
+  void clear();
+
+  typedef SmallVectorImpl<const SCEV *>::iterator iterator;
+  typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
+  iterator begin() { return RegSequence.begin(); }
+  iterator end()   { return RegSequence.end(); }
+  const_iterator begin() const { return RegSequence.begin(); }
+  const_iterator end() const   { return RegSequence.end(); }
+};
+
+} // end anonymous namespace
+
+void
+RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
+  std::pair<RegUsesTy::iterator, bool> Pair =
+    RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
+  RegSortData &RSD = Pair.first->second;
+  if (Pair.second)
+    RegSequence.push_back(Reg);
+  RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
+  RSD.UsedByIndices.set(LUIdx);
+}
+
+void
+RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
+  RegUsesTy::iterator It = RegUsesMap.find(Reg);
+  assert(It != RegUsesMap.end());
+  RegSortData &RSD = It->second;
+  assert(RSD.UsedByIndices.size() > LUIdx);
+  RSD.UsedByIndices.reset(LUIdx);
+}
+
+void
+RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
+  assert(LUIdx <= LastLUIdx);
+
+  // Update RegUses. The data structure is not optimized for this purpose;
+  // we must iterate through it and update each of the bit vectors.
+  for (auto &Pair : RegUsesMap) {
+    SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
+    if (LUIdx < UsedByIndices.size())
+      UsedByIndices[LUIdx] =
+        LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
+    UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
+  }
+}
+
+bool
+RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
+  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+  if (I == RegUsesMap.end())
+    return false;
+  const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
+  int i = UsedByIndices.find_first();
+  if (i == -1) return false;
+  if ((size_t)i != LUIdx) return true;
+  return UsedByIndices.find_next(i) != -1;
+}
+
+const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
+  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+  assert(I != RegUsesMap.end() && "Unknown register!");
+  return I->second.UsedByIndices;
+}
+
+void RegUseTracker::clear() {
+  RegUsesMap.clear();
+  RegSequence.clear();
+}
+
+namespace {
+
+/// This class holds information that describes a formula for computing
+/// satisfying a use. It may include broken-out immediates and scaled registers.
+struct Formula {
+  /// Global base address used for complex addressing.
+  GlobalValue *BaseGV;
+
+  /// Base offset for complex addressing.
+  int64_t BaseOffset;
+
+  /// Whether any complex addressing has a base register.
+  bool HasBaseReg;
+
+  /// The scale of any complex addressing.
+  int64_t Scale;
+
+  /// The list of "base" registers for this use. When this is non-empty. The
+  /// canonical representation of a formula is
+  /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
+  /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
+  /// 3. The reg containing recurrent expr related with currect loop in the
+  /// formula should be put in the ScaledReg.
+  /// #1 enforces that the scaled register is always used when at least two
+  /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
+  /// #2 enforces that 1 * reg is reg.
+  /// #3 ensures invariant regs with respect to current loop can be combined
+  /// together in LSR codegen.
+  /// This invariant can be temporarly broken while building a formula.
+  /// However, every formula inserted into the LSRInstance must be in canonical
+  /// form.
+  SmallVector<const SCEV *, 4> BaseRegs;
+
+  /// The 'scaled' register for this use. This should be non-null when Scale is
+  /// not zero.
+  const SCEV *ScaledReg;
+
+  /// An additional constant offset which added near the use. This requires a
+  /// temporary register, but the offset itself can live in an add immediate
+  /// field rather than a register.
+  int64_t UnfoldedOffset;
+
+  Formula()
+      : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0),
+        ScaledReg(nullptr), UnfoldedOffset(0) {}
+
+  void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
+
+  bool isCanonical(const Loop &L) const;
+
+  void canonicalize(const Loop &L);
+
+  bool unscale();
+
+  bool hasZeroEnd() const;
+
+  size_t getNumRegs() const;
+  Type *getType() const;
+
+  void deleteBaseReg(const SCEV *&S);
+
+  bool referencesReg(const SCEV *S) const;
+  bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
+                                  const RegUseTracker &RegUses) const;
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+} // end anonymous namespace
+
+/// Recursion helper for initialMatch.
+static void DoInitialMatch(const SCEV *S, Loop *L,
+                           SmallVectorImpl<const SCEV *> &Good,
+                           SmallVectorImpl<const SCEV *> &Bad,
+                           ScalarEvolution &SE) {
+  // Collect expressions which properly dominate the loop header.
+  if (SE.properlyDominates(S, L->getHeader())) {
+    Good.push_back(S);
+    return;
+  }
+
+  // Look at add operands.
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    for (const SCEV *S : Add->operands())
+      DoInitialMatch(S, L, Good, Bad, SE);
+    return;
+  }
+
+  // Look at addrec operands.
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
+    if (!AR->getStart()->isZero() && AR->isAffine()) {
+      DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
+      DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
+                                      AR->getStepRecurrence(SE),
+                                      // FIXME: AR->getNoWrapFlags()
+                                      AR->getLoop(), SCEV::FlagAnyWrap),
+                     L, Good, Bad, SE);
+      return;
+    }
+
+  // Handle a multiplication by -1 (negation) if it didn't fold.
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
+    if (Mul->getOperand(0)->isAllOnesValue()) {
+      SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
+      const SCEV *NewMul = SE.getMulExpr(Ops);
+
+      SmallVector<const SCEV *, 4> MyGood;
+      SmallVector<const SCEV *, 4> MyBad;
+      DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
+      const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
+        SE.getEffectiveSCEVType(NewMul->getType())));
+      for (const SCEV *S : MyGood)
+        Good.push_back(SE.getMulExpr(NegOne, S));
+      for (const SCEV *S : MyBad)
+        Bad.push_back(SE.getMulExpr(NegOne, S));
+      return;
+    }
+
+  // Ok, we can't do anything interesting. Just stuff the whole thing into a
+  // register and hope for the best.
+  Bad.push_back(S);
+}
+
+/// Incorporate loop-variant parts of S into this Formula, attempting to keep
+/// all loop-invariant and loop-computable values in a single base register.
+void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
+  SmallVector<const SCEV *, 4> Good;
+  SmallVector<const SCEV *, 4> Bad;
+  DoInitialMatch(S, L, Good, Bad, SE);
+  if (!Good.empty()) {
+    const SCEV *Sum = SE.getAddExpr(Good);
+    if (!Sum->isZero())
+      BaseRegs.push_back(Sum);
+    HasBaseReg = true;
+  }
+  if (!Bad.empty()) {
+    const SCEV *Sum = SE.getAddExpr(Bad);
+    if (!Sum->isZero())
+      BaseRegs.push_back(Sum);
+    HasBaseReg = true;
+  }
+  canonicalize(*L);
+}
+
+/// \brief Check whether or not this formula statisfies the canonical
+/// representation.
+/// \see Formula::BaseRegs.
+bool Formula::isCanonical(const Loop &L) const {
+  if (!ScaledReg)
+    return BaseRegs.size() <= 1;
+
+  if (Scale != 1)
+    return true;
+
+  if (Scale == 1 && BaseRegs.empty())
+    return false;
+
+  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+  if (SAR && SAR->getLoop() == &L)
+    return true;
+
+  // If ScaledReg is not a recurrent expr, or it is but its loop is not current
+  // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
+  // loop, we want to swap the reg in BaseRegs with ScaledReg.
+  auto I =
+      find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) {
+        return isa<const SCEVAddRecExpr>(S) &&
+               (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+      });
+  return I == BaseRegs.end();
+}
+
+/// \brief Helper method to morph a formula into its canonical representation.
+/// \see Formula::BaseRegs.
+/// Every formula having more than one base register, must use the ScaledReg
+/// field. Otherwise, we would have to do special cases everywhere in LSR
+/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
+/// On the other hand, 1*reg should be canonicalized into reg.
+void Formula::canonicalize(const Loop &L) {
+  if (isCanonical(L))
+    return;
+  // So far we did not need this case. This is easy to implement but it is
+  // useless to maintain dead code. Beside it could hurt compile time.
+  assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
+
+  // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
+  if (!ScaledReg) {
+    ScaledReg = BaseRegs.back();
+    BaseRegs.pop_back();
+    Scale = 1;
+  }
+
+  // If ScaledReg is an invariant with respect to L, find the reg from
+  // BaseRegs containing the recurrent expr related with Loop L. Swap the
+  // reg with ScaledReg.
+  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+  if (!SAR || SAR->getLoop() != &L) {
+    auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()),
+                     [&](const SCEV *S) {
+                       return isa<const SCEVAddRecExpr>(S) &&
+                              (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+                     });
+    if (I != BaseRegs.end())
+      std::swap(ScaledReg, *I);
+  }
+}
+
+/// \brief Get rid of the scale in the formula.
+/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
+/// \return true if it was possible to get rid of the scale, false otherwise.
+/// \note After this operation the formula may not be in the canonical form.
+bool Formula::unscale() {
+  if (Scale != 1)
+    return false;
+  Scale = 0;
+  BaseRegs.push_back(ScaledReg);
+  ScaledReg = nullptr;
+  return true;
+}
+
+bool Formula::hasZeroEnd() const {
+  if (UnfoldedOffset || BaseOffset)
+    return false;
+  if (BaseRegs.size() != 1 || ScaledReg)
+    return false;
+  return true;
+}
+
+/// Return the total number of register operands used by this formula. This does
+/// not include register uses implied by non-constant addrec strides.
+size_t Formula::getNumRegs() const {
+  return !!ScaledReg + BaseRegs.size();
+}
+
+/// Return the type of this formula, if it has one, or null otherwise. This type
+/// is meaningless except for the bit size.
+Type *Formula::getType() const {
+  return !BaseRegs.empty() ? BaseRegs.front()->getType() :
+         ScaledReg ? ScaledReg->getType() :
+         BaseGV ? BaseGV->getType() :
+         nullptr;
+}
+
+/// Delete the given base reg from the BaseRegs list.
+void Formula::deleteBaseReg(const SCEV *&S) {
+  if (&S != &BaseRegs.back())
+    std::swap(S, BaseRegs.back());
+  BaseRegs.pop_back();
+}
+
+/// Test if this formula references the given register.
+bool Formula::referencesReg(const SCEV *S) const {
+  return S == ScaledReg || is_contained(BaseRegs, S);
+}
+
+/// Test whether this formula uses registers which are used by uses other than
+/// the use with the given index.
+bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
+                                         const RegUseTracker &RegUses) const {
+  if (ScaledReg)
+    if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
+      return true;
+  for (const SCEV *BaseReg : BaseRegs)
+    if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
+      return true;
+  return false;
+}
+
+void Formula::print(raw_ostream &OS) const {
+  bool First = true;
+  if (BaseGV) {
+    if (!First) OS << " + "; else First = false;
+    BaseGV->printAsOperand(OS, /*PrintType=*/false);
+  }
+  if (BaseOffset != 0) {
+    if (!First) OS << " + "; else First = false;
+    OS << BaseOffset;
+  }
+  for (const SCEV *BaseReg : BaseRegs) {
+    if (!First) OS << " + "; else First = false;
+    OS << "reg(" << *BaseReg << ')';
+  }
+  if (HasBaseReg && BaseRegs.empty()) {
+    if (!First) OS << " + "; else First = false;
+    OS << "**error: HasBaseReg**";
+  } else if (!HasBaseReg && !BaseRegs.empty()) {
+    if (!First) OS << " + "; else First = false;
+    OS << "**error: !HasBaseReg**";
+  }
+  if (Scale != 0) {
+    if (!First) OS << " + "; else First = false;
+    OS << Scale << "*reg(";
+    if (ScaledReg)
+      OS << *ScaledReg;
+    else
+      OS << "<unknown>";
+    OS << ')';
+  }
+  if (UnfoldedOffset != 0) {
+    if (!First) OS << " + ";
+    OS << "imm(" << UnfoldedOffset << ')';
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void Formula::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+/// Return true if the given addrec can be sign-extended without changing its
+/// value.
+static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+  Type *WideTy =
+    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
+  return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+}
+
+/// Return true if the given add can be sign-extended without changing its
+/// value.
+static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
+  Type *WideTy =
+    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
+  return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
+}
+
+/// Return true if the given mul can be sign-extended without changing its
+/// value.
+static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
+  Type *WideTy =
+    IntegerType::get(SE.getContext(),
+                     SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
+  return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
+}
+
+/// Return an expression for LHS /s RHS, if it can be determined and if the
+/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
+/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that
+/// the multiplication may overflow, which is useful when the result will be
+/// used in a context where the most significant bits are ignored.
+static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
+                                ScalarEvolution &SE,
+                                bool IgnoreSignificantBits = false) {
+  // Handle the trivial case, which works for any SCEV type.
+  if (LHS == RHS)
+    return SE.getConstant(LHS->getType(), 1);
+
+  // Handle a few RHS special cases.
+  const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
+  if (RC) {
+    const APInt &RA = RC->getAPInt();
+    // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
+    // some folding.
+    if (RA.isAllOnesValue())
+      return SE.getMulExpr(LHS, RC);
+    // Handle x /s 1 as x.
+    if (RA == 1)
+      return LHS;
+  }
+
+  // Check for a division of a constant by a constant.
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
+    if (!RC)
+      return nullptr;
+    const APInt &LA = C->getAPInt();
+    const APInt &RA = RC->getAPInt();
+    if (LA.srem(RA) != 0)
+      return nullptr;
+    return SE.getConstant(LA.sdiv(RA));
+  }
+
+  // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
+    if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
+      const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
+                                      IgnoreSignificantBits);
+      if (!Step) return nullptr;
+      const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
+                                       IgnoreSignificantBits);
+      if (!Start) return nullptr;
+      // FlagNW is independent of the start value, step direction, and is
+      // preserved with smaller magnitude steps.
+      // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+      return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
+    }
+    return nullptr;
+  }
+
+  // Distribute the sdiv over add operands, if the add doesn't overflow.
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
+    if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
+      SmallVector<const SCEV *, 8> Ops;
+      for (const SCEV *S : Add->operands()) {
+        const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
+        if (!Op) return nullptr;
+        Ops.push_back(Op);
+      }
+      return SE.getAddExpr(Ops);
+    }
+    return nullptr;
+  }
+
+  // Check for a multiply operand that we can pull RHS out of.
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
+    if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
+      SmallVector<const SCEV *, 4> Ops;
+      bool Found = false;
+      for (const SCEV *S : Mul->operands()) {
+        if (!Found)
+          if (const SCEV *Q = getExactSDiv(S, RHS, SE,
+                                           IgnoreSignificantBits)) {
+            S = Q;
+            Found = true;
+          }
+        Ops.push_back(S);
+      }
+      return Found ? SE.getMulExpr(Ops) : nullptr;
+    }
+    return nullptr;
+  }
+
+  // Otherwise we don't know.
+  return nullptr;
+}
+
+/// If S involves the addition of a constant integer value, return that integer
+/// value, and mutate S to point to a new SCEV with that value excluded.
+static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
+    if (C->getAPInt().getMinSignedBits() <= 64) {
+      S = SE.getConstant(C->getType(), 0);
+      return C->getValue()->getSExtValue();
+    }
+  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
+    int64_t Result = ExtractImmediate(NewOps.front(), SE);
+    if (Result != 0)
+      S = SE.getAddExpr(NewOps);
+    return Result;
+  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
+    int64_t Result = ExtractImmediate(NewOps.front(), SE);
+    if (Result != 0)
+      S = SE.getAddRecExpr(NewOps, AR->getLoop(),
+                           // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+                           SCEV::FlagAnyWrap);
+    return Result;
+  }
+  return 0;
+}
+
+/// If S involves the addition of a GlobalValue address, return that symbol, and
+/// mutate S to point to a new SCEV with that value excluded.
+static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
+  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+    if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
+      S = SE.getConstant(GV->getType(), 0);
+      return GV;
+    }
+  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
+    GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
+    if (Result)
+      S = SE.getAddExpr(NewOps);
+    return Result;
+  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
+    GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
+    if (Result)
+      S = SE.getAddRecExpr(NewOps, AR->getLoop(),
+                           // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+                           SCEV::FlagAnyWrap);
+    return Result;
+  }
+  return nullptr;
+}
+
+/// Returns true if the specified instruction is using the specified value as an
+/// address.
+static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
+  bool isAddress = isa<LoadInst>(Inst);
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    if (SI->getPointerOperand() == OperandVal)
+      isAddress = true;
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    // Addressing modes can also be folded into prefetches and a variety
+    // of intrinsics.
+    switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::prefetch:
+        if (II->getArgOperand(0) == OperandVal)
+          isAddress = true;
+        break;
+    }
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+    if (RMW->getPointerOperand() == OperandVal)
+      isAddress = true;
+  } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    if (CmpX->getPointerOperand() == OperandVal)
+      isAddress = true;
+  }
+  return isAddress;
+}
+
+/// Return the type of the memory being accessed.
+static MemAccessTy getAccessType(const Instruction *Inst) {
+  MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
+  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    AccessTy.MemTy = SI->getOperand(0)->getType();
+    AccessTy.AddrSpace = SI->getPointerAddressSpace();
+  } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+    AccessTy.AddrSpace = LI->getPointerAddressSpace();
+  } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+    AccessTy.AddrSpace = RMW->getPointerAddressSpace();
+  } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
+  }
+
+  // All pointers have the same requirements, so canonicalize them to an
+  // arbitrary pointer type to minimize variation.
+  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy))
+    AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
+                                      PTy->getAddressSpace());
+
+  return AccessTy;
+}
+
+/// Return true if this AddRec is already a phi in its loop.
+static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+  for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    if (SE.isSCEVable(PN->getType()) &&
+        (SE.getEffectiveSCEVType(PN->getType()) ==
+         SE.getEffectiveSCEVType(AR->getType())) &&
+        SE.getSCEV(PN) == AR)
+      return true;
+  }
+  return false;
+}
+
+/// Check if expanding this expression is likely to incur significant cost. This
+/// is tricky because SCEV doesn't track which expressions are actually computed
+/// by the current IR.
+///
+/// We currently allow expansion of IV increments that involve adds,
+/// multiplication by constants, and AddRecs from existing phis.
+///
+/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
+/// obvious multiple of the UDivExpr.
+static bool isHighCostExpansion(const SCEV *S,
+                                SmallPtrSetImpl<const SCEV*> &Processed,
+                                ScalarEvolution &SE) {
+  // Zero/One operand expressions
+  switch (S->getSCEVType()) {
+  case scUnknown:
+  case scConstant:
+    return false;
+  case scTruncate:
+    return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
+                               Processed, SE);
+  case scZeroExtend:
+    return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
+                               Processed, SE);
+  case scSignExtend:
+    return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
+                               Processed, SE);
+  }
+
+  if (!Processed.insert(S).second)
+    return false;
+
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    for (const SCEV *S : Add->operands()) {
+      if (isHighCostExpansion(S, Processed, SE))
+        return true;
+    }
+    return false;
+  }
+
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+    if (Mul->getNumOperands() == 2) {
+      // Multiplication by a constant is ok
+      if (isa<SCEVConstant>(Mul->getOperand(0)))
+        return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
+
+      // If we have the value of one operand, check if an existing
+      // multiplication already generates this expression.
+      if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
+        Value *UVal = U->getValue();
+        for (User *UR : UVal->users()) {
+          // If U is a constant, it may be used by a ConstantExpr.
+          Instruction *UI = dyn_cast<Instruction>(UR);
+          if (UI && UI->getOpcode() == Instruction::Mul &&
+              SE.isSCEVable(UI->getType())) {
+            return SE.getSCEV(UI) == Mul;
+          }
+        }
+      }
+    }
+  }
+
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    if (isExistingPhi(AR, SE))
+      return false;
+  }
+
+  // Fow now, consider any other type of expression (div/mul/min/max) high cost.
+  return true;
+}
+
+/// If any of the instructions is the specified set are trivially dead, delete
+/// them and see if this makes any of their operands subsequently dead.
+static bool
+DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+  bool Changed = false;
+
+  while (!DeadInsts.empty()) {
+    Value *V = DeadInsts.pop_back_val();
+    Instruction *I = dyn_cast_or_null<Instruction>(V);
+
+    if (!I || !isInstructionTriviallyDead(I))
+      continue;
+
+    for (Use &O : I->operands())
+      if (Instruction *U = dyn_cast<Instruction>(O)) {
+        O = nullptr;
+        if (U->use_empty())
+          DeadInsts.emplace_back(U);
+      }
+
+    I->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+namespace {
+
+class LSRUse;
+
+} // end anonymous namespace
+
+/// \brief Check if the addressing mode defined by \p F is completely
+/// folded in \p LU at isel time.
+/// This includes address-mode folding and special icmp tricks.
+/// This function returns true if \p LU can accommodate what \p F
+/// defines and up to 1 base + 1 scaled + offset.
+/// In other words, if \p F has several base registers, this function may
+/// still return true. Therefore, users still need to account for
+/// additional base registers and/or unfolded offsets to derive an
+/// accurate cost model.
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 const LSRUse &LU, const Formula &F);
+// Get the cost of the scaling factor used in F for LU.
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+                                     const LSRUse &LU, const Formula &F,
+                                     const Loop &L);
+
+namespace {
+
+/// This class is used to measure and compare candidate formulae.
+class Cost {
+  TargetTransformInfo::LSRCost C;
+
+public:
+  Cost() {
+    C.Insns = 0;
+    C.NumRegs = 0;
+    C.AddRecCost = 0;
+    C.NumIVMuls = 0;
+    C.NumBaseAdds = 0;
+    C.ImmCost = 0;
+    C.SetupCost = 0;
+    C.ScaleCost = 0;
+  }
+
+  bool isLess(Cost &Other, const TargetTransformInfo &TTI);
+
+  void Lose();
+
+#ifndef NDEBUG
+  // Once any of the metrics loses, they must all remain losers.
+  bool isValid() {
+    return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
+             | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
+      || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
+           & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
+  }
+#endif
+
+  bool isLoser() {
+    assert(isValid() && "invalid cost");
+    return C.NumRegs == ~0u;
+  }
+
+  void RateFormula(const TargetTransformInfo &TTI,
+                   const Formula &F,
+                   SmallPtrSetImpl<const SCEV *> &Regs,
+                   const DenseSet<const SCEV *> &VisitedRegs,
+                   const Loop *L,
+                   ScalarEvolution &SE, DominatorTree &DT,
+                   const LSRUse &LU,
+                   SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+
+private:
+  void RateRegister(const SCEV *Reg,
+                    SmallPtrSetImpl<const SCEV *> &Regs,
+                    const Loop *L,
+                    ScalarEvolution &SE, DominatorTree &DT);
+  void RatePrimaryRegister(const SCEV *Reg,
+                           SmallPtrSetImpl<const SCEV *> &Regs,
+                           const Loop *L,
+                           ScalarEvolution &SE, DominatorTree &DT,
+                           SmallPtrSetImpl<const SCEV *> *LoserRegs);
+};
+  
+/// An operand value in an instruction which is to be replaced with some
+/// equivalent, possibly strength-reduced, replacement.
+struct LSRFixup {
+  /// The instruction which will be updated.
+  Instruction *UserInst;
+
+  /// The operand of the instruction which will be replaced. The operand may be
+  /// used more than once; every instance will be replaced.
+  Value *OperandValToReplace;
+
+  /// If this user is to use the post-incremented value of an induction
+  /// variable, this variable is non-null and holds the loop associated with the
+  /// induction variable.
+  PostIncLoopSet PostIncLoops;
+
+  /// A constant offset to be added to the LSRUse expression.  This allows
+  /// multiple fixups to share the same LSRUse with different offsets, for
+  /// example in an unrolled loop.
+  int64_t Offset;
+
+  bool isUseFullyOutsideLoop(const Loop *L) const;
+
+  LSRFixup();
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
+/// SmallVectors of const SCEV*.
+struct UniquifierDenseMapInfo {
+  static SmallVector<const SCEV *, 4> getEmptyKey() {
+    SmallVector<const SCEV *, 4>  V;
+    V.push_back(reinterpret_cast<const SCEV *>(-1));
+    return V;
+  }
+
+  static SmallVector<const SCEV *, 4> getTombstoneKey() {
+    SmallVector<const SCEV *, 4> V;
+    V.push_back(reinterpret_cast<const SCEV *>(-2));
+    return V;
+  }
+
+  static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
+    return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+  }
+
+  static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
+                      const SmallVector<const SCEV *, 4> &RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// This class holds the state that LSR keeps for each use in IVUsers, as well
+/// as uses invented by LSR itself. It includes information about what kinds of
+/// things can be folded into the user, information about the user itself, and
+/// information about how the use may be satisfied.  TODO: Represent multiple
+/// users of the same expression in common?
+class LSRUse {
+  DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
+
+public:
+  /// An enum for a kind of use, indicating what types of scaled and immediate
+  /// operands it might support.
+  enum KindType {
+    Basic,   ///< A normal use, with no folding.
+    Special, ///< A special case of basic, allowing -1 scales.
+    Address, ///< An address use; folding according to TargetLowering
+    ICmpZero ///< An equality icmp with both operands folded into one.
+    // TODO: Add a generic icmp too?
+  };
+
+  typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
+
+  KindType Kind;
+  MemAccessTy AccessTy;
+
+  /// The list of operands which are to be replaced.
+  SmallVector<LSRFixup, 8> Fixups;
+
+  /// Keep track of the min and max offsets of the fixups.
+  int64_t MinOffset;
+  int64_t MaxOffset;
+
+  /// This records whether all of the fixups using this LSRUse are outside of
+  /// the loop, in which case some special-case heuristics may be used.
+  bool AllFixupsOutsideLoop;
+
+  /// RigidFormula is set to true to guarantee that this use will be associated
+  /// with a single formula--the one that initially matched. Some SCEV
+  /// expressions cannot be expanded. This allows LSR to consider the registers
+  /// used by those expressions without the need to expand them later after
+  /// changing the formula.
+  bool RigidFormula;
+
+  /// This records the widest use type for any fixup using this
+  /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
+  /// fixup widths to be equivalent, because the narrower one may be relying on
+  /// the implicit truncation to truncate away bogus bits.
+  Type *WidestFixupType;
+
+  /// A list of ways to build a value that can satisfy this user.  After the
+  /// list is populated, one of these is selected heuristically and used to
+  /// formulate a replacement for OperandValToReplace in UserInst.
+  SmallVector<Formula, 12> Formulae;
+
+  /// The set of register candidates used by all formulae in this LSRUse.
+  SmallPtrSet<const SCEV *, 4> Regs;
+
+  LSRUse(KindType K, MemAccessTy AT)
+      : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN),
+        AllFixupsOutsideLoop(true), RigidFormula(false),
+        WidestFixupType(nullptr) {}
+
+  LSRFixup &getNewFixup() {
+    Fixups.push_back(LSRFixup());
+    return Fixups.back();
+  }
+
+  void pushFixup(LSRFixup &f) {
+    Fixups.push_back(f);
+    if (f.Offset > MaxOffset)
+      MaxOffset = f.Offset;
+    if (f.Offset < MinOffset)
+      MinOffset = f.Offset;
+  }
+  
+  bool HasFormulaWithSameRegs(const Formula &F) const;
+  float getNotSelectedProbability(const SCEV *Reg) const;
+  bool InsertFormula(const Formula &F, const Loop &L);
+  void DeleteFormula(Formula &F);
+  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+} // end anonymous namespace
+
+/// Tally up interesting quantities from the given register.
+void Cost::RateRegister(const SCEV *Reg,
+                        SmallPtrSetImpl<const SCEV *> &Regs,
+                        const Loop *L,
+                        ScalarEvolution &SE, DominatorTree &DT) {
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
+    // If this is an addrec for another loop, it should be an invariant
+    // with respect to L since L is the innermost loop (at least
+    // for now LSR only handles innermost loops).
+    if (AR->getLoop() != L) {
+      // If the AddRec exists, consider it's register free and leave it alone.
+      if (isExistingPhi(AR, SE))
+        return;
+
+      // It is bad to allow LSR for current loop to add induction variables
+      // for its sibling loops.
+      if (!AR->getLoop()->contains(L)) {
+        Lose();
+        return;
+      }
+
+      // Otherwise, it will be an invariant with respect to Loop L.
+      ++C.NumRegs;
+      return;
+    }
+    C.AddRecCost += 1; /// TODO: This should be a function of the stride.
+
+    // Add the step value register, if it needs one.
+    // TODO: The non-affine case isn't precisely modeled here.
+    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
+      if (!Regs.count(AR->getOperand(1))) {
+        RateRegister(AR->getOperand(1), Regs, L, SE, DT);
+        if (isLoser())
+          return;
+      }
+    }
+  }
+  ++C.NumRegs;
+
+  // Rough heuristic; favor registers which don't require extra setup
+  // instructions in the preheader.
+  if (!isa<SCEVUnknown>(Reg) &&
+      !isa<SCEVConstant>(Reg) &&
+      !(isa<SCEVAddRecExpr>(Reg) &&
+        (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
+         isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
+    ++C.SetupCost;
+
+  C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
+               SE.hasComputableLoopEvolution(Reg, L);
+}
+
+/// Record this register in the set. If we haven't seen it before, rate
+/// it. Optional LoserRegs provides a way to declare any formula that refers to
+/// one of those regs an instant loser.
+void Cost::RatePrimaryRegister(const SCEV *Reg,
+                               SmallPtrSetImpl<const SCEV *> &Regs,
+                               const Loop *L,
+                               ScalarEvolution &SE, DominatorTree &DT,
+                               SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+  if (LoserRegs && LoserRegs->count(Reg)) {
+    Lose();
+    return;
+  }
+  if (Regs.insert(Reg).second) {
+    RateRegister(Reg, Regs, L, SE, DT);
+    if (LoserRegs && isLoser())
+      LoserRegs->insert(Reg);
+  }
+}
+
+void Cost::RateFormula(const TargetTransformInfo &TTI,
+                       const Formula &F,
+                       SmallPtrSetImpl<const SCEV *> &Regs,
+                       const DenseSet<const SCEV *> &VisitedRegs,
+                       const Loop *L,
+                       ScalarEvolution &SE, DominatorTree &DT,
+                       const LSRUse &LU,
+                       SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+  assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
+  // Tally up the registers.
+  unsigned PrevAddRecCost = C.AddRecCost;
+  unsigned PrevNumRegs = C.NumRegs;
+  unsigned PrevNumBaseAdds = C.NumBaseAdds;
+  if (const SCEV *ScaledReg = F.ScaledReg) {
+    if (VisitedRegs.count(ScaledReg)) {
+      Lose();
+      return;
+    }
+    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
+    if (isLoser())
+      return;
+  }
+  for (const SCEV *BaseReg : F.BaseRegs) {
+    if (VisitedRegs.count(BaseReg)) {
+      Lose();
+      return;
+    }
+    RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
+    if (isLoser())
+      return;
+  }
+
+  // Determine how many (unfolded) adds we'll need inside the loop.
+  size_t NumBaseParts = F.getNumRegs();
+  if (NumBaseParts > 1)
+    // Do not count the base and a possible second register if the target
+    // allows to fold 2 registers.
+    C.NumBaseAdds +=
+        NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
+  C.NumBaseAdds += (F.UnfoldedOffset != 0);
+
+  // Accumulate non-free scaling amounts.
+  C.ScaleCost += getScalingFactorCost(TTI, LU, F, *L);
+
+  // Tally up the non-zero immediates.
+  for (const LSRFixup &Fixup : LU.Fixups) {
+    int64_t O = Fixup.Offset;
+    int64_t Offset = (uint64_t)O + F.BaseOffset;
+    if (F.BaseGV)
+      C.ImmCost += 64; // Handle symbolic values conservatively.
+                     // TODO: This should probably be the pointer size.
+    else if (Offset != 0)
+      C.ImmCost += APInt(64, Offset, true).getMinSignedBits();
+
+    // Check with target if this offset with this instruction is
+    // specifically not supported.
+    if ((isa<LoadInst>(Fixup.UserInst) || isa<StoreInst>(Fixup.UserInst)) &&
+        !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
+      C.NumBaseAdds++;
+  }
+
+  // If we don't count instruction cost exit here.
+  if (!InsnsCost) {
+    assert(isValid() && "invalid cost");
+    return;
+  }
+
+  // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
+  // additional instruction (at least fill).
+  unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
+  if (C.NumRegs > TTIRegNum) {
+    // Cost already exceeded TTIRegNum, then only newly added register can add
+    // new instructions.
+    if (PrevNumRegs > TTIRegNum)
+      C.Insns += (C.NumRegs - PrevNumRegs);
+    else
+      C.Insns += (C.NumRegs - TTIRegNum);
+  }
+
+  // If ICmpZero formula ends with not 0, it could not be replaced by
+  // just add or sub. We'll need to compare final result of AddRec.
+  // That means we'll need an additional instruction.
+  // For -10 + {0, +, 1}:
+  // i = i + 1;
+  // cmp i, 10
+  //
+  // For {-10, +, 1}:
+  // i = i + 1;
+  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+    C.Insns++;
+  // Each new AddRec adds 1 instruction to calculation.
+  C.Insns += (C.AddRecCost - PrevAddRecCost);
+
+  // BaseAdds adds instructions for unfolded registers.
+  if (LU.Kind != LSRUse::ICmpZero)
+    C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
+  assert(isValid() && "invalid cost");
+}
+
+/// Set this cost to a losing value.
+void Cost::Lose() {
+  C.Insns = ~0u;
+  C.NumRegs = ~0u;
+  C.AddRecCost = ~0u;
+  C.NumIVMuls = ~0u;
+  C.NumBaseAdds = ~0u;
+  C.ImmCost = ~0u;
+  C.SetupCost = ~0u;
+  C.ScaleCost = ~0u;
+}
+
+/// Choose the lower cost.
+bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) {
+  if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
+      C.Insns != Other.C.Insns)
+    return C.Insns < Other.C.Insns;
+  return TTI.isLSRCostLess(C, Other.C);
+}
+
+void Cost::print(raw_ostream &OS) const {
+  if (InsnsCost)
+    OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
+  OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
+  if (C.AddRecCost != 0)
+    OS << ", with addrec cost " << C.AddRecCost;
+  if (C.NumIVMuls != 0)
+    OS << ", plus " << C.NumIVMuls << " IV mul"
+       << (C.NumIVMuls == 1 ? "" : "s");
+  if (C.NumBaseAdds != 0)
+    OS << ", plus " << C.NumBaseAdds << " base add"
+       << (C.NumBaseAdds == 1 ? "" : "s");
+  if (C.ScaleCost != 0)
+    OS << ", plus " << C.ScaleCost << " scale cost";
+  if (C.ImmCost != 0)
+    OS << ", plus " << C.ImmCost << " imm cost";
+  if (C.SetupCost != 0)
+    OS << ", plus " << C.SetupCost << " setup cost";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void Cost::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+LSRFixup::LSRFixup()
+  : UserInst(nullptr), OperandValToReplace(nullptr),
+    Offset(0) {}
+
+/// Test whether this fixup always uses its value outside of the given loop.
+bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
+  // PHI nodes use their value in their incoming blocks.
+  if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (PN->getIncomingValue(i) == OperandValToReplace &&
+          L->contains(PN->getIncomingBlock(i)))
+        return false;
+    return true;
+  }
+
+  return !L->contains(UserInst);
+}
+
+void LSRFixup::print(raw_ostream &OS) const {
+  OS << "UserInst=";
+  // Store is common and interesting enough to be worth special-casing.
+  if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
+    OS << "store ";
+    Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
+  } else if (UserInst->getType()->isVoidTy())
+    OS << UserInst->getOpcodeName();
+  else
+    UserInst->printAsOperand(OS, /*PrintType=*/false);
+
+  OS << ", OperandValToReplace=";
+  OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
+
+  for (const Loop *PIL : PostIncLoops) {
+    OS << ", PostIncLoop=";
+    PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+  }
+
+  if (Offset != 0)
+    OS << ", Offset=" << Offset;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRFixup::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+/// Test whether this use as a formula which has the same registers as the given
+/// formula.
+bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
+  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
+  if (F.ScaledReg) Key.push_back(F.ScaledReg);
+  // Unstable sort by host order ok, because this is only used for uniquifying.
+  std::sort(Key.begin(), Key.end());
+  return Uniquifier.count(Key);
+}
+
+/// The function returns a probability of selecting formula without Reg.
+float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
+  unsigned FNum = 0;
+  for (const Formula &F : Formulae)
+    if (F.referencesReg(Reg))
+      FNum++;
+  return ((float)(Formulae.size() - FNum)) / Formulae.size();
+}
+
+/// If the given formula has not yet been inserted, add it to the list, and
+/// return true. Return false otherwise.  The formula must be in canonical form.
+bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
+  assert(F.isCanonical(L) && "Invalid canonical representation");
+
+  if (!Formulae.empty() && RigidFormula)
+    return false;
+
+  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
+  if (F.ScaledReg) Key.push_back(F.ScaledReg);
+  // Unstable sort by host order ok, because this is only used for uniquifying.
+  std::sort(Key.begin(), Key.end());
+
+  if (!Uniquifier.insert(Key).second)
+    return false;
+
+  // Using a register to hold the value of 0 is not profitable.
+  assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
+         "Zero allocated in a scaled register!");
+#ifndef NDEBUG
+  for (const SCEV *BaseReg : F.BaseRegs)
+    assert(!BaseReg->isZero() && "Zero allocated in a base register!");
+#endif
+
+  // Add the formula to the list.
+  Formulae.push_back(F);
+
+  // Record registers now being used by this use.
+  Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+  if (F.ScaledReg)
+    Regs.insert(F.ScaledReg);
+
+  return true;
+}
+
+/// Remove the given formula from this use's list.
+void LSRUse::DeleteFormula(Formula &F) {
+  if (&F != &Formulae.back())
+    std::swap(F, Formulae.back());
+  Formulae.pop_back();
+}
+
+/// Recompute the Regs field, and update RegUses.
+void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
+  // Now that we've filtered out some formulae, recompute the Regs set.
+  SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
+  Regs.clear();
+  for (const Formula &F : Formulae) {
+    if (F.ScaledReg) Regs.insert(F.ScaledReg);
+    Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+  }
+
+  // Update the RegTracker.
+  for (const SCEV *S : OldRegs)
+    if (!Regs.count(S))
+      RegUses.dropRegister(S, LUIdx);
+}
+
+void LSRUse::print(raw_ostream &OS) const {
+  OS << "LSR Use: Kind=";
+  switch (Kind) {
+  case Basic:    OS << "Basic"; break;
+  case Special:  OS << "Special"; break;
+  case ICmpZero: OS << "ICmpZero"; break;
+  case Address:
+    OS << "Address of ";
+    if (AccessTy.MemTy->isPointerTy())
+      OS << "pointer"; // the full pointer type could be really verbose
+    else {
+      OS << *AccessTy.MemTy;
+    }
+
+    OS << " in addrspace(" << AccessTy.AddrSpace << ')';
+  }
+
+  OS << ", Offsets={";
+  bool NeedComma = false;
+  for (const LSRFixup &Fixup : Fixups) {
+    if (NeedComma) OS << ',';
+    OS << Fixup.Offset;
+    NeedComma = true;
+  }
+  OS << '}';
+
+  if (AllFixupsOutsideLoop)
+    OS << ", all-fixups-outside-loop";
+
+  if (WidestFixupType)
+    OS << ", widest fixup type: " << *WidestFixupType;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRUse::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
+                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 bool HasBaseReg, int64_t Scale) {
+  switch (Kind) {
+  case LSRUse::Address:
+    return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
+                                     HasBaseReg, Scale, AccessTy.AddrSpace);
+
+  case LSRUse::ICmpZero:
+    // There's not even a target hook for querying whether it would be legal to
+    // fold a GV into an ICmp.
+    if (BaseGV)
+      return false;
+
+    // ICmp only has two operands; don't allow more than two non-trivial parts.
+    if (Scale != 0 && HasBaseReg && BaseOffset != 0)
+      return false;
+
+    // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
+    // putting the scaled register in the other operand of the icmp.
+    if (Scale != 0 && Scale != -1)
+      return false;
+
+    // If we have low-level target information, ask the target if it can fold an
+    // integer immediate on an icmp.
+    if (BaseOffset != 0) {
+      // We have one of:
+      // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
+      // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
+      // Offs is the ICmp immediate.
+      if (Scale == 0)
+        // The cast does the right thing with INT64_MIN.
+        BaseOffset = -(uint64_t)BaseOffset;
+      return TTI.isLegalICmpImmediate(BaseOffset);
+    }
+
+    // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
+    return true;
+
+  case LSRUse::Basic:
+    // Only handle single-register values.
+    return !BaseGV && Scale == 0 && BaseOffset == 0;
+
+  case LSRUse::Special:
+    // Special case Basic to handle -1 scales.
+    return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
+  }
+
+  llvm_unreachable("Invalid LSRUse Kind!");
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 int64_t MinOffset, int64_t MaxOffset,
+                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
+                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 bool HasBaseReg, int64_t Scale) {
+  // Check for overflow.
+  if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
+      (MinOffset > 0))
+    return false;
+  MinOffset = (uint64_t)BaseOffset + MinOffset;
+  if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
+      (MaxOffset > 0))
+    return false;
+  MaxOffset = (uint64_t)BaseOffset + MaxOffset;
+
+  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
+                              HasBaseReg, Scale) &&
+         isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
+                              HasBaseReg, Scale);
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 int64_t MinOffset, int64_t MaxOffset,
+                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
+                                 const Formula &F, const Loop &L) {
+  // For the purpose of isAMCompletelyFolded either having a canonical formula
+  // or a scale not equal to zero is correct.
+  // Problems may arise from non canonical formulae having a scale == 0.
+  // Strictly speaking it would best to just rely on canonical formulae.
+  // However, when we generate the scaled formulae, we first check that the
+  // scaling factor is profitable before computing the actual ScaledReg for
+  // compile time sake.
+  assert((F.isCanonical(L) || F.Scale != 0));
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+                              F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+/// Test whether we know how to expand the current formula.
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+                       int64_t MaxOffset, LSRUse::KindType Kind,
+                       MemAccessTy AccessTy, GlobalValue *BaseGV,
+                       int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
+  // We know how to expand completely foldable formulae.
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+                              BaseOffset, HasBaseReg, Scale) ||
+         // Or formulae that use a base register produced by a sum of base
+         // registers.
+         (Scale == 1 &&
+          isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+                               BaseGV, BaseOffset, true, 0));
+}
+
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+                       int64_t MaxOffset, LSRUse::KindType Kind,
+                       MemAccessTy AccessTy, const Formula &F) {
+  return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
+                    F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 const LSRUse &LU, const Formula &F) {
+  return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                              LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
+                              F.Scale);
+}
+
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+                                     const LSRUse &LU, const Formula &F,
+                                     const Loop &L) {
+  if (!F.Scale)
+    return 0;
+
+  // If the use is not completely folded in that instruction, we will have to
+  // pay an extra cost only for scale != 1.
+  if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                            LU.AccessTy, F, L))
+    return F.Scale != 1;
+
+  switch (LU.Kind) {
+  case LSRUse::Address: {
+    // Check the scaling factor cost with both the min and max offsets.
+    int ScaleCostMinOffset = TTI.getScalingFactorCost(
+        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
+        F.Scale, LU.AccessTy.AddrSpace);
+    int ScaleCostMaxOffset = TTI.getScalingFactorCost(
+        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
+        F.Scale, LU.AccessTy.AddrSpace);
+
+    assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
+           "Legal addressing mode has an illegal cost!");
+    return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
+  }
+  case LSRUse::ICmpZero:
+  case LSRUse::Basic:
+  case LSRUse::Special:
+    // The use is completely folded, i.e., everything is folded into the
+    // instruction.
+    return 0;
+  }
+
+  llvm_unreachable("Invalid LSRUse Kind!");
+}
+
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+                             LSRUse::KindType Kind, MemAccessTy AccessTy,
+                             GlobalValue *BaseGV, int64_t BaseOffset,
+                             bool HasBaseReg) {
+  // Fast-path: zero is always foldable.
+  if (BaseOffset == 0 && !BaseGV) return true;
+
+  // Conservatively, create an address with an immediate and a
+  // base and a scale.
+  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+
+  // Canonicalize a scale of 1 to a base register if the formula doesn't
+  // already have a base register.
+  if (!HasBaseReg && Scale == 1) {
+    Scale = 0;
+    HasBaseReg = true;
+  }
+
+  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
+                              HasBaseReg, Scale);
+}
+
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+                             ScalarEvolution &SE, int64_t MinOffset,
+                             int64_t MaxOffset, LSRUse::KindType Kind,
+                             MemAccessTy AccessTy, const SCEV *S,
+                             bool HasBaseReg) {
+  // Fast-path: zero is always foldable.
+  if (S->isZero()) return true;
+
+  // Conservatively, create an address with an immediate and a
+  // base and a scale.
+  int64_t BaseOffset = ExtractImmediate(S, SE);
+  GlobalValue *BaseGV = ExtractSymbol(S, SE);
+
+  // If there's anything else involved, it's not foldable.
+  if (!S->isZero()) return false;
+
+  // Fast-path: zero is always foldable.
+  if (BaseOffset == 0 && !BaseGV) return true;
+
+  // Conservatively, create an address with an immediate and a
+  // base and a scale.
+  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+                              BaseOffset, HasBaseReg, Scale);
+}
+
+namespace {
+
+/// An individual increment in a Chain of IV increments.  Relate an IV user to
+/// an expression that computes the IV it uses from the IV used by the previous
+/// link in the Chain.
+///
+/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
+/// original IVOperand. The head of the chain's IVOperand is only valid during
+/// chain collection, before LSR replaces IV users. During chain generation,
+/// IncExpr can be used to find the new IVOperand that computes the same
+/// expression.
+struct IVInc {
+  Instruction *UserInst;
+  Value* IVOperand;
+  const SCEV *IncExpr;
+
+  IVInc(Instruction *U, Value *O, const SCEV *E):
+    UserInst(U), IVOperand(O), IncExpr(E) {}
+};
+
+// The list of IV increments in program order.  We typically add the head of a
+// chain without finding subsequent links.
+struct IVChain {
+  SmallVector<IVInc,1> Incs;
+  const SCEV *ExprBase;
+
+  IVChain() : ExprBase(nullptr) {}
+
+  IVChain(const IVInc &Head, const SCEV *Base)
+    : Incs(1, Head), ExprBase(Base) {}
+
+  typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
+
+  // Return the first increment in the chain.
+  const_iterator begin() const {
+    assert(!Incs.empty());
+    return std::next(Incs.begin());
+  }
+  const_iterator end() const {
+    return Incs.end();
+  }
+
+  // Returns true if this chain contains any increments.
+  bool hasIncs() const { return Incs.size() >= 2; }
+
+  // Add an IVInc to the end of this chain.
+  void add(const IVInc &X) { Incs.push_back(X); }
+
+  // Returns the last UserInst in the chain.
+  Instruction *tailUserInst() const { return Incs.back().UserInst; }
+
+  // Returns true if IncExpr can be profitably added to this chain.
+  bool isProfitableIncrement(const SCEV *OperExpr,
+                             const SCEV *IncExpr,
+                             ScalarEvolution&);
+};
+
+/// Helper for CollectChains to track multiple IV increment uses.  Distinguish
+/// between FarUsers that definitely cross IV increments and NearUsers that may
+/// be used between IV increments.
+struct ChainUsers {
+  SmallPtrSet<Instruction*, 4> FarUsers;
+  SmallPtrSet<Instruction*, 4> NearUsers;
+};
+
+/// This class holds state for the main loop strength reduction logic.
+class LSRInstance {
+  IVUsers &IU;
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+  LoopInfo &LI;
+  const TargetTransformInfo &TTI;
+  Loop *const L;
+  bool Changed;
+
+  /// This is the insert position that the current loop's induction variable
+  /// increment should be placed. In simple loops, this is the latch block's
+  /// terminator. But in more complicated cases, this is a position which will
+  /// dominate all the in-loop post-increment users.
+  Instruction *IVIncInsertPos;
+
+  /// Interesting factors between use strides.
+  ///
+  /// We explicitly use a SetVector which contains a SmallSet, instead of the
+  /// default, a SmallDenseSet, because we need to use the full range of
+  /// int64_ts, and there's currently no good way of doing that with
+  /// SmallDenseSet.
+  SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
+
+  /// Interesting use types, to facilitate truncation reuse.
+  SmallSetVector<Type *, 4> Types;
+
+  /// The list of interesting uses.
+  SmallVector<LSRUse, 16> Uses;
+
+  /// Track which uses use which register candidates.
+  RegUseTracker RegUses;
+
+  // Limit the number of chains to avoid quadratic behavior. We don't expect to
+  // have more than a few IV increment chains in a loop. Missing a Chain falls
+  // back to normal LSR behavior for those uses.
+  static const unsigned MaxChains = 8;
+
+  /// IV users can form a chain of IV increments.
+  SmallVector<IVChain, MaxChains> IVChainVec;
+
+  /// IV users that belong to profitable IVChains.
+  SmallPtrSet<Use*, MaxChains> IVIncSet;
+
+  void OptimizeShadowIV();
+  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
+  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
+  void OptimizeLoopTermCond();
+
+  void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+                        SmallVectorImpl<ChainUsers> &ChainUsersVec);
+  void FinalizeChain(IVChain &Chain);
+  void CollectChains();
+  void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+                       SmallVectorImpl<WeakTrackingVH> &DeadInsts);
+
+  void CollectInterestingTypesAndFactors();
+  void CollectFixupsAndInitialFormulae();
+
+  // Support for sharing of LSRUses between LSRFixups.
+  typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy;
+  UseMapTy UseMap;
+
+  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
+                          LSRUse::KindType Kind, MemAccessTy AccessTy);
+
+  std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
+                                    MemAccessTy AccessTy);
+
+  void DeleteUse(LSRUse &LU, size_t LUIdx);
+
+  LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
+
+  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
+  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
+  void CountRegisters(const Formula &F, size_t LUIdx);
+  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
+
+  void CollectLoopInvariantFixupsAndFormulae();
+
+  void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
+                              unsigned Depth = 0);
+
+  void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+                                  const Formula &Base, unsigned Depth,
+                                  size_t Idx, bool IsScaledReg = false);
+  void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                   const Formula &Base, size_t Idx,
+                                   bool IsScaledReg = false);
+  void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                   const Formula &Base,
+                                   const SmallVectorImpl<int64_t> &Worklist,
+                                   size_t Idx, bool IsScaledReg = false);
+  void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateCrossUseConstantOffsets();
+  void GenerateAllReuseFormulae();
+
+  void FilterOutUndesirableDedicatedRegisters();
+
+  size_t EstimateSearchSpaceComplexity() const;
+  void NarrowSearchSpaceByDetectingSupersets();
+  void NarrowSearchSpaceByCollapsingUnrolledCode();
+  void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+  void NarrowSearchSpaceByDeletingCostlyFormulas();
+  void NarrowSearchSpaceByPickingWinnerRegs();
+  void NarrowSearchSpaceUsingHeuristics();
+
+  void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
+                    Cost &SolutionCost,
+                    SmallVectorImpl<const Formula *> &Workspace,
+                    const Cost &CurCost,
+                    const SmallPtrSet<const SCEV *, 16> &CurRegs,
+                    DenseSet<const SCEV *> &VisitedRegs) const;
+  void Solve(SmallVectorImpl<const Formula *> &Solution) const;
+
+  BasicBlock::iterator
+    HoistInsertPosition(BasicBlock::iterator IP,
+                        const SmallVectorImpl<Instruction *> &Inputs) const;
+  BasicBlock::iterator
+    AdjustInsertPositionForExpand(BasicBlock::iterator IP,
+                                  const LSRFixup &LF,
+                                  const LSRUse &LU,
+                                  SCEVExpander &Rewriter) const;
+
+  Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+                BasicBlock::iterator IP, SCEVExpander &Rewriter,
+                SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+  void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
+                     const Formula &F, SCEVExpander &Rewriter,
+                     SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+  void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+               SCEVExpander &Rewriter,
+               SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
+
+public:
+  LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
+              LoopInfo &LI, const TargetTransformInfo &TTI);
+
+  bool getChanged() const { return Changed; }
+
+  void print_factors_and_types(raw_ostream &OS) const;
+  void print_fixups(raw_ostream &OS) const;
+  void print_uses(raw_ostream &OS) const;
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+} // end anonymous namespace
+
+/// If IV is used in a int-to-float cast inside the loop then try to eliminate
+/// the cast operation.
+void LSRInstance::OptimizeShadowIV() {
+  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return;
+
+  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
+       UI != E; /* empty */) {
+    IVUsers::const_iterator CandidateUI = UI;
+    ++UI;
+    Instruction *ShadowUse = CandidateUI->getUser();
+    Type *DestTy = nullptr;
+    bool IsSigned = false;
+
+    /* If shadow use is a int->float cast then insert a second IV
+       to eliminate this cast.
+
+         for (unsigned i = 0; i < n; ++i)
+           foo((double)i);
+
+       is transformed into
+
+         double d = 0.0;
+         for (unsigned i = 0; i < n; ++i, ++d)
+           foo(d);
+    */
+    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
+      IsSigned = false;
+      DestTy = UCast->getDestTy();
+    }
+    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
+      IsSigned = true;
+      DestTy = SCast->getDestTy();
+    }
+    if (!DestTy) continue;
+
+    // If target does not support DestTy natively then do not apply
+    // this transformation.
+    if (!TTI.isTypeLegal(DestTy)) continue;
+
+    PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
+    if (!PH) continue;
+    if (PH->getNumIncomingValues() != 2) continue;
+
+    Type *SrcTy = PH->getType();
+    int Mantissa = DestTy->getFPMantissaWidth();
+    if (Mantissa == -1) continue;
+    if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
+      continue;
+
+    unsigned Entry, Latch;
+    if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
+      Entry = 0;
+      Latch = 1;
+    } else {
+      Entry = 1;
+      Latch = 0;
+    }
+
+    ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
+    if (!Init) continue;
+    Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
+                                        (double)Init->getSExtValue() :
+                                        (double)Init->getZExtValue());
+
+    BinaryOperator *Incr =
+      dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
+    if (!Incr) continue;
+    if (Incr->getOpcode() != Instruction::Add
+        && Incr->getOpcode() != Instruction::Sub)
+      continue;
+
+    /* Initialize new IV, double d = 0.0 in above example. */
+    ConstantInt *C = nullptr;
+    if (Incr->getOperand(0) == PH)
+      C = dyn_cast<ConstantInt>(Incr->getOperand(1));
+    else if (Incr->getOperand(1) == PH)
+      C = dyn_cast<ConstantInt>(Incr->getOperand(0));
+    else
+      continue;
+
+    if (!C) continue;
+
+    // Ignore negative constants, as the code below doesn't handle them
+    // correctly. TODO: Remove this restriction.
+    if (!C->getValue().isStrictlyPositive()) continue;
+
+    /* Add new PHINode. */
+    PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
+
+    /* create new increment. '++d' in above example. */
+    Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
+    BinaryOperator *NewIncr =
+      BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
+                               Instruction::FAdd : Instruction::FSub,
+                             NewPH, CFP, "IV.S.next.", Incr);
+
+    NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
+    NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
+
+    /* Remove cast operation */
+    ShadowUse->replaceAllUsesWith(NewPH);
+    ShadowUse->eraseFromParent();
+    Changed = true;
+    break;
+  }
+}
+
+/// If Cond has an operand that is an expression of an IV, set the IV user and
+/// stride information and return true, otherwise return false.
+bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
+  for (IVStrideUse &U : IU)
+    if (U.getUser() == Cond) {
+      // NOTE: we could handle setcc instructions with multiple uses here, but
+      // InstCombine does it as well for simple uses, it's not clear that it
+      // occurs enough in real life to handle.
+      CondUse = &U;
+      return true;
+    }
+  return false;
+}
+
+/// Rewrite the loop's terminating condition if it uses a max computation.
+///
+/// This is a narrow solution to a specific, but acute, problem. For loops
+/// like this:
+///
+///   i = 0;
+///   do {
+///     p[i] = 0.0;
+///   } while (++i < n);
+///
+/// the trip count isn't just 'n', because 'n' might not be positive. And
+/// unfortunately this can come up even for loops where the user didn't use
+/// a C do-while loop. For example, seemingly well-behaved top-test loops
+/// will commonly be lowered like this:
+//
+///   if (n > 0) {
+///     i = 0;
+///     do {
+///       p[i] = 0.0;
+///     } while (++i < n);
+///   }
+///
+/// and then it's possible for subsequent optimization to obscure the if
+/// test in such a way that indvars can't find it.
+///
+/// When indvars can't find the if test in loops like this, it creates a
+/// max expression, which allows it to give the loop a canonical
+/// induction variable:
+///
+///   i = 0;
+///   max = n < 1 ? 1 : n;
+///   do {
+///     p[i] = 0.0;
+///   } while (++i != max);
+///
+/// Canonical induction variables are necessary because the loop passes
+/// are designed around them. The most obvious example of this is the
+/// LoopInfo analysis, which doesn't remember trip count values. It
+/// expects to be able to rediscover the trip count each time it is
+/// needed, and it does this using a simple analysis that only succeeds if
+/// the loop has a canonical induction variable.
+///
+/// However, when it comes time to generate code, the maximum operation
+/// can be quite costly, especially if it's inside of an outer loop.
+///
+/// This function solves this problem by detecting this type of loop and
+/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
+/// the instructions for the maximum computation.
+///
+ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
+  // Check that the loop matches the pattern we're looking for.
+  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
+      Cond->getPredicate() != CmpInst::ICMP_NE)
+    return Cond;
+
+  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
+  if (!Sel || !Sel->hasOneUse()) return Cond;
+
+  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return Cond;
+  const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
+
+  // Add one to the backedge-taken count to get the trip count.
+  const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
+  if (IterationCount != SE.getSCEV(Sel)) return Cond;
+
+  // Check for a max calculation that matches the pattern. There's no check
+  // for ICMP_ULE here because the comparison would be with zero, which
+  // isn't interesting.
+  CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+  const SCEVNAryExpr *Max = nullptr;
+  if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
+    Pred = ICmpInst::ICMP_SLE;
+    Max = S;
+  } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
+    Pred = ICmpInst::ICMP_SLT;
+    Max = S;
+  } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
+    Pred = ICmpInst::ICMP_ULT;
+    Max = U;
+  } else {
+    // No match; bail.
+    return Cond;
+  }
+
+  // To handle a max with more than two operands, this optimization would
+  // require additional checking and setup.
+  if (Max->getNumOperands() != 2)
+    return Cond;
+
+  const SCEV *MaxLHS = Max->getOperand(0);
+  const SCEV *MaxRHS = Max->getOperand(1);
+
+  // ScalarEvolution canonicalizes constants to the left. For < and >, look
+  // for a comparison with 1. For <= and >=, a comparison with zero.
+  if (!MaxLHS ||
+      (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
+    return Cond;
+
+  // Check the relevant induction variable for conformance to
+  // the pattern.
+  const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
+  if (!AR || !AR->isAffine() ||
+      AR->getStart() != One ||
+      AR->getStepRecurrence(SE) != One)
+    return Cond;
+
+  assert(AR->getLoop() == L &&
+         "Loop condition operand is an addrec in a different loop!");
+
+  // Check the right operand of the select, and remember it, as it will
+  // be used in the new comparison instruction.
+  Value *NewRHS = nullptr;
+  if (ICmpInst::isTrueWhenEqual(Pred)) {
+    // Look for n+1, and grab n.
+    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
+      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+         if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+           NewRHS = BO->getOperand(0);
+    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
+      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+        if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+          NewRHS = BO->getOperand(0);
+    if (!NewRHS)
+      return Cond;
+  } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
+    NewRHS = Sel->getOperand(1);
+  else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
+    NewRHS = Sel->getOperand(2);
+  else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
+    NewRHS = SU->getValue();
+  else
+    // Max doesn't match expected pattern.
+    return Cond;
+
+  // Determine the new comparison opcode. It may be signed or unsigned,
+  // and the original comparison may be either equality or inequality.
+  if (Cond->getPredicate() == CmpInst::ICMP_EQ)
+    Pred = CmpInst::getInversePredicate(Pred);
+
+  // Ok, everything looks ok to change the condition into an SLT or SGE and
+  // delete the max calculation.
+  ICmpInst *NewCond =
+    new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
+
+  // Delete the max calculation instructions.
+  Cond->replaceAllUsesWith(NewCond);
+  CondUse->setUser(NewCond);
+  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
+  Cond->eraseFromParent();
+  Sel->eraseFromParent();
+  if (Cmp->use_empty())
+    Cmp->eraseFromParent();
+  return NewCond;
+}
+
+/// Change loop terminating condition to use the postinc iv when possible.
+void
+LSRInstance::OptimizeLoopTermCond() {
+  SmallPtrSet<Instruction *, 4> PostIncs;
+
+  // We need a different set of heuristics for rotated and non-rotated loops.
+  // If a loop is rotated then the latch is also the backedge, so inserting
+  // post-inc expressions just before the latch is ideal. To reduce live ranges
+  // it also makes sense to rewrite terminating conditions to use post-inc
+  // expressions.
+  //
+  // If the loop is not rotated then the latch is not a backedge; the latch
+  // check is done in the loop head. Adding post-inc expressions before the
+  // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
+  // in the loop body. In this case we do *not* want to use post-inc expressions
+  // in the latch check, and we want to insert post-inc expressions before
+  // the backedge.
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  SmallVector<BasicBlock*, 8> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+  if (llvm::all_of(ExitingBlocks, [&LatchBlock](const BasicBlock *BB) {
+        return LatchBlock != BB;
+      })) {
+    // The backedge doesn't exit the loop; treat this as a head-tested loop.
+    IVIncInsertPos = LatchBlock->getTerminator();
+    return;
+  }
+
+  // Otherwise treat this as a rotated loop.
+  for (BasicBlock *ExitingBlock : ExitingBlocks) {
+
+    // Get the terminating condition for the loop if possible.  If we
+    // can, we want to change it to use a post-incremented version of its
+    // induction variable, to allow coalescing the live ranges for the IV into
+    // one register value.
+
+    BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+    if (!TermBr)
+      continue;
+    // FIXME: Overly conservative, termination condition could be an 'or' etc..
+    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
+      continue;
+
+    // Search IVUsesByStride to find Cond's IVUse if there is one.
+    IVStrideUse *CondUse = nullptr;
+    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
+    if (!FindIVUserForCond(Cond, CondUse))
+      continue;
+
+    // If the trip count is computed in terms of a max (due to ScalarEvolution
+    // being unable to find a sufficient guard, for example), change the loop
+    // comparison to use SLT or ULT instead of NE.
+    // One consequence of doing this now is that it disrupts the count-down
+    // optimization. That's not always a bad thing though, because in such
+    // cases it may still be worthwhile to avoid a max.
+    Cond = OptimizeMax(Cond, CondUse);
+
+    // If this exiting block dominates the latch block, it may also use
+    // the post-inc value if it won't be shared with other uses.
+    // Check for dominance.
+    if (!DT.dominates(ExitingBlock, LatchBlock))
+      continue;
+
+    // Conservatively avoid trying to use the post-inc value in non-latch
+    // exits if there may be pre-inc users in intervening blocks.
+    if (LatchBlock != ExitingBlock)
+      for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
+        // Test if the use is reachable from the exiting block. This dominator
+        // query is a conservative approximation of reachability.
+        if (&*UI != CondUse &&
+            !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
+          // Conservatively assume there may be reuse if the quotient of their
+          // strides could be a legal scale.
+          const SCEV *A = IU.getStride(*CondUse, L);
+          const SCEV *B = IU.getStride(*UI, L);
+          if (!A || !B) continue;
+          if (SE.getTypeSizeInBits(A->getType()) !=
+              SE.getTypeSizeInBits(B->getType())) {
+            if (SE.getTypeSizeInBits(A->getType()) >
+                SE.getTypeSizeInBits(B->getType()))
+              B = SE.getSignExtendExpr(B, A->getType());
+            else
+              A = SE.getSignExtendExpr(A, B->getType());
+          }
+          if (const SCEVConstant *D =
+                dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
+            const ConstantInt *C = D->getValue();
+            // Stride of one or negative one can have reuse with non-addresses.
+            if (C->isOne() || C->isMinusOne())
+              goto decline_post_inc;
+            // Avoid weird situations.
+            if (C->getValue().getMinSignedBits() >= 64 ||
+                C->getValue().isMinSignedValue())
+              goto decline_post_inc;
+            // Check for possible scaled-address reuse.
+            MemAccessTy AccessTy = getAccessType(UI->getUser());
+            int64_t Scale = C->getSExtValue();
+            if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+                                          /*BaseOffset=*/0,
+                                          /*HasBaseReg=*/false, Scale,
+                                          AccessTy.AddrSpace))
+              goto decline_post_inc;
+            Scale = -Scale;
+            if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+                                          /*BaseOffset=*/0,
+                                          /*HasBaseReg=*/false, Scale,
+                                          AccessTy.AddrSpace))
+              goto decline_post_inc;
+          }
+        }
+
+    DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
+                 << *Cond << '\n');
+
+    // It's possible for the setcc instruction to be anywhere in the loop, and
+    // possible for it to have multiple users.  If it is not immediately before
+    // the exiting block branch, move it.
+    if (&*++BasicBlock::iterator(Cond) != TermBr) {
+      if (Cond->hasOneUse()) {
+        Cond->moveBefore(TermBr);
+      } else {
+        // Clone the terminating condition and insert into the loopend.
+        ICmpInst *OldCond = Cond;
+        Cond = cast<ICmpInst>(Cond->clone());
+        Cond->setName(L->getHeader()->getName() + ".termcond");
+        ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond);
+
+        // Clone the IVUse, as the old use still exists!
+        CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
+        TermBr->replaceUsesOfWith(OldCond, Cond);
+      }
+    }
+
+    // If we get to here, we know that we can transform the setcc instruction to
+    // use the post-incremented version of the IV, allowing us to coalesce the
+    // live ranges for the IV correctly.
+    CondUse->transformToPostInc(L);
+    Changed = true;
+
+    PostIncs.insert(Cond);
+  decline_post_inc:;
+  }
+
+  // Determine an insertion point for the loop induction variable increment. It
+  // must dominate all the post-inc comparisons we just set up, and it must
+  // dominate the loop latch edge.
+  IVIncInsertPos = L->getLoopLatch()->getTerminator();
+  for (Instruction *Inst : PostIncs) {
+    BasicBlock *BB =
+      DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
+                                    Inst->getParent());
+    if (BB == Inst->getParent())
+      IVIncInsertPos = Inst;
+    else if (BB != IVIncInsertPos->getParent())
+      IVIncInsertPos = BB->getTerminator();
+  }
+}
+
+/// Determine if the given use can accommodate a fixup at the given offset and
+/// other details. If so, update the use and return true.
+bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
+                                     bool HasBaseReg, LSRUse::KindType Kind,
+                                     MemAccessTy AccessTy) {
+  int64_t NewMinOffset = LU.MinOffset;
+  int64_t NewMaxOffset = LU.MaxOffset;
+  MemAccessTy NewAccessTy = AccessTy;
+
+  // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
+  // something conservative, however this can pessimize in the case that one of
+  // the uses will have all its uses outside the loop, for example.
+  if (LU.Kind != Kind)
+    return false;
+
+  // Check for a mismatched access type, and fall back conservatively as needed.
+  // TODO: Be less conservative when the type is similar and can use the same
+  // addressing modes.
+  if (Kind == LSRUse::Address) {
+    if (AccessTy.MemTy != LU.AccessTy.MemTy) {
+      NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
+                                            AccessTy.AddrSpace);
+    }
+  }
+
+  // Conservatively assume HasBaseReg is true for now.
+  if (NewOffset < LU.MinOffset) {
+    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
+                          LU.MaxOffset - NewOffset, HasBaseReg))
+      return false;
+    NewMinOffset = NewOffset;
+  } else if (NewOffset > LU.MaxOffset) {
+    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
+                          NewOffset - LU.MinOffset, HasBaseReg))
+      return false;
+    NewMaxOffset = NewOffset;
+  }
+
+  // Update the use.
+  LU.MinOffset = NewMinOffset;
+  LU.MaxOffset = NewMaxOffset;
+  LU.AccessTy = NewAccessTy;
+  return true;
+}
+
+/// Return an LSRUse index and an offset value for a fixup which needs the given
+/// expression, with the given kind and optional access type.  Either reuse an
+/// existing use or create a new one, as needed.
+std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
+                                               LSRUse::KindType Kind,
+                                               MemAccessTy AccessTy) {
+  const SCEV *Copy = Expr;
+  int64_t Offset = ExtractImmediate(Expr, SE);
+
+  // Basic uses can't accept any offset, for example.
+  if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
+                        Offset, /*HasBaseReg=*/ true)) {
+    Expr = Copy;
+    Offset = 0;
+  }
+
+  std::pair<UseMapTy::iterator, bool> P =
+    UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
+  if (!P.second) {
+    // A use already existed with this base.
+    size_t LUIdx = P.first->second;
+    LSRUse &LU = Uses[LUIdx];
+    if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
+      // Reuse this use.
+      return std::make_pair(LUIdx, Offset);
+  }
+
+  // Create a new use.
+  size_t LUIdx = Uses.size();
+  P.first->second = LUIdx;
+  Uses.push_back(LSRUse(Kind, AccessTy));
+  LSRUse &LU = Uses[LUIdx];
+
+  LU.MinOffset = Offset;
+  LU.MaxOffset = Offset;
+  return std::make_pair(LUIdx, Offset);
+}
+
+/// Delete the given use from the Uses list.
+void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
+  if (&LU != &Uses.back())
+    std::swap(LU, Uses.back());
+  Uses.pop_back();
+
+  // Update RegUses.
+  RegUses.swapAndDropUse(LUIdx, Uses.size());
+}
+
+/// Look for a use distinct from OrigLU which is has a formula that has the same
+/// registers as the given formula.
+LSRUse *
+LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
+                                       const LSRUse &OrigLU) {
+  // Search all uses for the formula. This could be more clever.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    // Check whether this use is close enough to OrigLU, to see whether it's
+    // worthwhile looking through its formulae.
+    // Ignore ICmpZero uses because they may contain formulae generated by
+    // GenerateICmpZeroScales, in which case adding fixup offsets may
+    // be invalid.
+    if (&LU != &OrigLU &&
+        LU.Kind != LSRUse::ICmpZero &&
+        LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
+        LU.WidestFixupType == OrigLU.WidestFixupType &&
+        LU.HasFormulaWithSameRegs(OrigF)) {
+      // Scan through this use's formulae.
+      for (const Formula &F : LU.Formulae) {
+        // Check to see if this formula has the same registers and symbols
+        // as OrigF.
+        if (F.BaseRegs == OrigF.BaseRegs &&
+            F.ScaledReg == OrigF.ScaledReg &&
+            F.BaseGV == OrigF.BaseGV &&
+            F.Scale == OrigF.Scale &&
+            F.UnfoldedOffset == OrigF.UnfoldedOffset) {
+          if (F.BaseOffset == 0)
+            return &LU;
+          // This is the formula where all the registers and symbols matched;
+          // there aren't going to be any others. Since we declined it, we
+          // can skip the rest of the formulae and proceed to the next LSRUse.
+          break;
+        }
+      }
+    }
+  }
+
+  // Nothing looked good.
+  return nullptr;
+}
+
+void LSRInstance::CollectInterestingTypesAndFactors() {
+  SmallSetVector<const SCEV *, 4> Strides;
+
+  // Collect interesting types and strides.
+  SmallVector<const SCEV *, 4> Worklist;
+  for (const IVStrideUse &U : IU) {
+    const SCEV *Expr = IU.getExpr(U);
+
+    // Collect interesting types.
+    Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
+
+    // Add strides for mentioned loops.
+    Worklist.push_back(Expr);
+    do {
+      const SCEV *S = Worklist.pop_back_val();
+      if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+        if (AR->getLoop() == L)
+          Strides.insert(AR->getStepRecurrence(SE));
+        Worklist.push_back(AR->getStart());
+      } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+        Worklist.append(Add->op_begin(), Add->op_end());
+      }
+    } while (!Worklist.empty());
+  }
+
+  // Compute interesting factors from the set of interesting strides.
+  for (SmallSetVector<const SCEV *, 4>::const_iterator
+       I = Strides.begin(), E = Strides.end(); I != E; ++I)
+    for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
+         std::next(I); NewStrideIter != E; ++NewStrideIter) {
+      const SCEV *OldStride = *I;
+      const SCEV *NewStride = *NewStrideIter;
+
+      if (SE.getTypeSizeInBits(OldStride->getType()) !=
+          SE.getTypeSizeInBits(NewStride->getType())) {
+        if (SE.getTypeSizeInBits(OldStride->getType()) >
+            SE.getTypeSizeInBits(NewStride->getType()))
+          NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
+        else
+          OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
+      }
+      if (const SCEVConstant *Factor =
+            dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
+                                                        SE, true))) {
+        if (Factor->getAPInt().getMinSignedBits() <= 64)
+          Factors.insert(Factor->getAPInt().getSExtValue());
+      } else if (const SCEVConstant *Factor =
+                   dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
+                                                               NewStride,
+                                                               SE, true))) {
+        if (Factor->getAPInt().getMinSignedBits() <= 64)
+          Factors.insert(Factor->getAPInt().getSExtValue());
+      }
+    }
+
+  // If all uses use the same type, don't bother looking for truncation-based
+  // reuse.
+  if (Types.size() == 1)
+    Types.clear();
+
+  DEBUG(print_factors_and_types(dbgs()));
+}
+
+/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
+/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
+/// IVStrideUses, we could partially skip this.
+static User::op_iterator
+findIVOperand(User::op_iterator OI, User::op_iterator OE,
+              Loop *L, ScalarEvolution &SE) {
+  for(; OI != OE; ++OI) {
+    if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
+      if (!SE.isSCEVable(Oper->getType()))
+        continue;
+
+      if (const SCEVAddRecExpr *AR =
+          dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
+        if (AR->getLoop() == L)
+          break;
+      }
+    }
+  }
+  return OI;
+}
+
+/// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in
+/// a convenient helper.
+static Value *getWideOperand(Value *Oper) {
+  if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
+    return Trunc->getOperand(0);
+  return Oper;
+}
+
+/// Return true if we allow an IV chain to include both types.
+static bool isCompatibleIVType(Value *LVal, Value *RVal) {
+  Type *LType = LVal->getType();
+  Type *RType = RVal->getType();
+  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() &&
+                              // Different address spaces means (possibly)
+                              // different types of the pointer implementation,
+                              // e.g. i16 vs i32 so disallow that.
+                              (LType->getPointerAddressSpace() ==
+                               RType->getPointerAddressSpace()));
+}
+
+/// Return an approximation of this SCEV expression's "base", or NULL for any
+/// constant. Returning the expression itself is conservative. Returning a
+/// deeper subexpression is more precise and valid as long as it isn't less
+/// complex than another subexpression. For expressions involving multiple
+/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
+/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
+/// IVInc==b-a.
+///
+/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
+/// SCEVUnknown, we simply return the rightmost SCEV operand.
+static const SCEV *getExprBase(const SCEV *S) {
+  switch (S->getSCEVType()) {
+  default: // uncluding scUnknown.
+    return S;
+  case scConstant:
+    return nullptr;
+  case scTruncate:
+    return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
+  case scZeroExtend:
+    return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
+  case scSignExtend:
+    return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
+  case scAddExpr: {
+    // Skip over scaled operands (scMulExpr) to follow add operands as long as
+    // there's nothing more complex.
+    // FIXME: not sure if we want to recognize negation.
+    const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
+    for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
+           E(Add->op_begin()); I != E; ++I) {
+      const SCEV *SubExpr = *I;
+      if (SubExpr->getSCEVType() == scAddExpr)
+        return getExprBase(SubExpr);
+
+      if (SubExpr->getSCEVType() != scMulExpr)
+        return SubExpr;
+    }
+    return S; // all operands are scaled, be conservative.
+  }
+  case scAddRecExpr:
+    return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
+  }
+}
+
+/// Return true if the chain increment is profitable to expand into a loop
+/// invariant value, which may require its own register. A profitable chain
+/// increment will be an offset relative to the same base. We allow such offsets
+/// to potentially be used as chain increment as long as it's not obviously
+/// expensive to expand using real instructions.
+bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
+                                    const SCEV *IncExpr,
+                                    ScalarEvolution &SE) {
+  // Aggressively form chains when -stress-ivchain.
+  if (StressIVChain)
+    return true;
+
+  // Do not replace a constant offset from IV head with a nonconstant IV
+  // increment.
+  if (!isa<SCEVConstant>(IncExpr)) {
+    const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
+    if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
+      return false;
+  }
+
+  SmallPtrSet<const SCEV*, 8> Processed;
+  return !isHighCostExpansion(IncExpr, Processed, SE);
+}
+
+/// Return true if the number of registers needed for the chain is estimated to
+/// be less than the number required for the individual IV users. First prohibit
+/// any IV users that keep the IV live across increments (the Users set should
+/// be empty). Next count the number and type of increments in the chain.
+///
+/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
+/// effectively use postinc addressing modes. Only consider it profitable it the
+/// increments can be computed in fewer registers when chained.
+///
+/// TODO: Consider IVInc free if it's already used in another chains.
+static bool
+isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
+                  ScalarEvolution &SE, const TargetTransformInfo &TTI) {
+  if (StressIVChain)
+    return true;
+
+  if (!Chain.hasIncs())
+    return false;
+
+  if (!Users.empty()) {
+    DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
+          for (Instruction *Inst : Users) {
+            dbgs() << "  " << *Inst << "\n";
+          });
+    return false;
+  }
+  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+
+  // The chain itself may require a register, so intialize cost to 1.
+  int cost = 1;
+
+  // A complete chain likely eliminates the need for keeping the original IV in
+  // a register. LSR does not currently know how to form a complete chain unless
+  // the header phi already exists.
+  if (isa<PHINode>(Chain.tailUserInst())
+      && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
+    --cost;
+  }
+  const SCEV *LastIncExpr = nullptr;
+  unsigned NumConstIncrements = 0;
+  unsigned NumVarIncrements = 0;
+  unsigned NumReusedIncrements = 0;
+  for (const IVInc &Inc : Chain) {
+    if (Inc.IncExpr->isZero())
+      continue;
+
+    // Incrementing by zero or some constant is neutral. We assume constants can
+    // be folded into an addressing mode or an add's immediate operand.
+    if (isa<SCEVConstant>(Inc.IncExpr)) {
+      ++NumConstIncrements;
+      continue;
+    }
+
+    if (Inc.IncExpr == LastIncExpr)
+      ++NumReusedIncrements;
+    else
+      ++NumVarIncrements;
+
+    LastIncExpr = Inc.IncExpr;
+  }
+  // An IV chain with a single increment is handled by LSR's postinc
+  // uses. However, a chain with multiple increments requires keeping the IV's
+  // value live longer than it needs to be if chained.
+  if (NumConstIncrements > 1)
+    --cost;
+
+  // Materializing increment expressions in the preheader that didn't exist in
+  // the original code may cost a register. For example, sign-extended array
+  // indices can produce ridiculous increments like this:
+  // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
+  cost += NumVarIncrements;
+
+  // Reusing variable increments likely saves a register to hold the multiple of
+  // the stride.
+  cost -= NumReusedIncrements;
+
+  DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
+               << "\n");
+
+  return cost < 0;
+}
+
+/// Add this IV user to an existing chain or make it the head of a new chain.
+void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+                                   SmallVectorImpl<ChainUsers> &ChainUsersVec) {
+  // When IVs are used as types of varying widths, they are generally converted
+  // to a wider type with some uses remaining narrow under a (free) trunc.
+  Value *const NextIV = getWideOperand(IVOper);
+  const SCEV *const OperExpr = SE.getSCEV(NextIV);
+  const SCEV *const OperExprBase = getExprBase(OperExpr);
+
+  // Visit all existing chains. Check if its IVOper can be computed as a
+  // profitable loop invariant increment from the last link in the Chain.
+  unsigned ChainIdx = 0, NChains = IVChainVec.size();
+  const SCEV *LastIncExpr = nullptr;
+  for (; ChainIdx < NChains; ++ChainIdx) {
+    IVChain &Chain = IVChainVec[ChainIdx];
+
+    // Prune the solution space aggressively by checking that both IV operands
+    // are expressions that operate on the same unscaled SCEVUnknown. This
+    // "base" will be canceled by the subsequent getMinusSCEV call. Checking
+    // first avoids creating extra SCEV expressions.
+    if (!StressIVChain && Chain.ExprBase != OperExprBase)
+      continue;
+
+    Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
+    if (!isCompatibleIVType(PrevIV, NextIV))
+      continue;
+
+    // A phi node terminates a chain.
+    if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
+      continue;
+
+    // The increment must be loop-invariant so it can be kept in a register.
+    const SCEV *PrevExpr = SE.getSCEV(PrevIV);
+    const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
+    if (!SE.isLoopInvariant(IncExpr, L))
+      continue;
+
+    if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
+      LastIncExpr = IncExpr;
+      break;
+    }
+  }
+  // If we haven't found a chain, create a new one, unless we hit the max. Don't
+  // bother for phi nodes, because they must be last in the chain.
+  if (ChainIdx == NChains) {
+    if (isa<PHINode>(UserInst))
+      return;
+    if (NChains >= MaxChains && !StressIVChain) {
+      DEBUG(dbgs() << "IV Chain Limit\n");
+      return;
+    }
+    LastIncExpr = OperExpr;
+    // IVUsers may have skipped over sign/zero extensions. We don't currently
+    // attempt to form chains involving extensions unless they can be hoisted
+    // into this loop's AddRec.
+    if (!isa<SCEVAddRecExpr>(LastIncExpr))
+      return;
+    ++NChains;
+    IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
+                                 OperExprBase));
+    ChainUsersVec.resize(NChains);
+    DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
+                 << ") IV=" << *LastIncExpr << "\n");
+  } else {
+    DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
+                 << ") IV+" << *LastIncExpr << "\n");
+    // Add this IV user to the end of the chain.
+    IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
+  }
+  IVChain &Chain = IVChainVec[ChainIdx];
+
+  SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
+  // This chain's NearUsers become FarUsers.
+  if (!LastIncExpr->isZero()) {
+    ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
+                                            NearUsers.end());
+    NearUsers.clear();
+  }
+
+  // All other uses of IVOperand become near uses of the chain.
+  // We currently ignore intermediate values within SCEV expressions, assuming
+  // they will eventually be used be the current chain, or can be computed
+  // from one of the chain increments. To be more precise we could
+  // transitively follow its user and only add leaf IV users to the set.
+  for (User *U : IVOper->users()) {
+    Instruction *OtherUse = dyn_cast<Instruction>(U);
+    if (!OtherUse)
+      continue;
+    // Uses in the chain will no longer be uses if the chain is formed.
+    // Include the head of the chain in this iteration (not Chain.begin()).
+    IVChain::const_iterator IncIter = Chain.Incs.begin();
+    IVChain::const_iterator IncEnd = Chain.Incs.end();
+    for( ; IncIter != IncEnd; ++IncIter) {
+      if (IncIter->UserInst == OtherUse)
+        break;
+    }
+    if (IncIter != IncEnd)
+      continue;
+
+    if (SE.isSCEVable(OtherUse->getType())
+        && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
+        && IU.isIVUserOrOperand(OtherUse)) {
+      continue;
+    }
+    NearUsers.insert(OtherUse);
+  }
+
+  // Since this user is part of the chain, it's no longer considered a use
+  // of the chain.
+  ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
+}
+
+/// Populate the vector of Chains.
+///
+/// This decreases ILP at the architecture level. Targets with ample registers,
+/// multiple memory ports, and no register renaming probably don't want
+/// this. However, such targets should probably disable LSR altogether.
+///
+/// The job of LSR is to make a reasonable choice of induction variables across
+/// the loop. Subsequent passes can easily "unchain" computation exposing more
+/// ILP *within the loop* if the target wants it.
+///
+/// Finding the best IV chain is potentially a scheduling problem. Since LSR
+/// will not reorder memory operations, it will recognize this as a chain, but
+/// will generate redundant IV increments. Ideally this would be corrected later
+/// by a smart scheduler:
+///        = A[i]
+///        = A[i+x]
+/// A[i]   =
+/// A[i+x] =
+///
+/// TODO: Walk the entire domtree within this loop, not just the path to the
+/// loop latch. This will discover chains on side paths, but requires
+/// maintaining multiple copies of the Chains state.
+void LSRInstance::CollectChains() {
+  DEBUG(dbgs() << "Collecting IV Chains.\n");
+  SmallVector<ChainUsers, 8> ChainUsersVec;
+
+  SmallVector<BasicBlock *,8> LatchPath;
+  BasicBlock *LoopHeader = L->getHeader();
+  for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
+       Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
+    LatchPath.push_back(Rung->getBlock());
+  }
+  LatchPath.push_back(LoopHeader);
+
+  // Walk the instruction stream from the loop header to the loop latch.
+  for (BasicBlock *BB : reverse(LatchPath)) {
+    for (Instruction &I : *BB) {
+      // Skip instructions that weren't seen by IVUsers analysis.
+      if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
+        continue;
+
+      // Ignore users that are part of a SCEV expression. This way we only
+      // consider leaf IV Users. This effectively rediscovers a portion of
+      // IVUsers analysis but in program order this time.
+      if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
+        continue;
+
+      // Remove this instruction from any NearUsers set it may be in.
+      for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
+           ChainIdx < NChains; ++ChainIdx) {
+        ChainUsersVec[ChainIdx].NearUsers.erase(&I);
+      }
+      // Search for operands that can be chained.
+      SmallPtrSet<Instruction*, 4> UniqueOperands;
+      User::op_iterator IVOpEnd = I.op_end();
+      User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
+      while (IVOpIter != IVOpEnd) {
+        Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
+        if (UniqueOperands.insert(IVOpInst).second)
+          ChainInstruction(&I, IVOpInst, ChainUsersVec);
+        IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
+      }
+    } // Continue walking down the instructions.
+  } // Continue walking down the domtree.
+  // Visit phi backedges to determine if the chain can generate the IV postinc.
+  for (BasicBlock::iterator I = L->getHeader()->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    if (!SE.isSCEVable(PN->getType()))
+      continue;
+
+    Instruction *IncV =
+      dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));
+    if (IncV)
+      ChainInstruction(PN, IncV, ChainUsersVec);
+  }
+  // Remove any unprofitable chains.
+  unsigned ChainIdx = 0;
+  for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
+       UsersIdx < NChains; ++UsersIdx) {
+    if (!isProfitableChain(IVChainVec[UsersIdx],
+                           ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
+      continue;
+    // Preserve the chain at UsesIdx.
+    if (ChainIdx != UsersIdx)
+      IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
+    FinalizeChain(IVChainVec[ChainIdx]);
+    ++ChainIdx;
+  }
+  IVChainVec.resize(ChainIdx);
+}
+
+void LSRInstance::FinalizeChain(IVChain &Chain) {
+  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+  DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
+
+  for (const IVInc &Inc : Chain) {
+    DEBUG(dbgs() << "        Inc: " << *Inc.UserInst << "\n");
+    auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
+    assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
+    IVIncSet.insert(UseI);
+  }
+}
+
+/// Return true if the IVInc can be folded into an addressing mode.
+static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
+                             Value *Operand, const TargetTransformInfo &TTI) {
+  const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
+  if (!IncConst || !isAddressUse(UserInst, Operand))
+    return false;
+
+  if (IncConst->getAPInt().getMinSignedBits() > 64)
+    return false;
+
+  MemAccessTy AccessTy = getAccessType(UserInst);
+  int64_t IncOffset = IncConst->getValue()->getSExtValue();
+  if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
+                        IncOffset, /*HaseBaseReg=*/false))
+    return false;
+
+  return true;
+}
+
+/// Generate an add or subtract for each IVInc in a chain to materialize the IV
+/// user's operand from the previous IV user's operand.
+void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+                                  SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+  // Find the new IVOperand for the head of the chain. It may have been replaced
+  // by LSR.
+  const IVInc &Head = Chain.Incs[0];
+  User::op_iterator IVOpEnd = Head.UserInst->op_end();
+  // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
+  User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
+                                             IVOpEnd, L, SE);
+  Value *IVSrc = nullptr;
+  while (IVOpIter != IVOpEnd) {
+    IVSrc = getWideOperand(*IVOpIter);
+
+    // If this operand computes the expression that the chain needs, we may use
+    // it. (Check this after setting IVSrc which is used below.)
+    //
+    // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
+    // narrow for the chain, so we can no longer use it. We do allow using a
+    // wider phi, assuming the LSR checked for free truncation. In that case we
+    // should already have a truncate on this operand such that
+    // getSCEV(IVSrc) == IncExpr.
+    if (SE.getSCEV(*IVOpIter) == Head.IncExpr
+        || SE.getSCEV(IVSrc) == Head.IncExpr) {
+      break;
+    }
+    IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
+  }
+  if (IVOpIter == IVOpEnd) {
+    // Gracefully give up on this chain.
+    DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
+    return;
+  }
+
+  DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
+  Type *IVTy = IVSrc->getType();
+  Type *IntTy = SE.getEffectiveSCEVType(IVTy);
+  const SCEV *LeftOverExpr = nullptr;
+  for (const IVInc &Inc : Chain) {
+    Instruction *InsertPt = Inc.UserInst;
+    if (isa<PHINode>(InsertPt))
+      InsertPt = L->getLoopLatch()->getTerminator();
+
+    // IVOper will replace the current IV User's operand. IVSrc is the IV
+    // value currently held in a register.
+    Value *IVOper = IVSrc;
+    if (!Inc.IncExpr->isZero()) {
+      // IncExpr was the result of subtraction of two narrow values, so must
+      // be signed.
+      const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
+      LeftOverExpr = LeftOverExpr ?
+        SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
+    }
+    if (LeftOverExpr && !LeftOverExpr->isZero()) {
+      // Expand the IV increment.
+      Rewriter.clearPostInc();
+      Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
+      const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
+                                             SE.getUnknown(IncV));
+      IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
+
+      // If an IV increment can't be folded, use it as the next IV value.
+      if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
+        assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
+        IVSrc = IVOper;
+        LeftOverExpr = nullptr;
+      }
+    }
+    Type *OperTy = Inc.IVOperand->getType();
+    if (IVTy != OperTy) {
+      assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
+             "cannot extend a chained IV");
+      IRBuilder<> Builder(InsertPt);
+      IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
+    }
+    Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
+    DeadInsts.emplace_back(Inc.IVOperand);
+  }
+  // If LSR created a new, wider phi, we may also replace its postinc. We only
+  // do this if we also found a wide value for the head of the chain.
+  if (isa<PHINode>(Chain.tailUserInst())) {
+    for (BasicBlock::iterator I = L->getHeader()->begin();
+         PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
+      if (!isCompatibleIVType(Phi, IVSrc))
+        continue;
+      Instruction *PostIncV = dyn_cast<Instruction>(
+        Phi->getIncomingValueForBlock(L->getLoopLatch()));
+      if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
+        continue;
+      Value *IVOper = IVSrc;
+      Type *PostIncTy = PostIncV->getType();
+      if (IVTy != PostIncTy) {
+        assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
+        IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
+        Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
+        IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
+      }
+      Phi->replaceUsesOfWith(PostIncV, IVOper);
+      DeadInsts.emplace_back(PostIncV);
+    }
+  }
+}
+
+void LSRInstance::CollectFixupsAndInitialFormulae() {
+  for (const IVStrideUse &U : IU) {
+    Instruction *UserInst = U.getUser();
+    // Skip IV users that are part of profitable IV Chains.
+    User::op_iterator UseI =
+        find(UserInst->operands(), U.getOperandValToReplace());
+    assert(UseI != UserInst->op_end() && "cannot find IV operand");
+    if (IVIncSet.count(UseI)) {
+      DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
+      continue;
+    }
+
+    LSRUse::KindType Kind = LSRUse::Basic;
+    MemAccessTy AccessTy;
+    if (isAddressUse(UserInst, U.getOperandValToReplace())) {
+      Kind = LSRUse::Address;
+      AccessTy = getAccessType(UserInst);
+    }
+
+    const SCEV *S = IU.getExpr(U);
+    PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
+    
+    // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
+    // (N - i == 0), and this allows (N - i) to be the expression that we work
+    // with rather than just N or i, so we can consider the register
+    // requirements for both N and i at the same time. Limiting this code to
+    // equality icmps is not a problem because all interesting loops use
+    // equality icmps, thanks to IndVarSimplify.
+    if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst))
+      if (CI->isEquality()) {
+        // Swap the operands if needed to put the OperandValToReplace on the
+        // left, for consistency.
+        Value *NV = CI->getOperand(1);
+        if (NV == U.getOperandValToReplace()) {
+          CI->setOperand(1, CI->getOperand(0));
+          CI->setOperand(0, NV);
+          NV = CI->getOperand(1);
+          Changed = true;
+        }
+
+        // x == y  -->  x - y == 0
+        const SCEV *N = SE.getSCEV(NV);
+        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
+          // S is normalized, so normalize N before folding it into S
+          // to keep the result normalized.
+          N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
+          Kind = LSRUse::ICmpZero;
+          S = SE.getMinusSCEV(N, S);
+        }
+
+        // -1 and the negations of all interesting strides (except the negation
+        // of -1) are now also interesting.
+        for (size_t i = 0, e = Factors.size(); i != e; ++i)
+          if (Factors[i] != -1)
+            Factors.insert(-(uint64_t)Factors[i]);
+        Factors.insert(-1);
+      }
+
+    // Get or create an LSRUse.
+    std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
+    size_t LUIdx = P.first;
+    int64_t Offset = P.second;
+    LSRUse &LU = Uses[LUIdx];
+
+    // Record the fixup.
+    LSRFixup &LF = LU.getNewFixup();
+    LF.UserInst = UserInst;
+    LF.OperandValToReplace = U.getOperandValToReplace();
+    LF.PostIncLoops = TmpPostIncLoops;
+    LF.Offset = Offset;
+    LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+
+    if (!LU.WidestFixupType ||
+        SE.getTypeSizeInBits(LU.WidestFixupType) <
+        SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+      LU.WidestFixupType = LF.OperandValToReplace->getType();
+
+    // If this is the first use of this LSRUse, give it a formula.
+    if (LU.Formulae.empty()) {
+      InsertInitialFormula(S, LU, LUIdx);
+      CountRegisters(LU.Formulae.back(), LUIdx);
+    }
+  }
+
+  DEBUG(print_fixups(dbgs()));
+}
+
+/// Insert a formula for the given expression into the given use, separating out
+/// loop-variant portions from loop-invariant and loop-computable portions.
+void
+LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
+  // Mark uses whose expressions cannot be expanded.
+  if (!isSafeToExpand(S, SE))
+    LU.RigidFormula = true;
+
+  Formula F;
+  F.initialMatch(S, L, SE);
+  bool Inserted = InsertFormula(LU, LUIdx, F);
+  assert(Inserted && "Initial formula already exists!"); (void)Inserted;
+}
+
+/// Insert a simple single-register formula for the given expression into the
+/// given use.
+void
+LSRInstance::InsertSupplementalFormula(const SCEV *S,
+                                       LSRUse &LU, size_t LUIdx) {
+  Formula F;
+  F.BaseRegs.push_back(S);
+  F.HasBaseReg = true;
+  bool Inserted = InsertFormula(LU, LUIdx, F);
+  assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
+}
+
+/// Note which registers are used by the given formula, updating RegUses.
+void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
+  if (F.ScaledReg)
+    RegUses.countRegister(F.ScaledReg, LUIdx);
+  for (const SCEV *BaseReg : F.BaseRegs)
+    RegUses.countRegister(BaseReg, LUIdx);
+}
+
+/// If the given formula has not yet been inserted, add it to the list, and
+/// return true. Return false otherwise.
+bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
+  // Do not insert formula that we will not be able to expand.
+  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
+         "Formula is illegal");
+
+  if (!LU.InsertFormula(F, *L))
+    return false;
+
+  CountRegisters(F, LUIdx);
+  return true;
+}
+
+/// Check for other uses of loop-invariant values which we're tracking. These
+/// other uses will pin these values in registers, making them less profitable
+/// for elimination.
+/// TODO: This currently misses non-constant addrec step registers.
+/// TODO: Should this give more weight to users inside the loop?
+void
+LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
+  SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
+  SmallPtrSet<const SCEV *, 32> Visited;
+
+  while (!Worklist.empty()) {
+    const SCEV *S = Worklist.pop_back_val();
+
+    // Don't process the same SCEV twice
+    if (!Visited.insert(S).second)
+      continue;
+
+    if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
+      Worklist.append(N->op_begin(), N->op_end());
+    else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
+      Worklist.push_back(C->getOperand());
+    else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+      Worklist.push_back(D->getLHS());
+      Worklist.push_back(D->getRHS());
+    } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
+      const Value *V = US->getValue();
+      if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+        // Look for instructions defined outside the loop.
+        if (L->contains(Inst)) continue;
+      } else if (isa<UndefValue>(V))
+        // Undef doesn't have a live range, so it doesn't matter.
+        continue;
+      for (const Use &U : V->uses()) {
+        const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
+        // Ignore non-instructions.
+        if (!UserInst)
+          continue;
+        // Ignore instructions in other functions (as can happen with
+        // Constants).
+        if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
+          continue;
+        // Ignore instructions not dominated by the loop.
+        const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
+          UserInst->getParent() :
+          cast<PHINode>(UserInst)->getIncomingBlock(
+            PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
+        if (!DT.dominates(L->getHeader(), UseBB))
+          continue;
+        // Don't bother if the instruction is in a BB which ends in an EHPad.
+        if (UseBB->getTerminator()->isEHPad())
+          continue;
+        // Don't bother rewriting PHIs in catchswitch blocks.
+        if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
+          continue;
+        // Ignore uses which are part of other SCEV expressions, to avoid
+        // analyzing them multiple times.
+        if (SE.isSCEVable(UserInst->getType())) {
+          const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
+          // If the user is a no-op, look through to its uses.
+          if (!isa<SCEVUnknown>(UserS))
+            continue;
+          if (UserS == US) {
+            Worklist.push_back(
+              SE.getUnknown(const_cast<Instruction *>(UserInst)));
+            continue;
+          }
+        }
+        // Ignore icmp instructions which are already being analyzed.
+        if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
+          unsigned OtherIdx = !U.getOperandNo();
+          Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
+          if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
+            continue;
+        }
+
+        std::pair<size_t, int64_t> P = getUse(
+            S, LSRUse::Basic, MemAccessTy());
+        size_t LUIdx = P.first;
+        int64_t Offset = P.second;
+        LSRUse &LU = Uses[LUIdx];
+        LSRFixup &LF = LU.getNewFixup();
+        LF.UserInst = const_cast<Instruction *>(UserInst);
+        LF.OperandValToReplace = U;
+        LF.Offset = Offset;
+        LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+        if (!LU.WidestFixupType ||
+            SE.getTypeSizeInBits(LU.WidestFixupType) <
+            SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+          LU.WidestFixupType = LF.OperandValToReplace->getType();
+        InsertSupplementalFormula(US, LU, LUIdx);
+        CountRegisters(LU.Formulae.back(), Uses.size() - 1);
+        break;
+      }
+    }
+  }
+}
+
+/// Split S into subexpressions which can be pulled out into separate
+/// registers. If C is non-null, multiply each subexpression by C.
+///
+/// Return remainder expression after factoring the subexpressions captured by
+/// Ops. If Ops is complete, return NULL.
+static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
+                                   SmallVectorImpl<const SCEV *> &Ops,
+                                   const Loop *L,
+                                   ScalarEvolution &SE,
+                                   unsigned Depth = 0) {
+  // Arbitrarily cap recursion to protect compile time.
+  if (Depth >= 3)
+    return S;
+
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    // Break out add operands.
+    for (const SCEV *S : Add->operands()) {
+      const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
+      if (Remainder)
+        Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+    }
+    return nullptr;
+  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    // Split a non-zero base out of an addrec.
+    if (AR->getStart()->isZero() || !AR->isAffine())
+      return S;
+
+    const SCEV *Remainder = CollectSubexprs(AR->getStart(),
+                                            C, Ops, L, SE, Depth+1);
+    // Split the non-zero AddRec unless it is part of a nested recurrence that
+    // does not pertain to this loop.
+    if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
+      Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+      Remainder = nullptr;
+    }
+    if (Remainder != AR->getStart()) {
+      if (!Remainder)
+        Remainder = SE.getConstant(AR->getType(), 0);
+      return SE.getAddRecExpr(Remainder,
+                              AR->getStepRecurrence(SE),
+                              AR->getLoop(),
+                              //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+                              SCEV::FlagAnyWrap);
+    }
+  } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+    // Break (C * (a + b + c)) into C*a + C*b + C*c.
+    if (Mul->getNumOperands() != 2)
+      return S;
+    if (const SCEVConstant *Op0 =
+        dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
+      C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
+      const SCEV *Remainder =
+        CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
+      if (Remainder)
+        Ops.push_back(SE.getMulExpr(C, Remainder));
+      return nullptr;
+    }
+  }
+  return S;
+}
+
+/// \brief Helper function for LSRInstance::GenerateReassociations.
+void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+                                             const Formula &Base,
+                                             unsigned Depth, size_t Idx,
+                                             bool IsScaledReg) {
+  const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  SmallVector<const SCEV *, 8> AddOps;
+  const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
+  if (Remainder)
+    AddOps.push_back(Remainder);
+
+  if (AddOps.size() == 1)
+    return;
+
+  for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
+                                                     JE = AddOps.end();
+       J != JE; ++J) {
+
+    // Loop-variant "unknown" values are uninteresting; we won't be able to
+    // do anything meaningful with them.
+    if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
+      continue;
+
+    // Don't pull a constant into a register if the constant could be folded
+    // into an immediate field.
+    if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                         LU.AccessTy, *J, Base.getNumRegs() > 1))
+      continue;
+
+    // Collect all operands except *J.
+    SmallVector<const SCEV *, 8> InnerAddOps(
+        ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
+    InnerAddOps.append(std::next(J),
+                       ((const SmallVector<const SCEV *, 8> &)AddOps).end());
+
+    // Don't leave just a constant behind in a register if the constant could
+    // be folded into an immediate field.
+    if (InnerAddOps.size() == 1 &&
+        isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                         LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
+      continue;
+
+    const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
+    if (InnerSum->isZero())
+      continue;
+    Formula F = Base;
+
+    // Add the remaining pieces of the add back into the new formula.
+    const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
+    if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
+        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                InnerSumSC->getValue()->getZExtValue())) {
+      F.UnfoldedOffset =
+          (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
+      if (IsScaledReg)
+        F.ScaledReg = nullptr;
+      else
+        F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
+    } else if (IsScaledReg)
+      F.ScaledReg = InnerSum;
+    else
+      F.BaseRegs[Idx] = InnerSum;
+
+    // Add J as its own register, or an unfolded immediate.
+    const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
+    if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
+        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                SC->getValue()->getZExtValue()))
+      F.UnfoldedOffset =
+          (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
+    else
+      F.BaseRegs.push_back(*J);
+    // We may have changed the number of register in base regs, adjust the
+    // formula accordingly.
+    F.canonicalize(*L);
+
+    if (InsertFormula(LU, LUIdx, F))
+      // If that formula hadn't been seen before, recurse to find more like
+      // it.
+      GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1);
+  }
+}
+
+/// Split out subexpressions from adds and the bases of addrecs.
+void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
+                                         Formula Base, unsigned Depth) {
+  assert(Base.isCanonical(*L) && "Input must be in the canonical form");
+  // Arbitrarily cap recursion to protect compile time.
+  if (Depth >= 3)
+    return;
+
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
+
+  if (Base.Scale == 1)
+    GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
+                               /* Idx */ -1, /* IsScaledReg */ true);
+}
+
+///  Generate a formula consisting of all of the loop-dominating registers added
+/// into a single register.
+void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
+                                       Formula Base) {
+  // This method is only interesting on a plurality of registers.
+  if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1)
+    return;
+
+  // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
+  // processing the formula.
+  Base.unscale();
+  Formula F = Base;
+  F.BaseRegs.clear();
+  SmallVector<const SCEV *, 4> Ops;
+  for (const SCEV *BaseReg : Base.BaseRegs) {
+    if (SE.properlyDominates(BaseReg, L->getHeader()) &&
+        !SE.hasComputableLoopEvolution(BaseReg, L))
+      Ops.push_back(BaseReg);
+    else
+      F.BaseRegs.push_back(BaseReg);
+  }
+  if (Ops.size() > 1) {
+    const SCEV *Sum = SE.getAddExpr(Ops);
+    // TODO: If Sum is zero, it probably means ScalarEvolution missed an
+    // opportunity to fold something. For now, just ignore such cases
+    // rather than proceed with zero in a register.
+    if (!Sum->isZero()) {
+      F.BaseRegs.push_back(Sum);
+      F.canonicalize(*L);
+      (void)InsertFormula(LU, LUIdx, F);
+    }
+  }
+}
+
+/// \brief Helper function for LSRInstance::GenerateSymbolicOffsets.
+void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                              const Formula &Base, size_t Idx,
+                                              bool IsScaledReg) {
+  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  GlobalValue *GV = ExtractSymbol(G, SE);
+  if (G->isZero() || !GV)
+    return;
+  Formula F = Base;
+  F.BaseGV = GV;
+  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+    return;
+  if (IsScaledReg)
+    F.ScaledReg = G;
+  else
+    F.BaseRegs[Idx] = G;
+  (void)InsertFormula(LU, LUIdx, F);
+}
+
+/// Generate reuse formulae using symbolic offsets.
+void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
+                                          Formula Base) {
+  // We can't add a symbolic offset if the address already contains one.
+  if (Base.BaseGV) return;
+
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
+  if (Base.Scale == 1)
+    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
+                                /* IsScaledReg */ true);
+}
+
+/// \brief Helper function for LSRInstance::GenerateConstantOffsets.
+void LSRInstance::GenerateConstantOffsetsImpl(
+    LSRUse &LU, unsigned LUIdx, const Formula &Base,
+    const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
+  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  for (int64_t Offset : Worklist) {
+    Formula F = Base;
+    F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
+    if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
+                   LU.AccessTy, F)) {
+      // Add the offset to the base register.
+      const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
+      // If it cancelled out, drop the base register, otherwise update it.
+      if (NewG->isZero()) {
+        if (IsScaledReg) {
+          F.Scale = 0;
+          F.ScaledReg = nullptr;
+        } else
+          F.deleteBaseReg(F.BaseRegs[Idx]);
+        F.canonicalize(*L);
+      } else if (IsScaledReg)
+        F.ScaledReg = NewG;
+      else
+        F.BaseRegs[Idx] = NewG;
+
+      (void)InsertFormula(LU, LUIdx, F);
+    }
+  }
+
+  int64_t Imm = ExtractImmediate(G, SE);
+  if (G->isZero() || Imm == 0)
+    return;
+  Formula F = Base;
+  F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
+  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+    return;
+  if (IsScaledReg)
+    F.ScaledReg = G;
+  else
+    F.BaseRegs[Idx] = G;
+  (void)InsertFormula(LU, LUIdx, F);
+}
+
+/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
+void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
+                                          Formula Base) {
+  // TODO: For now, just add the min and max offset, because it usually isn't
+  // worthwhile looking at everything inbetween.
+  SmallVector<int64_t, 2> Worklist;
+  Worklist.push_back(LU.MinOffset);
+  if (LU.MaxOffset != LU.MinOffset)
+    Worklist.push_back(LU.MaxOffset);
+
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
+  if (Base.Scale == 1)
+    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
+                                /* IsScaledReg */ true);
+}
+
+/// For ICmpZero, check to see if we can scale up the comparison. For example, x
+/// == y -> x*c == y*c.
+void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
+                                         Formula Base) {
+  if (LU.Kind != LSRUse::ICmpZero) return;
+
+  // Determine the integer type for the base formula.
+  Type *IntTy = Base.getType();
+  if (!IntTy) return;
+  if (SE.getTypeSizeInBits(IntTy) > 64) return;
+
+  // Don't do this if there is more than one offset.
+  if (LU.MinOffset != LU.MaxOffset) return;
+
+  assert(!Base.BaseGV && "ICmpZero use is not legal!");
+
+  // Check each interesting stride.
+  for (int64_t Factor : Factors) {
+    // Check that the multiplication doesn't overflow.
+    if (Base.BaseOffset == INT64_MIN && Factor == -1)
+      continue;
+    int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
+    if (NewBaseOffset / Factor != Base.BaseOffset)
+      continue;
+    // If the offset will be truncated at this use, check that it is in bounds.
+    if (!IntTy->isPointerTy() &&
+        !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
+      continue;
+
+    // Check that multiplying with the use offset doesn't overflow.
+    int64_t Offset = LU.MinOffset;
+    if (Offset == INT64_MIN && Factor == -1)
+      continue;
+    Offset = (uint64_t)Offset * Factor;
+    if (Offset / Factor != LU.MinOffset)
+      continue;
+    // If the offset will be truncated at this use, check that it is in bounds.
+    if (!IntTy->isPointerTy() &&
+        !ConstantInt::isValueValidForType(IntTy, Offset))
+      continue;
+
+    Formula F = Base;
+    F.BaseOffset = NewBaseOffset;
+
+    // Check that this scale is legal.
+    if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
+      continue;
+
+    // Compensate for the use having MinOffset built into it.
+    F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
+
+    const SCEV *FactorS = SE.getConstant(IntTy, Factor);
+
+    // Check that multiplying with each base register doesn't overflow.
+    for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
+      F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
+      if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
+        goto next;
+    }
+
+    // Check that multiplying with the scaled register doesn't overflow.
+    if (F.ScaledReg) {
+      F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
+      if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
+        continue;
+    }
+
+    // Check that multiplying with the unfolded offset doesn't overflow.
+    if (F.UnfoldedOffset != 0) {
+      if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
+        continue;
+      F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
+      if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
+        continue;
+      // If the offset will be truncated, check that it is in bounds.
+      if (!IntTy->isPointerTy() &&
+          !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
+        continue;
+    }
+
+    // If we make it here and it's legal, add it.
+    (void)InsertFormula(LU, LUIdx, F);
+  next:;
+  }
+}
+
+/// Generate stride factor reuse formulae by making use of scaled-offset address
+/// modes, for example.
+void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
+  // Determine the integer type for the base formula.
+  Type *IntTy = Base.getType();
+  if (!IntTy) return;
+
+  // If this Formula already has a scaled register, we can't add another one.
+  // Try to unscale the formula to generate a better scale.
+  if (Base.Scale != 0 && !Base.unscale())
+    return;
+
+  assert(Base.Scale == 0 && "unscale did not did its job!");
+
+  // Check each interesting stride.
+  for (int64_t Factor : Factors) {
+    Base.Scale = Factor;
+    Base.HasBaseReg = Base.BaseRegs.size() > 1;
+    // Check whether this scale is going to be legal.
+    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                    Base)) {
+      // As a special-case, handle special out-of-loop Basic users specially.
+      // TODO: Reconsider this special case.
+      if (LU.Kind == LSRUse::Basic &&
+          isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
+                     LU.AccessTy, Base) &&
+          LU.AllFixupsOutsideLoop)
+        LU.Kind = LSRUse::Special;
+      else
+        continue;
+    }
+    // For an ICmpZero, negating a solitary base register won't lead to
+    // new solutions.
+    if (LU.Kind == LSRUse::ICmpZero &&
+        !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
+      continue;
+    // For each addrec base reg, if its loop is current loop, apply the scale.
+    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
+      if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
+        const SCEV *FactorS = SE.getConstant(IntTy, Factor);
+        if (FactorS->isZero())
+          continue;
+        // Divide out the factor, ignoring high bits, since we'll be
+        // scaling the value back up in the end.
+        if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
+          // TODO: This could be optimized to avoid all the copying.
+          Formula F = Base;
+          F.ScaledReg = Quotient;
+          F.deleteBaseReg(F.BaseRegs[i]);
+          // The canonical representation of 1*reg is reg, which is already in
+          // Base. In that case, do not try to insert the formula, it will be
+          // rejected anyway.
+          if (F.Scale == 1 && (F.BaseRegs.empty() ||
+                               (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
+            continue;
+          // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
+          // non canonical Formula with ScaledReg's loop not being L.
+          if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
+            F.canonicalize(*L);
+          (void)InsertFormula(LU, LUIdx, F);
+        }
+      }
+    }
+  }
+}
+
+/// Generate reuse formulae from different IV types.
+void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
+  // Don't bother truncating symbolic values.
+  if (Base.BaseGV) return;
+
+  // Determine the integer type for the base formula.
+  Type *DstTy = Base.getType();
+  if (!DstTy) return;
+  DstTy = SE.getEffectiveSCEVType(DstTy);
+
+  for (Type *SrcTy : Types) {
+    if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
+      Formula F = Base;
+
+      if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);
+      for (const SCEV *&BaseReg : F.BaseRegs)
+        BaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);
+
+      // TODO: This assumes we've done basic processing on all uses and
+      // have an idea what the register usage is.
+      if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
+        continue;
+
+      F.canonicalize(*L);
+      (void)InsertFormula(LU, LUIdx, F);
+    }
+  }
+}
+
+namespace {
+
+/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
+/// modifications so that the search phase doesn't have to worry about the data
+/// structures moving underneath it.
+struct WorkItem {
+  size_t LUIdx;
+  int64_t Imm;
+  const SCEV *OrigReg;
+
+  WorkItem(size_t LI, int64_t I, const SCEV *R)
+    : LUIdx(LI), Imm(I), OrigReg(R) {}
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+} // end anonymous namespace
+
+void WorkItem::print(raw_ostream &OS) const {
+  OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
+     << " , add offset " << Imm;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void WorkItem::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+/// Look for registers which are a constant distance apart and try to form reuse
+/// opportunities between them.
+void LSRInstance::GenerateCrossUseConstantOffsets() {
+  // Group the registers by their value without any added constant offset.
+  typedef std::map<int64_t, const SCEV *> ImmMapTy;
+  DenseMap<const SCEV *, ImmMapTy> Map;
+  DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
+  SmallVector<const SCEV *, 8> Sequence;
+  for (const SCEV *Use : RegUses) {
+    const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
+    int64_t Imm = ExtractImmediate(Reg, SE);
+    auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
+    if (Pair.second)
+      Sequence.push_back(Reg);
+    Pair.first->second.insert(std::make_pair(Imm, Use));
+    UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
+  }
+
+  // Now examine each set of registers with the same base value. Build up
+  // a list of work to do and do the work in a separate step so that we're
+  // not adding formulae and register counts while we're searching.
+  SmallVector<WorkItem, 32> WorkItems;
+  SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
+  for (const SCEV *Reg : Sequence) {
+    const ImmMapTy &Imms = Map.find(Reg)->second;
+
+    // It's not worthwhile looking for reuse if there's only one offset.
+    if (Imms.size() == 1)
+      continue;
+
+    DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
+          for (const auto &Entry : Imms)
+            dbgs() << ' ' << Entry.first;
+          dbgs() << '\n');
+
+    // Examine each offset.
+    for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
+         J != JE; ++J) {
+      const SCEV *OrigReg = J->second;
+
+      int64_t JImm = J->first;
+      const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
+
+      if (!isa<SCEVConstant>(OrigReg) &&
+          UsedByIndicesMap[Reg].count() == 1) {
+        DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
+        continue;
+      }
+
+      // Conservatively examine offsets between this orig reg a few selected
+      // other orig regs.
+      ImmMapTy::const_iterator OtherImms[] = {
+        Imms.begin(), std::prev(Imms.end()),
+        Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) /
+                         2)
+      };
+      for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
+        ImmMapTy::const_iterator M = OtherImms[i];
+        if (M == J || M == JE) continue;
+
+        // Compute the difference between the two.
+        int64_t Imm = (uint64_t)JImm - M->first;
+        for (unsigned LUIdx : UsedByIndices.set_bits())
+          // Make a memo of this use, offset, and register tuple.
+          if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
+            WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
+      }
+    }
+  }
+
+  Map.clear();
+  Sequence.clear();
+  UsedByIndicesMap.clear();
+  UniqueItems.clear();
+
+  // Now iterate through the worklist and add new formulae.
+  for (const WorkItem &WI : WorkItems) {
+    size_t LUIdx = WI.LUIdx;
+    LSRUse &LU = Uses[LUIdx];
+    int64_t Imm = WI.Imm;
+    const SCEV *OrigReg = WI.OrigReg;
+
+    Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
+    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
+    unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
+
+    // TODO: Use a more targeted data structure.
+    for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
+      Formula F = LU.Formulae[L];
+      // FIXME: The code for the scaled and unscaled registers looks
+      // very similar but slightly different. Investigate if they
+      // could be merged. That way, we would not have to unscale the
+      // Formula.
+      F.unscale();
+      // Use the immediate in the scaled register.
+      if (F.ScaledReg == OrigReg) {
+        int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
+        // Don't create 50 + reg(-50).
+        if (F.referencesReg(SE.getSCEV(
+                   ConstantInt::get(IntTy, -(uint64_t)Offset))))
+          continue;
+        Formula NewF = F;
+        NewF.BaseOffset = Offset;
+        if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                        NewF))
+          continue;
+        NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
+
+        // If the new scale is a constant in a register, and adding the constant
+        // value to the immediate would produce a value closer to zero than the
+        // immediate itself, then the formula isn't worthwhile.
+        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
+          if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
+              (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
+                  .ule(std::abs(NewF.BaseOffset)))
+            continue;
+
+        // OK, looks good.
+        NewF.canonicalize(*this->L);
+        (void)InsertFormula(LU, LUIdx, NewF);
+      } else {
+        // Use the immediate in a base register.
+        for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
+          const SCEV *BaseReg = F.BaseRegs[N];
+          if (BaseReg != OrigReg)
+            continue;
+          Formula NewF = F;
+          NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
+          if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
+                          LU.Kind, LU.AccessTy, NewF)) {
+            if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
+              continue;
+            NewF = F;
+            NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
+          }
+          NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
+
+          // If the new formula has a constant in a register, and adding the
+          // constant value to the immediate would produce a value closer to
+          // zero than the immediate itself, then the formula isn't worthwhile.
+          for (const SCEV *NewReg : NewF.BaseRegs)
+            if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
+              if ((C->getAPInt() + NewF.BaseOffset)
+                      .abs()
+                      .slt(std::abs(NewF.BaseOffset)) &&
+                  (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >=
+                      countTrailingZeros<uint64_t>(NewF.BaseOffset))
+                goto skip_formula;
+
+          // Ok, looks good.
+          NewF.canonicalize(*this->L);
+          (void)InsertFormula(LU, LUIdx, NewF);
+          break;
+        skip_formula:;
+        }
+      }
+    }
+  }
+}
+
+/// Generate formulae for each use.
+void
+LSRInstance::GenerateAllReuseFormulae() {
+  // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
+  // queries are more precise.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
+  }
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateScales(LU, LUIdx, LU.Formulae[i]);
+  }
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
+  }
+
+  GenerateCrossUseConstantOffsets();
+
+  DEBUG(dbgs() << "\n"
+                  "After generating reuse formulae:\n";
+        print_uses(dbgs()));
+}
+
+/// If there are multiple formulae with the same set of registers used
+/// by other uses, pick the best one and delete the others.
+void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
+  DenseSet<const SCEV *> VisitedRegs;
+  SmallPtrSet<const SCEV *, 16> Regs;
+  SmallPtrSet<const SCEV *, 16> LoserRegs;
+#ifndef NDEBUG
+  bool ChangedFormulae = false;
+#endif
+
+  // Collect the best formula for each unique set of shared registers. This
+  // is reset for each use.
+  typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>
+    BestFormulaeTy;
+  BestFormulaeTy BestFormulae;
+
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+
+    bool Any = false;
+    for (size_t FIdx = 0, NumForms = LU.Formulae.size();
+         FIdx != NumForms; ++FIdx) {
+      Formula &F = LU.Formulae[FIdx];
+
+      // Some formulas are instant losers. For example, they may depend on
+      // nonexistent AddRecs from other loops. These need to be filtered
+      // immediately, otherwise heuristics could choose them over others leading
+      // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
+      // avoids the need to recompute this information across formulae using the
+      // same bad AddRec. Passing LoserRegs is also essential unless we remove
+      // the corresponding bad register from the Regs set.
+      Cost CostF;
+      Regs.clear();
+      CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, SE, DT, LU, &LoserRegs);
+      if (CostF.isLoser()) {
+        // During initial formula generation, undesirable formulae are generated
+        // by uses within other loops that have some non-trivial address mode or
+        // use the postinc form of the IV. LSR needs to provide these formulae
+        // as the basis of rediscovering the desired formula that uses an AddRec
+        // corresponding to the existing phi. Once all formulae have been
+        // generated, these initial losers may be pruned.
+        DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
+              dbgs() << "\n");
+      }
+      else {
+        SmallVector<const SCEV *, 4> Key;
+        for (const SCEV *Reg : F.BaseRegs) {
+          if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
+            Key.push_back(Reg);
+        }
+        if (F.ScaledReg &&
+            RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
+          Key.push_back(F.ScaledReg);
+        // Unstable sort by host order ok, because this is only used for
+        // uniquifying.
+        std::sort(Key.begin(), Key.end());
+
+        std::pair<BestFormulaeTy::const_iterator, bool> P =
+          BestFormulae.insert(std::make_pair(Key, FIdx));
+        if (P.second)
+          continue;
+
+        Formula &Best = LU.Formulae[P.first->second];
+
+        Cost CostBest;
+        Regs.clear();
+        CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU);
+        if (CostF.isLess(CostBest, TTI))
+          std::swap(F, Best);
+        DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
+              dbgs() << "\n"
+                        "    in favor of formula "; Best.print(dbgs());
+              dbgs() << '\n');
+      }
+#ifndef NDEBUG
+      ChangedFormulae = true;
+#endif
+      LU.DeleteFormula(F);
+      --FIdx;
+      --NumForms;
+      Any = true;
+    }
+
+    // Now that we've filtered out some formulae, recompute the Regs set.
+    if (Any)
+      LU.RecomputeRegs(LUIdx, RegUses);
+
+    // Reset this to prepare for the next use.
+    BestFormulae.clear();
+  }
+
+  DEBUG(if (ChangedFormulae) {
+          dbgs() << "\n"
+                    "After filtering out undesirable candidates:\n";
+          print_uses(dbgs());
+        });
+}
+
+// This is a rough guess that seems to work fairly well.
+static const size_t ComplexityLimit = UINT16_MAX;
+
+/// Estimate the worst-case number of solutions the solver might have to
+/// consider. It almost never considers this many solutions because it prune the
+/// search space, but the pruning isn't always sufficient.
+size_t LSRInstance::EstimateSearchSpaceComplexity() const {
+  size_t Power = 1;
+  for (const LSRUse &LU : Uses) {
+    size_t FSize = LU.Formulae.size();
+    if (FSize >= ComplexityLimit) {
+      Power = ComplexityLimit;
+      break;
+    }
+    Power *= FSize;
+    if (Power >= ComplexityLimit)
+      break;
+  }
+  return Power;
+}
+
+/// When one formula uses a superset of the registers of another formula, it
+/// won't help reduce register pressure (though it may not necessarily hurt
+/// register pressure); remove it to simplify the system.
+void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
+  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    DEBUG(dbgs() << "The search space is too complex.\n");
+
+    DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
+                    "which use a superset of registers used by other "
+                    "formulae.\n");
+
+    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+      LSRUse &LU = Uses[LUIdx];
+      bool Any = false;
+      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+        Formula &F = LU.Formulae[i];
+        // Look for a formula with a constant or GV in a register. If the use
+        // also has a formula with that same value in an immediate field,
+        // delete the one that uses a register.
+        for (SmallVectorImpl<const SCEV *>::const_iterator
+             I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
+          if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
+            Formula NewF = F;
+            NewF.BaseOffset += C->getValue()->getSExtValue();
+            NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+                                (I - F.BaseRegs.begin()));
+            if (LU.HasFormulaWithSameRegs(NewF)) {
+              DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
+              LU.DeleteFormula(F);
+              --i;
+              --e;
+              Any = true;
+              break;
+            }
+          } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
+            if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
+              if (!F.BaseGV) {
+                Formula NewF = F;
+                NewF.BaseGV = GV;
+                NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+                                    (I - F.BaseRegs.begin()));
+                if (LU.HasFormulaWithSameRegs(NewF)) {
+                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                        dbgs() << '\n');
+                  LU.DeleteFormula(F);
+                  --i;
+                  --e;
+                  Any = true;
+                  break;
+                }
+              }
+          }
+        }
+      }
+      if (Any)
+        LU.RecomputeRegs(LUIdx, RegUses);
+    }
+
+    DEBUG(dbgs() << "After pre-selection:\n";
+          print_uses(dbgs()));
+  }
+}
+
+/// When there are many registers for expressions like A, A+1, A+2, etc.,
+/// allocate a single register for them.
+void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+
+  DEBUG(dbgs() << "The search space is too complex.\n"
+                  "Narrowing the search space by assuming that uses separated "
+                  "by a constant offset will use the same registers.\n");
+
+  // This is especially useful for unrolled loops.
+
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (const Formula &F : LU.Formulae) {
+      if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
+        continue;
+
+      LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
+      if (!LUThatHas)
+        continue;
+
+      if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
+                              LU.Kind, LU.AccessTy))
+        continue;
+
+      DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
+
+      LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
+
+      // Transfer the fixups of LU to LUThatHas.
+      for (LSRFixup &Fixup : LU.Fixups) {
+        Fixup.Offset += F.BaseOffset;
+        LUThatHas->pushFixup(Fixup);
+        DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
+      }
+      
+      // Delete formulae from the new use which are no longer legal.
+      bool Any = false;
+      for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
+        Formula &F = LUThatHas->Formulae[i];
+        if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
+                        LUThatHas->Kind, LUThatHas->AccessTy, F)) {
+          DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                dbgs() << '\n');
+          LUThatHas->DeleteFormula(F);
+          --i;
+          --e;
+          Any = true;
+        }
+      }
+
+      if (Any)
+        LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
+
+      // Delete the old use.
+      DeleteUse(LU, LUIdx);
+      --LUIdx;
+      --NumUses;
+      break;
+    }
+  }
+
+  DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+}
+
+/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
+/// we've done more filtering, as it may be able to find more formulae to
+/// eliminate.
+void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
+  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    DEBUG(dbgs() << "The search space is too complex.\n");
+
+    DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
+                    "undesirable dedicated registers.\n");
+
+    FilterOutUndesirableDedicatedRegisters();
+
+    DEBUG(dbgs() << "After pre-selection:\n";
+          print_uses(dbgs()));
+  }
+}
+
+/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
+/// Pick the best one and delete the others.
+/// This narrowing heuristic is to keep as many formulae with different
+/// Scale and ScaledReg pair as possible while narrowing the search space.
+/// The benefit is that it is more likely to find out a better solution
+/// from a formulae set with more Scale and ScaledReg variations than
+/// a formulae set with the same Scale and ScaledReg. The picking winner
+/// reg heurstic will often keep the formulae with the same Scale and
+/// ScaledReg and filter others, and we want to avoid that if possible.
+void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+
+  DEBUG(dbgs() << "The search space is too complex.\n"
+                  "Narrowing the search space by choosing the best Formula "
+                  "from the Formulae with the same Scale and ScaledReg.\n");
+
+  // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
+  typedef DenseMap<std::pair<const SCEV *, int64_t>, size_t> BestFormulaeTy;
+  BestFormulaeTy BestFormulae;
+#ifndef NDEBUG
+  bool ChangedFormulae = false;
+#endif
+  DenseSet<const SCEV *> VisitedRegs;
+  SmallPtrSet<const SCEV *, 16> Regs;
+
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+
+    // Return true if Formula FA is better than Formula FB.
+    auto IsBetterThan = [&](Formula &FA, Formula &FB) {
+      // First we will try to choose the Formula with fewer new registers.
+      // For a register used by current Formula, the more the register is
+      // shared among LSRUses, the less we increase the register number
+      // counter of the formula.
+      size_t FARegNum = 0;
+      for (const SCEV *Reg : FA.BaseRegs) {
+        const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+        FARegNum += (NumUses - UsedByIndices.count() + 1);
+      }
+      size_t FBRegNum = 0;
+      for (const SCEV *Reg : FB.BaseRegs) {
+        const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+        FBRegNum += (NumUses - UsedByIndices.count() + 1);
+      }
+      if (FARegNum != FBRegNum)
+        return FARegNum < FBRegNum;
+
+      // If the new register numbers are the same, choose the Formula with
+      // less Cost.
+      Cost CostFA, CostFB;
+      Regs.clear();
+      CostFA.RateFormula(TTI, FA, Regs, VisitedRegs, L, SE, DT, LU);
+      Regs.clear();
+      CostFB.RateFormula(TTI, FB, Regs, VisitedRegs, L, SE, DT, LU);
+      return CostFA.isLess(CostFB, TTI);
+    };
+
+    bool Any = false;
+    for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
+         ++FIdx) {
+      Formula &F = LU.Formulae[FIdx];
+      if (!F.ScaledReg)
+        continue;
+      auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
+      if (P.second)
+        continue;
+
+      Formula &Best = LU.Formulae[P.first->second];
+      if (IsBetterThan(F, Best))
+        std::swap(F, Best);
+      DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
+            dbgs() << "\n"
+                      "    in favor of formula ";
+            Best.print(dbgs()); dbgs() << '\n');
+#ifndef NDEBUG
+      ChangedFormulae = true;
+#endif
+      LU.DeleteFormula(F);
+      --FIdx;
+      --NumForms;
+      Any = true;
+    }
+    if (Any)
+      LU.RecomputeRegs(LUIdx, RegUses);
+
+    // Reset this to prepare for the next use.
+    BestFormulae.clear();
+  }
+
+  DEBUG(if (ChangedFormulae) {
+    dbgs() << "\n"
+              "After filtering out undesirable candidates:\n";
+    print_uses(dbgs());
+  });
+}
+
+/// The function delete formulas with high registers number expectation.
+/// Assuming we don't know the value of each formula (already delete
+/// all inefficient), generate probability of not selecting for each
+/// register.
+/// For example,
+/// Use1:
+///  reg(a) + reg({0,+,1})
+///  reg(a) + reg({-1,+,1}) + 1
+///  reg({a,+,1})
+/// Use2:
+///  reg(b) + reg({0,+,1})
+///  reg(b) + reg({-1,+,1}) + 1
+///  reg({b,+,1})
+/// Use3:
+///  reg(c) + reg(b) + reg({0,+,1})
+///  reg(c) + reg({b,+,1})
+///
+/// Probability of not selecting
+///                 Use1   Use2    Use3
+/// reg(a)         (1/3) *   1   *   1
+/// reg(b)           1   * (1/3) * (1/2)
+/// reg({0,+,1})   (2/3) * (2/3) * (1/2)
+/// reg({-1,+,1})  (2/3) * (2/3) *   1
+/// reg({a,+,1})   (2/3) *   1   *   1
+/// reg({b,+,1})     1   * (2/3) * (2/3)
+/// reg(c)           1   *   1   *   0
+///
+/// Now count registers number mathematical expectation for each formula:
+/// Note that for each use we exclude probability if not selecting for the use.
+/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
+/// probabilty 1/3 of not selecting for Use1).
+/// Use1:
+///  reg(a) + reg({0,+,1})          1 + 1/3       -- to be deleted
+///  reg(a) + reg({-1,+,1}) + 1     1 + 4/9       -- to be deleted
+///  reg({a,+,1})                   1
+/// Use2:
+///  reg(b) + reg({0,+,1})          1/2 + 1/3     -- to be deleted
+///  reg(b) + reg({-1,+,1}) + 1     1/2 + 2/3     -- to be deleted
+///  reg({b,+,1})                   2/3
+/// Use3:
+///  reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
+///  reg(c) + reg({b,+,1})          1 + 2/3
+
+void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+  // Ok, we have too many of formulae on our hands to conveniently handle.
+  // Use a rough heuristic to thin out the list.
+
+  // Set of Regs wich will be 100% used in final solution.
+  // Used in each formula of a solution (in example above this is reg(c)).
+  // We can skip them in calculations.
+  SmallPtrSet<const SCEV *, 4> UniqRegs;
+  DEBUG(dbgs() << "The search space is too complex.\n");
+
+  // Map each register to probability of not selecting
+  DenseMap <const SCEV *, float> RegNumMap;
+  for (const SCEV *Reg : RegUses) {
+    if (UniqRegs.count(Reg))
+      continue;
+    float PNotSel = 1;
+    for (const LSRUse &LU : Uses) {
+      if (!LU.Regs.count(Reg))
+        continue;
+      float P = LU.getNotSelectedProbability(Reg);
+      if (P != 0.0)
+        PNotSel *= P;
+      else
+        UniqRegs.insert(Reg);
+    }
+    RegNumMap.insert(std::make_pair(Reg, PNotSel));
+  }
+
+  DEBUG(dbgs() << "Narrowing the search space by deleting costly formulas\n");
+
+  // Delete formulas where registers number expectation is high.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    // If nothing to delete - continue.
+    if (LU.Formulae.size() < 2)
+      continue;
+    // This is temporary solution to test performance. Float should be
+    // replaced with round independent type (based on integers) to avoid
+    // different results for different target builds.
+    float FMinRegNum = LU.Formulae[0].getNumRegs();
+    float FMinARegNum = LU.Formulae[0].getNumRegs();
+    size_t MinIdx = 0;
+    for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+      Formula &F = LU.Formulae[i];
+      float FRegNum = 0;
+      float FARegNum = 0;
+      for (const SCEV *BaseReg : F.BaseRegs) {
+        if (UniqRegs.count(BaseReg))
+          continue;
+        FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+        if (isa<SCEVAddRecExpr>(BaseReg))
+          FARegNum +=
+              RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+      }
+      if (const SCEV *ScaledReg = F.ScaledReg) {
+        if (!UniqRegs.count(ScaledReg)) {
+          FRegNum +=
+              RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+          if (isa<SCEVAddRecExpr>(ScaledReg))
+            FARegNum +=
+                RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+        }
+      }
+      if (FMinRegNum > FRegNum ||
+          (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
+        FMinRegNum = FRegNum;
+        FMinARegNum = FARegNum;
+        MinIdx = i;
+      }
+    }
+    DEBUG(dbgs() << "  The formula "; LU.Formulae[MinIdx].print(dbgs());
+          dbgs() << " with min reg num " << FMinRegNum << '\n');
+    if (MinIdx != 0)
+      std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
+    while (LU.Formulae.size() != 1) {
+      DEBUG(dbgs() << "  Deleting "; LU.Formulae.back().print(dbgs());
+            dbgs() << '\n');
+      LU.Formulae.pop_back();
+    }
+    LU.RecomputeRegs(LUIdx, RegUses);
+    assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
+    Formula &F = LU.Formulae[0];
+    DEBUG(dbgs() << "  Leaving only "; F.print(dbgs()); dbgs() << '\n');
+    // When we choose the formula, the regs become unique.
+    UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+    if (F.ScaledReg)
+      UniqRegs.insert(F.ScaledReg);
+  }
+  DEBUG(dbgs() << "After pre-selection:\n";
+  print_uses(dbgs()));
+}
+
+
+/// Pick a register which seems likely to be profitable, and then in any use
+/// which has any reference to that register, delete all formulae which do not
+/// reference that register.
+void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
+  // With all other options exhausted, loop until the system is simple
+  // enough to handle.
+  SmallPtrSet<const SCEV *, 4> Taken;
+  while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    // Ok, we have too many of formulae on our hands to conveniently handle.
+    // Use a rough heuristic to thin out the list.
+    DEBUG(dbgs() << "The search space is too complex.\n");
+
+    // Pick the register which is used by the most LSRUses, which is likely
+    // to be a good reuse register candidate.
+    const SCEV *Best = nullptr;
+    unsigned BestNum = 0;
+    for (const SCEV *Reg : RegUses) {
+      if (Taken.count(Reg))
+        continue;
+      if (!Best) {
+        Best = Reg;
+        BestNum = RegUses.getUsedByIndices(Reg).count();
+      } else {
+        unsigned Count = RegUses.getUsedByIndices(Reg).count();
+        if (Count > BestNum) {
+          Best = Reg;
+          BestNum = Count;
+        }
+      }
+    }
+
+    DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
+                 << " will yield profitable reuse.\n");
+    Taken.insert(Best);
+
+    // In any use with formulae which references this register, delete formulae
+    // which don't reference it.
+    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+      LSRUse &LU = Uses[LUIdx];
+      if (!LU.Regs.count(Best)) continue;
+
+      bool Any = false;
+      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+        Formula &F = LU.Formulae[i];
+        if (!F.referencesReg(Best)) {
+          DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
+          LU.DeleteFormula(F);
+          --e;
+          --i;
+          Any = true;
+          assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
+          continue;
+        }
+      }
+
+      if (Any)
+        LU.RecomputeRegs(LUIdx, RegUses);
+    }
+
+    DEBUG(dbgs() << "After pre-selection:\n";
+          print_uses(dbgs()));
+  }
+}
+
+/// If there are an extraordinary number of formulae to choose from, use some
+/// rough heuristics to prune down the number of formulae. This keeps the main
+/// solver from taking an extraordinary amount of time in some worst-case
+/// scenarios.
+void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
+  NarrowSearchSpaceByDetectingSupersets();
+  NarrowSearchSpaceByCollapsingUnrolledCode();
+  NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  if (FilterSameScaledReg)
+    NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+  if (LSRExpNarrow)
+    NarrowSearchSpaceByDeletingCostlyFormulas();
+  else
+    NarrowSearchSpaceByPickingWinnerRegs();
+}
+
+/// This is the recursive solver.
+void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
+                               Cost &SolutionCost,
+                               SmallVectorImpl<const Formula *> &Workspace,
+                               const Cost &CurCost,
+                               const SmallPtrSet<const SCEV *, 16> &CurRegs,
+                               DenseSet<const SCEV *> &VisitedRegs) const {
+  // Some ideas:
+  //  - prune more:
+  //    - use more aggressive filtering
+  //    - sort the formula so that the most profitable solutions are found first
+  //    - sort the uses too
+  //  - search faster:
+  //    - don't compute a cost, and then compare. compare while computing a cost
+  //      and bail early.
+  //    - track register sets with SmallBitVector
+
+  const LSRUse &LU = Uses[Workspace.size()];
+
+  // If this use references any register that's already a part of the
+  // in-progress solution, consider it a requirement that a formula must
+  // reference that register in order to be considered. This prunes out
+  // unprofitable searching.
+  SmallSetVector<const SCEV *, 4> ReqRegs;
+  for (const SCEV *S : CurRegs)
+    if (LU.Regs.count(S))
+      ReqRegs.insert(S);
+
+  SmallPtrSet<const SCEV *, 16> NewRegs;
+  Cost NewCost;
+  for (const Formula &F : LU.Formulae) {
+    // Ignore formulae which may not be ideal in terms of register reuse of
+    // ReqRegs.  The formula should use all required registers before
+    // introducing new ones.
+    int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
+    for (const SCEV *Reg : ReqRegs) {
+      if ((F.ScaledReg && F.ScaledReg == Reg) ||
+          is_contained(F.BaseRegs, Reg)) {
+        --NumReqRegsToFind;
+        if (NumReqRegsToFind == 0)
+          break;
+      }
+    }
+    if (NumReqRegsToFind != 0) {
+      // If none of the formulae satisfied the required registers, then we could
+      // clear ReqRegs and try again. Currently, we simply give up in this case.
+      continue;
+    }
+
+    // Evaluate the cost of the current formula. If it's already worse than
+    // the current best, prune the search at that point.
+    NewCost = CurCost;
+    NewRegs = CurRegs;
+    NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU);
+    if (NewCost.isLess(SolutionCost, TTI)) {
+      Workspace.push_back(&F);
+      if (Workspace.size() != Uses.size()) {
+        SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
+                     NewRegs, VisitedRegs);
+        if (F.getNumRegs() == 1 && Workspace.size() == 1)
+          VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
+      } else {
+        DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
+              dbgs() << ".\n Regs:";
+              for (const SCEV *S : NewRegs)
+                dbgs() << ' ' << *S;
+              dbgs() << '\n');
+
+        SolutionCost = NewCost;
+        Solution = Workspace;
+      }
+      Workspace.pop_back();
+    }
+  }
+}
+
+/// Choose one formula from each use. Return the results in the given Solution
+/// vector.
+void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
+  SmallVector<const Formula *, 8> Workspace;
+  Cost SolutionCost;
+  SolutionCost.Lose();
+  Cost CurCost;
+  SmallPtrSet<const SCEV *, 16> CurRegs;
+  DenseSet<const SCEV *> VisitedRegs;
+  Workspace.reserve(Uses.size());
+
+  // SolveRecurse does all the work.
+  SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
+               CurRegs, VisitedRegs);
+  if (Solution.empty()) {
+    DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
+    return;
+  }
+
+  // Ok, we've now made all our decisions.
+  DEBUG(dbgs() << "\n"
+                  "The chosen solution requires "; SolutionCost.print(dbgs());
+        dbgs() << ":\n";
+        for (size_t i = 0, e = Uses.size(); i != e; ++i) {
+          dbgs() << "  ";
+          Uses[i].print(dbgs());
+          dbgs() << "\n"
+                    "    ";
+          Solution[i]->print(dbgs());
+          dbgs() << '\n';
+        });
+
+  assert(Solution.size() == Uses.size() && "Malformed solution!");
+}
+
+/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
+/// we can go while still being dominated by the input positions. This helps
+/// canonicalize the insert position, which encourages sharing.
+BasicBlock::iterator
+LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
+                                 const SmallVectorImpl<Instruction *> &Inputs)
+                                                                         const {
+  Instruction *Tentative = &*IP;
+  while (true) {
+    bool AllDominate = true;
+    Instruction *BetterPos = nullptr;
+    // Don't bother attempting to insert before a catchswitch, their basic block
+    // cannot have other non-PHI instructions.
+    if (isa<CatchSwitchInst>(Tentative))
+      return IP;
+
+    for (Instruction *Inst : Inputs) {
+      if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
+        AllDominate = false;
+        break;
+      }
+      // Attempt to find an insert position in the middle of the block,
+      // instead of at the end, so that it can be used for other expansions.
+      if (Tentative->getParent() == Inst->getParent() &&
+          (!BetterPos || !DT.dominates(Inst, BetterPos)))
+        BetterPos = &*std::next(BasicBlock::iterator(Inst));
+    }
+    if (!AllDominate)
+      break;
+    if (BetterPos)
+      IP = BetterPos->getIterator();
+    else
+      IP = Tentative->getIterator();
+
+    const Loop *IPLoop = LI.getLoopFor(IP->getParent());
+    unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
+
+    BasicBlock *IDom;
+    for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
+      if (!Rung) return IP;
+      Rung = Rung->getIDom();
+      if (!Rung) return IP;
+      IDom = Rung->getBlock();
+
+      // Don't climb into a loop though.
+      const Loop *IDomLoop = LI.getLoopFor(IDom);
+      unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
+      if (IDomDepth <= IPLoopDepth &&
+          (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
+        break;
+    }
+
+    Tentative = IDom->getTerminator();
+  }
+
+  return IP;
+}
+
+/// Determine an input position which will be dominated by the operands and
+/// which will dominate the result.
+BasicBlock::iterator
+LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
+                                           const LSRFixup &LF,
+                                           const LSRUse &LU,
+                                           SCEVExpander &Rewriter) const {
+  // Collect some instructions which must be dominated by the
+  // expanding replacement. These must be dominated by any operands that
+  // will be required in the expansion.
+  SmallVector<Instruction *, 4> Inputs;
+  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
+    Inputs.push_back(I);
+  if (LU.Kind == LSRUse::ICmpZero)
+    if (Instruction *I =
+          dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
+      Inputs.push_back(I);
+  if (LF.PostIncLoops.count(L)) {
+    if (LF.isUseFullyOutsideLoop(L))
+      Inputs.push_back(L->getLoopLatch()->getTerminator());
+    else
+      Inputs.push_back(IVIncInsertPos);
+  }
+  // The expansion must also be dominated by the increment positions of any
+  // loops it for which it is using post-inc mode.
+  for (const Loop *PIL : LF.PostIncLoops) {
+    if (PIL == L) continue;
+
+    // Be dominated by the loop exit.
+    SmallVector<BasicBlock *, 4> ExitingBlocks;
+    PIL->getExitingBlocks(ExitingBlocks);
+    if (!ExitingBlocks.empty()) {
+      BasicBlock *BB = ExitingBlocks[0];
+      for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
+        BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
+      Inputs.push_back(BB->getTerminator());
+    }
+  }
+
+  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
+         && !isa<DbgInfoIntrinsic>(LowestIP) &&
+         "Insertion point must be a normal instruction");
+
+  // Then, climb up the immediate dominator tree as far as we can go while
+  // still being dominated by the input positions.
+  BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
+
+  // Don't insert instructions before PHI nodes.
+  while (isa<PHINode>(IP)) ++IP;
+
+  // Ignore landingpad instructions.
+  while (IP->isEHPad()) ++IP;
+
+  // Ignore debug intrinsics.
+  while (isa<DbgInfoIntrinsic>(IP)) ++IP;
+
+  // Set IP below instructions recently inserted by SCEVExpander. This keeps the
+  // IP consistent across expansions and allows the previously inserted
+  // instructions to be reused by subsequent expansion.
+  while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
+    ++IP;
+
+  return IP;
+}
+
+/// Emit instructions for the leading candidate expression for this LSRUse (this
+/// is called "expanding").
+Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
+                           const Formula &F, BasicBlock::iterator IP,
+                           SCEVExpander &Rewriter,
+                           SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+  if (LU.RigidFormula)
+    return LF.OperandValToReplace;
+
+  // Determine an input position which will be dominated by the operands and
+  // which will dominate the result.
+  IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
+  Rewriter.setInsertPoint(&*IP);
+
+  // Inform the Rewriter if we have a post-increment use, so that it can
+  // perform an advantageous expansion.
+  Rewriter.setPostInc(LF.PostIncLoops);
+
+  // This is the type that the user actually needs.
+  Type *OpTy = LF.OperandValToReplace->getType();
+  // This will be the type that we'll initially expand to.
+  Type *Ty = F.getType();
+  if (!Ty)
+    // No type known; just expand directly to the ultimate type.
+    Ty = OpTy;
+  else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
+    // Expand directly to the ultimate type if it's the right size.
+    Ty = OpTy;
+  // This is the type to do integer arithmetic in.
+  Type *IntTy = SE.getEffectiveSCEVType(Ty);
+
+  // Build up a list of operands to add together to form the full base.
+  SmallVector<const SCEV *, 8> Ops;
+
+  // Expand the BaseRegs portion.
+  for (const SCEV *Reg : F.BaseRegs) {
+    assert(!Reg->isZero() && "Zero allocated in a base register!");
+
+    // If we're expanding for a post-inc user, make the post-inc adjustment.
+    Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
+    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
+  }
+
+  // Expand the ScaledReg portion.
+  Value *ICmpScaledV = nullptr;
+  if (F.Scale != 0) {
+    const SCEV *ScaledS = F.ScaledReg;
+
+    // If we're expanding for a post-inc user, make the post-inc adjustment.
+    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
+    ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
+
+    if (LU.Kind == LSRUse::ICmpZero) {
+      // Expand ScaleReg as if it was part of the base regs.
+      if (F.Scale == 1)
+        Ops.push_back(
+            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
+      else {
+        // An interesting way of "folding" with an icmp is to use a negated
+        // scale, which we'll implement by inserting it into the other operand
+        // of the icmp.
+        assert(F.Scale == -1 &&
+               "The only scale supported by ICmpZero uses is -1!");
+        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
+      }
+    } else {
+      // Otherwise just expand the scaled register and an explicit scale,
+      // which is expected to be matched as part of the address.
+
+      // Flush the operand list to suppress SCEVExpander hoisting address modes.
+      // Unless the addressing mode will not be folded.
+      if (!Ops.empty() && LU.Kind == LSRUse::Address &&
+          isAMCompletelyFolded(TTI, LU, F)) {
+        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
+        Ops.clear();
+        Ops.push_back(SE.getUnknown(FullV));
+      }
+      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
+      if (F.Scale != 1)
+        ScaledS =
+            SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
+      Ops.push_back(ScaledS);
+    }
+  }
+
+  // Expand the GV portion.
+  if (F.BaseGV) {
+    // Flush the operand list to suppress SCEVExpander hoisting.
+    if (!Ops.empty()) {
+      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
+      Ops.clear();
+      Ops.push_back(SE.getUnknown(FullV));
+    }
+    Ops.push_back(SE.getUnknown(F.BaseGV));
+  }
+
+  // Flush the operand list to suppress SCEVExpander hoisting of both folded and
+  // unfolded offsets. LSR assumes they both live next to their uses.
+  if (!Ops.empty()) {
+    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
+    Ops.clear();
+    Ops.push_back(SE.getUnknown(FullV));
+  }
+
+  // Expand the immediate portion.
+  int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
+  if (Offset != 0) {
+    if (LU.Kind == LSRUse::ICmpZero) {
+      // The other interesting way of "folding" with an ICmpZero is to use a
+      // negated immediate.
+      if (!ICmpScaledV)
+        ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
+      else {
+        Ops.push_back(SE.getUnknown(ICmpScaledV));
+        ICmpScaledV = ConstantInt::get(IntTy, Offset);
+      }
+    } else {
+      // Just add the immediate values. These again are expected to be matched
+      // as part of the address.
+      Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
+    }
+  }
+
+  // Expand the unfolded offset portion.
+  int64_t UnfoldedOffset = F.UnfoldedOffset;
+  if (UnfoldedOffset != 0) {
+    // Just add the immediate values.
+    Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
+                                                       UnfoldedOffset)));
+  }
+
+  // Emit instructions summing all the operands.
+  const SCEV *FullS = Ops.empty() ?
+                      SE.getConstant(IntTy, 0) :
+                      SE.getAddExpr(Ops);
+  Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
+
+  // We're done expanding now, so reset the rewriter.
+  Rewriter.clearPostInc();
+
+  // An ICmpZero Formula represents an ICmp which we're handling as a
+  // comparison against zero. Now that we've expanded an expression for that
+  // form, update the ICmp's other operand.
+  if (LU.Kind == LSRUse::ICmpZero) {
+    ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
+    DeadInsts.emplace_back(CI->getOperand(1));
+    assert(!F.BaseGV && "ICmp does not support folding a global value and "
+                           "a scale at the same time!");
+    if (F.Scale == -1) {
+      if (ICmpScaledV->getType() != OpTy) {
+        Instruction *Cast =
+          CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
+                                                   OpTy, false),
+                           ICmpScaledV, OpTy, "tmp", CI);
+        ICmpScaledV = Cast;
+      }
+      CI->setOperand(1, ICmpScaledV);
+    } else {
+      // A scale of 1 means that the scale has been expanded as part of the
+      // base regs.
+      assert((F.Scale == 0 || F.Scale == 1) &&
+             "ICmp does not support folding a global value and "
+             "a scale at the same time!");
+      Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
+                                           -(uint64_t)Offset);
+      if (C->getType() != OpTy)
+        C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+                                                          OpTy, false),
+                                  C, OpTy);
+
+      CI->setOperand(1, C);
+    }
+  }
+
+  return FullV;
+}
+
+/// Helper for Rewrite. PHI nodes are special because the use of their operands
+/// effectively happens in their predecessor blocks, so the expression may need
+/// to be expanded in multiple places.
+void LSRInstance::RewriteForPHI(
+    PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+    SCEVExpander &Rewriter, SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+  DenseMap<BasicBlock *, Value *> Inserted;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
+      BasicBlock *BB = PN->getIncomingBlock(i);
+
+      // If this is a critical edge, split the edge so that we do not insert
+      // the code on all predecessor/successor paths.  We do this unless this
+      // is the canonical backedge for this loop, which complicates post-inc
+      // users.
+      if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
+          !isa<IndirectBrInst>(BB->getTerminator()) &&
+          !isa<CatchSwitchInst>(BB->getTerminator())) {
+        BasicBlock *Parent = PN->getParent();
+        Loop *PNLoop = LI.getLoopFor(Parent);
+        if (!PNLoop || Parent != PNLoop->getHeader()) {
+          // Split the critical edge.
+          BasicBlock *NewBB = nullptr;
+          if (!Parent->isLandingPad()) {
+            NewBB = SplitCriticalEdge(BB, Parent,
+                                      CriticalEdgeSplittingOptions(&DT, &LI)
+                                          .setMergeIdenticalEdges()
+                                          .setDontDeleteUselessPHIs());
+          } else {
+            SmallVector<BasicBlock*, 2> NewBBs;
+            SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
+            NewBB = NewBBs[0];
+          }
+          // If NewBB==NULL, then SplitCriticalEdge refused to split because all
+          // phi predecessors are identical. The simple thing to do is skip
+          // splitting in this case rather than complicate the API.
+          if (NewBB) {
+            // If PN is outside of the loop and BB is in the loop, we want to
+            // move the block to be immediately before the PHI block, not
+            // immediately after BB.
+            if (L->contains(BB) && !L->contains(PN))
+              NewBB->moveBefore(PN->getParent());
+
+            // Splitting the edge can reduce the number of PHI entries we have.
+            e = PN->getNumIncomingValues();
+            BB = NewBB;
+            i = PN->getBasicBlockIndex(BB);
+          }
+        }
+      }
+
+      std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
+        Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
+      if (!Pair.second)
+        PN->setIncomingValue(i, Pair.first->second);
+      else {
+        Value *FullV = Expand(LU, LF, F, BB->getTerminator()->getIterator(),
+                              Rewriter, DeadInsts);
+
+        // If this is reuse-by-noop-cast, insert the noop cast.
+        Type *OpTy = LF.OperandValToReplace->getType();
+        if (FullV->getType() != OpTy)
+          FullV =
+            CastInst::Create(CastInst::getCastOpcode(FullV, false,
+                                                     OpTy, false),
+                             FullV, LF.OperandValToReplace->getType(),
+                             "tmp", BB->getTerminator());
+
+        PN->setIncomingValue(i, FullV);
+        Pair.first->second = FullV;
+      }
+    }
+}
+
+/// Emit instructions for the leading candidate expression for this LSRUse (this
+/// is called "expanding"), and update the UserInst to reference the newly
+/// expanded value.
+void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
+                          const Formula &F, SCEVExpander &Rewriter,
+                          SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+  // First, find an insertion point that dominates UserInst. For PHI nodes,
+  // find the nearest block which dominates all the relevant uses.
+  if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
+    RewriteForPHI(PN, LU, LF, F, Rewriter, DeadInsts);
+  } else {
+    Value *FullV =
+      Expand(LU, LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
+
+    // If this is reuse-by-noop-cast, insert the noop cast.
+    Type *OpTy = LF.OperandValToReplace->getType();
+    if (FullV->getType() != OpTy) {
+      Instruction *Cast =
+        CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
+                         FullV, OpTy, "tmp", LF.UserInst);
+      FullV = Cast;
+    }
+
+    // Update the user. ICmpZero is handled specially here (for now) because
+    // Expand may have updated one of the operands of the icmp already, and
+    // its new value may happen to be equal to LF.OperandValToReplace, in
+    // which case doing replaceUsesOfWith leads to replacing both operands
+    // with the same value. TODO: Reorganize this.
+    if (LU.Kind == LSRUse::ICmpZero)
+      LF.UserInst->setOperand(0, FullV);
+    else
+      LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
+  }
+
+  DeadInsts.emplace_back(LF.OperandValToReplace);
+}
+
+/// Rewrite all the fixup locations with new values, following the chosen
+/// solution.
+void LSRInstance::ImplementSolution(
+    const SmallVectorImpl<const Formula *> &Solution) {
+  // Keep track of instructions we may have made dead, so that
+  // we can remove them after we are done working.
+  SmallVector<WeakTrackingVH, 16> DeadInsts;
+
+  SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(),
+                        "lsr");
+#ifndef NDEBUG
+  Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+  Rewriter.disableCanonicalMode();
+  Rewriter.enableLSRMode();
+  Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
+
+  // Mark phi nodes that terminate chains so the expander tries to reuse them.
+  for (const IVChain &Chain : IVChainVec) {
+    if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
+      Rewriter.setChainedPhi(PN);
+  }
+
+  // Expand the new value definitions and update the users.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
+    for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
+      Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], Rewriter, DeadInsts);
+      Changed = true;
+    }
+
+  for (const IVChain &Chain : IVChainVec) {
+    GenerateIVChain(Chain, Rewriter, DeadInsts);
+    Changed = true;
+  }
+  // Clean up after ourselves. This must be done before deleting any
+  // instructions.
+  Rewriter.clear();
+
+  Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
+}
+
+LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
+                         DominatorTree &DT, LoopInfo &LI,
+                         const TargetTransformInfo &TTI)
+    : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), Changed(false),
+      IVIncInsertPos(nullptr) {
+  // If LoopSimplify form is not available, stay out of trouble.
+  if (!L->isLoopSimplifyForm())
+    return;
+
+  // If there's no interesting work to be done, bail early.
+  if (IU.empty()) return;
+
+  // If there's too much analysis to be done, bail early. We won't be able to
+  // model the problem anyway.
+  unsigned NumUsers = 0;
+  for (const IVStrideUse &U : IU) {
+    if (++NumUsers > MaxIVUsers) {
+      (void)U;
+      DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U << "\n");
+      return;
+    }
+    // Bail out if we have a PHI on an EHPad that gets a value from a
+    // CatchSwitchInst.  Because the CatchSwitchInst cannot be split, there is
+    // no good place to stick any instructions.
+    if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
+       auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
+       if (isa<FuncletPadInst>(FirstNonPHI) ||
+           isa<CatchSwitchInst>(FirstNonPHI))
+         for (BasicBlock *PredBB : PN->blocks())
+           if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
+             return;
+    }
+  }
+
+#ifndef NDEBUG
+  // All dominating loops must have preheaders, or SCEVExpander may not be able
+  // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
+  //
+  // IVUsers analysis should only create users that are dominated by simple loop
+  // headers. Since this loop should dominate all of its users, its user list
+  // should be empty if this loop itself is not within a simple loop nest.
+  for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
+       Rung; Rung = Rung->getIDom()) {
+    BasicBlock *BB = Rung->getBlock();
+    const Loop *DomLoop = LI.getLoopFor(BB);
+    if (DomLoop && DomLoop->getHeader() == BB) {
+      assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
+    }
+  }
+#endif // DEBUG
+
+  DEBUG(dbgs() << "\nLSR on loop ";
+        L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
+        dbgs() << ":\n");
+
+  // First, perform some low-level loop optimizations.
+  OptimizeShadowIV();
+  OptimizeLoopTermCond();
+
+  // If loop preparation eliminates all interesting IV users, bail.
+  if (IU.empty()) return;
+
+  // Skip nested loops until we can model them better with formulae.
+  if (!L->empty()) {
+    DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
+    return;
+  }
+
+  // Start collecting data and preparing for the solver.
+  CollectChains();
+  CollectInterestingTypesAndFactors();
+  CollectFixupsAndInitialFormulae();
+  CollectLoopInvariantFixupsAndFormulae();
+
+  assert(!Uses.empty() && "IVUsers reported at least one use");
+  DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
+        print_uses(dbgs()));
+
+  // Now use the reuse data to generate a bunch of interesting ways
+  // to formulate the values needed for the uses.
+  GenerateAllReuseFormulae();
+
+  FilterOutUndesirableDedicatedRegisters();
+  NarrowSearchSpaceUsingHeuristics();
+
+  SmallVector<const Formula *, 8> Solution;
+  Solve(Solution);
+
+  // Release memory that is no longer needed.
+  Factors.clear();
+  Types.clear();
+  RegUses.clear();
+
+  if (Solution.empty())
+    return;
+
+#ifndef NDEBUG
+  // Formulae should be legal.
+  for (const LSRUse &LU : Uses) {
+    for (const Formula &F : LU.Formulae)
+      assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                        F) && "Illegal formula generated!");
+  };
+#endif
+
+  // Now that we've decided what we want, make it so.
+  ImplementSolution(Solution);
+}
+
+void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
+  if (Factors.empty() && Types.empty()) return;
+
+  OS << "LSR has identified the following interesting factors and types: ";
+  bool First = true;
+
+  for (int64_t Factor : Factors) {
+    if (!First) OS << ", ";
+    First = false;
+    OS << '*' << Factor;
+  }
+
+  for (Type *Ty : Types) {
+    if (!First) OS << ", ";
+    First = false;
+    OS << '(' << *Ty << ')';
+  }
+  OS << '\n';
+}
+
+void LSRInstance::print_fixups(raw_ostream &OS) const {
+  OS << "LSR is examining the following fixup sites:\n";
+  for (const LSRUse &LU : Uses)
+    for (const LSRFixup &LF : LU.Fixups) {
+      dbgs() << "  ";
+      LF.print(OS);
+      OS << '\n';
+    }
+}
+
+void LSRInstance::print_uses(raw_ostream &OS) const {
+  OS << "LSR is examining the following uses:\n";
+  for (const LSRUse &LU : Uses) {
+    dbgs() << "  ";
+    LU.print(OS);
+    OS << '\n';
+    for (const Formula &F : LU.Formulae) {
+      OS << "    ";
+      F.print(OS);
+      OS << '\n';
+    }
+  }
+}
+
+void LSRInstance::print(raw_ostream &OS) const {
+  print_factors_and_types(OS);
+  print_fixups(OS);
+  print_uses(OS);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRInstance::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+namespace {
+
+class LoopStrengthReduce : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+
+  LoopStrengthReduce();
+
+private:
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+} // end anonymous namespace
+
+LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
+  initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
+}
+
+void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
+  // We split critical edges, so we change the CFG.  However, we do update
+  // many analyses if they are around.
+  AU.addPreservedID(LoopSimplifyID);
+
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+  AU.addRequiredID(LoopSimplifyID);
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+  // Requiring LoopSimplify a second time here prevents IVUsers from running
+  // twice, since LoopSimplify was invalidated by running ScalarEvolution.
+  AU.addRequiredID(LoopSimplifyID);
+  AU.addRequired<IVUsersWrapperPass>();
+  AU.addPreserved<IVUsersWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+}
+
+static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
+                               DominatorTree &DT, LoopInfo &LI,
+                               const TargetTransformInfo &TTI) {
+  bool Changed = false;
+
+  // Run the main LSR transformation.
+  Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged();
+
+  // Remove any extra phis created by processing inner loops.
+  Changed |= DeleteDeadPHIs(L->getHeader());
+  if (EnablePhiElim && L->isLoopSimplifyForm()) {
+    SmallVector<WeakTrackingVH, 16> DeadInsts;
+    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+    SCEVExpander Rewriter(SE, DL, "lsr");
+#ifndef NDEBUG
+    Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+    unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
+    if (numFolded) {
+      Changed = true;
+      DeleteTriviallyDeadInstructions(DeadInsts);
+      DeleteDeadPHIs(L->getHeader());
+    }
+  }
+  return Changed;
+}
+
+bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
+  if (skipLoop(L))
+    return false;
+
+  auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
+  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+      *L->getHeader()->getParent());
+  return ReduceLoopStrength(L, IU, SE, DT, LI, TTI);
+}
+
+PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
+                                              LoopStandardAnalysisResults &AR,
+                                              LPMUpdater &) {
+  if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
+                          AR.DT, AR.LI, AR.TTI))
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+char LoopStrengthReduce::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
+                      "Loop Strength Reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(IVUsersWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
+                    "Loop Strength Reduction", false, false)
+
+Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
new file mode 100644
index 000000000000..530a68424d5c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -0,0 +1,1225 @@
+//===-- LoopUnroll.cpp - Loop unroller pass -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop unroller.  It works best when loops have
+// been canonicalized by the -indvars pass, allowing it to determine the trip
+// counts of loops easily.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopUnrollAnalyzer.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <climits>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll"
+
+static cl::opt<unsigned>
+    UnrollThreshold("unroll-threshold", cl::Hidden,
+                    cl::desc("The cost threshold for loop unrolling"));
+
+static cl::opt<unsigned> UnrollPartialThreshold(
+    "unroll-partial-threshold", cl::Hidden,
+    cl::desc("The cost threshold for partial loop unrolling"));
+
+static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
+    "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
+    cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
+             "to the threshold when aggressively unrolling a loop due to the "
+             "dynamic cost savings. If completely unrolling a loop will reduce "
+             "the total runtime from X to Y, we boost the loop unroll "
+             "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
+             "X/Y). This limit avoids excessive code bloat."));
+
+static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
+    "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
+    cl::desc("Don't allow loop unrolling to simulate more than this number of"
+             "iterations when checking full unroll profitability"));
+
+static cl::opt<unsigned> UnrollCount(
+    "unroll-count", cl::Hidden,
+    cl::desc("Use this unroll count for all loops including those with "
+             "unroll_count pragma values, for testing purposes"));
+
+static cl::opt<unsigned> UnrollMaxCount(
+    "unroll-max-count", cl::Hidden,
+    cl::desc("Set the max unroll count for partial and runtime unrolling, for"
+             "testing purposes"));
+
+static cl::opt<unsigned> UnrollFullMaxCount(
+    "unroll-full-max-count", cl::Hidden,
+    cl::desc(
+        "Set the max unroll count for full unrolling, for testing purposes"));
+
+static cl::opt<bool>
+    UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
+                       cl::desc("Allows loops to be partially unrolled until "
+                                "-unroll-threshold loop size is reached."));
+
+static cl::opt<bool> UnrollAllowRemainder(
+    "unroll-allow-remainder", cl::Hidden,
+    cl::desc("Allow generation of a loop remainder (extra iterations) "
+             "when unrolling a loop."));
+
+static cl::opt<bool>
+    UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
+                  cl::desc("Unroll loops with run-time trip counts"));
+
+static cl::opt<unsigned> UnrollMaxUpperBound(
+    "unroll-max-upperbound", cl::init(8), cl::Hidden,
+    cl::desc(
+        "The max of trip count upper bound that is considered in unrolling"));
+
+static cl::opt<unsigned> PragmaUnrollThreshold(
+    "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
+    cl::desc("Unrolled size limit for loops with an unroll(full) or "
+             "unroll_count pragma."));
+
+static cl::opt<unsigned> FlatLoopTripCountThreshold(
+    "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
+    cl::desc("If the runtime tripcount for the loop is lower than the "
+             "threshold, the loop is considered as flat and will be less "
+             "aggressively unrolled."));
+
+static cl::opt<bool>
+    UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
+                       cl::desc("Allows loops to be peeled when the dynamic "
+                                "trip count is known to be low."));
+
+// This option isn't ever intended to be enabled, it serves to allow
+// experiments to check the assumptions about when this kind of revisit is
+// necessary.
+static cl::opt<bool> UnrollRevisitChildLoops(
+    "unroll-revisit-child-loops", cl::Hidden,
+    cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
+             "This shouldn't typically be needed as child loops (or their "
+             "clones) were already visited."));
+
+/// A magic value for use with the Threshold parameter to indicate
+/// that the loop unroll should be performed regardless of how much
+/// code expansion would result.
+static const unsigned NoThreshold = UINT_MAX;
+
+/// Gather the various unrolling parameters based on the defaults, compiler
+/// flags, TTI overrides and user specified parameters.
+static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
+    Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
+    Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
+    Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
+    Optional<bool> UserUpperBound) {
+  TargetTransformInfo::UnrollingPreferences UP;
+
+  // Set up the defaults
+  UP.Threshold = OptLevel > 2 ? 300 : 150;
+  UP.MaxPercentThresholdBoost = 400;
+  UP.OptSizeThreshold = 0;
+  UP.PartialThreshold = 150;
+  UP.PartialOptSizeThreshold = 0;
+  UP.Count = 0;
+  UP.PeelCount = 0;
+  UP.DefaultUnrollRuntimeCount = 8;
+  UP.MaxCount = UINT_MAX;
+  UP.FullUnrollMaxCount = UINT_MAX;
+  UP.BEInsns = 2;
+  UP.Partial = false;
+  UP.Runtime = false;
+  UP.AllowRemainder = true;
+  UP.AllowExpensiveTripCount = false;
+  UP.Force = false;
+  UP.UpperBound = false;
+  UP.AllowPeeling = true;
+
+  // Override with any target specific settings
+  TTI.getUnrollingPreferences(L, SE, UP);
+
+  // Apply size attributes
+  if (L->getHeader()->getParent()->optForSize()) {
+    UP.Threshold = UP.OptSizeThreshold;
+    UP.PartialThreshold = UP.PartialOptSizeThreshold;
+  }
+
+  // Apply any user values specified by cl::opt
+  if (UnrollThreshold.getNumOccurrences() > 0)
+    UP.Threshold = UnrollThreshold;
+  if (UnrollPartialThreshold.getNumOccurrences() > 0)
+    UP.PartialThreshold = UnrollPartialThreshold;
+  if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
+    UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
+  if (UnrollMaxCount.getNumOccurrences() > 0)
+    UP.MaxCount = UnrollMaxCount;
+  if (UnrollFullMaxCount.getNumOccurrences() > 0)
+    UP.FullUnrollMaxCount = UnrollFullMaxCount;
+  if (UnrollAllowPartial.getNumOccurrences() > 0)
+    UP.Partial = UnrollAllowPartial;
+  if (UnrollAllowRemainder.getNumOccurrences() > 0)
+    UP.AllowRemainder = UnrollAllowRemainder;
+  if (UnrollRuntime.getNumOccurrences() > 0)
+    UP.Runtime = UnrollRuntime;
+  if (UnrollMaxUpperBound == 0)
+    UP.UpperBound = false;
+  if (UnrollAllowPeeling.getNumOccurrences() > 0)
+    UP.AllowPeeling = UnrollAllowPeeling;
+
+  // Apply user values provided by argument
+  if (UserThreshold.hasValue()) {
+    UP.Threshold = *UserThreshold;
+    UP.PartialThreshold = *UserThreshold;
+  }
+  if (UserCount.hasValue())
+    UP.Count = *UserCount;
+  if (UserAllowPartial.hasValue())
+    UP.Partial = *UserAllowPartial;
+  if (UserRuntime.hasValue())
+    UP.Runtime = *UserRuntime;
+  if (UserUpperBound.hasValue())
+    UP.UpperBound = *UserUpperBound;
+
+  return UP;
+}
+
+namespace {
+/// A struct to densely store the state of an instruction after unrolling at
+/// each iteration.
+///
+/// This is designed to work like a tuple of <Instruction *, int> for the
+/// purposes of hashing and lookup, but to be able to associate two boolean
+/// states with each key.
+struct UnrolledInstState {
+  Instruction *I;
+  int Iteration : 30;
+  unsigned IsFree : 1;
+  unsigned IsCounted : 1;
+};
+
+/// Hashing and equality testing for a set of the instruction states.
+struct UnrolledInstStateKeyInfo {
+  typedef DenseMapInfo<Instruction *> PtrInfo;
+  typedef DenseMapInfo<std::pair<Instruction *, int>> PairInfo;
+  static inline UnrolledInstState getEmptyKey() {
+    return {PtrInfo::getEmptyKey(), 0, 0, 0};
+  }
+  static inline UnrolledInstState getTombstoneKey() {
+    return {PtrInfo::getTombstoneKey(), 0, 0, 0};
+  }
+  static inline unsigned getHashValue(const UnrolledInstState &S) {
+    return PairInfo::getHashValue({S.I, S.Iteration});
+  }
+  static inline bool isEqual(const UnrolledInstState &LHS,
+                             const UnrolledInstState &RHS) {
+    return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
+  }
+};
+}
+
+namespace {
+struct EstimatedUnrollCost {
+  /// \brief The estimated cost after unrolling.
+  unsigned UnrolledCost;
+
+  /// \brief The estimated dynamic cost of executing the instructions in the
+  /// rolled form.
+  unsigned RolledDynamicCost;
+};
+}
+
+/// \brief Figure out if the loop is worth full unrolling.
+///
+/// Complete loop unrolling can make some loads constant, and we need to know
+/// if that would expose any further optimization opportunities.  This routine
+/// estimates this optimization.  It computes cost of unrolled loop
+/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
+/// dynamic cost we mean that we won't count costs of blocks that are known not
+/// to be executed (i.e. if we have a branch in the loop and we know that at the
+/// given iteration its condition would be resolved to true, we won't add up the
+/// cost of the 'false'-block).
+/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
+/// the analysis failed (no benefits expected from the unrolling, or the loop is
+/// too big to analyze), the returned value is None.
+static Optional<EstimatedUnrollCost>
+analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
+                      ScalarEvolution &SE, const TargetTransformInfo &TTI,
+                      unsigned MaxUnrolledLoopSize) {
+  // We want to be able to scale offsets by the trip count and add more offsets
+  // to them without checking for overflows, and we already don't want to
+  // analyze *massive* trip counts, so we force the max to be reasonably small.
+  assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) &&
+         "The unroll iterations max is too large!");
+
+  // Only analyze inner loops. We can't properly estimate cost of nested loops
+  // and we won't visit inner loops again anyway.
+  if (!L->empty())
+    return None;
+
+  // Don't simulate loops with a big or unknown tripcount
+  if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
+      TripCount > UnrollMaxIterationsCountToAnalyze)
+    return None;
+
+  SmallSetVector<BasicBlock *, 16> BBWorklist;
+  SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
+  DenseMap<Value *, Constant *> SimplifiedValues;
+  SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
+
+  // The estimated cost of the unrolled form of the loop. We try to estimate
+  // this by simplifying as much as we can while computing the estimate.
+  unsigned UnrolledCost = 0;
+
+  // We also track the estimated dynamic (that is, actually executed) cost in
+  // the rolled form. This helps identify cases when the savings from unrolling
+  // aren't just exposing dead control flows, but actual reduced dynamic
+  // instructions due to the simplifications which we expect to occur after
+  // unrolling.
+  unsigned RolledDynamicCost = 0;
+
+  // We track the simplification of each instruction in each iteration. We use
+  // this to recursively merge costs into the unrolled cost on-demand so that
+  // we don't count the cost of any dead code. This is essentially a map from
+  // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
+  DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
+
+  // A small worklist used to accumulate cost of instructions from each
+  // observable and reached root in the loop.
+  SmallVector<Instruction *, 16> CostWorklist;
+
+  // PHI-used worklist used between iterations while accumulating cost.
+  SmallVector<Instruction *, 4> PHIUsedList;
+
+  // Helper function to accumulate cost for instructions in the loop.
+  auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
+    assert(Iteration >= 0 && "Cannot have a negative iteration!");
+    assert(CostWorklist.empty() && "Must start with an empty cost list");
+    assert(PHIUsedList.empty() && "Must start with an empty phi used list");
+    CostWorklist.push_back(&RootI);
+    for (;; --Iteration) {
+      do {
+        Instruction *I = CostWorklist.pop_back_val();
+
+        // InstCostMap only uses I and Iteration as a key, the other two values
+        // don't matter here.
+        auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
+        if (CostIter == InstCostMap.end())
+          // If an input to a PHI node comes from a dead path through the loop
+          // we may have no cost data for it here. What that actually means is
+          // that it is free.
+          continue;
+        auto &Cost = *CostIter;
+        if (Cost.IsCounted)
+          // Already counted this instruction.
+          continue;
+
+        // Mark that we are counting the cost of this instruction now.
+        Cost.IsCounted = true;
+
+        // If this is a PHI node in the loop header, just add it to the PHI set.
+        if (auto *PhiI = dyn_cast<PHINode>(I))
+          if (PhiI->getParent() == L->getHeader()) {
+            assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
+                                  "inherently simplify during unrolling.");
+            if (Iteration == 0)
+              continue;
+
+            // Push the incoming value from the backedge into the PHI used list
+            // if it is an in-loop instruction. We'll use this to populate the
+            // cost worklist for the next iteration (as we count backwards).
+            if (auto *OpI = dyn_cast<Instruction>(
+                    PhiI->getIncomingValueForBlock(L->getLoopLatch())))
+              if (L->contains(OpI))
+                PHIUsedList.push_back(OpI);
+            continue;
+          }
+
+        // First accumulate the cost of this instruction.
+        if (!Cost.IsFree) {
+          UnrolledCost += TTI.getUserCost(I);
+          DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration
+                       << "): ");
+          DEBUG(I->dump());
+        }
+
+        // We must count the cost of every operand which is not free,
+        // recursively. If we reach a loop PHI node, simply add it to the set
+        // to be considered on the next iteration (backwards!).
+        for (Value *Op : I->operands()) {
+          // Check whether this operand is free due to being a constant or
+          // outside the loop.
+          auto *OpI = dyn_cast<Instruction>(Op);
+          if (!OpI || !L->contains(OpI))
+            continue;
+
+          // Otherwise accumulate its cost.
+          CostWorklist.push_back(OpI);
+        }
+      } while (!CostWorklist.empty());
+
+      if (PHIUsedList.empty())
+        // We've exhausted the search.
+        break;
+
+      assert(Iteration > 0 &&
+             "Cannot track PHI-used values past the first iteration!");
+      CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
+      PHIUsedList.clear();
+    }
+  };
+
+  // Ensure that we don't violate the loop structure invariants relied on by
+  // this analysis.
+  assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
+  assert(L->isLCSSAForm(DT) &&
+         "Must have loops in LCSSA form to track live-out values.");
+
+  DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
+
+  // Simulate execution of each iteration of the loop counting instructions,
+  // which would be simplified.
+  // Since the same load will take different values on different iterations,
+  // we literally have to go through all loop's iterations.
+  for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
+    DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
+
+    // Prepare for the iteration by collecting any simplified entry or backedge
+    // inputs.
+    for (Instruction &I : *L->getHeader()) {
+      auto *PHI = dyn_cast<PHINode>(&I);
+      if (!PHI)
+        break;
+
+      // The loop header PHI nodes must have exactly two input: one from the
+      // loop preheader and one from the loop latch.
+      assert(
+          PHI->getNumIncomingValues() == 2 &&
+          "Must have an incoming value only for the preheader and the latch.");
+
+      Value *V = PHI->getIncomingValueForBlock(
+          Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
+      Constant *C = dyn_cast<Constant>(V);
+      if (Iteration != 0 && !C)
+        C = SimplifiedValues.lookup(V);
+      if (C)
+        SimplifiedInputValues.push_back({PHI, C});
+    }
+
+    // Now clear and re-populate the map for the next iteration.
+    SimplifiedValues.clear();
+    while (!SimplifiedInputValues.empty())
+      SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
+
+    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
+
+    BBWorklist.clear();
+    BBWorklist.insert(L->getHeader());
+    // Note that we *must not* cache the size, this loop grows the worklist.
+    for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
+      BasicBlock *BB = BBWorklist[Idx];
+
+      // Visit all instructions in the given basic block and try to simplify
+      // it.  We don't change the actual IR, just count optimization
+      // opportunities.
+      for (Instruction &I : *BB) {
+        if (isa<DbgInfoIntrinsic>(I))
+          continue;
+
+        // Track this instruction's expected baseline cost when executing the
+        // rolled loop form.
+        RolledDynamicCost += TTI.getUserCost(&I);
+
+        // Visit the instruction to analyze its loop cost after unrolling,
+        // and if the visitor returns true, mark the instruction as free after
+        // unrolling and continue.
+        bool IsFree = Analyzer.visit(I);
+        bool Inserted = InstCostMap.insert({&I, (int)Iteration,
+                                           (unsigned)IsFree,
+                                           /*IsCounted*/ false}).second;
+        (void)Inserted;
+        assert(Inserted && "Cannot have a state for an unvisited instruction!");
+
+        if (IsFree)
+          continue;
+
+        // Can't properly model a cost of a call.
+        // FIXME: With a proper cost model we should be able to do it.
+        if(isa<CallInst>(&I))
+          return None;
+
+        // If the instruction might have a side-effect recursively account for
+        // the cost of it and all the instructions leading up to it.
+        if (I.mayHaveSideEffects())
+          AddCostRecursively(I, Iteration);
+
+        // If unrolled body turns out to be too big, bail out.
+        if (UnrolledCost > MaxUnrolledLoopSize) {
+          DEBUG(dbgs() << "  Exceeded threshold.. exiting.\n"
+                       << "  UnrolledCost: " << UnrolledCost
+                       << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
+                       << "\n");
+          return None;
+        }
+      }
+
+      TerminatorInst *TI = BB->getTerminator();
+
+      // Add in the live successors by first checking whether we have terminator
+      // that may be simplified based on the values simplified by this call.
+      BasicBlock *KnownSucc = nullptr;
+      if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+        if (BI->isConditional()) {
+          if (Constant *SimpleCond =
+                  SimplifiedValues.lookup(BI->getCondition())) {
+            // Just take the first successor if condition is undef
+            if (isa<UndefValue>(SimpleCond))
+              KnownSucc = BI->getSuccessor(0);
+            else if (ConstantInt *SimpleCondVal =
+                         dyn_cast<ConstantInt>(SimpleCond))
+              KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
+          }
+        }
+      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+        if (Constant *SimpleCond =
+                SimplifiedValues.lookup(SI->getCondition())) {
+          // Just take the first successor if condition is undef
+          if (isa<UndefValue>(SimpleCond))
+            KnownSucc = SI->getSuccessor(0);
+          else if (ConstantInt *SimpleCondVal =
+                       dyn_cast<ConstantInt>(SimpleCond))
+            KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
+        }
+      }
+      if (KnownSucc) {
+        if (L->contains(KnownSucc))
+          BBWorklist.insert(KnownSucc);
+        else
+          ExitWorklist.insert({BB, KnownSucc});
+        continue;
+      }
+
+      // Add BB's successors to the worklist.
+      for (BasicBlock *Succ : successors(BB))
+        if (L->contains(Succ))
+          BBWorklist.insert(Succ);
+        else
+          ExitWorklist.insert({BB, Succ});
+      AddCostRecursively(*TI, Iteration);
+    }
+
+    // If we found no optimization opportunities on the first iteration, we
+    // won't find them on later ones too.
+    if (UnrolledCost == RolledDynamicCost) {
+      DEBUG(dbgs() << "  No opportunities found.. exiting.\n"
+                   << "  UnrolledCost: " << UnrolledCost << "\n");
+      return None;
+    }
+  }
+
+  while (!ExitWorklist.empty()) {
+    BasicBlock *ExitingBB, *ExitBB;
+    std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
+
+    for (Instruction &I : *ExitBB) {
+      auto *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+
+      Value *Op = PN->getIncomingValueForBlock(ExitingBB);
+      if (auto *OpI = dyn_cast<Instruction>(Op))
+        if (L->contains(OpI))
+          AddCostRecursively(*OpI, TripCount - 1);
+    }
+  }
+
+  DEBUG(dbgs() << "Analysis finished:\n"
+               << "UnrolledCost: " << UnrolledCost << ", "
+               << "RolledDynamicCost: " << RolledDynamicCost << "\n");
+  return {{UnrolledCost, RolledDynamicCost}};
+}
+
+/// ApproximateLoopSize - Approximate the size of the loop.
+static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
+                                    bool &NotDuplicatable, bool &Convergent,
+                                    const TargetTransformInfo &TTI,
+                                    AssumptionCache *AC, unsigned BEInsns) {
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+  CodeMetrics Metrics;
+  for (BasicBlock *BB : L->blocks())
+    Metrics.analyzeBasicBlock(BB, TTI, EphValues);
+  NumCalls = Metrics.NumInlineCandidates;
+  NotDuplicatable = Metrics.notDuplicatable;
+  Convergent = Metrics.convergent;
+
+  unsigned LoopSize = Metrics.NumInsts;
+
+  // Don't allow an estimate of size zero.  This would allows unrolling of loops
+  // with huge iteration counts, which is a compile time problem even if it's
+  // not a problem for code quality. Also, the code using this size may assume
+  // that each loop has at least three instructions (likely a conditional
+  // branch, a comparison feeding that branch, and some kind of loop increment
+  // feeding that comparison instruction).
+  LoopSize = std::max(LoopSize, BEInsns + 1);
+
+  return LoopSize;
+}
+
+// Returns the loop hint metadata node with the given name (for example,
+// "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
+// returned.
+static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+  if (MDNode *LoopID = L->getLoopID())
+    return GetUnrollMetadata(LoopID, Name);
+  return nullptr;
+}
+
+// Returns true if the loop has an unroll(full) pragma.
+static bool HasUnrollFullPragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
+}
+
+// Returns true if the loop has an unroll(enable) pragma. This metadata is used
+// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
+static bool HasUnrollEnablePragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
+}
+
+// Returns true if the loop has an unroll(disable) pragma.
+static bool HasUnrollDisablePragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable");
+}
+
+// Returns true if the loop has an runtime unroll(disable) pragma.
+static bool HasRuntimeUnrollDisablePragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
+}
+
+// If loop has an unroll_count pragma return the (necessarily
+// positive) value from the pragma.  Otherwise return 0.
+static unsigned UnrollCountPragmaValue(const Loop *L) {
+  MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
+  if (MD) {
+    assert(MD->getNumOperands() == 2 &&
+           "Unroll count hint metadata should have two operands.");
+    unsigned Count =
+        mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+    assert(Count >= 1 && "Unroll count must be positive.");
+    return Count;
+  }
+  return 0;
+}
+
+// Remove existing unroll metadata and add unroll disable metadata to
+// indicate the loop has already been unrolled.  This prevents a loop
+// from being unrolled more than is directed by a pragma if the loop
+// unrolling pass is run more than once (which it generally is).
+static void SetLoopAlreadyUnrolled(Loop *L) {
+  MDNode *LoopID = L->getLoopID();
+  // First remove any existing loop unrolling metadata.
+  SmallVector<Metadata *, 4> MDs;
+  // Reserve first location for self reference to the LoopID metadata node.
+  MDs.push_back(nullptr);
+
+  if (LoopID) {
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      bool IsUnrollMetadata = false;
+      MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+      if (MD) {
+        const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+        IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
+      }
+      if (!IsUnrollMetadata)
+        MDs.push_back(LoopID->getOperand(i));
+    }
+  }
+
+  // Add unroll(disable) metadata to disable future unrolling.
+  LLVMContext &Context = L->getHeader()->getContext();
+  SmallVector<Metadata *, 1> DisableOperands;
+  DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
+  MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+  MDs.push_back(DisableNode);
+
+  MDNode *NewLoopID = MDNode::get(Context, MDs);
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  L->setLoopID(NewLoopID);
+}
+
+// Computes the boosting factor for complete unrolling.
+// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
+// be beneficial to fully unroll the loop even if unrolledcost is large. We
+// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
+// the unroll threshold.
+static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
+                                            unsigned MaxPercentThresholdBoost) {
+  if (Cost.RolledDynamicCost >= UINT_MAX / 100)
+    return 100;
+  else if (Cost.UnrolledCost != 0)
+    // The boosting factor is RolledDynamicCost / UnrolledCost
+    return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
+                    MaxPercentThresholdBoost);
+  else
+    return MaxPercentThresholdBoost;
+}
+
+// Returns loop size estimation for unrolled loop.
+static uint64_t getUnrolledLoopSize(
+    unsigned LoopSize,
+    TargetTransformInfo::UnrollingPreferences &UP) {
+  assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
+  return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
+}
+
+// Returns true if unroll count was set explicitly.
+// Calculates unroll count and writes it to UP.Count.
+static bool computeUnrollCount(
+    Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
+    ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
+    unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
+    TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
+  // Check for explicit Count.
+  // 1st priority is unroll count set by "unroll-count" option.
+  bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
+  if (UserUnrollCount) {
+    UP.Count = UnrollCount;
+    UP.AllowExpensiveTripCount = true;
+    UP.Force = true;
+    if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
+      return true;
+  }
+
+  // 2nd priority is unroll count set by pragma.
+  unsigned PragmaCount = UnrollCountPragmaValue(L);
+  if (PragmaCount > 0) {
+    UP.Count = PragmaCount;
+    UP.Runtime = true;
+    UP.AllowExpensiveTripCount = true;
+    UP.Force = true;
+    if (UP.AllowRemainder &&
+        getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
+      return true;
+  }
+  bool PragmaFullUnroll = HasUnrollFullPragma(L);
+  if (PragmaFullUnroll && TripCount != 0) {
+    UP.Count = TripCount;
+    if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
+      return false;
+  }
+
+  bool PragmaEnableUnroll = HasUnrollEnablePragma(L);
+  bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
+                        PragmaEnableUnroll || UserUnrollCount;
+
+  if (ExplicitUnroll && TripCount != 0) {
+    // If the loop has an unrolling pragma, we want to be more aggressive with
+    // unrolling limits. Set thresholds to at least the PragmaThreshold value
+    // which is larger than the default limits.
+    UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
+    UP.PartialThreshold =
+        std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
+  }
+
+  // 3rd priority is full unroll count.
+  // Full unroll makes sense only when TripCount or its upper bound could be
+  // statically calculated.
+  // Also we need to check if we exceed FullUnrollMaxCount.
+  // If using the upper bound to unroll, TripMultiple should be set to 1 because
+  // we do not know when loop may exit.
+  // MaxTripCount and ExactTripCount cannot both be non zero since we only
+  // compute the former when the latter is zero.
+  unsigned ExactTripCount = TripCount;
+  assert((ExactTripCount == 0 || MaxTripCount == 0) &&
+         "ExtractTripCound and MaxTripCount cannot both be non zero.");
+  unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
+  UP.Count = FullUnrollTripCount;
+  if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
+    // When computing the unrolled size, note that BEInsns are not replicated
+    // like the rest of the loop body.
+    if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
+      UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+      TripCount = FullUnrollTripCount;
+      TripMultiple = UP.UpperBound ? 1 : TripMultiple;
+      return ExplicitUnroll;
+    } else {
+      // The loop isn't that small, but we still can fully unroll it if that
+      // helps to remove a significant number of instructions.
+      // To check that, run additional analysis on the loop.
+      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
+              L, FullUnrollTripCount, DT, SE, TTI,
+              UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
+        unsigned Boost =
+            getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
+        if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
+          UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+          TripCount = FullUnrollTripCount;
+          TripMultiple = UP.UpperBound ? 1 : TripMultiple;
+          return ExplicitUnroll;
+        }
+      }
+    }
+  }
+
+  // 4th priority is loop peeling
+  computePeelCount(L, LoopSize, UP, TripCount);
+  if (UP.PeelCount) {
+    UP.Runtime = false;
+    UP.Count = 1;
+    return ExplicitUnroll;
+  }
+
+  // 5th priority is partial unrolling.
+  // Try partial unroll only when TripCount could be staticaly calculated.
+  if (TripCount) {
+    UP.Partial |= ExplicitUnroll;
+    if (!UP.Partial) {
+      DEBUG(dbgs() << "  will not try to unroll partially because "
+                   << "-unroll-allow-partial not given\n");
+      UP.Count = 0;
+      return false;
+    }
+    if (UP.Count == 0)
+      UP.Count = TripCount;
+    if (UP.PartialThreshold != NoThreshold) {
+      // Reduce unroll count to be modulo of TripCount for partial unrolling.
+      if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+        UP.Count =
+            (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
+            (LoopSize - UP.BEInsns);
+      if (UP.Count > UP.MaxCount)
+        UP.Count = UP.MaxCount;
+      while (UP.Count != 0 && TripCount % UP.Count != 0)
+        UP.Count--;
+      if (UP.AllowRemainder && UP.Count <= 1) {
+        // If there is no Count that is modulo of TripCount, set Count to
+        // largest power-of-two factor that satisfies the threshold limit.
+        // As we'll create fixup loop, do the type of unrolling only if
+        // remainder loop is allowed.
+        UP.Count = UP.DefaultUnrollRuntimeCount;
+        while (UP.Count != 0 &&
+               getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+          UP.Count >>= 1;
+      }
+      if (UP.Count < 2) {
+        if (PragmaEnableUnroll)
+          ORE->emit(
+              OptimizationRemarkMissed(DEBUG_TYPE, "UnrollAsDirectedTooLarge",
+                                       L->getStartLoc(), L->getHeader())
+              << "Unable to unroll loop as directed by unroll(enable) pragma "
+                 "because unrolled size is too large.");
+        UP.Count = 0;
+      }
+    } else {
+      UP.Count = TripCount;
+    }
+    if (UP.Count > UP.MaxCount)
+      UP.Count = UP.MaxCount;
+    if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
+        UP.Count != TripCount)
+      ORE->emit(
+          OptimizationRemarkMissed(DEBUG_TYPE, "FullUnrollAsDirectedTooLarge",
+                                   L->getStartLoc(), L->getHeader())
+          << "Unable to fully unroll loop as directed by unroll pragma because "
+             "unrolled size is too large.");
+    return ExplicitUnroll;
+  }
+  assert(TripCount == 0 &&
+         "All cases when TripCount is constant should be covered here.");
+  if (PragmaFullUnroll)
+    ORE->emit(
+        OptimizationRemarkMissed(DEBUG_TYPE,
+                                 "CantFullUnrollAsDirectedRuntimeTripCount",
+                                 L->getStartLoc(), L->getHeader())
+        << "Unable to fully unroll loop as directed by unroll(full) pragma "
+           "because loop has a runtime trip count.");
+
+  // 6th priority is runtime unrolling.
+  // Don't unroll a runtime trip count loop when it is disabled.
+  if (HasRuntimeUnrollDisablePragma(L)) {
+    UP.Count = 0;
+    return false;
+  }
+  
+  // Check if the runtime trip count is too small when profile is available.
+  if (L->getHeader()->getParent()->getEntryCount()) {
+    if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
+      if (*ProfileTripCount < FlatLoopTripCountThreshold)
+        return false;
+      else
+        UP.AllowExpensiveTripCount = true;
+    }
+  }  
+
+  // Reduce count based on the type of unrolling and the threshold values.
+  UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
+  if (!UP.Runtime) {
+    DEBUG(dbgs() << "  will not try to unroll loop with runtime trip count "
+                 << "-unroll-runtime not given\n");
+    UP.Count = 0;
+    return false;
+  }
+  if (UP.Count == 0)
+    UP.Count = UP.DefaultUnrollRuntimeCount;
+
+  // Reduce unroll count to be the largest power-of-two factor of
+  // the original count which satisfies the threshold limit.
+  while (UP.Count != 0 &&
+         getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+    UP.Count >>= 1;
+
+#ifndef NDEBUG
+  unsigned OrigCount = UP.Count;
+#endif
+
+  if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
+    while (UP.Count != 0 && TripMultiple % UP.Count != 0)
+      UP.Count >>= 1;
+    DEBUG(dbgs() << "Remainder loop is restricted (that could architecture "
+                    "specific or because the loop contains a convergent "
+                    "instruction), so unroll count must divide the trip "
+                    "multiple, "
+                 << TripMultiple << ".  Reducing unroll count from "
+                 << OrigCount << " to " << UP.Count << ".\n");
+    using namespace ore;
+    if (PragmaCount > 0 && !UP.AllowRemainder)
+      ORE->emit(
+          OptimizationRemarkMissed(DEBUG_TYPE,
+                                   "DifferentUnrollCountFromDirected",
+                                   L->getStartLoc(), L->getHeader())
+          << "Unable to unroll loop the number of times directed by "
+             "unroll_count pragma because remainder loop is restricted "
+             "(that could architecture specific or because the loop "
+             "contains a convergent instruction) and so must have an unroll "
+             "count that divides the loop trip multiple of "
+          << NV("TripMultiple", TripMultiple) << ".  Unrolling instead "
+          << NV("UnrollCount", UP.Count) << " time(s).");
+  }
+
+  if (UP.Count > UP.MaxCount)
+    UP.Count = UP.MaxCount;
+  DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count << "\n");
+  if (UP.Count < 2)
+    UP.Count = 0;
+  return ExplicitUnroll;
+}
+
+static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
+                            ScalarEvolution &SE, const TargetTransformInfo &TTI,
+                            AssumptionCache &AC, OptimizationRemarkEmitter &ORE,
+                            bool PreserveLCSSA, int OptLevel,
+                            Optional<unsigned> ProvidedCount,
+                            Optional<unsigned> ProvidedThreshold,
+                            Optional<bool> ProvidedAllowPartial,
+                            Optional<bool> ProvidedRuntime,
+                            Optional<bool> ProvidedUpperBound) {
+  DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName()
+               << "] Loop %" << L->getHeader()->getName() << "\n");
+  if (HasUnrollDisablePragma(L)) 
+    return false;
+  if (!L->isLoopSimplifyForm()) { 
+    DEBUG(
+        dbgs() << "  Not unrolling loop which is not in loop-simplify form.\n");
+    return false;
+  }
+
+  unsigned NumInlineCandidates;
+  bool NotDuplicatable;
+  bool Convergent;
+  TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
+      L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount,
+      ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound);
+  // Exit early if unrolling is disabled.
+  if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
+    return false;
+  unsigned LoopSize = ApproximateLoopSize(
+      L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns);
+  DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
+  if (NotDuplicatable) {
+    DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
+                 << " instructions.\n");
+    return false;
+  }
+  if (NumInlineCandidates != 0) {
+    DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
+    return false;
+  }
+
+  // Find trip count and trip multiple if count is not available
+  unsigned TripCount = 0;
+  unsigned MaxTripCount = 0;
+  unsigned TripMultiple = 1;
+  // If there are multiple exiting blocks but one of them is the latch, use the
+  // latch for the trip count estimation. Otherwise insist on a single exiting
+  // block for the trip count estimation.
+  BasicBlock *ExitingBlock = L->getLoopLatch();
+  if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
+    ExitingBlock = L->getExitingBlock();
+  if (ExitingBlock) {
+    TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
+    TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
+  }
+
+  // If the loop contains a convergent operation, the prelude we'd add
+  // to do the first few instructions before we hit the unrolled loop
+  // is unsafe -- it adds a control-flow dependency to the convergent
+  // operation.  Therefore restrict remainder loop (try unrollig without).
+  //
+  // TODO: This is quite conservative.  In practice, convergent_op()
+  // is likely to be called unconditionally in the loop.  In this
+  // case, the program would be ill-formed (on most architectures)
+  // unless n were the same on all threads in a thread group.
+  // Assuming n is the same on all threads, any kind of unrolling is
+  // safe.  But currently llvm's notion of convergence isn't powerful
+  // enough to express this.
+  if (Convergent)
+    UP.AllowRemainder = false;
+
+  // Try to find the trip count upper bound if we cannot find the exact trip
+  // count.
+  bool MaxOrZero = false;
+  if (!TripCount) {
+    MaxTripCount = SE.getSmallConstantMaxTripCount(L);
+    MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
+    // We can unroll by the upper bound amount if it's generally allowed or if
+    // we know that the loop is executed either the upper bound or zero times.
+    // (MaxOrZero unrolling keeps only the first loop test, so the number of
+    // loop tests remains the same compared to the non-unrolled version, whereas
+    // the generic upper bound unrolling keeps all but the last loop test so the
+    // number of loop tests goes up which may end up being worse on targets with
+    // constriained branch predictor resources so is controlled by an option.)
+    // In addition we only unroll small upper bounds.
+    if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) {
+      MaxTripCount = 0;
+    }
+  }
+
+  // computeUnrollCount() decides whether it is beneficial to use upper bound to
+  // fully unroll the loop.
+  bool UseUpperBound = false;
+  bool IsCountSetExplicitly =
+      computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount,
+                         TripMultiple, LoopSize, UP, UseUpperBound);
+  if (!UP.Count)
+    return false;
+  // Unroll factor (Count) must be less or equal to TripCount.
+  if (TripCount && UP.Count > TripCount)
+    UP.Count = TripCount;
+
+  // Unroll the loop.
+  if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
+                  UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero,
+                  TripMultiple, UP.PeelCount, LI, &SE, &DT, &AC, &ORE,
+                  PreserveLCSSA))
+    return false;
+
+  // If loop has an unroll count pragma or unrolled by explicitly set count
+  // mark loop as unrolled to prevent unrolling beyond that requested.
+  // If the loop was peeled, we already "used up" the profile information
+  // we had, so we don't want to unroll or peel again.
+  if (IsCountSetExplicitly || UP.PeelCount)
+    SetLoopAlreadyUnrolled(L);
+
+  return true;
+}
+
+namespace {
+class LoopUnroll : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None,
+             Optional<unsigned> Count = None,
+             Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
+             Optional<bool> UpperBound = None)
+      : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)),
+        ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
+        ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) {
+    initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
+  }
+
+  int OptLevel;
+  Optional<unsigned> ProvidedCount;
+  Optional<unsigned> ProvidedThreshold;
+  Optional<bool> ProvidedAllowPartial;
+  Optional<bool> ProvidedRuntime;
+  Optional<bool> ProvidedUpperBound;
+
+  bool runOnLoop(Loop *L, LPPassManager &) override {
+    if (skipLoop(L))
+      return false;
+
+    Function &F = *L->getHeader()->getParent();
+
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    const TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+    // pass.  Function analyses need to be preserved across loop transformations
+    // but ORE cannot be preserved (see comment before the pass definition).
+    OptimizationRemarkEmitter ORE(&F);
+    bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+    return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel,
+                           ProvidedCount, ProvidedThreshold,
+                           ProvidedAllowPartial, ProvidedRuntime,
+                           ProvidedUpperBound);
+  }
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG...
+  ///
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    // FIXME: Loop passes are required to preserve domtree, and for now we just
+    // recreate dom info if anything gets unrolled.
+    getLoopAnalysisUsage(AU);
+  }
+};
+}
+
+char LoopUnroll::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+
+Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
+                                 int AllowPartial, int Runtime,
+                                 int UpperBound) {
+  // TODO: It would make more sense for this function to take the optionals
+  // directly, but that's dangerous since it would silently break out of tree
+  // callers.
+  return new LoopUnroll(
+      OptLevel, Threshold == -1 ? None : Optional<unsigned>(Threshold),
+      Count == -1 ? None : Optional<unsigned>(Count),
+      AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
+      Runtime == -1 ? None : Optional<bool>(Runtime),
+      UpperBound == -1 ? None : Optional<bool>(UpperBound));
+}
+
+Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) {
+  return llvm::createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0);
+}
+
+PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
+                                      LoopStandardAnalysisResults &AR,
+                                      LPMUpdater &Updater) {
+  const auto &FAM =
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+  Function *F = L.getHeader()->getParent();
+
+  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
+  // FIXME: This should probably be optional rather than required.
+  if (!ORE)
+    report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "
+                       "cached at a higher level");
+
+  // Keep track of the previous loop structure so we can identify new loops
+  // created by unrolling.
+  Loop *ParentL = L.getParentLoop();
+  SmallPtrSet<Loop *, 4> OldLoops;
+  if (ParentL)
+    OldLoops.insert(ParentL->begin(), ParentL->end());
+  else
+    OldLoops.insert(AR.LI.begin(), AR.LI.end());
+
+  // The API here is quite complex to call, but there are only two interesting
+  // states we support: partial and full (or "simple") unrolling. However, to
+  // enable these things we actually pass "None" in for the optional to avoid
+  // providing an explicit choice.
+  Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam;
+  if (!AllowPartialUnrolling)
+    AllowPartialParam = RuntimeParam = UpperBoundParam = false;
+  bool Changed = tryToUnrollLoop(
+      &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
+      /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
+      /*Threshold*/ None, AllowPartialParam, RuntimeParam, UpperBoundParam);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  // The parent must not be damaged by unrolling!
+#ifndef NDEBUG
+  if (ParentL)
+    ParentL->verifyLoop();
+#endif
+
+  // Unrolling can do several things to introduce new loops into a loop nest:
+  // - Partial unrolling clones child loops within the current loop. If it
+  //   uses a remainder, then it can also create any number of sibling loops.
+  // - Full unrolling clones child loops within the current loop but then
+  //   removes the current loop making all of the children appear to be new
+  //   sibling loops.
+  // - Loop peeling can directly introduce new sibling loops by peeling one
+  //   iteration.
+  //
+  // When a new loop appears as a sibling loop, either from peeling an
+  // iteration or fully unrolling, its nesting structure has fundamentally
+  // changed and we want to revisit it to reflect that.
+  //
+  // When unrolling has removed the current loop, we need to tell the
+  // infrastructure that it is gone.
+  //
+  // Finally, we support a debugging/testing mode where we revisit child loops
+  // as well. These are not expected to require further optimizations as either
+  // they or the loop they were cloned from have been directly visited already.
+  // But the debugging mode allows us to check this assumption.
+  bool IsCurrentLoopValid = false;
+  SmallVector<Loop *, 4> SibLoops;
+  if (ParentL)
+    SibLoops.append(ParentL->begin(), ParentL->end());
+  else
+    SibLoops.append(AR.LI.begin(), AR.LI.end());
+  erase_if(SibLoops, [&](Loop *SibLoop) {
+    if (SibLoop == &L) {
+      IsCurrentLoopValid = true;
+      return true;
+    }
+
+    // Otherwise erase the loop from the list if it was in the old loops.
+    return OldLoops.count(SibLoop) != 0;
+  });
+  Updater.addSiblingLoops(SibLoops);
+
+  if (!IsCurrentLoopValid) {
+    Updater.markLoopAsDeleted(L);
+  } else {
+    // We can only walk child loops if the current loop remained valid.
+    if (UnrollRevisitChildLoops) {
+      // Walk *all* of the child loops. This is a highly speculative mode
+      // anyways so look for any simplifications that arose from partial
+      // unrolling or peeling off of iterations.
+      SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
+      Updater.addChildLoops(ChildLoops);
+    }
+  }
+
+  return getLoopPassPreservedAnalyses();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
new file mode 100644
index 000000000000..d0c96fa627a4
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -0,0 +1,1527 @@
+//===-- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops that contain branches on loop-invariant conditions
+// to multiple loops.  For example, it turns the left into the right code:
+//
+//  for (...)                  if (lic)
+//    A                          for (...)
+//    if (lic)                     A; B; C
+//      B                      else
+//    C                          for (...)
+//                                 A; C
+//
+// This can increase the size of the code exponentially (doubling it every time
+// a loop is unswitched) so we only unswitch if the resultant code will be
+// smaller than a threshold.
+//
+// This pass expects LICM to be run before it to hoist invariant conditions out
+// of the loop, to make the unswitching opportunity obvious.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <algorithm>
+#include <map>
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unswitch"
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumGuards,   "Number of guards unswitched");
+STATISTIC(NumSelects , "Number of selects unswitched");
+STATISTIC(NumTrivial , "Number of unswitches that are trivial");
+STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
+STATISTIC(TotalInsts,  "Total number of instructions analyzed");
+
+// The specific value of 100 here was chosen based only on intuition and a
+// few specific examples.
+static cl::opt<unsigned>
+Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
+          cl::init(100), cl::Hidden);
+
+namespace {
+
+  class LUAnalysisCache {
+
+    typedef DenseMap<const SwitchInst*, SmallPtrSet<const Value *, 8> >
+      UnswitchedValsMap;
+
+    typedef UnswitchedValsMap::iterator UnswitchedValsIt;
+
+    struct LoopProperties {
+      unsigned CanBeUnswitchedCount;
+      unsigned WasUnswitchedCount;
+      unsigned SizeEstimation;
+      UnswitchedValsMap UnswitchedVals;
+    };
+
+    // Here we use std::map instead of DenseMap, since we need to keep valid
+    // LoopProperties pointer for current loop for better performance.
+    typedef std::map<const Loop*, LoopProperties> LoopPropsMap;
+    typedef LoopPropsMap::iterator LoopPropsMapIt;
+
+    LoopPropsMap LoopsProperties;
+    UnswitchedValsMap *CurLoopInstructions;
+    LoopProperties *CurrentLoopProperties;
+
+    // A loop unswitching with an estimated cost above this threshold
+    // is not performed. MaxSize is turned into unswitching quota for
+    // the current loop, and reduced correspondingly, though note that
+    // the quota is returned by releaseMemory() when the loop has been
+    // processed, so that MaxSize will return to its previous
+    // value. So in most cases MaxSize will equal the Threshold flag
+    // when a new loop is processed. An exception to that is that
+    // MaxSize will have a smaller value while processing nested loops
+    // that were introduced due to loop unswitching of an outer loop.
+    //
+    // FIXME: The way that MaxSize works is subtle and depends on the
+    // pass manager processing loops and calling releaseMemory() in a
+    // specific order. It would be good to find a more straightforward
+    // way of doing what MaxSize does.
+    unsigned MaxSize;
+
+  public:
+    LUAnalysisCache()
+        : CurLoopInstructions(nullptr), CurrentLoopProperties(nullptr),
+          MaxSize(Threshold) {}
+
+    // Analyze loop. Check its size, calculate is it possible to unswitch
+    // it. Returns true if we can unswitch this loop.
+    bool countLoop(const Loop *L, const TargetTransformInfo &TTI,
+                   AssumptionCache *AC);
+
+    // Clean all data related to given loop.
+    void forgetLoop(const Loop *L);
+
+    // Mark case value as unswitched.
+    // Since SI instruction can be partly unswitched, in order to avoid
+    // extra unswitching in cloned loops keep track all unswitched values.
+    void setUnswitched(const SwitchInst *SI, const Value *V);
+
+    // Check was this case value unswitched before or not.
+    bool isUnswitched(const SwitchInst *SI, const Value *V);
+
+    // Returns true if another unswitching could be done within the cost
+    // threshold.
+    bool CostAllowsUnswitching();
+
+    // Clone all loop-unswitch related loop properties.
+    // Redistribute unswitching quotas.
+    // Note, that new loop data is stored inside the VMap.
+    void cloneData(const Loop *NewLoop, const Loop *OldLoop,
+                   const ValueToValueMapTy &VMap);
+  };
+
+  class LoopUnswitch : public LoopPass {
+    LoopInfo *LI;  // Loop information
+    LPPassManager *LPM;
+    AssumptionCache *AC;
+
+    // Used to check if second loop needs processing after
+    // RewriteLoopBodyWithConditionConstant rewrites first loop.
+    std::vector<Loop*> LoopProcessWorklist;
+
+    LUAnalysisCache BranchesInfo;
+
+    bool OptimizeForSize;
+    bool redoLoop;
+
+    Loop *currentLoop;
+    DominatorTree *DT;
+    BasicBlock *loopHeader;
+    BasicBlock *loopPreheader;
+
+    bool SanitizeMemory;
+    LoopSafetyInfo SafetyInfo;
+
+    // LoopBlocks contains all of the basic blocks of the loop, including the
+    // preheader of the loop, the body of the loop, and the exit blocks of the
+    // loop, in that order.
+    std::vector<BasicBlock*> LoopBlocks;
+    // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
+    std::vector<BasicBlock*> NewBlocks;
+
+    bool hasBranchDivergence;
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false) :
+      LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
+      currentLoop(nullptr), DT(nullptr), loopHeader(nullptr),
+      loopPreheader(nullptr), hasBranchDivergence(hasBranchDivergence) {
+        initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
+      }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+    bool processCurrentLoop();
+    bool isUnreachableDueToPreviousUnswitching(BasicBlock *);
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG.
+    ///
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
+      if (hasBranchDivergence)
+        AU.addRequired<DivergenceAnalysis>();
+      getLoopAnalysisUsage(AU);
+    }
+
+  private:
+
+    void releaseMemory() override {
+      BranchesInfo.forgetLoop(currentLoop);
+    }
+
+    void initLoopData() {
+      loopHeader = currentLoop->getHeader();
+      loopPreheader = currentLoop->getLoopPreheader();
+    }
+
+    /// Split all of the edges from inside the loop to their exit blocks.
+    /// Update the appropriate Phi nodes as we do so.
+    void SplitExitEdges(Loop *L,
+                        const SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+    bool TryTrivialLoopUnswitch(bool &Changed);
+
+    bool UnswitchIfProfitable(Value *LoopCond, Constant *Val,
+                              TerminatorInst *TI = nullptr);
+    void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+                                  BasicBlock *ExitBlock, TerminatorInst *TI);
+    void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L,
+                                     TerminatorInst *TI);
+
+    void RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                              Constant *Val, bool isEqual);
+
+    void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+                                        BasicBlock *TrueDest,
+                                        BasicBlock *FalseDest,
+                                        Instruction *InsertPt,
+                                        TerminatorInst *TI);
+
+    void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
+
+    /// Given that the Invariant is not equal to Val. Simplify instructions
+    /// in the loop.
+    Value *SimplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
+                                           Constant *Val);
+  };
+}
+
+// Analyze loop. Check its size, calculate is it possible to unswitch
+// it. Returns true if we can unswitch this loop.
+bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
+                                AssumptionCache *AC) {
+
+  LoopPropsMapIt PropsIt;
+  bool Inserted;
+  std::tie(PropsIt, Inserted) =
+      LoopsProperties.insert(std::make_pair(L, LoopProperties()));
+
+  LoopProperties &Props = PropsIt->second;
+
+  if (Inserted) {
+    // New loop.
+
+    // Limit the number of instructions to avoid causing significant code
+    // expansion, and the number of basic blocks, to avoid loops with
+    // large numbers of branches which cause loop unswitching to go crazy.
+    // This is a very ad-hoc heuristic.
+
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+    // FIXME: This is overly conservative because it does not take into
+    // consideration code simplification opportunities and code that can
+    // be shared by the resultant unswitched loops.
+    CodeMetrics Metrics;
+    for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E;
+         ++I)
+      Metrics.analyzeBasicBlock(*I, TTI, EphValues);
+
+    Props.SizeEstimation = Metrics.NumInsts;
+    Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
+    Props.WasUnswitchedCount = 0;
+    MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
+
+    if (Metrics.notDuplicatable) {
+      DEBUG(dbgs() << "NOT unswitching loop %"
+                   << L->getHeader()->getName() << ", contents cannot be "
+                   << "duplicated!\n");
+      return false;
+    }
+  }
+
+  // Be careful. This links are good only before new loop addition.
+  CurrentLoopProperties = &Props;
+  CurLoopInstructions = &Props.UnswitchedVals;
+
+  return true;
+}
+
+// Clean all data related to given loop.
+void LUAnalysisCache::forgetLoop(const Loop *L) {
+
+  LoopPropsMapIt LIt = LoopsProperties.find(L);
+
+  if (LIt != LoopsProperties.end()) {
+    LoopProperties &Props = LIt->second;
+    MaxSize += (Props.CanBeUnswitchedCount + Props.WasUnswitchedCount) *
+               Props.SizeEstimation;
+    LoopsProperties.erase(LIt);
+  }
+
+  CurrentLoopProperties = nullptr;
+  CurLoopInstructions = nullptr;
+}
+
+// Mark case value as unswitched.
+// Since SI instruction can be partly unswitched, in order to avoid
+// extra unswitching in cloned loops keep track all unswitched values.
+void LUAnalysisCache::setUnswitched(const SwitchInst *SI, const Value *V) {
+  (*CurLoopInstructions)[SI].insert(V);
+}
+
+// Check was this case value unswitched before or not.
+bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) {
+  return (*CurLoopInstructions)[SI].count(V);
+}
+
+bool LUAnalysisCache::CostAllowsUnswitching() {
+  return CurrentLoopProperties->CanBeUnswitchedCount > 0;
+}
+
+// Clone all loop-unswitch related loop properties.
+// Redistribute unswitching quotas.
+// Note, that new loop data is stored inside the VMap.
+void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
+                                const ValueToValueMapTy &VMap) {
+
+  LoopProperties &NewLoopProps = LoopsProperties[NewLoop];
+  LoopProperties &OldLoopProps = *CurrentLoopProperties;
+  UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals;
+
+  // Reallocate "can-be-unswitched quota"
+
+  --OldLoopProps.CanBeUnswitchedCount;
+  ++OldLoopProps.WasUnswitchedCount;
+  NewLoopProps.WasUnswitchedCount = 0;
+  unsigned Quota = OldLoopProps.CanBeUnswitchedCount;
+  NewLoopProps.CanBeUnswitchedCount = Quota / 2;
+  OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2;
+
+  NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation;
+
+  // Clone unswitched values info:
+  // for new loop switches we clone info about values that was
+  // already unswitched and has redundant successors.
+  for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) {
+    const SwitchInst *OldInst = I->first;
+    Value *NewI = VMap.lookup(OldInst);
+    const SwitchInst *NewInst = cast_or_null<SwitchInst>(NewI);
+    assert(NewInst && "All instructions that are in SrcBB must be in VMap.");
+
+    NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst];
+  }
+}
+
+char LoopUnswitch::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
+                      false, false)
+
+Pass *llvm::createLoopUnswitchPass(bool Os, bool hasBranchDivergence) {
+  return new LoopUnswitch(Os, hasBranchDivergence);
+}
+
+/// Operator chain lattice.
+enum OperatorChain {
+  OC_OpChainNone,    ///< There is no operator.
+  OC_OpChainOr,      ///< There are only ORs.
+  OC_OpChainAnd,     ///< There are only ANDs.
+  OC_OpChainMixed    ///< There are ANDs and ORs.
+};
+
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant. Otherwise, return null.
+//
+/// NOTE: FindLIVLoopCondition will not return a partial LIV by walking up a
+/// mixed operator chain, as we can not reliably find a value which will simplify
+/// the operator chain. If the chain is AND-only or OR-only, we can use 0 or ~0
+/// to simplify the chain.
+///
+/// NOTE: In case a partial LIV and a mixed operator chain, we may be able to
+/// simplify the condition itself to a loop variant condition, but at the
+/// cost of creating an entirely new loop.
+static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+                                   OperatorChain &ParentChain,
+                                   DenseMap<Value *, Value *> &Cache) {
+  auto CacheIt = Cache.find(Cond);
+  if (CacheIt != Cache.end())
+    return CacheIt->second;
+
+  // We started analyze new instruction, increment scanned instructions counter.
+  ++TotalInsts;
+
+  // We can never unswitch on vector conditions.
+  if (Cond->getType()->isVectorTy())
+    return nullptr;
+
+  // Constants should be folded, not unswitched on!
+  if (isa<Constant>(Cond)) return nullptr;
+
+  // TODO: Handle: br (VARIANT|INVARIANT).
+
+  // Hoist simple values out.
+  if (L->makeLoopInvariant(Cond, Changed)) {
+    Cache[Cond] = Cond;
+    return Cond;
+  }
+
+  // Walk up the operator chain to find partial invariant conditions.
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
+    if (BO->getOpcode() == Instruction::And ||
+        BO->getOpcode() == Instruction::Or) {
+      // Given the previous operator, compute the current operator chain status.
+      OperatorChain NewChain;
+      switch (ParentChain) {
+      case OC_OpChainNone:
+        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+                                      OC_OpChainOr;
+        break;
+      case OC_OpChainOr:
+        NewChain = BO->getOpcode() == Instruction::Or ? OC_OpChainOr :
+                                      OC_OpChainMixed;
+        break;
+      case OC_OpChainAnd:
+        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+                                      OC_OpChainMixed;
+        break;
+      case OC_OpChainMixed:
+        NewChain = OC_OpChainMixed;
+        break;
+      }
+
+      // If we reach a Mixed state, we do not want to keep walking up as we can not
+      // reliably find a value that will simplify the chain. With this check, we
+      // will return null on the first sight of mixed chain and the caller will
+      // either backtrack to find partial LIV in other operand or return null.
+      if (NewChain != OC_OpChainMixed) {
+        // Update the current operator chain type before we search up the chain.
+        ParentChain = NewChain;
+        // If either the left or right side is invariant, we can unswitch on this,
+        // which will cause the branch to go away in one loop and the condition to
+        // simplify in the other one.
+        if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed,
+                                              ParentChain, Cache)) {
+          Cache[Cond] = LHS;
+          return LHS;
+        }
+        // We did not manage to find a partial LIV in operand(0). Backtrack and try
+        // operand(1).
+        ParentChain = NewChain;
+        if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed,
+                                              ParentChain, Cache)) {
+          Cache[Cond] = RHS;
+          return RHS;
+        }
+      }
+    }
+
+  Cache[Cond] = nullptr;
+  return nullptr;
+}
+
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant along with the operator chain type.
+/// Otherwise, return null.
+static std::pair<Value *, OperatorChain> FindLIVLoopCondition(Value *Cond,
+                                                              Loop *L,
+                                                              bool &Changed) {
+  DenseMap<Value *, Value *> Cache;
+  OperatorChain OpChain = OC_OpChainNone;
+  Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache);
+
+  // In case we do find a LIV, it can not be obtained by walking up a mixed
+  // operator chain.
+  assert((!FCond || OpChain != OC_OpChainMixed) &&
+        "Do not expect a partial LIV with mixed operator chain");
+  return {FCond, OpChain};
+}
+
+bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
+  if (skipLoop(L))
+    return false;
+
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+      *L->getHeader()->getParent());
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  LPM = &LPM_Ref;
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  currentLoop = L;
+  Function *F = currentLoop->getHeader()->getParent();
+
+  SanitizeMemory = F->hasFnAttribute(Attribute::SanitizeMemory);
+  if (SanitizeMemory)
+    computeLoopSafetyInfo(&SafetyInfo, L);
+
+  bool Changed = false;
+  do {
+    assert(currentLoop->isLCSSAForm(*DT));
+    redoLoop = false;
+    Changed |= processCurrentLoop();
+  } while(redoLoop);
+
+  // FIXME: Reconstruct dom info, because it is not preserved properly.
+  if (Changed)
+    DT->recalculate(*F);
+  return Changed;
+}
+
+// Return true if the BasicBlock BB is unreachable from the loop header.
+// Return false, otherwise.
+bool LoopUnswitch::isUnreachableDueToPreviousUnswitching(BasicBlock *BB) {
+  auto *Node = DT->getNode(BB)->getIDom();
+  BasicBlock *DomBB = Node->getBlock();
+  while (currentLoop->contains(DomBB)) {
+    BranchInst *BInst = dyn_cast<BranchInst>(DomBB->getTerminator());
+
+    Node = DT->getNode(DomBB)->getIDom();
+    DomBB = Node->getBlock();
+
+    if (!BInst || !BInst->isConditional())
+      continue;
+
+    Value *Cond = BInst->getCondition();
+    if (!isa<ConstantInt>(Cond))
+      continue;
+
+    BasicBlock *UnreachableSucc =
+        Cond == ConstantInt::getTrue(Cond->getContext())
+            ? BInst->getSuccessor(1)
+            : BInst->getSuccessor(0);
+
+    if (DT->dominates(UnreachableSucc, BB))
+      return true;
+  }
+  return false;
+}
+
+/// Do actual work and unswitch loop if possible and profitable.
+bool LoopUnswitch::processCurrentLoop() {
+  bool Changed = false;
+
+  initLoopData();
+
+  // If LoopSimplify was unable to form a preheader, don't do any unswitching.
+  if (!loopPreheader)
+    return false;
+
+  // Loops with indirectbr cannot be cloned.
+  if (!currentLoop->isSafeToClone())
+    return false;
+
+  // Without dedicated exits, splitting the exit edge may fail.
+  if (!currentLoop->hasDedicatedExits())
+    return false;
+
+  LLVMContext &Context = loopHeader->getContext();
+
+  // Analyze loop cost, and stop unswitching if loop content can not be duplicated.
+  if (!BranchesInfo.countLoop(
+          currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+                           *currentLoop->getHeader()->getParent()),
+          AC))
+    return false;
+
+  // Try trivial unswitch first before loop over other basic blocks in the loop.
+  if (TryTrivialLoopUnswitch(Changed)) {
+    return true;
+  }
+
+  // Run through the instructions in the loop, keeping track of three things:
+  //
+  //  - That we do not unswitch loops containing convergent operations, as we
+  //    might be making them control dependent on the unswitch value when they
+  //    were not before.
+  //    FIXME: This could be refined to only bail if the convergent operation is
+  //    not already control-dependent on the unswitch value.
+  //
+  //  - That basic blocks in the loop contain invokes whose predecessor edges we
+  //    cannot split.
+  //
+  //  - The set of guard intrinsics encountered (these are non terminator
+  //    instructions that are also profitable to be unswitched).
+
+  SmallVector<IntrinsicInst *, 4> Guards;
+
+  for (const auto BB : currentLoop->blocks()) {
+    for (auto &I : *BB) {
+      auto CS = CallSite(&I);
+      if (!CS) continue;
+      if (CS.hasFnAttr(Attribute::Convergent))
+        return false;
+      if (auto *II = dyn_cast<InvokeInst>(&I))
+        if (!II->getUnwindDest()->canSplitPredecessors())
+          return false;
+      if (auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::experimental_guard)
+          Guards.push_back(II);
+    }
+  }
+
+  // Do not do non-trivial unswitch while optimizing for size.
+  // FIXME: Use Function::optForSize().
+  if (OptimizeForSize ||
+      loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
+    return false;
+
+  for (IntrinsicInst *Guard : Guards) {
+    Value *LoopCond =
+        FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first;
+    if (LoopCond &&
+        UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
+      // NB! Unswitching (if successful) could have erased some of the
+      // instructions in Guards leaving dangling pointers there.  This is fine
+      // because we're returning now, and won't look at Guards again.
+      ++NumGuards;
+      return true;
+    }
+  }
+
+  // Loop over all of the basic blocks in the loop.  If we find an interior
+  // block that is branching on a loop-invariant condition, we can unswitch this
+  // loop.
+  for (Loop::block_iterator I = currentLoop->block_begin(),
+         E = currentLoop->block_end(); I != E; ++I) {
+    TerminatorInst *TI = (*I)->getTerminator();
+
+    // Unswitching on a potentially uninitialized predicate is not
+    // MSan-friendly. Limit this to the cases when the original predicate is
+    // guaranteed to execute, to avoid creating a use-of-uninitialized-value
+    // in the code that did not have one.
+    // This is a workaround for the discrepancy between LLVM IR and MSan
+    // semantics. See PR28054 for more details.
+    if (SanitizeMemory &&
+        !isGuaranteedToExecute(*TI, DT, currentLoop, &SafetyInfo))
+      continue;
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      // Some branches may be rendered unreachable because of previous
+      // unswitching.
+      // Unswitch only those branches that are reachable.
+      if (isUnreachableDueToPreviousUnswitching(*I))
+        continue;
+ 
+      // If this isn't branching on an invariant condition, we can't unswitch
+      // it.
+      if (BI->isConditional()) {
+        // See if this, or some part of it, is loop invariant.  If so, we can
+        // unswitch on it if we desire.
+        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
+                                               currentLoop, Changed).first;
+        if (LoopCond &&
+            UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
+          ++NumBranches;
+          return true;
+        }
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      Value *SC = SI->getCondition();
+      Value *LoopCond;
+      OperatorChain OpChain;
+      std::tie(LoopCond, OpChain) =
+        FindLIVLoopCondition(SC, currentLoop, Changed);
+
+      unsigned NumCases = SI->getNumCases();
+      if (LoopCond && NumCases) {
+        // Find a value to unswitch on:
+        // FIXME: this should chose the most expensive case!
+        // FIXME: scan for a case with a non-critical edge?
+        Constant *UnswitchVal = nullptr;
+        // Find a case value such that at least one case value is unswitched
+        // out.
+        if (OpChain == OC_OpChainAnd) {
+          // If the chain only has ANDs and the switch has a case value of 0.
+          // Dropping in a 0 to the chain will unswitch out the 0-casevalue.
+          auto *AllZero = cast<ConstantInt>(Constant::getNullValue(SC->getType()));
+          if (BranchesInfo.isUnswitched(SI, AllZero))
+            continue;
+          // We are unswitching 0 out.
+          UnswitchVal = AllZero;
+        } else if (OpChain == OC_OpChainOr) {
+          // If the chain only has ORs and the switch has a case value of ~0.
+          // Dropping in a ~0 to the chain will unswitch out the ~0-casevalue.
+          auto *AllOne = cast<ConstantInt>(Constant::getAllOnesValue(SC->getType()));
+          if (BranchesInfo.isUnswitched(SI, AllOne))
+            continue;
+          // We are unswitching ~0 out.
+          UnswitchVal = AllOne;
+        } else {
+          assert(OpChain == OC_OpChainNone && 
+                 "Expect to unswitch on trivial chain");
+          // Do not process same value again and again.
+          // At this point we have some cases already unswitched and
+          // some not yet unswitched. Let's find the first not yet unswitched one.
+          for (auto Case : SI->cases()) {
+            Constant *UnswitchValCandidate = Case.getCaseValue();
+            if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
+              UnswitchVal = UnswitchValCandidate;
+              break;
+            }
+          }
+        }
+
+        if (!UnswitchVal)
+          continue;
+
+        if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
+          ++NumSwitches;
+          // In case of a full LIV, UnswitchVal is the value we unswitched out.
+          // In case of a partial LIV, we only unswitch when its an AND-chain
+          // or OR-chain. In both cases switch input value simplifies to
+          // UnswitchVal.
+          BranchesInfo.setUnswitched(SI, UnswitchVal);
+          return true;
+        }
+      }
+    }
+
+    // Scan the instructions to check for unswitchable values.
+    for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
+         BBI != E; ++BBI)
+      if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
+        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
+                                               currentLoop, Changed).first;
+        if (LoopCond && UnswitchIfProfitable(LoopCond,
+                                             ConstantInt::getTrue(Context))) {
+          ++NumSelects;
+          return true;
+        }
+      }
+  }
+  return Changed;
+}
+
+/// Check to see if all paths from BB exit the loop with no side effects
+/// (including infinite loops).
+///
+/// If true, we return true and set ExitBB to the block we
+/// exit through.
+///
+static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
+                                         BasicBlock *&ExitBB,
+                                         std::set<BasicBlock*> &Visited) {
+  if (!Visited.insert(BB).second) {
+    // Already visited. Without more analysis, this could indicate an infinite
+    // loop.
+    return false;
+  }
+  if (!L->contains(BB)) {
+    // Otherwise, this is a loop exit, this is fine so long as this is the
+    // first exit.
+    if (ExitBB) return false;
+    ExitBB = BB;
+    return true;
+  }
+
+  // Otherwise, this is an unvisited intra-loop node.  Check all successors.
+  for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
+    // Check to see if the successor is a trivial loop exit.
+    if (!isTrivialLoopExitBlockHelper(L, *SI, ExitBB, Visited))
+      return false;
+  }
+
+  // Okay, everything after this looks good, check to make sure that this block
+  // doesn't include any side effects.
+  for (Instruction &I : *BB)
+    if (I.mayHaveSideEffects())
+      return false;
+
+  return true;
+}
+
+/// Return true if the specified block unconditionally leads to an exit from
+/// the specified loop, and has no side-effects in the process. If so, return
+/// the block that is exited to, otherwise return null.
+static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
+  std::set<BasicBlock*> Visited;
+  Visited.insert(L->getHeader());  // Branches to header make infinite loops.
+  BasicBlock *ExitBB = nullptr;
+  if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited))
+    return ExitBB;
+  return nullptr;
+}
+
+/// We have found that we can unswitch currentLoop when LoopCond == Val to
+/// simplify the loop.  If we decide that this is profitable,
+/// unswitch the loop, reprocess the pieces, then return true.
+bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
+                                        TerminatorInst *TI) {
+  // Check to see if it would be profitable to unswitch current loop.
+  if (!BranchesInfo.CostAllowsUnswitching()) {
+    DEBUG(dbgs() << "NOT unswitching loop %"
+                 << currentLoop->getHeader()->getName()
+                 << " at non-trivial condition '" << *Val
+                 << "' == " << *LoopCond << "\n"
+                 << ". Cost too high.\n");
+    return false;
+  }
+  if (hasBranchDivergence &&
+      getAnalysis<DivergenceAnalysis>().isDivergent(LoopCond)) {
+    DEBUG(dbgs() << "NOT unswitching loop %"
+                 << currentLoop->getHeader()->getName()
+                 << " at non-trivial condition '" << *Val
+                 << "' == " << *LoopCond << "\n"
+                 << ". Condition is divergent.\n");
+    return false;
+  }
+
+  UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI);
+  return true;
+}
+
+/// Recursively clone the specified loop and all of its children,
+/// mapping the blocks with the specified map.
+static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
+                       LoopInfo *LI, LPPassManager *LPM) {
+  Loop &New = *new Loop();
+  if (PL)
+    PL->addChildLoop(&New);
+  else
+    LI->addTopLevelLoop(&New);
+  LPM->addLoop(New);
+
+  // Add all of the blocks in L to the new loop.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    if (LI->getLoopFor(*I) == L)
+      New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
+
+  // Add all of the subloops to the new loop.
+  for (Loop *I : *L)
+    CloneLoop(I, &New, VM, LI, LPM);
+
+  return &New;
+}
+
+/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
+/// otherwise branch to FalseDest. Insert the code immediately before InsertPt.
+void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+                                                  BasicBlock *TrueDest,
+                                                  BasicBlock *FalseDest,
+                                                  Instruction *InsertPt,
+                                                  TerminatorInst *TI) {
+  // Insert a conditional branch on LIC to the two preheaders.  The original
+  // code is the true version and the new code is the false version.
+  Value *BranchVal = LIC;
+  bool Swapped = false;
+  if (!isa<ConstantInt>(Val) ||
+      Val->getType() != Type::getInt1Ty(LIC->getContext()))
+    BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val);
+  else if (Val != ConstantInt::getTrue(Val->getContext())) {
+    // We want to enter the new loop when the condition is true.
+    std::swap(TrueDest, FalseDest);
+    Swapped = true;
+  }
+
+  // Insert the new branch.
+  BranchInst *BI =
+      IRBuilder<>(InsertPt).CreateCondBr(BranchVal, TrueDest, FalseDest, TI);
+  if (Swapped)
+    BI->swapProfMetadata();
+
+  // If either edge is critical, split it. This helps preserve LoopSimplify
+  // form for enclosing loops.
+  auto Options = CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA();
+  SplitCriticalEdge(BI, 0, Options);
+  SplitCriticalEdge(BI, 1, Options);
+}
+
+/// Given a loop that has a trivial unswitchable condition in it (a cond branch
+/// from its header block to its latch block, where the path through the loop
+/// that doesn't execute its body has no side-effects), unswitch it. This
+/// doesn't involve any code duplication, just moving the conditional branch
+/// outside of the loop and updating loop info.
+void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+                                            BasicBlock *ExitBlock,
+                                            TerminatorInst *TI) {
+  DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
+               << loopHeader->getName() << " [" << L->getBlocks().size()
+               << " blocks] in Function "
+               << L->getHeader()->getParent()->getName() << " on cond: " << *Val
+               << " == " << *Cond << "\n");
+
+  // First step, split the preheader, so that we know that there is a safe place
+  // to insert the conditional branch.  We will change loopPreheader to have a
+  // conditional branch on Cond.
+  BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, DT, LI);
+
+  // Now that we have a place to insert the conditional branch, create a place
+  // to branch to: this is the exit block out of the loop that we should
+  // short-circuit to.
+
+  // Split this block now, so that the loop maintains its exit block, and so
+  // that the jump from the preheader can execute the contents of the exit block
+  // without actually branching to it (the exit block should be dominated by the
+  // loop header, not the preheader).
+  assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
+  BasicBlock *NewExit = SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI);
+
+  // Okay, now we have a position to branch from and a position to branch to,
+  // insert the new conditional branch.
+  EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH,
+                                 loopPreheader->getTerminator(), TI);
+  LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L);
+  loopPreheader->getTerminator()->eraseFromParent();
+
+  // We need to reprocess this loop, it could be unswitched again.
+  redoLoop = true;
+
+  // Now that we know that the loop is never entered when this condition is a
+  // particular value, rewrite the loop with this info.  We know that this will
+  // at least eliminate the old branch.
+  RewriteLoopBodyWithConditionConstant(L, Cond, Val, false);
+  ++NumTrivial;
+}
+
+/// Check if the first non-constant condition starting from the loop header is
+/// a trivial unswitch condition: that is, a condition controls whether or not
+/// the loop does anything at all. If it is a trivial condition, unswitching
+/// produces no code duplications (equivalently, it produces a simpler loop and
+/// a new empty loop, which gets deleted). Therefore always unswitch trivial
+/// condition.
+bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
+  BasicBlock *CurrentBB = currentLoop->getHeader();
+  TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
+  LLVMContext &Context = CurrentBB->getContext();
+
+  // If loop header has only one reachable successor (currently via an
+  // unconditional branch or constant foldable conditional branch, but
+  // should also consider adding constant foldable switch instruction in
+  // future), we should keep looking for trivial condition candidates in
+  // the successor as well. An alternative is to constant fold conditions
+  // and merge successors into loop header (then we only need to check header's
+  // terminator). The reason for not doing this in LoopUnswitch pass is that
+  // it could potentially break LoopPassManager's invariants. Folding dead
+  // branches could either eliminate the current loop or make other loops
+  // unreachable. LCSSA form might also not be preserved after deleting
+  // branches. The following code keeps traversing loop header's successors
+  // until it finds the trivial condition candidate (condition that is not a
+  // constant). Since unswitching generates branches with constant conditions,
+  // this scenario could be very common in practice.
+  SmallSet<BasicBlock*, 8> Visited;
+
+  while (true) {
+    // If we exit loop or reach a previous visited block, then
+    // we can not reach any trivial condition candidates (unfoldable
+    // branch instructions or switch instructions) and no unswitch
+    // can happen. Exit and return false.
+    if (!currentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second)
+      return false;
+
+    // Check if this loop will execute any side-effecting instructions (e.g.
+    // stores, calls, volatile loads) in the part of the loop that the code
+    // *would* execute. Check the header first.
+    for (Instruction &I : *CurrentBB)
+      if (I.mayHaveSideEffects())
+        return false;
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
+      if (BI->isUnconditional()) {
+        CurrentBB = BI->getSuccessor(0);
+      } else if (BI->getCondition() == ConstantInt::getTrue(Context)) {
+        CurrentBB = BI->getSuccessor(0);
+      } else if (BI->getCondition() == ConstantInt::getFalse(Context)) {
+        CurrentBB = BI->getSuccessor(1);
+      } else {
+        // Found a trivial condition candidate: non-foldable conditional branch.
+        break;
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+      // At this point, any constant-foldable instructions should have probably
+      // been folded.
+      ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
+      if (!Cond)
+        break;
+      // Find the target block we are definitely going to.
+      CurrentBB = SI->findCaseValue(Cond)->getCaseSuccessor();
+    } else {
+      // We do not understand these terminator instructions.
+      break;
+    }
+
+    CurrentTerm = CurrentBB->getTerminator();
+  }
+
+  // CondVal is the condition that controls the trivial condition.
+  // LoopExitBB is the BasicBlock that loop exits when meets trivial condition.
+  Constant *CondVal = nullptr;
+  BasicBlock *LoopExitBB = nullptr;
+
+  if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
+    // If this isn't branching on an invariant condition, we can't unswitch it.
+    if (!BI->isConditional())
+      return false;
+
+    Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
+                                           currentLoop, Changed).first;
+
+    // Unswitch only if the trivial condition itself is an LIV (not
+    // partial LIV which could occur in and/or)
+    if (!LoopCond || LoopCond != BI->getCondition())
+      return false;
+
+    // Check to see if a successor of the branch is guaranteed to
+    // exit through a unique exit block without having any
+    // side-effects.  If so, determine the value of Cond that causes
+    // it to do this.
+    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
+                                             BI->getSuccessor(0)))) {
+      CondVal = ConstantInt::getTrue(Context);
+    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
+                                                    BI->getSuccessor(1)))) {
+      CondVal = ConstantInt::getFalse(Context);
+    }
+
+    // If we didn't find a single unique LoopExit block, or if the loop exit
+    // block contains phi nodes, this isn't trivial.
+    if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+      return false;   // Can't handle this.
+
+    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
+                             CurrentTerm);
+    ++NumBranches;
+    return true;
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+    // If this isn't switching on an invariant condition, we can't unswitch it.
+    Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
+                                           currentLoop, Changed).first;
+
+    // Unswitch only if the trivial condition itself is an LIV (not
+    // partial LIV which could occur in and/or)
+    if (!LoopCond || LoopCond != SI->getCondition())
+      return false;
+
+    // Check to see if a successor of the switch is guaranteed to go to the
+    // latch block or exit through a one exit block without having any
+    // side-effects.  If so, determine the value of Cond that causes it to do
+    // this.
+    // Note that we can't trivially unswitch on the default case or
+    // on already unswitched cases.
+    for (auto Case : SI->cases()) {
+      BasicBlock *LoopExitCandidate;
+      if ((LoopExitCandidate =
+               isTrivialLoopExitBlock(currentLoop, Case.getCaseSuccessor()))) {
+        // Okay, we found a trivial case, remember the value that is trivial.
+        ConstantInt *CaseVal = Case.getCaseValue();
+
+        // Check that it was not unswitched before, since already unswitched
+        // trivial vals are looks trivial too.
+        if (BranchesInfo.isUnswitched(SI, CaseVal))
+          continue;
+        LoopExitBB = LoopExitCandidate;
+        CondVal = CaseVal;
+        break;
+      }
+    }
+
+    // If we didn't find a single unique LoopExit block, or if the loop exit
+    // block contains phi nodes, this isn't trivial.
+    if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+      return false;   // Can't handle this.
+
+    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
+                             nullptr);
+
+    // We are only unswitching full LIV.
+    BranchesInfo.setUnswitched(SI, CondVal);
+    ++NumSwitches;
+    return true;
+  }
+  return false;
+}
+
+/// Split all of the edges from inside the loop to their exit blocks.
+/// Update the appropriate Phi nodes as we do so.
+void LoopUnswitch::SplitExitEdges(Loop *L,
+                               const SmallVectorImpl<BasicBlock *> &ExitBlocks){
+
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBlock = ExitBlocks[i];
+    SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
+                                       pred_end(ExitBlock));
+
+    // Although SplitBlockPredecessors doesn't preserve loop-simplify in
+    // general, if we call it on all predecessors of all exits then it does.
+    SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI,
+                           /*PreserveLCSSA*/ true);
+  }
+}
+
+/// We determined that the loop is profitable to unswitch when LIC equal Val.
+/// Split it into loop versions and test the condition outside of either loop.
+/// Return the loops created as Out1/Out2.
+void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
+                                               Loop *L, TerminatorInst *TI) {
+  Function *F = loopHeader->getParent();
+  DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
+        << loopHeader->getName() << " [" << L->getBlocks().size()
+        << " blocks] in Function " << F->getName()
+        << " when '" << *Val << "' == " << *LIC << "\n");
+
+  if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
+    SEWP->getSE().forgetLoop(L);
+
+  LoopBlocks.clear();
+  NewBlocks.clear();
+
+  // First step, split the preheader and exit blocks, and add these blocks to
+  // the LoopBlocks list.
+  BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, DT, LI);
+  LoopBlocks.push_back(NewPreheader);
+
+  // We want the loop to come after the preheader, but before the exit blocks.
+  LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Split all of the edges from inside the loop to their exit blocks.  Update
+  // the appropriate Phi nodes as we do so.
+  SplitExitEdges(L, ExitBlocks);
+
+  // The exit blocks may have been changed due to edge splitting, recompute.
+  ExitBlocks.clear();
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Add exit blocks to the loop blocks.
+  LoopBlocks.insert(LoopBlocks.end(), ExitBlocks.begin(), ExitBlocks.end());
+
+  // Next step, clone all of the basic blocks that make up the loop (including
+  // the loop preheader and exit blocks), keeping track of the mapping between
+  // the instructions and blocks.
+  NewBlocks.reserve(LoopBlocks.size());
+  ValueToValueMapTy VMap;
+  for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
+    BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".us", F);
+
+    NewBlocks.push_back(NewBB);
+    VMap[LoopBlocks[i]] = NewBB;  // Keep the BB mapping.
+    LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L);
+  }
+
+  // Splice the newly inserted blocks into the function right before the
+  // original preheader.
+  F->getBasicBlockList().splice(NewPreheader->getIterator(),
+                                F->getBasicBlockList(),
+                                NewBlocks[0]->getIterator(), F->end());
+
+  // Now we create the new Loop object for the versioned loop.
+  Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
+
+  // Recalculate unswitching quota, inherit simplified switches info for NewBB,
+  // Probably clone more loop-unswitch related loop properties.
+  BranchesInfo.cloneData(NewLoop, L, VMap);
+
+  Loop *ParentLoop = L->getParentLoop();
+  if (ParentLoop) {
+    // Make sure to add the cloned preheader and exit blocks to the parent loop
+    // as well.
+    ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI);
+  }
+
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]);
+    // The new exit block should be in the same loop as the old one.
+    if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
+      ExitBBLoop->addBasicBlockToLoop(NewExit, *LI);
+
+    assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
+           "Exit block should have been split to have one successor!");
+    BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
+
+    // If the successor of the exit block had PHI nodes, add an entry for
+    // NewExit.
+    for (BasicBlock::iterator I = ExitSucc->begin();
+         PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+      Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]);
+      ValueToValueMapTy::iterator It = VMap.find(V);
+      if (It != VMap.end()) V = It->second;
+      PN->addIncoming(V, NewExit);
+    }
+
+    if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
+      PHINode *PN = PHINode::Create(LPad->getType(), 0, "",
+                                    &*ExitSucc->getFirstInsertionPt());
+
+      for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
+           I != E; ++I) {
+        BasicBlock *BB = *I;
+        LandingPadInst *LPI = BB->getLandingPadInst();
+        LPI->replaceAllUsesWith(PN);
+        PN->addIncoming(LPI, BB);
+      }
+    }
+  }
+
+  // Rewrite the code to refer to itself.
+  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
+    for (Instruction &I : *NewBlocks[i]) {
+      RemapInstruction(&I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      if (auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
+    }
+  }
+
+  // Rewrite the original preheader to select between versions of the loop.
+  BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
+  assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
+         "Preheader splitting did not work correctly!");
+
+  // Emit the new branch that selects between the two versions of this loop.
+  EmitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR,
+                                 TI);
+  LPM->deleteSimpleAnalysisValue(OldBR, L);
+  OldBR->eraseFromParent();
+
+  LoopProcessWorklist.push_back(NewLoop);
+  redoLoop = true;
+
+  // Keep a WeakTrackingVH holding onto LIC.  If the first call to
+  // RewriteLoopBody
+  // deletes the instruction (for example by simplifying a PHI that feeds into
+  // the condition that we're unswitching on), we don't rewrite the second
+  // iteration.
+  WeakTrackingVH LICHandle(LIC);
+
+  // Now we rewrite the original code to know that the condition is true and the
+  // new code to know that the condition is false.
+  RewriteLoopBodyWithConditionConstant(L, LIC, Val, false);
+
+  // It's possible that simplifying one loop could cause the other to be
+  // changed to another value or a constant.  If its a constant, don't simplify
+  // it.
+  if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop &&
+      LICHandle && !isa<Constant>(LICHandle))
+    RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true);
+}
+
+/// Remove all instances of I from the worklist vector specified.
+static void RemoveFromWorklist(Instruction *I,
+                               std::vector<Instruction*> &Worklist) {
+
+  Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), I),
+                 Worklist.end());
+}
+
+/// When we find that I really equals V, remove I from the
+/// program, replacing all uses with V and update the worklist.
+static void ReplaceUsesOfWith(Instruction *I, Value *V,
+                              std::vector<Instruction*> &Worklist,
+                              Loop *L, LPPassManager *LPM) {
+  DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
+
+  // Add uses to the worklist, which may be dead now.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+      Worklist.push_back(Use);
+
+  // Add users to the worklist which may be simplified now.
+  for (User *U : I->users())
+    Worklist.push_back(cast<Instruction>(U));
+  LPM->deleteSimpleAnalysisValue(I, L);
+  RemoveFromWorklist(I, Worklist);
+  I->replaceAllUsesWith(V);
+  if (!I->mayHaveSideEffects())
+    I->eraseFromParent();
+  ++NumSimplify;
+}
+
+/// We know either that the value LIC has the value specified by Val in the
+/// specified loop, or we know it does NOT have that value.
+/// Rewrite any uses of LIC or of properties correlated to it.
+void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                                        Constant *Val,
+                                                        bool IsEqual) {
+  assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
+
+  // FIXME: Support correlated properties, like:
+  //  for (...)
+  //    if (li1 < li2)
+  //      ...
+  //    if (li1 > li2)
+  //      ...
+
+  // FOLD boolean conditions (X|LIC), (X&LIC).  Fold conditional branches,
+  // selects, switches.
+  std::vector<Instruction*> Worklist;
+  LLVMContext &Context = Val->getContext();
+
+  // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC
+  // in the loop with the appropriate one directly.
+  if (IsEqual || (isa<ConstantInt>(Val) &&
+      Val->getType()->isIntegerTy(1))) {
+    Value *Replacement;
+    if (IsEqual)
+      Replacement = Val;
+    else
+      Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()),
+                                     !cast<ConstantInt>(Val)->getZExtValue());
+
+    for (User *U : LIC->users()) {
+      Instruction *UI = dyn_cast<Instruction>(U);
+      if (!UI || !L->contains(UI))
+        continue;
+      Worklist.push_back(UI);
+    }
+
+    for (Instruction *UI : Worklist)
+      UI->replaceUsesOfWith(LIC, Replacement);
+
+    SimplifyCode(Worklist, L);
+    return;
+  }
+
+  // Otherwise, we don't know the precise value of LIC, but we do know that it
+  // is certainly NOT "Val".  As such, simplify any uses in the loop that we
+  // can.  This case occurs when we unswitch switch statements.
+  for (User *U : LIC->users()) {
+    Instruction *UI = dyn_cast<Instruction>(U);
+    if (!UI || !L->contains(UI))
+      continue;
+
+    // At this point, we know LIC is definitely not Val. Try to use some simple
+    // logic to simplify the user w.r.t. to the context.
+    if (Value *Replacement = SimplifyInstructionWithNotEqual(UI, LIC, Val)) {
+      if (LI->replacementPreservesLCSSAForm(UI, Replacement)) {
+        // This in-loop instruction has been simplified w.r.t. its context,
+        // i.e. LIC != Val, make sure we propagate its replacement value to
+        // all its users.
+        //  
+        // We can not yet delete UI, the LIC user, yet, because that would invalidate
+        // the LIC->users() iterator !. However, we can make this instruction
+        // dead by replacing all its users and push it onto the worklist so that
+        // it can be properly deleted and its operands simplified. 
+        UI->replaceAllUsesWith(Replacement);
+      }
+    }
+
+    // This is a LIC user, push it into the worklist so that SimplifyCode can
+    // attempt to simplify it.
+    Worklist.push_back(UI);
+
+    // If we know that LIC is not Val, use this info to simplify code.
+    SwitchInst *SI = dyn_cast<SwitchInst>(UI);
+    if (!SI || !isa<ConstantInt>(Val)) continue;
+
+    // NOTE: if a case value for the switch is unswitched out, we record it
+    // after the unswitch finishes. We can not record it here as the switch
+    // is not a direct user of the partial LIV.
+    SwitchInst::CaseHandle DeadCase =
+        *SI->findCaseValue(cast<ConstantInt>(Val));
+    // Default case is live for multiple values.
+    if (DeadCase == *SI->case_default())
+      continue;
+
+    // Found a dead case value.  Don't remove PHI nodes in the
+    // successor if they become single-entry, those PHI nodes may
+    // be in the Users list.
+
+    BasicBlock *Switch = SI->getParent();
+    BasicBlock *SISucc = DeadCase.getCaseSuccessor();
+    BasicBlock *Latch = L->getLoopLatch();
+
+    if (!SI->findCaseDest(SISucc)) continue;  // Edge is critical.
+    // If the DeadCase successor dominates the loop latch, then the
+    // transformation isn't safe since it will delete the sole predecessor edge
+    // to the latch.
+    if (Latch && DT->dominates(SISucc, Latch))
+      continue;
+
+    // FIXME: This is a hack.  We need to keep the successor around
+    // and hooked up so as to preserve the loop structure, because
+    // trying to update it is complicated.  So instead we preserve the
+    // loop structure and put the block on a dead code path.
+    SplitEdge(Switch, SISucc, DT, LI);
+    // Compute the successors instead of relying on the return value
+    // of SplitEdge, since it may have split the switch successor
+    // after PHI nodes.
+    BasicBlock *NewSISucc = DeadCase.getCaseSuccessor();
+    BasicBlock *OldSISucc = *succ_begin(NewSISucc);
+    // Create an "unreachable" destination.
+    BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable",
+                                           Switch->getParent(),
+                                           OldSISucc);
+    new UnreachableInst(Context, Abort);
+    // Force the new case destination to branch to the "unreachable"
+    // block while maintaining a (dead) CFG edge to the old block.
+    NewSISucc->getTerminator()->eraseFromParent();
+    BranchInst::Create(Abort, OldSISucc,
+                       ConstantInt::getTrue(Context), NewSISucc);
+    // Release the PHI operands for this edge.
+    for (BasicBlock::iterator II = NewSISucc->begin();
+         PHINode *PN = dyn_cast<PHINode>(II); ++II)
+      PN->setIncomingValue(PN->getBasicBlockIndex(Switch),
+                           UndefValue::get(PN->getType()));
+    // Tell the domtree about the new block. We don't fully update the
+    // domtree here -- instead we force it to do a full recomputation
+    // after the pass is complete -- but we do need to inform it of
+    // new blocks.
+    DT->addNewBlock(Abort, NewSISucc);
+  }
+
+  SimplifyCode(Worklist, L);
+}
+
+/// Now that we have simplified some instructions in the loop, walk over it and
+/// constant prop, dce, and fold control flow where possible. Note that this is
+/// effectively a very simple loop-structure-aware optimizer. During processing
+/// of this loop, L could very well be deleted, so it must not be used.
+///
+/// FIXME: When the loop optimizer is more mature, separate this out to a new
+/// pass.
+///
+void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+
+    // Simple DCE.
+    if (isInstructionTriviallyDead(I)) {
+      DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n");
+
+      // Add uses to the worklist, which may be dead now.
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+        if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+          Worklist.push_back(Use);
+      LPM->deleteSimpleAnalysisValue(I, L);
+      RemoveFromWorklist(I, Worklist);
+      I->eraseFromParent();
+      ++NumSimplify;
+      continue;
+    }
+
+    // See if instruction simplification can hack this up.  This is common for
+    // things like "select false, X, Y" after unswitching made the condition be
+    // 'false'.  TODO: update the domtree properly so we can pass it here.
+    if (Value *V = SimplifyInstruction(I, DL))
+      if (LI->replacementPreservesLCSSAForm(I, V)) {
+        ReplaceUsesOfWith(I, V, Worklist, L, LPM);
+        continue;
+      }
+
+    // Special case hacks that appear commonly in unswitched code.
+    if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+      if (BI->isUnconditional()) {
+        // If BI's parent is the only pred of the successor, fold the two blocks
+        // together.
+        BasicBlock *Pred = BI->getParent();
+        BasicBlock *Succ = BI->getSuccessor(0);
+        BasicBlock *SinglePred = Succ->getSinglePredecessor();
+        if (!SinglePred) continue;  // Nothing to do.
+        assert(SinglePred == Pred && "CFG broken");
+
+        DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
+              << Succ->getName() << "\n");
+
+        // Resolve any single entry PHI nodes in Succ.
+        while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
+          ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM);
+
+        // If Succ has any successors with PHI nodes, update them to have
+        // entries coming from Pred instead of Succ.
+        Succ->replaceAllUsesWith(Pred);
+
+        // Move all of the successor contents from Succ to Pred.
+        Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(),
+                                   Succ->begin(), Succ->end());
+        LPM->deleteSimpleAnalysisValue(BI, L);
+        RemoveFromWorklist(BI, Worklist);
+        BI->eraseFromParent();
+
+        // Remove Succ from the loop tree.
+        LI->removeBlock(Succ);
+        LPM->deleteSimpleAnalysisValue(Succ, L);
+        Succ->eraseFromParent();
+        ++NumSimplify;
+        continue;
+      }
+
+      continue;
+    }
+  }
+}
+
+/// Simple simplifications we can do given the information that Cond is
+/// definitely not equal to Val.
+Value *LoopUnswitch::SimplifyInstructionWithNotEqual(Instruction *Inst,
+                                                     Value *Invariant,
+                                                     Constant *Val) {
+  // icmp eq cond, val -> false
+  ICmpInst *CI = dyn_cast<ICmpInst>(Inst);
+  if (CI && CI->isEquality()) {
+    Value *Op0 = CI->getOperand(0);
+    Value *Op1 = CI->getOperand(1);
+    if ((Op0 == Invariant && Op1 == Val) || (Op0 == Val && Op1 == Invariant)) {
+      LLVMContext &Ctx = Inst->getContext();
+      if (CI->getPredicate() == CmpInst::ICMP_EQ)
+        return ConstantInt::getFalse(Ctx);
+      else 
+        return ConstantInt::getTrue(Ctx);
+     }
+  }
+
+  // FIXME: there may be other opportunities, e.g. comparison with floating
+  // point, or Invariant - Val != 0, etc.
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
new file mode 100644
index 000000000000..c23d891b6504
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -0,0 +1,578 @@
+//===----------- LoopVersioningLICM.cpp - LICM Loop Versioning ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When alias analysis is uncertain about the aliasing between any two accesses,
+// it will return MayAlias. This uncertainty from alias analysis restricts LICM
+// from proceeding further. In cases where alias analysis is uncertain we might
+// use loop versioning as an alternative.
+//
+// Loop Versioning will create a version of the loop with aggressive aliasing
+// assumptions in addition to the original with conservative (default) aliasing
+// assumptions. The version of the loop making aggressive aliasing assumptions
+// will have all the memory accesses marked as no-alias. These two versions of
+// loop will be preceded by a memory runtime check. This runtime check consists
+// of bound checks for all unique memory accessed in loop, and it ensures the
+// lack of memory aliasing. The result of the runtime check determines which of
+// the loop versions is executed: If the runtime check detects any memory
+// aliasing, then the original loop is executed. Otherwise, the version with
+// aggressive aliasing assumptions is used.
+//
+// Following are the top level steps:
+//
+// a) Perform LoopVersioningLICM's feasibility check.
+// b) If loop is a candidate for versioning then create a memory bound check,
+//    by considering all the memory accesses in loop body.
+// c) Clone original loop and set all memory accesses as no-alias in new loop.
+// d) Set original loop & versioned loop as a branch target of the runtime check
+//    result.
+//
+// It transforms loop as shown below:
+//
+//                         +----------------+
+//                         |Runtime Memcheck|
+//                         +----------------+
+//                                 |
+//              +----------+----------------+----------+
+//              |                                      |
+//    +---------+----------+               +-----------+----------+
+//    |Orig Loop Preheader |               |Cloned Loop Preheader |
+//    +--------------------+               +----------------------+
+//              |                                      |
+//    +--------------------+               +----------------------+
+//    |Orig Loop Body      |               |Cloned Loop Body      |
+//    +--------------------+               +----------------------+
+//              |                                      |
+//    +--------------------+               +----------------------+
+//    |Orig Loop Exit Block|               |Cloned Loop Exit Block|
+//    +--------------------+               +-----------+----------+
+//              |                                      |
+//              +----------+--------------+-----------+
+//                                 |
+//                           +-----+----+
+//                           |Join Block|
+//                           +----------+
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+#define DEBUG_TYPE "loop-versioning-licm"
+static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable";
+
+using namespace llvm;
+
+/// Threshold minimum allowed percentage for possible
+/// invariant instructions in a loop.
+static cl::opt<float>
+    LVInvarThreshold("licm-versioning-invariant-threshold",
+                     cl::desc("LoopVersioningLICM's minimum allowed percentage"
+                              "of possible invariant instructions per loop"),
+                     cl::init(25), cl::Hidden);
+
+/// Threshold for maximum allowed loop nest/depth
+static cl::opt<unsigned> LVLoopDepthThreshold(
+    "licm-versioning-max-depth-threshold",
+    cl::desc(
+        "LoopVersioningLICM's threshold for maximum allowed loop nest/depth"),
+    cl::init(2), cl::Hidden);
+
+/// \brief Create MDNode for input string.
+static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  Metadata *MDs[] = {
+      MDString::get(Context, Name),
+      ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
+  return MDNode::get(Context, MDs);
+}
+
+/// \brief Set input string into loop metadata by keeping other values intact.
+void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
+                                   unsigned V) {
+  SmallVector<Metadata *, 4> MDs(1);
+  // If the loop already has metadata, retain it.
+  MDNode *LoopID = TheLoop->getLoopID();
+  if (LoopID) {
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+      MDs.push_back(Node);
+    }
+  }
+  // Add new metadata.
+  MDs.push_back(createStringMetadata(TheLoop, MDString, V));
+  // Replace current metadata node with new one.
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  MDNode *NewLoopID = MDNode::get(Context, MDs);
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  TheLoop->setLoopID(NewLoopID);
+}
+
+namespace {
+struct LoopVersioningLICM : public LoopPass {
+  static char ID;
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequiredID(LCSSAID);
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  LoopVersioningLICM()
+      : LoopPass(ID), AA(nullptr), SE(nullptr), LAA(nullptr), LAI(nullptr),
+        CurLoop(nullptr), LoopDepthThreshold(LVLoopDepthThreshold),
+        InvariantThreshold(LVInvarThreshold), LoadAndStoreCounter(0),
+        InvariantCounter(0), IsReadOnlyLoop(true) {
+    initializeLoopVersioningLICMPass(*PassRegistry::getPassRegistry());
+  }
+  StringRef getPassName() const override { return "Loop Versioning for LICM"; }
+
+  void reset() {
+    AA = nullptr;
+    SE = nullptr;
+    LAA = nullptr;
+    CurLoop = nullptr;
+    LoadAndStoreCounter = 0;
+    InvariantCounter = 0;
+    IsReadOnlyLoop = true;
+    CurAST.reset();
+  }
+
+  class AutoResetter {
+  public:
+    AutoResetter(LoopVersioningLICM &LVLICM) : LVLICM(LVLICM) {}
+    ~AutoResetter() { LVLICM.reset(); }
+
+  private:
+    LoopVersioningLICM &LVLICM;
+  };
+
+private:
+  AliasAnalysis *AA;             // Current AliasAnalysis information
+  ScalarEvolution *SE;           // Current ScalarEvolution
+  LoopAccessLegacyAnalysis *LAA; // Current LoopAccessAnalysis
+  const LoopAccessInfo *LAI;     // Current Loop's LoopAccessInfo
+
+  Loop *CurLoop; // The current loop we are working on.
+  std::unique_ptr<AliasSetTracker>
+      CurAST; // AliasSet information for the current loop.
+
+  unsigned LoopDepthThreshold;  // Maximum loop nest threshold
+  float InvariantThreshold;     // Minimum invariant threshold
+  unsigned LoadAndStoreCounter; // Counter to track num of load & store
+  unsigned InvariantCounter;    // Counter to track num of invariant
+  bool IsReadOnlyLoop;          // Read only loop marker.
+
+  bool isLegalForVersioning();
+  bool legalLoopStructure();
+  bool legalLoopInstructions();
+  bool legalLoopMemoryAccesses();
+  bool isLoopAlreadyVisited();
+  void setNoAliasToLoop(Loop *);
+  bool instructionSafeForVersioning(Instruction *);
+};
+}
+
+/// \brief Check loop structure and confirms it's good for LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopStructure() {
+  // Loop must be in loop simplify form.
+  if (!CurLoop->isLoopSimplifyForm()) {
+    DEBUG(
+        dbgs() << "    loop is not in loop-simplify form.\n");
+    return false;
+  }
+  // Loop should be innermost loop, if not return false.
+  if (CurLoop->getSubLoops().size()) {
+    DEBUG(dbgs() << "    loop is not innermost\n");
+    return false;
+  }
+  // Loop should have a single backedge, if not return false.
+  if (CurLoop->getNumBackEdges() != 1) {
+    DEBUG(dbgs() << "    loop has multiple backedges\n");
+    return false;
+  }
+  // Loop must have a single exiting block, if not return false.
+  if (!CurLoop->getExitingBlock()) {
+    DEBUG(dbgs() << "    loop has multiple exiting block\n");
+    return false;
+  }
+  // We only handle bottom-tested loop, i.e. loop in which the condition is
+  // checked at the end of each iteration. With that we can assume that all
+  // instructions in the loop are executed the same number of times.
+  if (CurLoop->getExitingBlock() != CurLoop->getLoopLatch()) {
+    DEBUG(dbgs() << "    loop is not bottom tested\n");
+    return false;
+  }
+  // Parallel loops must not have aliasing loop-invariant memory accesses.
+  // Hence we don't need to version anything in this case.
+  if (CurLoop->isAnnotatedParallel()) {
+    DEBUG(dbgs() << "    Parallel loop is not worth versioning\n");
+    return false;
+  }
+  // Loop depth more then LoopDepthThreshold are not allowed
+  if (CurLoop->getLoopDepth() > LoopDepthThreshold) {
+    DEBUG(dbgs() << "    loop depth is more then threshold\n");
+    return false;
+  }
+  // We need to be able to compute the loop trip count in order
+  // to generate the bound checks.
+  const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop);
+  if (ExitCount == SE->getCouldNotCompute()) {
+    DEBUG(dbgs() << "    loop does not has trip count\n");
+    return false;
+  }
+  return true;
+}
+
+/// \brief Check memory accesses in loop and confirms it's good for
+/// LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopMemoryAccesses() {
+  bool HasMayAlias = false;
+  bool TypeSafety = false;
+  bool HasMod = false;
+  // Memory check:
+  // Transform phase will generate a versioned loop and also a runtime check to
+  // ensure the pointers are independent and they don’t alias.
+  // In version variant of loop, alias meta data asserts that all access are
+  // mutually independent.
+  //
+  // Pointers aliasing in alias domain are avoided because with multiple
+  // aliasing domains we may not be able to hoist potential loop invariant
+  // access out of the loop.
+  //
+  // Iterate over alias tracker sets, and confirm AliasSets doesn't have any
+  // must alias set.
+  for (const auto &I : *CurAST) {
+    const AliasSet &AS = I;
+    // Skip Forward Alias Sets, as this should be ignored as part of
+    // the AliasSetTracker object.
+    if (AS.isForwardingAliasSet())
+      continue;
+    // With MustAlias its not worth adding runtime bound check.
+    if (AS.isMustAlias())
+      return false;
+    Value *SomePtr = AS.begin()->getValue();
+    bool TypeCheck = true;
+    // Check for Mod & MayAlias
+    HasMayAlias |= AS.isMayAlias();
+    HasMod |= AS.isMod();
+    for (const auto &A : AS) {
+      Value *Ptr = A.getValue();
+      // Alias tracker should have pointers of same data type.
+      TypeCheck = (TypeCheck && (SomePtr->getType() == Ptr->getType()));
+    }
+    // At least one alias tracker should have pointers of same data type.
+    TypeSafety |= TypeCheck;
+  }
+  // Ensure types should be of same type.
+  if (!TypeSafety) {
+    DEBUG(dbgs() << "    Alias tracker type safety failed!\n");
+    return false;
+  }
+  // Ensure loop body shouldn't be read only.
+  if (!HasMod) {
+    DEBUG(dbgs() << "    No memory modified in loop body\n");
+    return false;
+  }
+  // Make sure alias set has may alias case.
+  // If there no alias memory ambiguity, return false.
+  if (!HasMayAlias) {
+    DEBUG(dbgs() << "    No ambiguity in memory access.\n");
+    return false;
+  }
+  return true;
+}
+
+/// \brief Check loop instructions safe for Loop versioning.
+/// It returns true if it's safe else returns false.
+/// Consider following:
+/// 1) Check all load store in loop body are non atomic & non volatile.
+/// 2) Check function call safety, by ensuring its not accessing memory.
+/// 3) Loop body shouldn't have any may throw instruction.
+bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
+  assert(I != nullptr && "Null instruction found!");
+  // Check function call safety
+  if (isa<CallInst>(I) && !AA->doesNotAccessMemory(CallSite(I))) {
+    DEBUG(dbgs() << "    Unsafe call site found.\n");
+    return false;
+  }
+  // Avoid loops with possiblity of throw
+  if (I->mayThrow()) {
+    DEBUG(dbgs() << "    May throw instruction found in loop body\n");
+    return false;
+  }
+  // If current instruction is load instructions
+  // make sure it's a simple load (non atomic & non volatile)
+  if (I->mayReadFromMemory()) {
+    LoadInst *Ld = dyn_cast<LoadInst>(I);
+    if (!Ld || !Ld->isSimple()) {
+      DEBUG(dbgs() << "    Found a non-simple load.\n");
+      return false;
+    }
+    LoadAndStoreCounter++;
+    Value *Ptr = Ld->getPointerOperand();
+    // Check loop invariant.
+    if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop))
+      InvariantCounter++;
+  }
+  // If current instruction is store instruction
+  // make sure it's a simple store (non atomic & non volatile)
+  else if (I->mayWriteToMemory()) {
+    StoreInst *St = dyn_cast<StoreInst>(I);
+    if (!St || !St->isSimple()) {
+      DEBUG(dbgs() << "    Found a non-simple store.\n");
+      return false;
+    }
+    LoadAndStoreCounter++;
+    Value *Ptr = St->getPointerOperand();
+    // Check loop invariant.
+    if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop))
+      InvariantCounter++;
+
+    IsReadOnlyLoop = false;
+  }
+  return true;
+}
+
+/// \brief Check loop instructions and confirms it's good for
+/// LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopInstructions() {
+  // Resetting counters.
+  LoadAndStoreCounter = 0;
+  InvariantCounter = 0;
+  IsReadOnlyLoop = true;
+  // Iterate over loop blocks and instructions of each block and check
+  // instruction safety.
+  for (auto *Block : CurLoop->getBlocks())
+    for (auto &Inst : *Block) {
+      // If instruction is unsafe just return false.
+      if (!instructionSafeForVersioning(&Inst))
+        return false;
+    }
+  // Get LoopAccessInfo from current loop.
+  LAI = &LAA->getInfo(CurLoop);
+  // Check LoopAccessInfo for need of runtime check.
+  if (LAI->getRuntimePointerChecking()->getChecks().empty()) {
+    DEBUG(dbgs() << "    LAA: Runtime check not found !!\n");
+    return false;
+  }
+  // Number of runtime-checks should be less then RuntimeMemoryCheckThreshold
+  if (LAI->getNumRuntimePointerChecks() >
+      VectorizerParams::RuntimeMemoryCheckThreshold) {
+    DEBUG(dbgs() << "    LAA: Runtime checks are more than threshold !!\n");
+    return false;
+  }
+  // Loop should have at least one invariant load or store instruction.
+  if (!InvariantCounter) {
+    DEBUG(dbgs() << "    Invariant not found !!\n");
+    return false;
+  }
+  // Read only loop not allowed.
+  if (IsReadOnlyLoop) {
+    DEBUG(dbgs() << "    Found a read-only loop!\n");
+    return false;
+  }
+  // Profitablity check:
+  // Check invariant threshold, should be in limit.
+  if (InvariantCounter * 100 < InvariantThreshold * LoadAndStoreCounter) {
+    DEBUG(dbgs()
+          << "    Invariant load & store are less then defined threshold\n");
+    DEBUG(dbgs() << "    Invariant loads & stores: "
+                 << ((InvariantCounter * 100) / LoadAndStoreCounter) << "%\n");
+    DEBUG(dbgs() << "    Invariant loads & store threshold: "
+                 << InvariantThreshold << "%\n");
+    return false;
+  }
+  return true;
+}
+
+/// \brief It checks loop is already visited or not.
+/// check loop meta data, if loop revisited return true
+/// else false.
+bool LoopVersioningLICM::isLoopAlreadyVisited() {
+  // Check LoopVersioningLICM metadata into loop
+  if (findStringMetadataForLoop(CurLoop, LICMVersioningMetaData)) {
+    return true;
+  }
+  return false;
+}
+
+/// \brief Checks legality for LoopVersioningLICM by considering following:
+/// a) loop structure legality   b) loop instruction legality
+/// c) loop memory access legality.
+/// Return true if legal else returns false.
+bool LoopVersioningLICM::isLegalForVersioning() {
+  DEBUG(dbgs() << "Loop: " << *CurLoop);
+  // Make sure not re-visiting same loop again.
+  if (isLoopAlreadyVisited()) {
+    DEBUG(
+        dbgs() << "    Revisiting loop in LoopVersioningLICM not allowed.\n\n");
+    return false;
+  }
+  // Check loop structure leagality.
+  if (!legalLoopStructure()) {
+    DEBUG(
+        dbgs() << "    Loop structure not suitable for LoopVersioningLICM\n\n");
+    return false;
+  }
+  // Check loop instruction leagality.
+  if (!legalLoopInstructions()) {
+    DEBUG(dbgs()
+          << "    Loop instructions not suitable for LoopVersioningLICM\n\n");
+    return false;
+  }
+  // Check loop memory access leagality.
+  if (!legalLoopMemoryAccesses()) {
+    DEBUG(dbgs()
+          << "    Loop memory access not suitable for LoopVersioningLICM\n\n");
+    return false;
+  }
+  // Loop versioning is feasible, return true.
+  DEBUG(dbgs() << "    Loop Versioning found to be beneficial\n\n");
+  return true;
+}
+
+/// \brief Update loop with aggressive aliasing assumptions.
+/// It marks no-alias to any pairs of memory operations by assuming
+/// loop should not have any must-alias memory accesses pairs.
+/// During LoopVersioningLICM legality we ignore loops having must
+/// aliasing memory accesses.
+void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) {
+  // Get latch terminator instruction.
+  Instruction *I = VerLoop->getLoopLatch()->getTerminator();
+  // Create alias scope domain.
+  MDBuilder MDB(I->getContext());
+  MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("LVDomain");
+  StringRef Name = "LVAliasScope";
+  SmallVector<Metadata *, 4> Scopes, NoAliases;
+  MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
+  // Iterate over each instruction of loop.
+  // set no-alias for all load & store instructions.
+  for (auto *Block : CurLoop->getBlocks()) {
+    for (auto &Inst : *Block) {
+      // Only interested in instruction that may modify or read memory.
+      if (!Inst.mayReadFromMemory() && !Inst.mayWriteToMemory())
+        continue;
+      Scopes.push_back(NewScope);
+      NoAliases.push_back(NewScope);
+      // Set no-alias for current instruction.
+      Inst.setMetadata(
+          LLVMContext::MD_noalias,
+          MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_noalias),
+                              MDNode::get(Inst.getContext(), NoAliases)));
+      // set alias-scope for current instruction.
+      Inst.setMetadata(
+          LLVMContext::MD_alias_scope,
+          MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_alias_scope),
+                              MDNode::get(Inst.getContext(), Scopes)));
+    }
+  }
+}
+
+bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
+  // This will automatically release all resources hold by the current
+  // LoopVersioningLICM object.
+  AutoResetter Resetter(*this);
+
+  if (skipLoop(L))
+    return false;
+  // Get Analysis information.
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+  LAI = nullptr;
+  // Set Current Loop
+  CurLoop = L;
+  CurAST.reset(new AliasSetTracker(*AA));
+
+  // Loop over the body of this loop, construct AST.
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  for (auto *Block : L->getBlocks()) {
+    if (LI->getLoopFor(Block) == L) // Ignore blocks in subloop.
+      CurAST->add(*Block);          // Incorporate the specified basic block
+  }
+
+  bool Changed = false;
+
+  // Check feasiblity of LoopVersioningLICM.
+  // If versioning found to be feasible and beneficial then proceed
+  // else simply return, by cleaning up memory.
+  if (isLegalForVersioning()) {
+    // Do loop versioning.
+    // Create memcheck for memory accessed inside loop.
+    // Clone original loop, and set blocks properly.
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopVersioning LVer(*LAI, CurLoop, LI, DT, SE, true);
+    LVer.versionLoop();
+    // Set Loop Versioning metaData for original loop.
+    addStringMetadataToLoop(LVer.getNonVersionedLoop(), LICMVersioningMetaData);
+    // Set Loop Versioning metaData for version loop.
+    addStringMetadataToLoop(LVer.getVersionedLoop(), LICMVersioningMetaData);
+    // Set "llvm.mem.parallel_loop_access" metaData to versioned loop.
+    addStringMetadataToLoop(LVer.getVersionedLoop(),
+                            "llvm.mem.parallel_loop_access");
+    // Update version loop with aggressive aliasing assumption.
+    setNoAliasToLoop(LVer.getVersionedLoop());
+    Changed = true;
+  }
+  return Changed;
+}
+
+char LoopVersioningLICM::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopVersioningLICM, "loop-versioning-licm",
+                      "Loop Versioning For LICM", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopVersioningLICM, "loop-versioning-licm",
+                    "Loop Versioning For LICM", false, false)
+
+Pass *llvm::createLoopVersioningLICMPass() { return new LoopVersioningLICM(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
new file mode 100644
index 000000000000..08e60b16bedf
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -0,0 +1,174 @@
+//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers atomic intrinsics to non-atomic form for use in a known
+// non-preemptible environment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerAtomic.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loweratomic"
+
+static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
+  IRBuilder<> Builder(CXI);
+  Value *Ptr = CXI->getPointerOperand();
+  Value *Cmp = CXI->getCompareOperand();
+  Value *Val = CXI->getNewValOperand();
+
+  LoadInst *Orig = Builder.CreateLoad(Ptr);
+  Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+  Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+  Builder.CreateStore(Res, Ptr);
+
+  Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0);
+  Res = Builder.CreateInsertValue(Res, Equal, 1);
+
+  CXI->replaceAllUsesWith(Res);
+  CXI->eraseFromParent();
+  return true;
+}
+
+static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) {
+  IRBuilder<> Builder(RMWI);
+  Value *Ptr = RMWI->getPointerOperand();
+  Value *Val = RMWI->getValOperand();
+
+  LoadInst *Orig = Builder.CreateLoad(Ptr);
+  Value *Res = nullptr;
+
+  switch (RMWI->getOperation()) {
+  default: llvm_unreachable("Unexpected RMW operation");
+  case AtomicRMWInst::Xchg:
+    Res = Val;
+    break;
+  case AtomicRMWInst::Add:
+    Res = Builder.CreateAdd(Orig, Val);
+    break;
+  case AtomicRMWInst::Sub:
+    Res = Builder.CreateSub(Orig, Val);
+    break;
+  case AtomicRMWInst::And:
+    Res = Builder.CreateAnd(Orig, Val);
+    break;
+  case AtomicRMWInst::Nand:
+    Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val));
+    break;
+  case AtomicRMWInst::Or:
+    Res = Builder.CreateOr(Orig, Val);
+    break;
+  case AtomicRMWInst::Xor:
+    Res = Builder.CreateXor(Orig, Val);
+    break;
+  case AtomicRMWInst::Max:
+    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+                               Val, Orig);
+    break;
+  case AtomicRMWInst::Min:
+    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+                               Orig, Val);
+    break;
+  case AtomicRMWInst::UMax:
+    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+                               Val, Orig);
+    break;
+  case AtomicRMWInst::UMin:
+    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+                               Orig, Val);
+    break;
+  }
+  Builder.CreateStore(Res, Ptr);
+  RMWI->replaceAllUsesWith(Orig);
+  RMWI->eraseFromParent();
+  return true;
+}
+
+static bool LowerFenceInst(FenceInst *FI) {
+  FI->eraseFromParent();
+  return true;
+}
+
+static bool LowerLoadInst(LoadInst *LI) {
+  LI->setAtomic(AtomicOrdering::NotAtomic);
+  return true;
+}
+
+static bool LowerStoreInst(StoreInst *SI) {
+  SI->setAtomic(AtomicOrdering::NotAtomic);
+  return true;
+}
+
+static bool runOnBasicBlock(BasicBlock &BB) {
+  bool Changed = false;
+  for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE;) {
+    Instruction *Inst = &*DI++;
+    if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+      Changed |= LowerFenceInst(FI);
+    else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst))
+      Changed |= LowerAtomicCmpXchgInst(CXI);
+    else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst))
+      Changed |= LowerAtomicRMWInst(RMWI);
+    else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+      if (LI->isAtomic())
+        LowerLoadInst(LI);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      if (SI->isAtomic())
+        LowerStoreInst(SI);
+    }
+  }
+  return Changed;
+}
+
+static bool lowerAtomics(Function &F) {
+  bool Changed = false;
+  for (BasicBlock &BB : F) {
+    Changed |= runOnBasicBlock(BB);
+  }
+  return Changed;
+}
+
+PreservedAnalyses LowerAtomicPass::run(Function &F, FunctionAnalysisManager &) {
+  if (lowerAtomics(F))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+namespace {
+class LowerAtomicLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  LowerAtomicLegacyPass() : FunctionPass(ID) {
+    initializeLowerAtomicLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    FunctionAnalysisManager DummyFAM;
+    auto PA = Impl.run(F, DummyFAM);
+    return !PA.areAllPreserved();
+  }
+
+private:
+  LowerAtomicPass Impl;
+  };
+}
+
+char LowerAtomicLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerAtomicLegacyPass, "loweratomic",
+                "Lower atomic intrinsics to non-atomic form", false, false)
+
+Pass *llvm::createLowerAtomicPass() { return new LowerAtomicLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
new file mode 100644
index 000000000000..46f8a3564265
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -0,0 +1,383 @@
+//===- LowerExpectIntrinsic.cpp - Lower expect intrinsic ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the 'expect' intrinsic to LLVM metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-expect-intrinsic"
+
+STATISTIC(ExpectIntrinsicsHandled,
+          "Number of 'expect' intrinsic instructions handled");
+
+// These default values are chosen to represent an extremely skewed outcome for
+// a condition, but they leave some room for interpretation by later passes.
+//
+// If the documentation for __builtin_expect() was made explicit that it should
+// only be used in extreme cases, we could make this ratio higher. As it stands,
+// programmers may be using __builtin_expect() / llvm.expect to annotate that a
+// branch is likely or unlikely to be taken.
+//
+// There is a known dependency on this ratio in CodeGenPrepare when transforming
+// 'select' instructions. It may be worthwhile to hoist these values to some
+// shared space, so they can be used directly by other passes.
+
+static cl::opt<uint32_t> LikelyBranchWeight(
+    "likely-branch-weight", cl::Hidden, cl::init(2000),
+    cl::desc("Weight of the branch likely to be taken (default = 2000)"));
+static cl::opt<uint32_t> UnlikelyBranchWeight(
+    "unlikely-branch-weight", cl::Hidden, cl::init(1),
+    cl::desc("Weight of the branch unlikely to be taken (default = 1)"));
+
+static bool handleSwitchExpect(SwitchInst &SI) {
+  CallInst *CI = dyn_cast<CallInst>(SI.getCondition());
+  if (!CI)
+    return false;
+
+  Function *Fn = CI->getCalledFunction();
+  if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+    return false;
+
+  Value *ArgValue = CI->getArgOperand(0);
+  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!ExpectedValue)
+    return false;
+
+  SwitchInst::CaseHandle Case = *SI.findCaseValue(ExpectedValue);
+  unsigned n = SI.getNumCases(); // +1 for default case.
+  SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
+
+  if (Case == *SI.case_default())
+    Weights[0] = LikelyBranchWeight;
+  else
+    Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight;
+
+  SI.setMetadata(LLVMContext::MD_prof,
+                 MDBuilder(CI->getContext()).createBranchWeights(Weights));
+
+  SI.setCondition(ArgValue);
+  return true;
+}
+
+/// Handler for PHINodes that define the value argument to an
+/// @llvm.expect call.
+///
+/// If the operand of the phi has a constant value and it 'contradicts'
+/// with the expected value of phi def, then the corresponding incoming
+/// edge of the phi is unlikely to be taken. Using that information,
+/// the branch probability info for the originating branch can be inferred.
+static void handlePhiDef(CallInst *Expect) {
+  Value &Arg = *Expect->getArgOperand(0);
+  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(Expect->getArgOperand(1));
+  if (!ExpectedValue)
+    return;
+  const APInt &ExpectedPhiValue = ExpectedValue->getValue();
+
+  // Walk up in backward a list of instructions that
+  // have 'copy' semantics by 'stripping' the copies
+  // until a PHI node or an instruction of unknown kind
+  // is reached. Negation via xor is also handled.
+  //
+  //       C = PHI(...);
+  //       B = C;
+  //       A = B;
+  //       D = __builtin_expect(A, 0);
+  //
+  Value *V = &Arg;
+  SmallVector<Instruction *, 4> Operations;
+  while (!isa<PHINode>(V)) {
+    if (ZExtInst *ZExt = dyn_cast<ZExtInst>(V)) {
+      V = ZExt->getOperand(0);
+      Operations.push_back(ZExt);
+      continue;
+    }
+
+    if (SExtInst *SExt = dyn_cast<SExtInst>(V)) {
+      V = SExt->getOperand(0);
+      Operations.push_back(SExt);
+      continue;
+    }
+
+    BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
+    if (!BinOp || BinOp->getOpcode() != Instruction::Xor)
+      return;
+
+    ConstantInt *CInt = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+    if (!CInt)
+      return;
+
+    V = BinOp->getOperand(0);
+    Operations.push_back(BinOp);
+  }
+
+  // Executes the recorded operations on input 'Value'.
+  auto ApplyOperations = [&](const APInt &Value) {
+    APInt Result = Value;
+    for (auto Op : llvm::reverse(Operations)) {
+      switch (Op->getOpcode()) {
+      case Instruction::Xor:
+        Result ^= cast<ConstantInt>(Op->getOperand(1))->getValue();
+        break;
+      case Instruction::ZExt:
+        Result = Result.zext(Op->getType()->getIntegerBitWidth());
+        break;
+      case Instruction::SExt:
+        Result = Result.sext(Op->getType()->getIntegerBitWidth());
+        break;
+      default:
+        llvm_unreachable("Unexpected operation");
+      }
+    }
+    return Result;
+  };
+
+  auto *PhiDef = dyn_cast<PHINode>(V);
+
+  // Get the first dominating conditional branch of the operand
+  // i's incoming block.
+  auto GetDomConditional = [&](unsigned i) -> BranchInst * {
+    BasicBlock *BB = PhiDef->getIncomingBlock(i);
+    BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (BI && BI->isConditional())
+      return BI;
+    BB = BB->getSinglePredecessor();
+    if (!BB)
+      return nullptr;
+    BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BI || BI->isUnconditional())
+      return nullptr;
+    return BI;
+  };
+
+  // Now walk through all Phi operands to find phi oprerands with values
+  // conflicting with the expected phi output value. Any such operand
+  // indicates the incoming edge to that operand is unlikely.
+  for (unsigned i = 0, e = PhiDef->getNumIncomingValues(); i != e; ++i) {
+
+    Value *PhiOpnd = PhiDef->getIncomingValue(i);
+    ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
+    if (!CI)
+      continue;
+
+    // Not an interesting case when IsUnlikely is false -- we can not infer
+    // anything useful when the operand value matches the expected phi
+    // output.
+    if (ExpectedPhiValue == ApplyOperations(CI->getValue()))
+      continue;
+
+    BranchInst *BI = GetDomConditional(i);
+    if (!BI)
+      continue;
+
+    MDBuilder MDB(PhiDef->getContext());
+
+    // There are two situations in which an operand of the PhiDef comes
+    // from a given successor of a branch instruction BI.
+    // 1) When the incoming block of the operand is the successor block;
+    // 2) When the incoming block is BI's enclosing block and the
+    // successor is the PhiDef's enclosing block.
+    //
+    // Returns true if the operand which comes from OpndIncomingBB
+    // comes from outgoing edge of BI that leads to Succ block.
+    auto *OpndIncomingBB = PhiDef->getIncomingBlock(i);
+    auto IsOpndComingFromSuccessor = [&](BasicBlock *Succ) {
+      if (OpndIncomingBB == Succ)
+        // If this successor is the incoming block for this
+        // Phi operand, then this successor does lead to the Phi.
+        return true;
+      if (OpndIncomingBB == BI->getParent() && Succ == PhiDef->getParent())
+        // Otherwise, if the edge is directly from the branch
+        // to the Phi, this successor is the one feeding this
+        // Phi operand.
+        return true;
+      return false;
+    };
+
+    if (IsOpndComingFromSuccessor(BI->getSuccessor(1)))
+      BI->setMetadata(
+          LLVMContext::MD_prof,
+          MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight));
+    else if (IsOpndComingFromSuccessor(BI->getSuccessor(0)))
+      BI->setMetadata(
+          LLVMContext::MD_prof,
+          MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight));
+  }
+}
+
+// Handle both BranchInst and SelectInst.
+template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
+
+  // Handle non-optimized IR code like:
+  //   %expval = call i64 @llvm.expect.i64(i64 %conv1, i64 1)
+  //   %tobool = icmp ne i64 %expval, 0
+  //   br i1 %tobool, label %if.then, label %if.end
+  //
+  // Or the following simpler case:
+  //   %expval = call i1 @llvm.expect.i1(i1 %cmp, i1 1)
+  //   br i1 %expval, label %if.then, label %if.end
+
+  CallInst *CI;
+
+  ICmpInst *CmpI = dyn_cast<ICmpInst>(BSI.getCondition());
+  CmpInst::Predicate Predicate;
+  ConstantInt *CmpConstOperand = nullptr;
+  if (!CmpI) {
+    CI = dyn_cast<CallInst>(BSI.getCondition());
+    Predicate = CmpInst::ICMP_NE;
+  } else {
+    Predicate = CmpI->getPredicate();
+    if (Predicate != CmpInst::ICMP_NE && Predicate != CmpInst::ICMP_EQ)
+      return false;
+
+    CmpConstOperand = dyn_cast<ConstantInt>(CmpI->getOperand(1));
+    if (!CmpConstOperand)
+      return false;
+    CI = dyn_cast<CallInst>(CmpI->getOperand(0));
+  }
+
+  if (!CI)
+    return false;
+
+  uint64_t ValueComparedTo = 0;
+  if (CmpConstOperand) {
+    if (CmpConstOperand->getBitWidth() > 64)
+      return false;
+    ValueComparedTo = CmpConstOperand->getZExtValue();
+  }
+
+  Function *Fn = CI->getCalledFunction();
+  if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+    return false;
+
+  Value *ArgValue = CI->getArgOperand(0);
+  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!ExpectedValue)
+    return false;
+
+  MDBuilder MDB(CI->getContext());
+  MDNode *Node;
+
+  if ((ExpectedValue->getZExtValue() == ValueComparedTo) ==
+      (Predicate == CmpInst::ICMP_EQ))
+    Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
+  else
+    Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
+
+  BSI.setMetadata(LLVMContext::MD_prof, Node);
+
+  if (CmpI)
+    CmpI->setOperand(0, ArgValue);
+  else
+    BSI.setCondition(ArgValue);
+  return true;
+}
+
+static bool handleBranchExpect(BranchInst &BI) {
+  if (BI.isUnconditional())
+    return false;
+
+  return handleBrSelExpect<BranchInst>(BI);
+}
+
+static bool lowerExpectIntrinsic(Function &F) {
+  bool Changed = false;
+
+  for (BasicBlock &BB : F) {
+    // Create "block_weights" metadata.
+    if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
+      if (handleBranchExpect(*BI))
+        ExpectIntrinsicsHandled++;
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
+      if (handleSwitchExpect(*SI))
+        ExpectIntrinsicsHandled++;
+    }
+
+    // Remove llvm.expect intrinsics. Iterate backwards in order
+    // to process select instructions before the intrinsic gets
+    // removed.
+    for (auto BI = BB.rbegin(), BE = BB.rend(); BI != BE;) {
+      Instruction *Inst = &*BI++;
+      CallInst *CI = dyn_cast<CallInst>(Inst);
+      if (!CI) {
+        if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
+          if (handleBrSelExpect(*SI))
+            ExpectIntrinsicsHandled++;
+        }
+        continue;
+      }
+
+      Function *Fn = CI->getCalledFunction();
+      if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) {
+        // Before erasing the llvm.expect, walk backward to find
+        // phi that define llvm.expect's first arg, and
+        // infer branch probability:
+        handlePhiDef(CI);
+        Value *Exp = CI->getArgOperand(0);
+        CI->replaceAllUsesWith(Exp);
+        CI->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F,
+                                                FunctionAnalysisManager &) {
+  if (lowerExpectIntrinsic(F))
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+/// \brief Legacy pass for lowering expect intrinsics out of the IR.
+///
+/// When this pass is run over a function it uses expect intrinsics which feed
+/// branches and switches to provide branch weight metadata for those
+/// terminators. It then removes the expect intrinsics from the IR so the rest
+/// of the optimizer can ignore them.
+class LowerExpectIntrinsic : public FunctionPass {
+public:
+  static char ID;
+  LowerExpectIntrinsic() : FunctionPass(ID) {
+    initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); }
+};
+}
+
+char LowerExpectIntrinsic::ID = 0;
+INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect",
+                "Lower 'expect' Intrinsics", false, false)
+
+FunctionPass *llvm::createLowerExpectIntrinsicPass() {
+  return new LowerExpectIntrinsic();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
new file mode 100644
index 000000000000..070114a84cc5
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
@@ -0,0 +1,137 @@
+//===- LowerGuardIntrinsic.cpp - Lower the guard intrinsic ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the llvm.experimental.guard intrinsic to a conditional call
+// to @llvm.experimental.deoptimize.  Once this happens, the guard can no longer
+// be widened.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+static cl::opt<uint32_t> PredicatePassBranchWeight(
+    "guards-predicate-pass-branch-weight", cl::Hidden, cl::init(1 << 20),
+    cl::desc("The probability of a guard failing is assumed to be the "
+             "reciprocal of this value (default = 1 << 20)"));
+
+namespace {
+struct LowerGuardIntrinsicLegacyPass : public FunctionPass {
+  static char ID;
+  LowerGuardIntrinsicLegacyPass() : FunctionPass(ID) {
+    initializeLowerGuardIntrinsicLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+}
+
+static void MakeGuardControlFlowExplicit(Function *DeoptIntrinsic,
+                                         CallInst *CI) {
+  OperandBundleDef DeoptOB(*CI->getOperandBundle(LLVMContext::OB_deopt));
+  SmallVector<Value *, 4> Args(std::next(CI->arg_begin()), CI->arg_end());
+
+  auto *CheckBB = CI->getParent();
+  auto *DeoptBlockTerm =
+      SplitBlockAndInsertIfThen(CI->getArgOperand(0), CI, true);
+
+  auto *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
+
+  // SplitBlockAndInsertIfThen inserts control flow that branches to
+  // DeoptBlockTerm if the condition is true.  We want the opposite.
+  CheckBI->swapSuccessors();
+
+  CheckBI->getSuccessor(0)->setName("guarded");
+  CheckBI->getSuccessor(1)->setName("deopt");
+
+  if (auto *MD = CI->getMetadata(LLVMContext::MD_make_implicit))
+    CheckBI->setMetadata(LLVMContext::MD_make_implicit, MD);
+
+  MDBuilder MDB(CI->getContext());
+  CheckBI->setMetadata(LLVMContext::MD_prof,
+                       MDB.createBranchWeights(PredicatePassBranchWeight, 1));
+
+  IRBuilder<> B(DeoptBlockTerm);
+  auto *DeoptCall = B.CreateCall(DeoptIntrinsic, Args, {DeoptOB}, "");
+
+  if (DeoptIntrinsic->getReturnType()->isVoidTy()) {
+    B.CreateRetVoid();
+  } else {
+    DeoptCall->setName("deoptcall");
+    B.CreateRet(DeoptCall);
+  }
+
+  DeoptCall->setCallingConv(CI->getCallingConv());
+  DeoptBlockTerm->eraseFromParent();
+}
+
+static bool lowerGuardIntrinsic(Function &F) {
+  // Check if we can cheaply rule out the possibility of not having any work to
+  // do.
+  auto *GuardDecl = F.getParent()->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_guard));
+  if (!GuardDecl || GuardDecl->use_empty())
+    return false;
+
+  SmallVector<CallInst *, 8> ToLower;
+  for (auto &I : instructions(F))
+    if (auto *CI = dyn_cast<CallInst>(&I))
+      if (auto *F = CI->getCalledFunction())
+        if (F->getIntrinsicID() == Intrinsic::experimental_guard)
+          ToLower.push_back(CI);
+
+  if (ToLower.empty())
+    return false;
+
+  auto *DeoptIntrinsic = Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()});
+  DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv());
+
+  for (auto *CI : ToLower) {
+    MakeGuardControlFlowExplicit(DeoptIntrinsic, CI);
+    CI->eraseFromParent();
+  }
+
+  return true;
+}
+
+bool LowerGuardIntrinsicLegacyPass::runOnFunction(Function &F) {
+  return lowerGuardIntrinsic(F);
+}
+
+char LowerGuardIntrinsicLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerGuardIntrinsicLegacyPass, "lower-guard-intrinsic",
+                "Lower the guard intrinsic to normal control flow", false,
+                false)
+
+Pass *llvm::createLowerGuardIntrinsicPass() {
+  return new LowerGuardIntrinsicLegacyPass();
+}
+
+PreservedAnalyses LowerGuardIntrinsicPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  if (lowerGuardIntrinsic(F))
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
new file mode 100644
index 000000000000..7896396f0898
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -0,0 +1,1485 @@
+//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs various transformations related to eliminating memcpy
+// calls, or transforming sets of stores into memset's.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "memcpyopt"
+
+STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
+STATISTIC(NumMemSetInfer, "Number of memsets inferred");
+STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
+STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
+
+static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
+                                  bool &VariableIdxFound,
+                                  const DataLayout &DL) {
+  // Skip over the first indices.
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned i = 1; i != Idx; ++i, ++GTI)
+    /*skip along*/;
+
+  // Compute the offset implied by the rest of the indices.
+  int64_t Offset = 0;
+  for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
+    ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!OpC)
+      return VariableIdxFound = true;
+    if (OpC->isZero()) continue;  // No offset.
+
+    // Handle struct indices, which add their field offset to the pointer.
+    if (StructType *STy = GTI.getStructTypeOrNull()) {
+      Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+      continue;
+    }
+
+    // Otherwise, we have a sequential type like an array or vector.  Multiply
+    // the index by the ElementSize.
+    uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
+    Offset += Size*OpC->getSExtValue();
+  }
+
+  return Offset;
+}
+
+/// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and
+/// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2
+/// might be &A[40]. In this case offset would be -8.
+static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
+                            const DataLayout &DL) {
+  Ptr1 = Ptr1->stripPointerCasts();
+  Ptr2 = Ptr2->stripPointerCasts();
+
+  // Handle the trivial case first.
+  if (Ptr1 == Ptr2) {
+    Offset = 0;
+    return true;
+  }
+
+  GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1);
+  GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2);
+
+  bool VariableIdxFound = false;
+
+  // If one pointer is a GEP and the other isn't, then see if the GEP is a
+  // constant offset from the base, as in "P" and "gep P, 1".
+  if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) {
+    Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, DL);
+    return !VariableIdxFound;
+  }
+
+  if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) {
+    Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, DL);
+    return !VariableIdxFound;
+  }
+
+  // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
+  // base.  After that base, they may have some number of common (and
+  // potentially variable) indices.  After that they handle some constant
+  // offset, which determines their offset from each other.  At this point, we
+  // handle no other case.
+  if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
+    return false;
+
+  // Skip any common indices and track the GEP types.
+  unsigned Idx = 1;
+  for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
+    if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
+      break;
+
+  int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, DL);
+  int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, DL);
+  if (VariableIdxFound) return false;
+
+  Offset = Offset2-Offset1;
+  return true;
+}
+
+namespace {
+
+/// Represents a range of memset'd bytes with the ByteVal value.
+/// This allows us to analyze stores like:
+///   store 0 -> P+1
+///   store 0 -> P+0
+///   store 0 -> P+3
+///   store 0 -> P+2
+/// which sometimes happens with stores to arrays of structs etc.  When we see
+/// the first store, we make a range [1, 2).  The second store extends the range
+/// to [0, 2).  The third makes a new range [2, 3).  The fourth store joins the
+/// two ranges into [0, 3) which is memset'able.
+struct MemsetRange {
+  // Start/End - A semi range that describes the span that this range covers.
+  // The range is closed at the start and open at the end: [Start, End).
+  int64_t Start, End;
+
+  /// StartPtr - The getelementptr instruction that points to the start of the
+  /// range.
+  Value *StartPtr;
+
+  /// Alignment - The known alignment of the first store.
+  unsigned Alignment;
+
+  /// TheStores - The actual stores that make up this range.
+  SmallVector<Instruction*, 16> TheStores;
+
+  bool isProfitableToUseMemset(const DataLayout &DL) const;
+};
+
+} // end anonymous namespace
+
+bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
+  // If we found more than 4 stores to merge or 16 bytes, use memset.
+  if (TheStores.size() >= 4 || End-Start >= 16) return true;
+
+  // If there is nothing to merge, don't do anything.
+  if (TheStores.size() < 2) return false;
+
+  // If any of the stores are a memset, then it is always good to extend the
+  // memset.
+  for (Instruction *SI : TheStores)
+    if (!isa<StoreInst>(SI))
+      return true;
+
+  // Assume that the code generator is capable of merging pairs of stores
+  // together if it wants to.
+  if (TheStores.size() == 2) return false;
+
+  // If we have fewer than 8 stores, it can still be worthwhile to do this.
+  // For example, merging 4 i8 stores into an i32 store is useful almost always.
+  // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
+  // memset will be split into 2 32-bit stores anyway) and doing so can
+  // pessimize the llvm optimizer.
+  //
+  // Since we don't have perfect knowledge here, make some assumptions: assume
+  // the maximum GPR width is the same size as the largest legal integer
+  // size. If so, check to see whether we will end up actually reducing the
+  // number of stores used.
+  unsigned Bytes = unsigned(End-Start);
+  unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8;
+  if (MaxIntSize == 0)
+    MaxIntSize = 1;
+  unsigned NumPointerStores = Bytes / MaxIntSize;
+
+  // Assume the remaining bytes if any are done a byte at a time.
+  unsigned NumByteStores = Bytes % MaxIntSize;
+
+  // If we will reduce the # stores (according to this heuristic), do the
+  // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
+  // etc.
+  return TheStores.size() > NumPointerStores+NumByteStores;
+}
+
+namespace {
+
+class MemsetRanges {
+  /// A sorted list of the memset ranges.
+  SmallVector<MemsetRange, 8> Ranges;
+  typedef SmallVectorImpl<MemsetRange>::iterator range_iterator;
+  const DataLayout &DL;
+
+public:
+  MemsetRanges(const DataLayout &DL) : DL(DL) {}
+
+  typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator;
+  const_iterator begin() const { return Ranges.begin(); }
+  const_iterator end() const { return Ranges.end(); }
+  bool empty() const { return Ranges.empty(); }
+
+  void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+      addStore(OffsetFromFirst, SI);
+    else
+      addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst));
+  }
+
+  void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
+    int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
+
+    addRange(OffsetFromFirst, StoreSize,
+             SI->getPointerOperand(), SI->getAlignment(), SI);
+  }
+
+  void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
+    int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+    addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI);
+  }
+
+  void addRange(int64_t Start, int64_t Size, Value *Ptr,
+                unsigned Alignment, Instruction *Inst);
+
+};
+
+} // end anonymous namespace
+
+/// Add a new store to the MemsetRanges data structure.  This adds a
+/// new range for the specified store at the specified offset, merging into
+/// existing ranges as appropriate.
+void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
+                            unsigned Alignment, Instruction *Inst) {
+  int64_t End = Start+Size;
+
+  range_iterator I = std::lower_bound(Ranges.begin(), Ranges.end(), Start,
+    [](const MemsetRange &LHS, int64_t RHS) { return LHS.End < RHS; });
+
+  // We now know that I == E, in which case we didn't find anything to merge
+  // with, or that Start <= I->End.  If End < I->Start or I == E, then we need
+  // to insert a new range.  Handle this now.
+  if (I == Ranges.end() || End < I->Start) {
+    MemsetRange &R = *Ranges.insert(I, MemsetRange());
+    R.Start        = Start;
+    R.End          = End;
+    R.StartPtr     = Ptr;
+    R.Alignment    = Alignment;
+    R.TheStores.push_back(Inst);
+    return;
+  }
+
+  // This store overlaps with I, add it.
+  I->TheStores.push_back(Inst);
+
+  // At this point, we may have an interval that completely contains our store.
+  // If so, just add it to the interval and return.
+  if (I->Start <= Start && I->End >= End)
+    return;
+
+  // Now we know that Start <= I->End and End >= I->Start so the range overlaps
+  // but is not entirely contained within the range.
+
+  // See if the range extends the start of the range.  In this case, it couldn't
+  // possibly cause it to join the prior range, because otherwise we would have
+  // stopped on *it*.
+  if (Start < I->Start) {
+    I->Start = Start;
+    I->StartPtr = Ptr;
+    I->Alignment = Alignment;
+  }
+
+  // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
+  // is in or right at the end of I), and that End >= I->Start.  Extend I out to
+  // End.
+  if (End > I->End) {
+    I->End = End;
+    range_iterator NextI = I;
+    while (++NextI != Ranges.end() && End >= NextI->Start) {
+      // Merge the range in.
+      I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
+      if (NextI->End > I->End)
+        I->End = NextI->End;
+      Ranges.erase(NextI);
+      NextI = I;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         MemCpyOptLegacyPass Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class MemCpyOptLegacyPass : public FunctionPass {
+  MemCpyOptPass Impl;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  MemCpyOptLegacyPass() : FunctionPass(ID) {
+    initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  // This transformation requires dominator postdominator info
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<MemoryDependenceWrapperPass>();
+  }
+};
+
+char MemCpyOptLegacyPass::ID = 0;
+
+} // end anonymous namespace
+
+/// The public interface to this file...
+FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
+                    false, false)
+
+/// When scanning forward over instructions, we look for some other patterns to
+/// fold away. In particular, this looks for stores to neighboring locations of
+/// memory. If it sees enough consecutive ones, it attempts to merge them
+/// together into a memcpy/memset.
+Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
+                                                 Value *StartPtr,
+                                                 Value *ByteVal) {
+  const DataLayout &DL = StartInst->getModule()->getDataLayout();
+
+  // Okay, so we now have a single store that can be splatable.  Scan to find
+  // all subsequent stores of the same value to offset from the same pointer.
+  // Join these together into ranges, so we can decide whether contiguous blocks
+  // are stored.
+  MemsetRanges Ranges(DL);
+
+  BasicBlock::iterator BI(StartInst);
+  for (++BI; !isa<TerminatorInst>(BI); ++BI) {
+    if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
+      // If the instruction is readnone, ignore it, otherwise bail out.  We
+      // don't even allow readonly here because we don't want something like:
+      // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+      if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
+        break;
+      continue;
+    }
+
+    if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+      // If this is a store, see if we can merge it in.
+      if (!NextStore->isSimple()) break;
+
+      // Check to see if this stored value is of the same byte-splattable value.
+      if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
+        break;
+
+      // Check to see if this store is to a constant offset from the start ptr.
+      int64_t Offset;
+      if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset,
+                           DL))
+        break;
+
+      Ranges.addStore(Offset, NextStore);
+    } else {
+      MemSetInst *MSI = cast<MemSetInst>(BI);
+
+      if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
+          !isa<ConstantInt>(MSI->getLength()))
+        break;
+
+      // Check to see if this store is to a constant offset from the start ptr.
+      int64_t Offset;
+      if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, DL))
+        break;
+
+      Ranges.addMemSet(Offset, MSI);
+    }
+  }
+
+  // If we have no ranges, then we just had a single store with nothing that
+  // could be merged in.  This is a very common case of course.
+  if (Ranges.empty())
+    return nullptr;
+
+  // If we had at least one store that could be merged in, add the starting
+  // store as well.  We try to avoid this unless there is at least something
+  // interesting as a small compile-time optimization.
+  Ranges.addInst(0, StartInst);
+
+  // If we create any memsets, we put it right before the first instruction that
+  // isn't part of the memset block.  This ensure that the memset is dominated
+  // by any addressing instruction needed by the start of the block.
+  IRBuilder<> Builder(&*BI);
+
+  // Now that we have full information about ranges, loop over the ranges and
+  // emit memset's for anything big enough to be worthwhile.
+  Instruction *AMemSet = nullptr;
+  for (const MemsetRange &Range : Ranges) {
+
+    if (Range.TheStores.size() == 1) continue;
+
+    // If it is profitable to lower this range to memset, do so now.
+    if (!Range.isProfitableToUseMemset(DL))
+      continue;
+
+    // Otherwise, we do want to transform this!  Create a new memset.
+    // Get the starting pointer of the block.
+    StartPtr = Range.StartPtr;
+
+    // Determine alignment
+    unsigned Alignment = Range.Alignment;
+    if (Alignment == 0) {
+      Type *EltType =
+        cast<PointerType>(StartPtr->getType())->getElementType();
+      Alignment = DL.getABITypeAlignment(EltType);
+    }
+
+    AMemSet =
+      Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
+
+    DEBUG(dbgs() << "Replace stores:\n";
+          for (Instruction *SI : Range.TheStores)
+            dbgs() << *SI << '\n';
+          dbgs() << "With: " << *AMemSet << '\n');
+
+    if (!Range.TheStores.empty())
+      AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+
+    // Zap all the stores.
+    for (Instruction *SI : Range.TheStores) {
+      MD->removeInstruction(SI);
+      SI->eraseFromParent();
+    }
+    ++NumMemSetInfer;
+  }
+
+  return AMemSet;
+}
+
+static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
+                                     const LoadInst *LI) {
+  unsigned StoreAlign = SI->getAlignment();
+  if (!StoreAlign)
+    StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
+  unsigned LoadAlign = LI->getAlignment();
+  if (!LoadAlign)
+    LoadAlign = DL.getABITypeAlignment(LI->getType());
+
+  return std::min(StoreAlign, LoadAlign);
+}
+
+// This method try to lift a store instruction before position P.
+// It will lift the store and its argument + that anything that
+// may alias with these.
+// The method returns true if it was successful.
+static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
+                   const LoadInst *LI) {
+  // If the store alias this position, early bail out.
+  MemoryLocation StoreLoc = MemoryLocation::get(SI);
+  if (AA.getModRefInfo(P, StoreLoc) != MRI_NoModRef)
+    return false;
+
+  // Keep track of the arguments of all instruction we plan to lift
+  // so we can make sure to lift them as well if apropriate.
+  DenseSet<Instruction*> Args;
+  if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand()))
+    if (Ptr->getParent() == SI->getParent())
+      Args.insert(Ptr);
+
+  // Instruction to lift before P.
+  SmallVector<Instruction*, 8> ToLift;
+
+  // Memory locations of lifted instructions.
+  SmallVector<MemoryLocation, 8> MemLocs{StoreLoc};
+
+  // Lifted callsites.
+  SmallVector<ImmutableCallSite, 8> CallSites;
+
+  const MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
+  for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) {
+    auto *C = &*I;
+
+    bool MayAlias = AA.getModRefInfo(C) != MRI_NoModRef;
+
+    bool NeedLift = false;
+    if (Args.erase(C))
+      NeedLift = true;
+    else if (MayAlias) {
+      NeedLift = llvm::any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
+        return AA.getModRefInfo(C, ML);
+      });
+
+      if (!NeedLift)
+        NeedLift =
+            llvm::any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) {
+              return AA.getModRefInfo(C, CS);
+            });
+    }
+
+    if (!NeedLift)
+      continue;
+
+    if (MayAlias) {
+      // Since LI is implicitly moved downwards past the lifted instructions,
+      // none of them may modify its source.
+      if (AA.getModRefInfo(C, LoadLoc) & MRI_Mod)
+        return false;
+      else if (auto CS = ImmutableCallSite(C)) {
+        // If we can't lift this before P, it's game over.
+        if (AA.getModRefInfo(P, CS) != MRI_NoModRef)
+          return false;
+
+        CallSites.push_back(CS);
+      } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) {
+        // If we can't lift this before P, it's game over.
+        auto ML = MemoryLocation::get(C);
+        if (AA.getModRefInfo(P, ML) != MRI_NoModRef)
+          return false;
+
+        MemLocs.push_back(ML);
+      } else
+        // We don't know how to lift this instruction.
+        return false;
+    }
+
+    ToLift.push_back(C);
+    for (unsigned k = 0, e = C->getNumOperands(); k != e; ++k)
+      if (auto *A = dyn_cast<Instruction>(C->getOperand(k)))
+        if (A->getParent() == SI->getParent())
+          Args.insert(A);
+  }
+
+  // We made it, we need to lift
+  for (auto *I : llvm::reverse(ToLift)) {
+    DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
+    I->moveBefore(P);
+  }
+
+  return true;
+}
+
+bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
+  if (!SI->isSimple()) return false;
+
+  // Avoid merging nontemporal stores since the resulting
+  // memcpy/memset would not be able to preserve the nontemporal hint.
+  // In theory we could teach how to propagate the !nontemporal metadata to
+  // memset calls. However, that change would force the backend to
+  // conservatively expand !nontemporal memset calls back to sequences of
+  // store instructions (effectively undoing the merging).
+  if (SI->getMetadata(LLVMContext::MD_nontemporal))
+    return false;
+
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+
+  // Load to store forwarding can be interpreted as memcpy.
+  if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
+    if (LI->isSimple() && LI->hasOneUse() &&
+        LI->getParent() == SI->getParent()) {
+
+      auto *T = LI->getType();
+      if (T->isAggregateType()) {
+        AliasAnalysis &AA = LookupAliasAnalysis();
+        MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
+        // We use alias analysis to check if an instruction may store to
+        // the memory we load from in between the load and the store. If
+        // such an instruction is found, we try to promote there instead
+        // of at the store position.
+        Instruction *P = SI;
+        for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
+          if (AA.getModRefInfo(&I, LoadLoc) & MRI_Mod) {
+            P = &I;
+            break;
+          }
+        }
+
+        // We found an instruction that may write to the loaded memory.
+        // We can try to promote at this position instead of the store
+        // position if nothing alias the store memory after this and the store
+        // destination is not in the range.
+        if (P && P != SI) {
+          if (!moveUp(AA, SI, P, LI))
+            P = nullptr;
+        }
+
+        // If a valid insertion position is found, then we can promote
+        // the load/store pair to a memcpy.
+        if (P) {
+          // If we load from memory that may alias the memory we store to,
+          // memmove must be used to preserve semantic. If not, memcpy can
+          // be used.
+          bool UseMemMove = false;
+          if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc))
+            UseMemMove = true;
+
+          unsigned Align = findCommonAlignment(DL, SI, LI);
+          uint64_t Size = DL.getTypeStoreSize(T);
+
+          IRBuilder<> Builder(P);
+          Instruction *M;
+          if (UseMemMove)
+            M = Builder.CreateMemMove(SI->getPointerOperand(),
+                                      LI->getPointerOperand(), Size,
+                                      Align, SI->isVolatile());
+          else
+            M = Builder.CreateMemCpy(SI->getPointerOperand(),
+                                     LI->getPointerOperand(), Size,
+                                     Align, SI->isVolatile());
+
+          DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI
+                       << " => " << *M << "\n");
+
+          MD->removeInstruction(SI);
+          SI->eraseFromParent();
+          MD->removeInstruction(LI);
+          LI->eraseFromParent();
+          ++NumMemCpyInstr;
+
+          // Make sure we do not invalidate the iterator.
+          BBI = M->getIterator();
+          return true;
+        }
+      }
+
+      // Detect cases where we're performing call slot forwarding, but
+      // happen to be using a load-store pair to implement it, rather than
+      // a memcpy.
+      MemDepResult ldep = MD->getDependency(LI);
+      CallInst *C = nullptr;
+      if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
+        C = dyn_cast<CallInst>(ldep.getInst());
+
+      if (C) {
+        // Check that nothing touches the dest of the "copy" between
+        // the call and the store.
+        Value *CpyDest = SI->getPointerOperand()->stripPointerCasts();
+        bool CpyDestIsLocal = isa<AllocaInst>(CpyDest);
+        AliasAnalysis &AA = LookupAliasAnalysis();
+        MemoryLocation StoreLoc = MemoryLocation::get(SI);
+        for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator();
+             I != E; --I) {
+          if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) {
+            C = nullptr;
+            break;
+          }
+          // The store to dest may never happen if an exception can be thrown
+          // between the load and the store.
+          if (I->mayThrow() && !CpyDestIsLocal) {
+            C = nullptr;
+            break;
+          }
+        }
+      }
+
+      if (C) {
+        bool changed = performCallSlotOptzn(
+            LI, SI->getPointerOperand()->stripPointerCasts(),
+            LI->getPointerOperand()->stripPointerCasts(),
+            DL.getTypeStoreSize(SI->getOperand(0)->getType()),
+            findCommonAlignment(DL, SI, LI), C);
+        if (changed) {
+          MD->removeInstruction(SI);
+          SI->eraseFromParent();
+          MD->removeInstruction(LI);
+          LI->eraseFromParent();
+          ++NumMemCpyInstr;
+          return true;
+        }
+      }
+    }
+  }
+
+  // There are two cases that are interesting for this code to handle: memcpy
+  // and memset.  Right now we only handle memset.
+
+  // Ensure that the value being stored is something that can be memset'able a
+  // byte at a time like "0" or "-1" or any width, as well as things like
+  // 0xA0A0A0A0 and 0.0.
+  auto *V = SI->getOperand(0);
+  if (Value *ByteVal = isBytewiseValue(V)) {
+    if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(),
+                                              ByteVal)) {
+      BBI = I->getIterator(); // Don't invalidate iterator.
+      return true;
+    }
+
+    // If we have an aggregate, we try to promote it to memset regardless
+    // of opportunity for merging as it can expose optimization opportunities
+    // in subsequent passes.
+    auto *T = V->getType();
+    if (T->isAggregateType()) {
+      uint64_t Size = DL.getTypeStoreSize(T);
+      unsigned Align = SI->getAlignment();
+      if (!Align)
+        Align = DL.getABITypeAlignment(T);
+      IRBuilder<> Builder(SI);
+      auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal,
+                                     Size, Align, SI->isVolatile());
+
+      DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
+
+      MD->removeInstruction(SI);
+      SI->eraseFromParent();
+      NumMemSetInfer++;
+
+      // Make sure we do not invalidate the iterator.
+      BBI = M->getIterator();
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
+  // See if there is another memset or store neighboring this memset which
+  // allows us to widen out the memset to do a single larger store.
+  if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
+    if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(),
+                                              MSI->getValue())) {
+      BBI = I->getIterator(); // Don't invalidate iterator.
+      return true;
+    }
+  return false;
+}
+
+/// Takes a memcpy and a call that it depends on,
+/// and checks for the possibility of a call slot optimization by having
+/// the call write its result directly into the destination of the memcpy.
+bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
+                                         Value *cpySrc, uint64_t cpyLen,
+                                         unsigned cpyAlign, CallInst *C) {
+  // The general transformation to keep in mind is
+  //
+  //   call @func(..., src, ...)
+  //   memcpy(dest, src, ...)
+  //
+  // ->
+  //
+  //   memcpy(dest, src, ...)
+  //   call @func(..., dest, ...)
+  //
+  // Since moving the memcpy is technically awkward, we additionally check that
+  // src only holds uninitialized values at the moment of the call, meaning that
+  // the memcpy can be discarded rather than moved.
+
+  // Lifetime marks shouldn't be operated on.
+  if (Function *F = C->getCalledFunction())
+    if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
+      return false;
+
+  // Deliberately get the source and destination with bitcasts stripped away,
+  // because we'll need to do type comparisons based on the underlying type.
+  CallSite CS(C);
+
+  // Require that src be an alloca.  This simplifies the reasoning considerably.
+  AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
+  if (!srcAlloca)
+    return false;
+
+  ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
+  if (!srcArraySize)
+    return false;
+
+  const DataLayout &DL = cpy->getModule()->getDataLayout();
+  uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) *
+                     srcArraySize->getZExtValue();
+
+  if (cpyLen < srcSize)
+    return false;
+
+  // Check that accessing the first srcSize bytes of dest will not cause a
+  // trap.  Otherwise the transform is invalid since it might cause a trap
+  // to occur earlier than it otherwise would.
+  if (AllocaInst *A = dyn_cast<AllocaInst>(cpyDest)) {
+    // The destination is an alloca.  Check it is larger than srcSize.
+    ConstantInt *destArraySize = dyn_cast<ConstantInt>(A->getArraySize());
+    if (!destArraySize)
+      return false;
+
+    uint64_t destSize = DL.getTypeAllocSize(A->getAllocatedType()) *
+                        destArraySize->getZExtValue();
+
+    if (destSize < srcSize)
+      return false;
+  } else if (Argument *A = dyn_cast<Argument>(cpyDest)) {
+    // The store to dest may never happen if the call can throw.
+    if (C->mayThrow())
+      return false;
+
+    if (A->getDereferenceableBytes() < srcSize) {
+      // If the destination is an sret parameter then only accesses that are
+      // outside of the returned struct type can trap.
+      if (!A->hasStructRetAttr())
+        return false;
+
+      Type *StructTy = cast<PointerType>(A->getType())->getElementType();
+      if (!StructTy->isSized()) {
+        // The call may never return and hence the copy-instruction may never
+        // be executed, and therefore it's not safe to say "the destination
+        // has at least <cpyLen> bytes, as implied by the copy-instruction",
+        return false;
+      }
+
+      uint64_t destSize = DL.getTypeAllocSize(StructTy);
+      if (destSize < srcSize)
+        return false;
+    }
+  } else {
+    return false;
+  }
+
+  // Check that dest points to memory that is at least as aligned as src.
+  unsigned srcAlign = srcAlloca->getAlignment();
+  if (!srcAlign)
+    srcAlign = DL.getABITypeAlignment(srcAlloca->getAllocatedType());
+  bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
+  // If dest is not aligned enough and we can't increase its alignment then
+  // bail out.
+  if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest))
+    return false;
+
+  // Check that src is not accessed except via the call and the memcpy.  This
+  // guarantees that it holds only undefined values when passed in (so the final
+  // memcpy can be dropped), that it is not read or written between the call and
+  // the memcpy, and that writing beyond the end of it is undefined.
+  SmallVector<User*, 8> srcUseList(srcAlloca->user_begin(),
+                                   srcAlloca->user_end());
+  while (!srcUseList.empty()) {
+    User *U = srcUseList.pop_back_val();
+
+    if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
+      for (User *UU : U->users())
+        srcUseList.push_back(UU);
+      continue;
+    }
+    if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
+      if (!G->hasAllZeroIndices())
+        return false;
+
+      for (User *UU : U->users())
+        srcUseList.push_back(UU);
+      continue;
+    }
+    if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U))
+      if (IT->getIntrinsicID() == Intrinsic::lifetime_start ||
+          IT->getIntrinsicID() == Intrinsic::lifetime_end)
+        continue;
+
+    if (U != C && U != cpy)
+      return false;
+  }
+
+  // Check that src isn't captured by the called function since the
+  // transformation can cause aliasing issues in that case.
+  for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
+    if (CS.getArgument(i) == cpySrc && !CS.doesNotCapture(i))
+      return false;
+
+  // Since we're changing the parameter to the callsite, we need to make sure
+  // that what would be the new parameter dominates the callsite.
+  DominatorTree &DT = LookupDomTree();
+  if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest))
+    if (!DT.dominates(cpyDestInst, C))
+      return false;
+
+  // In addition to knowing that the call does not access src in some
+  // unexpected manner, for example via a global, which we deduce from
+  // the use analysis, we also need to know that it does not sneakily
+  // access dest.  We rely on AA to figure this out for us.
+  AliasAnalysis &AA = LookupAliasAnalysis();
+  ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize);
+  // If necessary, perform additional analysis.
+  if (MR != MRI_NoModRef)
+    MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT);
+  if (MR != MRI_NoModRef)
+    return false;
+
+  // We can't create address space casts here because we don't know if they're
+  // safe for the target.
+  if (cpySrc->getType()->getPointerAddressSpace() !=
+      cpyDest->getType()->getPointerAddressSpace())
+    return false;
+  for (unsigned i = 0; i < CS.arg_size(); ++i)
+    if (CS.getArgument(i)->stripPointerCasts() == cpySrc &&
+        cpySrc->getType()->getPointerAddressSpace() !=
+        CS.getArgument(i)->getType()->getPointerAddressSpace())
+      return false;
+
+  // All the checks have passed, so do the transformation.
+  bool changedArgument = false;
+  for (unsigned i = 0; i < CS.arg_size(); ++i)
+    if (CS.getArgument(i)->stripPointerCasts() == cpySrc) {
+      Value *Dest = cpySrc->getType() == cpyDest->getType() ?  cpyDest
+        : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
+                                      cpyDest->getName(), C);
+      changedArgument = true;
+      if (CS.getArgument(i)->getType() == Dest->getType())
+        CS.setArgument(i, Dest);
+      else
+        CS.setArgument(i, CastInst::CreatePointerCast(Dest,
+                          CS.getArgument(i)->getType(), Dest->getName(), C));
+    }
+
+  if (!changedArgument)
+    return false;
+
+  // If the destination wasn't sufficiently aligned then increase its alignment.
+  if (!isDestSufficientlyAligned) {
+    assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
+    cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
+  }
+
+  // Drop any cached information about the call, because we may have changed
+  // its dependence information by changing its parameter.
+  MD->removeInstruction(C);
+
+  // Update AA metadata
+  // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
+  // handled here, but combineMetadata doesn't support them yet
+  unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+                         LLVMContext::MD_noalias,
+                         LLVMContext::MD_invariant_group};
+  combineMetadata(C, cpy, KnownIDs);
+
+  // Remove the memcpy.
+  MD->removeInstruction(cpy);
+  ++NumMemCpyInstr;
+
+  return true;
+}
+
+/// We've found that the (upward scanning) memory dependence of memcpy 'M' is
+/// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
+bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
+                                                  MemCpyInst *MDep) {
+  // We can only transforms memcpy's where the dest of one is the source of the
+  // other.
+  if (M->getSource() != MDep->getDest() || MDep->isVolatile())
+    return false;
+
+  // If dep instruction is reading from our current input, then it is a noop
+  // transfer and substituting the input won't change this instruction.  Just
+  // ignore the input and let someone else zap MDep.  This handles cases like:
+  //    memcpy(a <- a)
+  //    memcpy(b <- a)
+  if (M->getSource() == MDep->getSource())
+    return false;
+
+  // Second, the length of the memcpy's must be the same, or the preceding one
+  // must be larger than the following one.
+  ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
+  ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength());
+  if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
+    return false;
+
+  AliasAnalysis &AA = LookupAliasAnalysis();
+
+  // Verify that the copied-from memory doesn't change in between the two
+  // transfers.  For example, in:
+  //    memcpy(a <- b)
+  //    *b = 42;
+  //    memcpy(c <- a)
+  // It would be invalid to transform the second memcpy into memcpy(c <- b).
+  //
+  // TODO: If the code between M and MDep is transparent to the destination "c",
+  // then we could still perform the xform by moving M up to the first memcpy.
+  //
+  // NOTE: This is conservative, it will stop on any read from the source loc,
+  // not just the defining memcpy.
+  MemDepResult SourceDep =
+      MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
+                                   M->getIterator(), M->getParent());
+  if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+    return false;
+
+  // If the dest of the second might alias the source of the first, then the
+  // source and dest might overlap.  We still want to eliminate the intermediate
+  // value, but we have to generate a memmove instead of memcpy.
+  bool UseMemMove = false;
+  if (!AA.isNoAlias(MemoryLocation::getForDest(M),
+                    MemoryLocation::getForSource(MDep)))
+    UseMemMove = true;
+
+  // If all checks passed, then we can transform M.
+
+  // Make sure to use the lesser of the alignment of the source and the dest
+  // since we're changing where we're reading from, but don't want to increase
+  // the alignment past what can be read from or written to.
+  // TODO: Is this worth it if we're creating a less aligned memcpy? For
+  // example we could be moving from movaps -> movq on x86.
+  unsigned Align = std::min(MDep->getAlignment(), M->getAlignment());
+
+  IRBuilder<> Builder(M);
+  if (UseMemMove)
+    Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(),
+                          Align, M->isVolatile());
+  else
+    Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(),
+                         Align, M->isVolatile());
+
+  // Remove the instruction we're replacing.
+  MD->removeInstruction(M);
+  M->eraseFromParent();
+  ++NumMemCpyInstr;
+  return true;
+}
+
+/// We've found that the (upward scanning) memory dependence of \p MemCpy is
+/// \p MemSet.  Try to simplify \p MemSet to only set the trailing bytes that
+/// weren't copied over by \p MemCpy.
+///
+/// In other words, transform:
+/// \code
+///   memset(dst, c, dst_size);
+///   memcpy(dst, src, src_size);
+/// \endcode
+/// into:
+/// \code
+///   memcpy(dst, src, src_size);
+///   memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size);
+/// \endcode
+bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
+                                                  MemSetInst *MemSet) {
+  // We can only transform memset/memcpy with the same destination.
+  if (MemSet->getDest() != MemCpy->getDest())
+    return false;
+
+  // Check that there are no other dependencies on the memset destination.
+  MemDepResult DstDepInfo =
+      MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false,
+                                   MemCpy->getIterator(), MemCpy->getParent());
+  if (DstDepInfo.getInst() != MemSet)
+    return false;
+
+  // Use the same i8* dest as the memcpy, killing the memset dest if different.
+  Value *Dest = MemCpy->getRawDest();
+  Value *DestSize = MemSet->getLength();
+  Value *SrcSize = MemCpy->getLength();
+
+  // By default, create an unaligned memset.
+  unsigned Align = 1;
+  // If Dest is aligned, and SrcSize is constant, use the minimum alignment
+  // of the sum.
+  const unsigned DestAlign =
+      std::max(MemSet->getAlignment(), MemCpy->getAlignment());
+  if (DestAlign > 1)
+    if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
+      Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
+
+  IRBuilder<> Builder(MemCpy);
+
+  // If the sizes have different types, zext the smaller one.
+  if (DestSize->getType() != SrcSize->getType()) {
+    if (DestSize->getType()->getIntegerBitWidth() >
+        SrcSize->getType()->getIntegerBitWidth())
+      SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType());
+    else
+      DestSize = Builder.CreateZExt(DestSize, SrcSize->getType());
+  }
+
+  Value *Ule = Builder.CreateICmpULE(DestSize, SrcSize);
+  Value *SizeDiff = Builder.CreateSub(DestSize, SrcSize);
+  Value *MemsetLen = Builder.CreateSelect(
+      Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff);
+  Builder.CreateMemSet(Builder.CreateGEP(Dest, SrcSize), MemSet->getOperand(1),
+                       MemsetLen, Align);
+
+  MD->removeInstruction(MemSet);
+  MemSet->eraseFromParent();
+  return true;
+}
+
+/// Transform memcpy to memset when its source was just memset.
+/// In other words, turn:
+/// \code
+///   memset(dst1, c, dst1_size);
+///   memcpy(dst2, dst1, dst2_size);
+/// \endcode
+/// into:
+/// \code
+///   memset(dst1, c, dst1_size);
+///   memset(dst2, c, dst2_size);
+/// \endcode
+/// When dst2_size <= dst1_size.
+///
+/// The \p MemCpy must have a Constant length.
+bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
+                                               MemSetInst *MemSet) {
+  AliasAnalysis &AA = LookupAliasAnalysis();
+
+  // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and
+  // memcpying from the same address. Otherwise it is hard to reason about.
+  if (!AA.isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource()))
+    return false;
+
+  ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength());
+  ConstantInt *MemSetSize = dyn_cast<ConstantInt>(MemSet->getLength());
+  // Make sure the memcpy doesn't read any more than what the memset wrote.
+  // Don't worry about sizes larger than i64.
+  if (!MemSetSize || CopySize->getZExtValue() > MemSetSize->getZExtValue())
+    return false;
+
+  IRBuilder<> Builder(MemCpy);
+  Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
+                       CopySize, MemCpy->getAlignment());
+  return true;
+}
+
+/// Perform simplification of memcpy's.  If we have memcpy A
+/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
+/// B to be a memcpy from X to Z (or potentially a memmove, depending on
+/// circumstances). This allows later passes to remove the first memcpy
+/// altogether.
+bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
+  // We can only optimize non-volatile memcpy's.
+  if (M->isVolatile()) return false;
+
+  // If the source and destination of the memcpy are the same, then zap it.
+  if (M->getSource() == M->getDest()) {
+    MD->removeInstruction(M);
+    M->eraseFromParent();
+    return false;
+  }
+
+  // If copying from a constant, try to turn the memcpy into a memset.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource()))
+    if (GV->isConstant() && GV->hasDefinitiveInitializer())
+      if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) {
+        IRBuilder<> Builder(M);
+        Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
+                             M->getAlignment(), false);
+        MD->removeInstruction(M);
+        M->eraseFromParent();
+        ++NumCpyToSet;
+        return true;
+      }
+
+  MemDepResult DepInfo = MD->getDependency(M);
+
+  // Try to turn a partially redundant memset + memcpy into
+  // memcpy + smaller memset.  We don't need the memcpy size for this.
+  if (DepInfo.isClobber())
+    if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
+      if (processMemSetMemCpyDependence(M, MDep))
+        return true;
+
+  // The optimizations after this point require the memcpy size.
+  ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+  if (!CopySize) return false;
+
+  // There are four possible optimizations we can do for memcpy:
+  //   a) memcpy-memcpy xform which exposes redundance for DSE.
+  //   b) call-memcpy xform for return slot optimization.
+  //   c) memcpy from freshly alloca'd space or space that has just started its
+  //      lifetime copies undefined data, and we can therefore eliminate the
+  //      memcpy in favor of the data that was already at the destination.
+  //   d) memcpy from a just-memset'd source can be turned into memset.
+  if (DepInfo.isClobber()) {
+    if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+      if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
+                               CopySize->getZExtValue(), M->getAlignment(),
+                               C)) {
+        MD->removeInstruction(M);
+        M->eraseFromParent();
+        return true;
+      }
+    }
+  }
+
+  MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
+  MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
+      SrcLoc, true, M->getIterator(), M->getParent());
+
+  if (SrcDepInfo.isClobber()) {
+    if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
+      return processMemCpyMemCpyDependence(M, MDep);
+  } else if (SrcDepInfo.isDef()) {
+    Instruction *I = SrcDepInfo.getInst();
+    bool hasUndefContents = false;
+
+    if (isa<AllocaInst>(I)) {
+      hasUndefContents = true;
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+        if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0)))
+          if (LTSize->getZExtValue() >= CopySize->getZExtValue())
+            hasUndefContents = true;
+    }
+
+    if (hasUndefContents) {
+      MD->removeInstruction(M);
+      M->eraseFromParent();
+      ++NumMemCpyInstr;
+      return true;
+    }
+  }
+
+  if (SrcDepInfo.isClobber())
+    if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
+      if (performMemCpyToMemSetOptzn(M, MDep)) {
+        MD->removeInstruction(M);
+        M->eraseFromParent();
+        ++NumCpyToSet;
+        return true;
+      }
+
+  return false;
+}
+
+/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
+/// not to alias.
+bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
+  AliasAnalysis &AA = LookupAliasAnalysis();
+
+  if (!TLI->has(LibFunc_memmove))
+    return false;
+
+  // See if the pointers alias.
+  if (!AA.isNoAlias(MemoryLocation::getForDest(M),
+                    MemoryLocation::getForSource(M)))
+    return false;
+
+  DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
+               << "\n");
+
+  // If not, then we know we can transform this.
+  Type *ArgTys[3] = { M->getRawDest()->getType(),
+                      M->getRawSource()->getType(),
+                      M->getLength()->getType() };
+  M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(),
+                                                 Intrinsic::memcpy, ArgTys));
+
+  // MemDep may have over conservative information about this instruction, just
+  // conservatively flush it from the cache.
+  MD->removeInstruction(M);
+
+  ++NumMoveToCpy;
+  return true;
+}
+
+/// This is called on every byval argument in call sites.
+bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
+  const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout();
+  // Find out what feeds this byval argument.
+  Value *ByValArg = CS.getArgument(ArgNo);
+  Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
+  uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
+  MemDepResult DepInfo = MD->getPointerDependencyFrom(
+      MemoryLocation(ByValArg, ByValSize), true,
+      CS.getInstruction()->getIterator(), CS.getInstruction()->getParent());
+  if (!DepInfo.isClobber())
+    return false;
+
+  // If the byval argument isn't fed by a memcpy, ignore it.  If it is fed by
+  // a memcpy, see if we can byval from the source of the memcpy instead of the
+  // result.
+  MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
+  if (!MDep || MDep->isVolatile() ||
+      ByValArg->stripPointerCasts() != MDep->getDest())
+    return false;
+
+  // The length of the memcpy must be larger or equal to the size of the byval.
+  ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
+  if (!C1 || C1->getValue().getZExtValue() < ByValSize)
+    return false;
+
+  // Get the alignment of the byval.  If the call doesn't specify the alignment,
+  // then it is some target specific value that we can't know.
+  unsigned ByValAlign = CS.getParamAlignment(ArgNo);
+  if (ByValAlign == 0) return false;
+
+  // If it is greater than the memcpy, then we check to see if we can force the
+  // source of the memcpy to the alignment we need.  If we fail, we bail out.
+  AssumptionCache &AC = LookupAssumptionCache();
+  DominatorTree &DT = LookupDomTree();
+  if (MDep->getAlignment() < ByValAlign &&
+      getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL,
+                                 CS.getInstruction(), &AC, &DT) < ByValAlign)
+    return false;
+
+  // The address space of the memcpy source must match the byval argument
+  if (MDep->getSource()->getType()->getPointerAddressSpace() !=
+      ByValArg->getType()->getPointerAddressSpace())
+    return false;
+
+  // Verify that the copied-from memory doesn't change in between the memcpy and
+  // the byval call.
+  //    memcpy(a <- b)
+  //    *b = 42;
+  //    foo(*a)
+  // It would be invalid to transform the second memcpy into foo(*b).
+  //
+  // NOTE: This is conservative, it will stop on any read from the source loc,
+  // not just the defining memcpy.
+  MemDepResult SourceDep = MD->getPointerDependencyFrom(
+      MemoryLocation::getForSource(MDep), false,
+      CS.getInstruction()->getIterator(), MDep->getParent());
+  if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+    return false;
+
+  Value *TmpCast = MDep->getSource();
+  if (MDep->getSource()->getType() != ByValArg->getType())
+    TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
+                              "tmpcast", CS.getInstruction());
+
+  DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
+               << "  " << *MDep << "\n"
+               << "  " << *CS.getInstruction() << "\n");
+
+  // Otherwise we're good!  Update the byval argument.
+  CS.setArgument(ArgNo, TmpCast);
+  ++NumMemCpyInstr;
+  return true;
+}
+
+/// Executes one iteration of MemCpyOptPass.
+bool MemCpyOptPass::iterateOnFunction(Function &F) {
+  bool MadeChange = false;
+
+  // Walk all instruction in the function.
+  for (BasicBlock &BB : F) {
+    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
+      // Avoid invalidating the iterator.
+      Instruction *I = &*BI++;
+
+      bool RepeatInstruction = false;
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(I))
+        MadeChange |= processStore(SI, BI);
+      else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
+        RepeatInstruction = processMemSet(M, BI);
+      else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
+        RepeatInstruction = processMemCpy(M);
+      else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
+        RepeatInstruction = processMemMove(M);
+      else if (auto CS = CallSite(I)) {
+        for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
+          if (CS.isByValArgument(i))
+            MadeChange |= processByValArgument(CS, i);
+      }
+
+      // Reprocess the instruction if desired.
+      if (RepeatInstruction) {
+        if (BI != BB.begin())
+          --BI;
+        MadeChange = true;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
+  auto LookupAliasAnalysis = [&]() -> AliasAnalysis & {
+    return AM.getResult<AAManager>(F);
+  };
+  auto LookupAssumptionCache = [&]() -> AssumptionCache & {
+    return AM.getResult<AssumptionAnalysis>(F);
+  };
+  auto LookupDomTree = [&]() -> DominatorTree & {
+    return AM.getResult<DominatorTreeAnalysis>(F);
+  };
+
+  bool MadeChange = runImpl(F, &MD, &TLI, LookupAliasAnalysis,
+                            LookupAssumptionCache, LookupDomTree);
+  if (!MadeChange)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<MemoryDependenceAnalysis>();
+  return PA;
+}
+
+bool MemCpyOptPass::runImpl(
+    Function &F, MemoryDependenceResults *MD_, TargetLibraryInfo *TLI_,
+    std::function<AliasAnalysis &()> LookupAliasAnalysis_,
+    std::function<AssumptionCache &()> LookupAssumptionCache_,
+    std::function<DominatorTree &()> LookupDomTree_) {
+  bool MadeChange = false;
+  MD = MD_;
+  TLI = TLI_;
+  LookupAliasAnalysis = std::move(LookupAliasAnalysis_);
+  LookupAssumptionCache = std::move(LookupAssumptionCache_);
+  LookupDomTree = std::move(LookupDomTree_);
+
+  // If we don't have at least memset and memcpy, there is little point of doing
+  // anything here.  These are required by a freestanding implementation, so if
+  // even they are disabled, there is no point in trying hard.
+  if (!TLI->has(LibFunc_memset) || !TLI->has(LibFunc_memcpy))
+    return false;
+
+  while (true) {
+    if (!iterateOnFunction(F))
+      break;
+    MadeChange = true;
+  }
+
+  MD = nullptr;
+  return MadeChange;
+}
+
+/// This is the main transformation entry point for a function.
+bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  auto LookupAliasAnalysis = [this]() -> AliasAnalysis & {
+    return getAnalysis<AAResultsWrapperPass>().getAAResults();
+  };
+  auto LookupAssumptionCache = [this, &F]() -> AssumptionCache & {
+    return getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  };
+  auto LookupDomTree = [this]() -> DominatorTree & {
+    return getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  };
+
+  return Impl.runImpl(F, MD, TLI, LookupAliasAnalysis, LookupAssumptionCache,
+                      LookupDomTree);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
new file mode 100644
index 000000000000..6727cf0179c1
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -0,0 +1,431 @@
+//===- MergedLoadStoreMotion.cpp - merge and hoist/sink load/stores -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//! \file
+//! \brief This pass performs merges of loads and stores on both sides of a
+//  diamond (hammock). It hoists the loads and sinks the stores.
+//
+// The algorithm iteratively hoists two loads to the same address out of a
+// diamond (hammock) and merges them into a single load in the header. Similar
+// it sinks and merges two stores to the tail block (footer). The algorithm
+// iterates over the instructions of one side of the diamond and attempts to
+// find a matching load/store on the other side. It hoists / sinks when it
+// thinks it safe to do so.  This optimization helps with eg. hiding load
+// latencies, triggering if-conversion, and reducing static code size.
+//
+// NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// Example:
+// Diamond shaped code before merge:
+//
+//            header:
+//                     br %cond, label %if.then, label %if.else
+//                        +                    +
+//                       +                      +
+//                      +                        +
+//            if.then:                         if.else:
+//               %lt = load %addr_l               %le = load %addr_l
+//               <use %lt>                        <use %le>
+//               <...>                            <...>
+//               store %st, %addr_s               store %se, %addr_s
+//               br label %if.end                 br label %if.end
+//                     +                         +
+//                      +                       +
+//                       +                     +
+//            if.end ("footer"):
+//                     <...>
+//
+// Diamond shaped code after merge:
+//
+//            header:
+//                     %l = load %addr_l
+//                     br %cond, label %if.then, label %if.else
+//                        +                    +
+//                       +                      +
+//                      +                        +
+//            if.then:                         if.else:
+//               <use %l>                         <use %l>
+//               <...>                            <...>
+//               br label %if.end                 br label %if.end
+//                      +                        +
+//                       +                      +
+//                        +                    +
+//            if.end ("footer"):
+//                     %s.sink = phi [%st, if.then], [%se, if.else]
+//                     <...>
+//                     store %s.sink, %addr_s
+//                     <...>
+//
+//
+//===----------------------- TODO -----------------------------------------===//
+//
+// 1) Generalize to regions other than diamonds
+// 2) Be more aggressive merging memory operations
+// Note that both changes require register pressure control
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mldst-motion"
+
+namespace {
+//===----------------------------------------------------------------------===//
+//                         MergedLoadStoreMotion Pass
+//===----------------------------------------------------------------------===//
+class MergedLoadStoreMotion {
+  MemoryDependenceResults *MD = nullptr;
+  AliasAnalysis *AA = nullptr;
+
+  // The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
+  // where Size0 and Size1 are the #instructions on the two sides of
+  // the diamond. The constant chosen here is arbitrary. Compiler Time
+  // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
+  const int MagicCompileTimeControl = 250;
+
+public:
+  bool run(Function &F, MemoryDependenceResults *MD, AliasAnalysis &AA);
+
+private:
+  ///
+  /// \brief Remove instruction from parent and update memory dependence
+  /// analysis.
+  ///
+  void removeInstruction(Instruction *Inst);
+  BasicBlock *getDiamondTail(BasicBlock *BB);
+  bool isDiamondHead(BasicBlock *BB);
+  // Routines for sinking stores
+  StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
+  PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
+  bool isStoreSinkBarrierInRange(const Instruction &Start,
+                                 const Instruction &End, MemoryLocation Loc);
+  bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst);
+  bool mergeStores(BasicBlock *BB);
+};
+} // end anonymous namespace
+
+///
+/// \brief Remove instruction from parent and update memory dependence analysis.
+///
+void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) {
+  // Notify the memory dependence analysis.
+  if (MD) {
+    MD->removeInstruction(Inst);
+    if (auto *LI = dyn_cast<LoadInst>(Inst))
+      MD->invalidateCachedPointerInfo(LI->getPointerOperand());
+    if (Inst->getType()->isPtrOrPtrVectorTy()) {
+      MD->invalidateCachedPointerInfo(Inst);
+    }
+  }
+  Inst->eraseFromParent();
+}
+
+///
+/// \brief Return tail block of a diamond.
+///
+BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
+  assert(isDiamondHead(BB) && "Basic block is not head of a diamond");
+  return BB->getTerminator()->getSuccessor(0)->getSingleSuccessor();
+}
+
+///
+/// \brief True when BB is the head of a diamond (hammock)
+///
+bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
+  if (!BB)
+    return false;
+  auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+
+  BasicBlock *Succ0 = BI->getSuccessor(0);
+  BasicBlock *Succ1 = BI->getSuccessor(1);
+
+  if (!Succ0->getSinglePredecessor())
+    return false;
+  if (!Succ1->getSinglePredecessor())
+    return false;
+
+  BasicBlock *Succ0Succ = Succ0->getSingleSuccessor();
+  BasicBlock *Succ1Succ = Succ1->getSingleSuccessor();
+  // Ignore triangles.
+  if (!Succ0Succ || !Succ1Succ || Succ0Succ != Succ1Succ)
+    return false;
+  return true;
+}
+
+
+///
+/// \brief True when instruction is a sink barrier for a store
+/// located in Loc
+///
+/// Whenever an instruction could possibly read or modify the
+/// value being stored or protect against the store from
+/// happening it is considered a sink barrier.
+///
+bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,
+                                                      const Instruction &End,
+                                                      MemoryLocation Loc) {
+  for (const Instruction &Inst :
+       make_range(Start.getIterator(), End.getIterator()))
+    if (Inst.mayThrow())
+      return true;
+  return AA->canInstructionRangeModRef(Start, End, Loc, MRI_ModRef);
+}
+
+///
+/// \brief Check if \p BB contains a store to the same address as \p SI
+///
+/// \return The store in \p  when it is safe to sink. Otherwise return Null.
+///
+StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
+                                                   StoreInst *Store0) {
+  DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
+  BasicBlock *BB0 = Store0->getParent();
+  for (Instruction &Inst : reverse(*BB1)) {
+    auto *Store1 = dyn_cast<StoreInst>(&Inst);
+    if (!Store1)
+      continue;
+
+    MemoryLocation Loc0 = MemoryLocation::get(Store0);
+    MemoryLocation Loc1 = MemoryLocation::get(Store1);
+    if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&
+        !isStoreSinkBarrierInRange(*Store1->getNextNode(), BB1->back(), Loc1) &&
+        !isStoreSinkBarrierInRange(*Store0->getNextNode(), BB0->back(), Loc0)) {
+      return Store1;
+    }
+  }
+  return nullptr;
+}
+
+///
+/// \brief Create a PHI node in BB for the operands of S0 and S1
+///
+PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
+                                              StoreInst *S1) {
+  // Create a phi if the values mismatch.
+  Value *Opd1 = S0->getValueOperand();
+  Value *Opd2 = S1->getValueOperand();
+  if (Opd1 == Opd2)
+    return nullptr;
+
+  auto *NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink",
+                                &BB->front());
+  NewPN->addIncoming(Opd1, S0->getParent());
+  NewPN->addIncoming(Opd2, S1->getParent());
+  if (MD && NewPN->getType()->isPtrOrPtrVectorTy())
+    MD->invalidateCachedPointerInfo(NewPN);
+  return NewPN;
+}
+
+///
+/// \brief Merge two stores to same address and sink into \p BB
+///
+/// Also sinks GEP instruction computing the store address
+///
+bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
+                                      StoreInst *S1) {
+  // Only one definition?
+  auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
+  auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
+  if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
+      (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
+      (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) {
+    DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
+          dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
+          dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
+    // Hoist the instruction.
+    BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
+    // Intersect optional metadata.
+    S0->andIRFlags(S1);
+    S0->dropUnknownNonDebugMetadata();
+
+    // Create the new store to be inserted at the join point.
+    StoreInst *SNew = cast<StoreInst>(S0->clone());
+    Instruction *ANew = A0->clone();
+    SNew->insertBefore(&*InsertPt);
+    ANew->insertBefore(SNew);
+
+    assert(S0->getParent() == A0->getParent());
+    assert(S1->getParent() == A1->getParent());
+
+    // New PHI operand? Use it.
+    if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
+      SNew->setOperand(0, NewPN);
+    removeInstruction(S0);
+    removeInstruction(S1);
+    A0->replaceAllUsesWith(ANew);
+    removeInstruction(A0);
+    A1->replaceAllUsesWith(ANew);
+    removeInstruction(A1);
+    return true;
+  }
+  return false;
+}
+
+///
+/// \brief True when two stores are equivalent and can sink into the footer
+///
+/// Starting from a diamond tail block, iterate over the instructions in one
+/// predecessor block and try to match a store in the second predecessor.
+///
+bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
+
+  bool MergedStores = false;
+  assert(T && "Footer of a diamond cannot be empty");
+
+  pred_iterator PI = pred_begin(T), E = pred_end(T);
+  assert(PI != E);
+  BasicBlock *Pred0 = *PI;
+  ++PI;
+  BasicBlock *Pred1 = *PI;
+  ++PI;
+  // tail block  of a diamond/hammock?
+  if (Pred0 == Pred1)
+    return false; // No.
+  if (PI != E)
+    return false; // No. More than 2 predecessors.
+
+  // #Instructions in Succ1 for Compile Time Control
+  int Size1 = Pred1->size();
+  int NStores = 0;
+
+  for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend();
+       RBI != RBE;) {
+
+    Instruction *I = &*RBI;
+    ++RBI;
+
+    // Don't sink non-simple (atomic, volatile) stores.
+    auto *S0 = dyn_cast<StoreInst>(I);
+    if (!S0 || !S0->isSimple())
+      continue;
+
+    ++NStores;
+    if (NStores * Size1 >= MagicCompileTimeControl)
+      break;
+    if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) {
+      bool Res = sinkStore(T, S0, S1);
+      MergedStores |= Res;
+      // Don't attempt to sink below stores that had to stick around
+      // But after removal of a store and some of its feeding
+      // instruction search again from the beginning since the iterator
+      // is likely stale at this point.
+      if (!Res)
+        break;
+      RBI = Pred0->rbegin();
+      RBE = Pred0->rend();
+      DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
+    }
+  }
+  return MergedStores;
+}
+
+bool MergedLoadStoreMotion::run(Function &F, MemoryDependenceResults *MD,
+                                AliasAnalysis &AA) {
+  this->MD = MD;
+  this->AA = &AA;
+
+  bool Changed = false;
+  DEBUG(dbgs() << "Instruction Merger\n");
+
+  // Merge unconditional branches, allowing PRE to catch more
+  // optimization opportunities.
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
+    BasicBlock *BB = &*FI++;
+
+    // Hoist equivalent loads and sink stores
+    // outside diamonds when possible
+    if (isDiamondHead(BB)) {
+      Changed |= mergeStores(getDiamondTail(BB));
+    }
+  }
+  return Changed;
+}
+
+namespace {
+class MergedLoadStoreMotionLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  MergedLoadStoreMotionLegacyPass() : FunctionPass(ID) {
+    initializeMergedLoadStoreMotionLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  ///
+  /// \brief Run the transformation for each function
+  ///
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    MergedLoadStoreMotion Impl;
+    auto *MDWP = getAnalysisIfAvailable<MemoryDependenceWrapperPass>();
+    return Impl.run(F, MDWP ? &MDWP->getMemDep() : nullptr,
+                    getAnalysis<AAResultsWrapperPass>().getAAResults());
+  }
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<MemoryDependenceWrapperPass>();
+  }
+};
+
+char MergedLoadStoreMotionLegacyPass::ID = 0;
+} // anonymous namespace
+
+///
+/// \brief createMergedLoadStoreMotionPass - The public interface to this file.
+///
+FunctionPass *llvm::createMergedLoadStoreMotionPass() {
+  return new MergedLoadStoreMotionLegacyPass();
+}
+
+INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion",
+                      "MergedLoadStoreMotion", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
+                    "MergedLoadStoreMotion", false, false)
+
+PreservedAnalyses
+MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
+  MergedLoadStoreMotion Impl;
+  auto *MD = AM.getCachedResult<MemoryDependenceAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  if (!Impl.run(F, MD, AA))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<MemoryDependenceAnalysis>();
+  return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
new file mode 100644
index 000000000000..d0bfe3603897
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -0,0 +1,509 @@
+//===- NaryReassociate.cpp - Reassociate n-ary expressions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates n-ary add expressions and eliminates the redundancy
+// exposed by the reassociation.
+//
+// A motivating example:
+//
+//   void foo(int a, int b) {
+//     bar(a + b);
+//     bar((a + 2) + b);
+//   }
+//
+// An ideal compiler should reassociate (a + 2) + b to (a + b) + 2 and simplify
+// the above code to
+//
+//   int t = a + b;
+//   bar(t);
+//   bar(t + 2);
+//
+// However, the Reassociate pass is unable to do that because it processes each
+// instruction individually and believes (a + 2) + b is the best form according
+// to its rank system.
+//
+// To address this limitation, NaryReassociate reassociates an expression in a
+// form that reuses existing instructions. As a result, NaryReassociate can
+// reassociate (a + 2) + b in the example to (a + b) + 2 because it detects that
+// (a + b) is computed before.
+//
+// NaryReassociate works as follows. For every instruction in the form of (a +
+// b) + c, it checks whether a + c or b + c is already computed by a dominating
+// instruction. If so, it then reassociates (a + b) + c into (a + c) + b or (b +
+// c) + a and removes the redundancy accordingly. To efficiently look up whether
+// an expression is computed before, we store each instruction seen and its SCEV
+// into an SCEV-to-instruction map.
+//
+// Although the algorithm pattern-matches only ternary additions, it
+// automatically handles many >3-ary expressions by walking through the function
+// in the depth-first order. For example, given
+//
+//   (a + c) + d
+//   ((a + b) + c) + d
+//
+// NaryReassociate first rewrites (a + b) + c to (a + c) + b, and then rewrites
+// ((a + c) + b) + d into ((a + c) + d) + b.
+//
+// Finally, the above dominator-based algorithm may need to be run multiple
+// iterations before emitting optimal code. One source of this need is that we
+// only split an operand when it is used only once. The above algorithm can
+// eliminate an instruction and decrease the usage count of its operands. As a
+// result, an instruction that previously had multiple uses may become a
+// single-use instruction and thus eligible for split consideration. For
+// example,
+//
+//   ac = a + c
+//   ab = a + b
+//   abc = ab + c
+//   ab2 = ab + b
+//   ab2c = ab2 + c
+//
+// In the first iteration, we cannot reassociate abc to ac+b because ab is used
+// twice. However, we can reassociate ab2c to abc+b in the first iteration. As a
+// result, ab2 becomes dead and ab will be used only once in the second
+// iteration.
+//
+// Limitations and TODO items:
+//
+// 1) We only considers n-ary adds and muls for now. This should be extended
+// and generalized.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/NaryReassociate.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "nary-reassociate"
+
+namespace {
+class NaryReassociateLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  NaryReassociateLegacyPass() : FunctionPass(ID) {
+    initializeNaryReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool doInitialization(Module &M) override {
+    return false;
+  }
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+private:
+  NaryReassociatePass Impl;
+};
+} // anonymous namespace
+
+char NaryReassociateLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(NaryReassociateLegacyPass, "nary-reassociate",
+                      "Nary reassociation", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(NaryReassociateLegacyPass, "nary-reassociate",
+                    "Nary reassociation", false, false)
+
+FunctionPass *llvm::createNaryReassociatePass() {
+  return new NaryReassociateLegacyPass();
+}
+
+bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  return Impl.runImpl(F, AC, DT, SE, TLI, TTI);
+}
+
+PreservedAnalyses NaryReassociatePass::run(Function &F,
+                                           FunctionAnalysisManager &AM) {
+  auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+  auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+  if (!runImpl(F, AC, DT, SE, TLI, TTI))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  return PA;
+}
+
+bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
+                                  DominatorTree *DT_, ScalarEvolution *SE_,
+                                  TargetLibraryInfo *TLI_,
+                                  TargetTransformInfo *TTI_) {
+  AC = AC_;
+  DT = DT_;
+  SE = SE_;
+  TLI = TLI_;
+  TTI = TTI_;
+  DL = &F.getParent()->getDataLayout();
+
+  bool Changed = false, ChangedInThisIteration;
+  do {
+    ChangedInThisIteration = doOneIteration(F);
+    Changed |= ChangedInThisIteration;
+  } while (ChangedInThisIteration);
+  return Changed;
+}
+
+// Whitelist the instruction types NaryReassociate handles for now.
+static bool isPotentiallyNaryReassociable(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::GetElementPtr:
+  case Instruction::Mul:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool NaryReassociatePass::doOneIteration(Function &F) {
+  bool Changed = false;
+  SeenExprs.clear();
+  // Process the basic blocks in a depth first traversal of the dominator
+  // tree. This order ensures that all bases of a candidate are in Candidates
+  // when we process it.
+  for (const auto Node : depth_first(DT)) {
+    BasicBlock *BB = Node->getBlock();
+    for (auto I = BB->begin(); I != BB->end(); ++I) {
+      if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(&*I)) {
+        const SCEV *OldSCEV = SE->getSCEV(&*I);
+        if (Instruction *NewI = tryReassociate(&*I)) {
+          Changed = true;
+          SE->forgetValue(&*I);
+          I->replaceAllUsesWith(NewI);
+          // If SeenExprs constains I's WeakTrackingVH, that entry will be
+          // replaced with
+          // nullptr.
+          RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI);
+          I = NewI->getIterator();
+        }
+        // Add the rewritten instruction to SeenExprs; the original instruction
+        // is deleted.
+        const SCEV *NewSCEV = SE->getSCEV(&*I);
+        SeenExprs[NewSCEV].push_back(WeakTrackingVH(&*I));
+        // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I)
+        // is equivalent to I. However, ScalarEvolution::getSCEV may
+        // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose
+        // we reassociate
+        //   I = &a[sext(i +nsw j)] // assuming sizeof(a[0]) = 4
+        // to
+        //   NewI = &a[sext(i)] + sext(j).
+        //
+        // ScalarEvolution computes
+        //   getSCEV(I)    = a + 4 * sext(i + j)
+        //   getSCEV(newI) = a + 4 * sext(i) + 4 * sext(j)
+        // which are different SCEVs.
+        //
+        // To alleviate this issue of ScalarEvolution not always capturing
+        // equivalence, we add I to SeenExprs[OldSCEV] as well so that we can
+        // map both SCEV before and after tryReassociate(I) to I.
+        //
+        // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll.
+        if (NewSCEV != OldSCEV)
+          SeenExprs[OldSCEV].push_back(WeakTrackingVH(&*I));
+      }
+    }
+  }
+  return Changed;
+}
+
+Instruction *NaryReassociatePass::tryReassociate(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Mul:
+    return tryReassociateBinaryOp(cast<BinaryOperator>(I));
+  case Instruction::GetElementPtr:
+    return tryReassociateGEP(cast<GetElementPtrInst>(I));
+  default:
+    llvm_unreachable("should be filtered out by isPotentiallyNaryReassociable");
+  }
+}
+
+static bool isGEPFoldable(GetElementPtrInst *GEP,
+                          const TargetTransformInfo *TTI) {
+  SmallVector<const Value*, 4> Indices;
+  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+    Indices.push_back(*I);
+  return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
+                         Indices) == TargetTransformInfo::TCC_Free;
+}
+
+Instruction *NaryReassociatePass::tryReassociateGEP(GetElementPtrInst *GEP) {
+  // Not worth reassociating GEP if it is foldable.
+  if (isGEPFoldable(GEP, TTI))
+    return nullptr;
+
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (GTI.isSequential()) {
+      if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I - 1,
+                                                  GTI.getIndexedType())) {
+        return NewGEP;
+      }
+    }
+  }
+  return nullptr;
+}
+
+bool NaryReassociatePass::requiresSignExtension(Value *Index,
+                                                GetElementPtrInst *GEP) {
+  unsigned PointerSizeInBits =
+      DL->getPointerSizeInBits(GEP->getType()->getPointerAddressSpace());
+  return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits;
+}
+
+GetElementPtrInst *
+NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
+                                              unsigned I, Type *IndexedType) {
+  Value *IndexToSplit = GEP->getOperand(I + 1);
+  if (SExtInst *SExt = dyn_cast<SExtInst>(IndexToSplit)) {
+    IndexToSplit = SExt->getOperand(0);
+  } else if (ZExtInst *ZExt = dyn_cast<ZExtInst>(IndexToSplit)) {
+    // zext can be treated as sext if the source is non-negative.
+    if (isKnownNonNegative(ZExt->getOperand(0), *DL, 0, AC, GEP, DT))
+      IndexToSplit = ZExt->getOperand(0);
+  }
+
+  if (AddOperator *AO = dyn_cast<AddOperator>(IndexToSplit)) {
+    // If the I-th index needs sext and the underlying add is not equipped with
+    // nsw, we cannot split the add because
+    //   sext(LHS + RHS) != sext(LHS) + sext(RHS).
+    if (requiresSignExtension(IndexToSplit, GEP) &&
+        computeOverflowForSignedAdd(AO, *DL, AC, GEP, DT) !=
+            OverflowResult::NeverOverflows)
+      return nullptr;
+
+    Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);
+    // IndexToSplit = LHS + RHS.
+    if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+      return NewGEP;
+    // Symmetrically, try IndexToSplit = RHS + LHS.
+    if (LHS != RHS) {
+      if (auto *NewGEP =
+              tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+        return NewGEP;
+    }
+  }
+  return nullptr;
+}
+
+GetElementPtrInst *
+NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
+                                              unsigned I, Value *LHS,
+                                              Value *RHS, Type *IndexedType) {
+  // Look for GEP's closest dominator that has the same SCEV as GEP except that
+  // the I-th index is replaced with LHS.
+  SmallVector<const SCEV *, 4> IndexExprs;
+  for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index)
+    IndexExprs.push_back(SE->getSCEV(*Index));
+  // Replace the I-th index with LHS.
+  IndexExprs[I] = SE->getSCEV(LHS);
+  if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) &&
+      DL->getTypeSizeInBits(LHS->getType()) <
+          DL->getTypeSizeInBits(GEP->getOperand(I)->getType())) {
+    // Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to
+    // zext if the source operand is proved non-negative. We should do that
+    // consistently so that CandidateExpr more likely appears before. See
+    // @reassociate_gep_assume for an example of this canonicalization.
+    IndexExprs[I] =
+        SE->getZeroExtendExpr(IndexExprs[I], GEP->getOperand(I)->getType());
+  }
+  const SCEV *CandidateExpr = SE->getGEPExpr(cast<GEPOperator>(GEP),
+                                             IndexExprs);
+
+  Value *Candidate = findClosestMatchingDominator(CandidateExpr, GEP);
+  if (Candidate == nullptr)
+    return nullptr;
+
+  IRBuilder<> Builder(GEP);
+  // Candidate does not necessarily have the same pointer type as GEP. Use
+  // bitcast or pointer cast to make sure they have the same type, so that the
+  // later RAUW doesn't complain.
+  Candidate = Builder.CreateBitOrPointerCast(Candidate, GEP->getType());
+  assert(Candidate->getType() == GEP->getType());
+
+  // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType)
+  uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType);
+  Type *ElementType = GEP->getResultElementType();
+  uint64_t ElementSize = DL->getTypeAllocSize(ElementType);
+  // Another less rare case: because I is not necessarily the last index of the
+  // GEP, the size of the type at the I-th index (IndexedSize) is not
+  // necessarily divisible by ElementSize. For example,
+  //
+  // #pragma pack(1)
+  // struct S {
+  //   int a[3];
+  //   int64 b[8];
+  // };
+  // #pragma pack()
+  //
+  // sizeof(S) = 100 is indivisible by sizeof(int64) = 8.
+  //
+  // TODO: bail out on this case for now. We could emit uglygep.
+  if (IndexedSize % ElementSize != 0)
+    return nullptr;
+
+  // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0])));
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+  if (RHS->getType() != IntPtrTy)
+    RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy);
+  if (IndexedSize != ElementSize) {
+    RHS = Builder.CreateMul(
+        RHS, ConstantInt::get(IntPtrTy, IndexedSize / ElementSize));
+  }
+  GetElementPtrInst *NewGEP =
+      cast<GetElementPtrInst>(Builder.CreateGEP(Candidate, RHS));
+  NewGEP->setIsInBounds(GEP->isInBounds());
+  NewGEP->takeName(GEP);
+  return NewGEP;
+}
+
+Instruction *NaryReassociatePass::tryReassociateBinaryOp(BinaryOperator *I) {
+  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+  if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I))
+    return NewI;
+  if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I))
+    return NewI;
+  return nullptr;
+}
+
+Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS,
+                                                         BinaryOperator *I) {
+  Value *A = nullptr, *B = nullptr;
+  // To be conservative, we reassociate I only when it is the only user of (A op
+  // B).
+  if (LHS->hasOneUse() && matchTernaryOp(I, LHS, A, B)) {
+    // I = (A op B) op RHS
+    //   = (A op RHS) op B or (B op RHS) op A
+    const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B);
+    const SCEV *RHSExpr = SE->getSCEV(RHS);
+    if (BExpr != RHSExpr) {
+      if (auto *NewI =
+              tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+        return NewI;
+    }
+    if (AExpr != RHSExpr) {
+      if (auto *NewI =
+              tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+        return NewI;
+    }
+  }
+  return nullptr;
+}
+
+Instruction *NaryReassociatePass::tryReassociatedBinaryOp(const SCEV *LHSExpr,
+                                                          Value *RHS,
+                                                          BinaryOperator *I) {
+  // Look for the closest dominator LHS of I that computes LHSExpr, and replace
+  // I with LHS op RHS.
+  auto *LHS = findClosestMatchingDominator(LHSExpr, I);
+  if (LHS == nullptr)
+    return nullptr;
+
+  Instruction *NewI = nullptr;
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I);
+    break;
+  case Instruction::Mul:
+    NewI = BinaryOperator::CreateMul(LHS, RHS, "", I);
+    break;
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  }
+  NewI->takeName(I);
+  return NewI;
+}
+
+bool NaryReassociatePass::matchTernaryOp(BinaryOperator *I, Value *V,
+                                         Value *&Op1, Value *&Op2) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    return match(V, m_Add(m_Value(Op1), m_Value(Op2)));
+  case Instruction::Mul:
+    return match(V, m_Mul(m_Value(Op1), m_Value(Op2)));
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  }
+  return false;
+}
+
+const SCEV *NaryReassociatePass::getBinarySCEV(BinaryOperator *I,
+                                               const SCEV *LHS,
+                                               const SCEV *RHS) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    return SE->getAddExpr(LHS, RHS);
+  case Instruction::Mul:
+    return SE->getMulExpr(LHS, RHS);
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  }
+  return nullptr;
+}
+
+Instruction *
+NaryReassociatePass::findClosestMatchingDominator(const SCEV *CandidateExpr,
+                                                  Instruction *Dominatee) {
+  auto Pos = SeenExprs.find(CandidateExpr);
+  if (Pos == SeenExprs.end())
+    return nullptr;
+
+  auto &Candidates = Pos->second;
+  // Because we process the basic blocks in pre-order of the dominator tree, a
+  // candidate that doesn't dominate the current instruction won't dominate any
+  // future instruction either. Therefore, we pop it out of the stack. This
+  // optimization makes the algorithm O(n).
+  while (!Candidates.empty()) {
+    // Candidates stores WeakTrackingVHs, so a candidate can be nullptr if it's
+    // removed
+    // during rewriting.
+    if (Value *Candidate = Candidates.back()) {
+      Instruction *CandidateInstruction = cast<Instruction>(Candidate);
+      if (DT->dominates(CandidateInstruction, Dominatee))
+        return CandidateInstruction;
+    }
+    Candidates.pop_back();
+  }
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
new file mode 100644
index 000000000000..9d018563618e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -0,0 +1,3941 @@
+//===---- NewGVN.cpp - Global Value Numbering Pass --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the new LLVM's Global Value Numbering pass.
+/// GVN partitions values computed by a function into congruence classes.
+/// Values ending up in the same congruence class are guaranteed to be the same
+/// for every execution of the program. In that respect, congruency is a
+/// compile-time approximation of equivalence of values at runtime.
+/// The algorithm implemented here uses a sparse formulation and it's based
+/// on the ideas described in the paper:
+/// "A Sparse Algorithm for Predicated Global Value Numbering" from
+/// Karthik Gargi.
+///
+/// A brief overview of the algorithm: The algorithm is essentially the same as
+/// the standard RPO value numbering algorithm (a good reference is the paper
+/// "SCC based value numbering" by L. Taylor Simpson) with one major difference:
+/// The RPO algorithm proceeds, on every iteration, to process every reachable
+/// block and every instruction in that block.  This is because the standard RPO
+/// algorithm does not track what things have the same value number, it only
+/// tracks what the value number of a given operation is (the mapping is
+/// operation -> value number).  Thus, when a value number of an operation
+/// changes, it must reprocess everything to ensure all uses of a value number
+/// get updated properly.  In constrast, the sparse algorithm we use *also*
+/// tracks what operations have a given value number (IE it also tracks the
+/// reverse mapping from value number -> operations with that value number), so
+/// that it only needs to reprocess the instructions that are affected when
+/// something's value number changes.  The vast majority of complexity and code
+/// in this file is devoted to tracking what value numbers could change for what
+/// instructions when various things happen.  The rest of the algorithm is
+/// devoted to performing symbolic evaluation, forward propagation, and
+/// simplification of operations based on the value numbers deduced so far
+///
+/// In order to make the GVN mostly-complete, we use a technique derived from
+/// "Detection of Redundant Expressions: A Complete and Polynomial-time
+/// Algorithm in SSA" by R.R. Pai.  The source of incompleteness in most SSA
+/// based GVN algorithms is related to their inability to detect equivalence
+/// between phi of ops (IE phi(a+b, c+d)) and op of phis (phi(a,c) + phi(b, d)).
+/// We resolve this issue by generating the equivalent "phi of ops" form for
+/// each op of phis we see, in a way that only takes polynomial time to resolve.
+///
+/// We also do not perform elimination by using any published algorithm.  All
+/// published algorithms are O(Instructions). Instead, we use a technique that
+/// is O(number of operations with the same value number), enabling us to skip
+/// trying to eliminate things that have unique value numbers.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/NewGVN.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVNExpression.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include <numeric>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+using namespace llvm;
+using namespace PatternMatch;
+using namespace llvm::GVNExpression;
+using namespace llvm::VNCoercion;
+#define DEBUG_TYPE "newgvn"
+
+STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
+STATISTIC(NumGVNBlocksDeleted, "Number of blocks deleted");
+STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified");
+STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same");
+STATISTIC(NumGVNMaxIterations,
+          "Maximum Number of iterations it took to converge GVN");
+STATISTIC(NumGVNLeaderChanges, "Number of leader changes");
+STATISTIC(NumGVNSortedLeaderChanges, "Number of sorted leader changes");
+STATISTIC(NumGVNAvoidedSortedLeaderChanges,
+          "Number of avoided sorted leader changes");
+STATISTIC(NumGVNDeadStores, "Number of redundant/dead stores eliminated");
+STATISTIC(NumGVNPHIOfOpsCreated, "Number of PHI of ops created");
+STATISTIC(NumGVNPHIOfOpsEliminations,
+          "Number of things eliminated using PHI of ops");
+DEBUG_COUNTER(VNCounter, "newgvn-vn",
+              "Controls which instructions are value numbered")
+DEBUG_COUNTER(PHIOfOpsCounter, "newgvn-phi",
+              "Controls which instructions we create phi of ops for")
+// Currently store defining access refinement is too slow due to basicaa being
+// egregiously slow.  This flag lets us keep it working while we work on this
+// issue.
+static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
+                                           cl::init(false), cl::Hidden);
+
+//===----------------------------------------------------------------------===//
+//                                GVN Pass
+//===----------------------------------------------------------------------===//
+
+// Anchor methods.
+namespace llvm {
+namespace GVNExpression {
+Expression::~Expression() = default;
+BasicExpression::~BasicExpression() = default;
+CallExpression::~CallExpression() = default;
+LoadExpression::~LoadExpression() = default;
+StoreExpression::~StoreExpression() = default;
+AggregateValueExpression::~AggregateValueExpression() = default;
+PHIExpression::~PHIExpression() = default;
+}
+}
+
+// Tarjan's SCC finding algorithm with Nuutila's improvements
+// SCCIterator is actually fairly complex for the simple thing we want.
+// It also wants to hand us SCC's that are unrelated to the phi node we ask
+// about, and have us process them there or risk redoing work.
+// Graph traits over a filter iterator also doesn't work that well here.
+// This SCC finder is specialized to walk use-def chains, and only follows
+// instructions,
+// not generic values (arguments, etc).
+struct TarjanSCC {
+
+  TarjanSCC() : Components(1) {}
+
+  void Start(const Instruction *Start) {
+    if (Root.lookup(Start) == 0)
+      FindSCC(Start);
+  }
+
+  const SmallPtrSetImpl<const Value *> &getComponentFor(const Value *V) const {
+    unsigned ComponentID = ValueToComponent.lookup(V);
+
+    assert(ComponentID > 0 &&
+           "Asking for a component for a value we never processed");
+    return Components[ComponentID];
+  }
+
+private:
+  void FindSCC(const Instruction *I) {
+    Root[I] = ++DFSNum;
+    // Store the DFS Number we had before it possibly gets incremented.
+    unsigned int OurDFS = DFSNum;
+    for (auto &Op : I->operands()) {
+      if (auto *InstOp = dyn_cast<Instruction>(Op)) {
+        if (Root.lookup(Op) == 0)
+          FindSCC(InstOp);
+        if (!InComponent.count(Op))
+          Root[I] = std::min(Root.lookup(I), Root.lookup(Op));
+      }
+    }
+    // See if we really were the root of a component, by seeing if we still have
+    // our DFSNumber.  If we do, we are the root of the component, and we have
+    // completed a component. If we do not, we are not the root of a component,
+    // and belong on the component stack.
+    if (Root.lookup(I) == OurDFS) {
+      unsigned ComponentID = Components.size();
+      Components.resize(Components.size() + 1);
+      auto &Component = Components.back();
+      Component.insert(I);
+      DEBUG(dbgs() << "Component root is " << *I << "\n");
+      InComponent.insert(I);
+      ValueToComponent[I] = ComponentID;
+      // Pop a component off the stack and label it.
+      while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) {
+        auto *Member = Stack.back();
+        DEBUG(dbgs() << "Component member is " << *Member << "\n");
+        Component.insert(Member);
+        InComponent.insert(Member);
+        ValueToComponent[Member] = ComponentID;
+        Stack.pop_back();
+      }
+    } else {
+      // Part of a component, push to stack
+      Stack.push_back(I);
+    }
+  }
+  unsigned int DFSNum = 1;
+  SmallPtrSet<const Value *, 8> InComponent;
+  DenseMap<const Value *, unsigned int> Root;
+  SmallVector<const Value *, 8> Stack;
+  // Store the components as vector of ptr sets, because we need the topo order
+  // of SCC's, but not individual member order
+  SmallVector<SmallPtrSet<const Value *, 8>, 8> Components;
+  DenseMap<const Value *, unsigned> ValueToComponent;
+};
+// Congruence classes represent the set of expressions/instructions
+// that are all the same *during some scope in the function*.
+// That is, because of the way we perform equality propagation, and
+// because of memory value numbering, it is not correct to assume
+// you can willy-nilly replace any member with any other at any
+// point in the function.
+//
+// For any Value in the Member set, it is valid to replace any dominated member
+// with that Value.
+//
+// Every congruence class has a leader, and the leader is used to symbolize
+// instructions in a canonical way (IE every operand of an instruction that is a
+// member of the same congruence class will always be replaced with leader
+// during symbolization).  To simplify symbolization, we keep the leader as a
+// constant if class can be proved to be a constant value.  Otherwise, the
+// leader is the member of the value set with the smallest DFS number.  Each
+// congruence class also has a defining expression, though the expression may be
+// null.  If it exists, it can be used for forward propagation and reassociation
+// of values.
+
+// For memory, we also track a representative MemoryAccess, and a set of memory
+// members for MemoryPhis (which have no real instructions). Note that for
+// memory, it seems tempting to try to split the memory members into a
+// MemoryCongruenceClass or something.  Unfortunately, this does not work
+// easily.  The value numbering of a given memory expression depends on the
+// leader of the memory congruence class, and the leader of memory congruence
+// class depends on the value numbering of a given memory expression.  This
+// leads to wasted propagation, and in some cases, missed optimization.  For
+// example: If we had value numbered two stores together before, but now do not,
+// we move them to a new value congruence class.  This in turn will move at one
+// of the memorydefs to a new memory congruence class.  Which in turn, affects
+// the value numbering of the stores we just value numbered (because the memory
+// congruence class is part of the value number).  So while theoretically
+// possible to split them up, it turns out to be *incredibly* complicated to get
+// it to work right, because of the interdependency.  While structurally
+// slightly messier, it is algorithmically much simpler and faster to do what we
+// do here, and track them both at once in the same class.
+// Note: The default iterators for this class iterate over values
+class CongruenceClass {
+public:
+  using MemberType = Value;
+  using MemberSet = SmallPtrSet<MemberType *, 4>;
+  using MemoryMemberType = MemoryPhi;
+  using MemoryMemberSet = SmallPtrSet<const MemoryMemberType *, 2>;
+
+  explicit CongruenceClass(unsigned ID) : ID(ID) {}
+  CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
+      : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
+  unsigned getID() const { return ID; }
+  // True if this class has no members left.  This is mainly used for assertion
+  // purposes, and for skipping empty classes.
+  bool isDead() const {
+    // If it's both dead from a value perspective, and dead from a memory
+    // perspective, it's really dead.
+    return empty() && memory_empty();
+  }
+  // Leader functions
+  Value *getLeader() const { return RepLeader; }
+  void setLeader(Value *Leader) { RepLeader = Leader; }
+  const std::pair<Value *, unsigned int> &getNextLeader() const {
+    return NextLeader;
+  }
+  void resetNextLeader() { NextLeader = {nullptr, ~0}; }
+
+  void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) {
+    if (LeaderPair.second < NextLeader.second)
+      NextLeader = LeaderPair;
+  }
+
+  Value *getStoredValue() const { return RepStoredValue; }
+  void setStoredValue(Value *Leader) { RepStoredValue = Leader; }
+  const MemoryAccess *getMemoryLeader() const { return RepMemoryAccess; }
+  void setMemoryLeader(const MemoryAccess *Leader) { RepMemoryAccess = Leader; }
+
+  // Forward propagation info
+  const Expression *getDefiningExpr() const { return DefiningExpr; }
+
+  // Value member set
+  bool empty() const { return Members.empty(); }
+  unsigned size() const { return Members.size(); }
+  MemberSet::const_iterator begin() const { return Members.begin(); }
+  MemberSet::const_iterator end() const { return Members.end(); }
+  void insert(MemberType *M) { Members.insert(M); }
+  void erase(MemberType *M) { Members.erase(M); }
+  void swap(MemberSet &Other) { Members.swap(Other); }
+
+  // Memory member set
+  bool memory_empty() const { return MemoryMembers.empty(); }
+  unsigned memory_size() const { return MemoryMembers.size(); }
+  MemoryMemberSet::const_iterator memory_begin() const {
+    return MemoryMembers.begin();
+  }
+  MemoryMemberSet::const_iterator memory_end() const {
+    return MemoryMembers.end();
+  }
+  iterator_range<MemoryMemberSet::const_iterator> memory() const {
+    return make_range(memory_begin(), memory_end());
+  }
+  void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); }
+  void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); }
+
+  // Store count
+  unsigned getStoreCount() const { return StoreCount; }
+  void incStoreCount() { ++StoreCount; }
+  void decStoreCount() {
+    assert(StoreCount != 0 && "Store count went negative");
+    --StoreCount;
+  }
+
+  // True if this class has no memory members.
+  bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
+
+  // Return true if two congruence classes are equivalent to each other.  This
+  // means
+  // that every field but the ID number and the dead field are equivalent.
+  bool isEquivalentTo(const CongruenceClass *Other) const {
+    if (!Other)
+      return false;
+    if (this == Other)
+      return true;
+
+    if (std::tie(StoreCount, RepLeader, RepStoredValue, RepMemoryAccess) !=
+        std::tie(Other->StoreCount, Other->RepLeader, Other->RepStoredValue,
+                 Other->RepMemoryAccess))
+      return false;
+    if (DefiningExpr != Other->DefiningExpr)
+      if (!DefiningExpr || !Other->DefiningExpr ||
+          *DefiningExpr != *Other->DefiningExpr)
+        return false;
+    // We need some ordered set
+    std::set<Value *> AMembers(Members.begin(), Members.end());
+    std::set<Value *> BMembers(Members.begin(), Members.end());
+    return AMembers == BMembers;
+  }
+
+private:
+  unsigned ID;
+  // Representative leader.
+  Value *RepLeader = nullptr;
+  // The most dominating leader after our current leader, because the member set
+  // is not sorted and is expensive to keep sorted all the time.
+  std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
+  // If this is represented by a store, the value of the store.
+  Value *RepStoredValue = nullptr;
+  // If this class contains MemoryDefs or MemoryPhis, this is the leading memory
+  // access.
+  const MemoryAccess *RepMemoryAccess = nullptr;
+  // Defining Expression.
+  const Expression *DefiningExpr = nullptr;
+  // Actual members of this class.
+  MemberSet Members;
+  // This is the set of MemoryPhis that exist in the class. MemoryDefs and
+  // MemoryUses have real instructions representing them, so we only need to
+  // track MemoryPhis here.
+  MemoryMemberSet MemoryMembers;
+  // Number of stores in this congruence class.
+  // This is used so we can detect store equivalence changes properly.
+  int StoreCount = 0;
+};
+
+namespace llvm {
+struct ExactEqualsExpression {
+  const Expression &E;
+  explicit ExactEqualsExpression(const Expression &E) : E(E) {}
+  hash_code getComputedHash() const { return E.getComputedHash(); }
+  bool operator==(const Expression &Other) const {
+    return E.exactlyEquals(Other);
+  }
+};
+
+template <> struct DenseMapInfo<const Expression *> {
+  static const Expression *getEmptyKey() {
+    auto Val = static_cast<uintptr_t>(-1);
+    Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
+    return reinterpret_cast<const Expression *>(Val);
+  }
+  static const Expression *getTombstoneKey() {
+    auto Val = static_cast<uintptr_t>(~1U);
+    Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
+    return reinterpret_cast<const Expression *>(Val);
+  }
+  static unsigned getHashValue(const Expression *E) {
+    return E->getComputedHash();
+  }
+  static unsigned getHashValue(const ExactEqualsExpression &E) {
+    return E.getComputedHash();
+  }
+  static bool isEqual(const ExactEqualsExpression &LHS, const Expression *RHS) {
+    if (RHS == getTombstoneKey() || RHS == getEmptyKey())
+      return false;
+    return LHS == *RHS;
+  }
+
+  static bool isEqual(const Expression *LHS, const Expression *RHS) {
+    if (LHS == RHS)
+      return true;
+    if (LHS == getTombstoneKey() || RHS == getTombstoneKey() ||
+        LHS == getEmptyKey() || RHS == getEmptyKey())
+      return false;
+    // Compare hashes before equality.  This is *not* what the hashtable does,
+    // since it is computing it modulo the number of buckets, whereas we are
+    // using the full hash keyspace.  Since the hashes are precomputed, this
+    // check is *much* faster than equality.
+    if (LHS->getComputedHash() != RHS->getComputedHash())
+      return false;
+    return *LHS == *RHS;
+  }
+};
+} // end namespace llvm
+
+namespace {
+class NewGVN {
+  Function &F;
+  DominatorTree *DT;
+  const TargetLibraryInfo *TLI;
+  AliasAnalysis *AA;
+  MemorySSA *MSSA;
+  MemorySSAWalker *MSSAWalker;
+  const DataLayout &DL;
+  std::unique_ptr<PredicateInfo> PredInfo;
+
+  // These are the only two things the create* functions should have
+  // side-effects on due to allocating memory.
+  mutable BumpPtrAllocator ExpressionAllocator;
+  mutable ArrayRecycler<Value *> ArgRecycler;
+  mutable TarjanSCC SCCFinder;
+  const SimplifyQuery SQ;
+
+  // Number of function arguments, used by ranking
+  unsigned int NumFuncArgs;
+
+  // RPOOrdering of basic blocks
+  DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
+
+  // Congruence class info.
+
+  // This class is called INITIAL in the paper. It is the class everything
+  // startsout in, and represents any value. Being an optimistic analysis,
+  // anything in the TOP class has the value TOP, which is indeterminate and
+  // equivalent to everything.
+  CongruenceClass *TOPClass;
+  std::vector<CongruenceClass *> CongruenceClasses;
+  unsigned NextCongruenceNum;
+
+  // Value Mappings.
+  DenseMap<Value *, CongruenceClass *> ValueToClass;
+  DenseMap<Value *, const Expression *> ValueToExpression;
+  // Value PHI handling, used to make equivalence between phi(op, op) and
+  // op(phi, phi).
+  // These mappings just store various data that would normally be part of the
+  // IR.
+  DenseSet<const Instruction *> PHINodeUses;
+  // Map a temporary instruction we created to a parent block.
+  DenseMap<const Value *, BasicBlock *> TempToBlock;
+  // Map between the temporary phis we created and the real instructions they
+  // are known equivalent to.
+  DenseMap<const Value *, PHINode *> RealToTemp;
+  // In order to know when we should re-process instructions that have
+  // phi-of-ops, we track the set of expressions that they needed as
+  // leaders. When we discover new leaders for those expressions, we process the
+  // associated phi-of-op instructions again in case they have changed.  The
+  // other way they may change is if they had leaders, and those leaders
+  // disappear.  However, at the point they have leaders, there are uses of the
+  // relevant operands in the created phi node, and so they will get reprocessed
+  // through the normal user marking we perform.
+  mutable DenseMap<const Value *, SmallPtrSet<Value *, 2>> AdditionalUsers;
+  DenseMap<const Expression *, SmallPtrSet<Instruction *, 2>>
+      ExpressionToPhiOfOps;
+  // Map from basic block to the temporary operations we created
+  DenseMap<const BasicBlock *, SmallVector<PHINode *, 8>> PHIOfOpsPHIs;
+  // Map from temporary operation to MemoryAccess.
+  DenseMap<const Instruction *, MemoryUseOrDef *> TempToMemory;
+  // Set of all temporary instructions we created.
+  DenseSet<Instruction *> AllTempInstructions;
+
+  // Mapping from predicate info we used to the instructions we used it with.
+  // In order to correctly ensure propagation, we must keep track of what
+  // comparisons we used, so that when the values of the comparisons change, we
+  // propagate the information to the places we used the comparison.
+  mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>>
+      PredicateToUsers;
+  // the same reasoning as PredicateToUsers.  When we skip MemoryAccesses for
+  // stores, we no longer can rely solely on the def-use chains of MemorySSA.
+  mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>>
+      MemoryToUsers;
+
+  // A table storing which memorydefs/phis represent a memory state provably
+  // equivalent to another memory state.
+  // We could use the congruence class machinery, but the MemoryAccess's are
+  // abstract memory states, so they can only ever be equivalent to each other,
+  // and not to constants, etc.
+  DenseMap<const MemoryAccess *, CongruenceClass *> MemoryAccessToClass;
+
+  // We could, if we wanted, build MemoryPhiExpressions and
+  // MemoryVariableExpressions, etc, and value number them the same way we value
+  // number phi expressions.  For the moment, this seems like overkill.  They
+  // can only exist in one of three states: they can be TOP (equal to
+  // everything), Equivalent to something else, or unique.  Because we do not
+  // create expressions for them, we need to simulate leader change not just
+  // when they change class, but when they change state.  Note: We can do the
+  // same thing for phis, and avoid having phi expressions if we wanted, We
+  // should eventually unify in one direction or the other, so this is a little
+  // bit of an experiment in which turns out easier to maintain.
+  enum MemoryPhiState { MPS_Invalid, MPS_TOP, MPS_Equivalent, MPS_Unique };
+  DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
+
+  enum InstCycleState { ICS_Unknown, ICS_CycleFree, ICS_Cycle };
+  mutable DenseMap<const Instruction *, InstCycleState> InstCycleState;
+  // Expression to class mapping.
+  using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
+  ExpressionClassMap ExpressionToClass;
+
+  // We have a single expression that represents currently DeadExpressions.
+  // For dead expressions we can prove will stay dead, we mark them with
+  // DFS number zero.  However, it's possible in the case of phi nodes
+  // for us to assume/prove all arguments are dead during fixpointing.
+  // We use DeadExpression for that case.
+  DeadExpression *SingletonDeadExpression = nullptr;
+
+  // Which values have changed as a result of leader changes.
+  SmallPtrSet<Value *, 8> LeaderChanges;
+
+  // Reachability info.
+  using BlockEdge = BasicBlockEdge;
+  DenseSet<BlockEdge> ReachableEdges;
+  SmallPtrSet<const BasicBlock *, 8> ReachableBlocks;
+
+  // This is a bitvector because, on larger functions, we may have
+  // thousands of touched instructions at once (entire blocks,
+  // instructions with hundreds of uses, etc).  Even with optimization
+  // for when we mark whole blocks as touched, when this was a
+  // SmallPtrSet or DenseSet, for some functions, we spent >20% of all
+  // the time in GVN just managing this list.  The bitvector, on the
+  // other hand, efficiently supports test/set/clear of both
+  // individual and ranges, as well as "find next element" This
+  // enables us to use it as a worklist with essentially 0 cost.
+  BitVector TouchedInstructions;
+
+  DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
+
+#ifndef NDEBUG
+  // Debugging for how many times each block and instruction got processed.
+  DenseMap<const Value *, unsigned> ProcessedCount;
+#endif
+
+  // DFS info.
+  // This contains a mapping from Instructions to DFS numbers.
+  // The numbering starts at 1. An instruction with DFS number zero
+  // means that the instruction is dead.
+  DenseMap<const Value *, unsigned> InstrDFS;
+
+  // This contains the mapping DFS numbers to instructions.
+  SmallVector<Value *, 32> DFSToInstr;
+
+  // Deletion info.
+  SmallPtrSet<Instruction *, 8> InstructionsToErase;
+
+public:
+  NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
+         TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
+         const DataLayout &DL)
+      : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL),
+        PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)), SQ(DL, TLI, DT, AC) {
+  }
+  bool runGVN();
+
+private:
+  // Expression handling.
+  const Expression *createExpression(Instruction *) const;
+  const Expression *createBinaryExpression(unsigned, Type *, Value *,
+                                           Value *) const;
+  PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge,
+                                     bool &OriginalOpsConstant) const;
+  const DeadExpression *createDeadExpression() const;
+  const VariableExpression *createVariableExpression(Value *) const;
+  const ConstantExpression *createConstantExpression(Constant *) const;
+  const Expression *createVariableOrConstant(Value *V) const;
+  const UnknownExpression *createUnknownExpression(Instruction *) const;
+  const StoreExpression *createStoreExpression(StoreInst *,
+                                               const MemoryAccess *) const;
+  LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
+                                       const MemoryAccess *) const;
+  const CallExpression *createCallExpression(CallInst *,
+                                             const MemoryAccess *) const;
+  const AggregateValueExpression *
+  createAggregateValueExpression(Instruction *) const;
+  bool setBasicExpressionInfo(Instruction *, BasicExpression *) const;
+
+  // Congruence class handling.
+  CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
+    auto *result = new CongruenceClass(NextCongruenceNum++, Leader, E);
+    CongruenceClasses.emplace_back(result);
+    return result;
+  }
+
+  CongruenceClass *createMemoryClass(MemoryAccess *MA) {
+    auto *CC = createCongruenceClass(nullptr, nullptr);
+    CC->setMemoryLeader(MA);
+    return CC;
+  }
+  CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) {
+    auto *CC = getMemoryClass(MA);
+    if (CC->getMemoryLeader() != MA)
+      CC = createMemoryClass(MA);
+    return CC;
+  }
+
+  CongruenceClass *createSingletonCongruenceClass(Value *Member) {
+    CongruenceClass *CClass = createCongruenceClass(Member, nullptr);
+    CClass->insert(Member);
+    ValueToClass[Member] = CClass;
+    return CClass;
+  }
+  void initializeCongruenceClasses(Function &F);
+  const Expression *makePossiblePhiOfOps(Instruction *,
+                                         SmallPtrSetImpl<Value *> &);
+  void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue);
+
+  // Value number an Instruction or MemoryPhi.
+  void valueNumberMemoryPhi(MemoryPhi *);
+  void valueNumberInstruction(Instruction *);
+
+  // Symbolic evaluation.
+  const Expression *checkSimplificationResults(Expression *, Instruction *,
+                                               Value *) const;
+  const Expression *performSymbolicEvaluation(Value *,
+                                              SmallPtrSetImpl<Value *> &) const;
+  const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
+                                                Instruction *,
+                                                MemoryAccess *) const;
+  const Expression *performSymbolicLoadEvaluation(Instruction *) const;
+  const Expression *performSymbolicStoreEvaluation(Instruction *) const;
+  const Expression *performSymbolicCallEvaluation(Instruction *) const;
+  const Expression *performSymbolicPHIEvaluation(Instruction *) const;
+  const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
+  const Expression *performSymbolicCmpEvaluation(Instruction *) const;
+  const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;
+
+  // Congruence finding.
+  bool someEquivalentDominates(const Instruction *, const Instruction *) const;
+  Value *lookupOperandLeader(Value *) const;
+  void performCongruenceFinding(Instruction *, const Expression *);
+  void moveValueToNewCongruenceClass(Instruction *, const Expression *,
+                                     CongruenceClass *, CongruenceClass *);
+  void moveMemoryToNewCongruenceClass(Instruction *, MemoryAccess *,
+                                      CongruenceClass *, CongruenceClass *);
+  Value *getNextValueLeader(CongruenceClass *) const;
+  const MemoryAccess *getNextMemoryLeader(CongruenceClass *) const;
+  bool setMemoryClass(const MemoryAccess *From, CongruenceClass *To);
+  CongruenceClass *getMemoryClass(const MemoryAccess *MA) const;
+  const MemoryAccess *lookupMemoryLeader(const MemoryAccess *) const;
+  bool isMemoryAccessTOP(const MemoryAccess *) const;
+
+  // Ranking
+  unsigned int getRank(const Value *) const;
+  bool shouldSwapOperands(const Value *, const Value *) const;
+
+  // Reachability handling.
+  void updateReachableEdge(BasicBlock *, BasicBlock *);
+  void processOutgoingEdges(TerminatorInst *, BasicBlock *);
+  Value *findConditionEquivalence(Value *) const;
+
+  // Elimination.
+  struct ValueDFS;
+  void convertClassToDFSOrdered(const CongruenceClass &,
+                                SmallVectorImpl<ValueDFS> &,
+                                DenseMap<const Value *, unsigned int> &,
+                                SmallPtrSetImpl<Instruction *> &) const;
+  void convertClassToLoadsAndStores(const CongruenceClass &,
+                                    SmallVectorImpl<ValueDFS> &) const;
+
+  bool eliminateInstructions(Function &);
+  void replaceInstruction(Instruction *, Value *);
+  void markInstructionForDeletion(Instruction *);
+  void deleteInstructionsInBlock(BasicBlock *);
+  Value *findPhiOfOpsLeader(const Expression *E, const BasicBlock *BB) const;
+
+  // New instruction creation.
+  void handleNewInstruction(Instruction *){};
+
+  // Various instruction touch utilities
+  template <typename Map, typename KeyType, typename Func>
+  void for_each_found(Map &, const KeyType &, Func);
+  template <typename Map, typename KeyType>
+  void touchAndErase(Map &, const KeyType &);
+  void markUsersTouched(Value *);
+  void markMemoryUsersTouched(const MemoryAccess *);
+  void markMemoryDefTouched(const MemoryAccess *);
+  void markPredicateUsersTouched(Instruction *);
+  void markValueLeaderChangeTouched(CongruenceClass *CC);
+  void markMemoryLeaderChangeTouched(CongruenceClass *CC);
+  void markPhiOfOpsChanged(const Expression *E);
+  void addPredicateUsers(const PredicateBase *, Instruction *) const;
+  void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;
+  void addAdditionalUsers(Value *To, Value *User) const;
+
+  // Main loop of value numbering
+  void iterateTouchedInstructions();
+
+  // Utilities.
+  void cleanupTables();
+  std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
+  void updateProcessedCount(const Value *V);
+  void verifyMemoryCongruency() const;
+  void verifyIterationSettled(Function &F);
+  void verifyStoreExpressions() const;
+  bool singleReachablePHIPath(SmallPtrSet<const MemoryAccess *, 8> &,
+                              const MemoryAccess *, const MemoryAccess *) const;
+  BasicBlock *getBlockForValue(Value *V) const;
+  void deleteExpression(const Expression *E) const;
+  MemoryUseOrDef *getMemoryAccess(const Instruction *) const;
+  MemoryAccess *getDefiningAccess(const MemoryAccess *) const;
+  MemoryPhi *getMemoryAccess(const BasicBlock *) const;
+  template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
+  unsigned InstrToDFSNum(const Value *V) const {
+    assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
+    return InstrDFS.lookup(V);
+  }
+
+  unsigned InstrToDFSNum(const MemoryAccess *MA) const {
+    return MemoryToDFSNum(MA);
+  }
+  Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; }
+  // Given a MemoryAccess, return the relevant instruction DFS number.  Note:
+  // This deliberately takes a value so it can be used with Use's, which will
+  // auto-convert to Value's but not to MemoryAccess's.
+  unsigned MemoryToDFSNum(const Value *MA) const {
+    assert(isa<MemoryAccess>(MA) &&
+           "This should not be used with instructions");
+    return isa<MemoryUseOrDef>(MA)
+               ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
+               : InstrDFS.lookup(MA);
+  }
+  bool isCycleFree(const Instruction *) const;
+  bool isBackedge(BasicBlock *From, BasicBlock *To) const;
+  // Debug counter info.  When verifying, we have to reset the value numbering
+  // debug counter to the same state it started in to get the same results.
+  std::pair<int, int> StartingVNCounter;
+};
+} // end anonymous namespace
+
+template <typename T>
+static bool equalsLoadStoreHelper(const T &LHS, const Expression &RHS) {
+  if (!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS))
+    return false;
+  return LHS.MemoryExpression::equals(RHS);
+}
+
+bool LoadExpression::equals(const Expression &Other) const {
+  return equalsLoadStoreHelper(*this, Other);
+}
+
+bool StoreExpression::equals(const Expression &Other) const {
+  if (!equalsLoadStoreHelper(*this, Other))
+    return false;
+  // Make sure that store vs store includes the value operand.
+  if (const auto *S = dyn_cast<StoreExpression>(&Other))
+    if (getStoredValue() != S->getStoredValue())
+      return false;
+  return true;
+}
+
+// Determine if the edge From->To is a backedge
+bool NewGVN::isBackedge(BasicBlock *From, BasicBlock *To) const {
+  if (From == To)
+    return true;
+  auto *FromDTN = DT->getNode(From);
+  auto *ToDTN = DT->getNode(To);
+  return RPOOrdering.lookup(FromDTN) >= RPOOrdering.lookup(ToDTN);
+}
+
+#ifndef NDEBUG
+static std::string getBlockName(const BasicBlock *B) {
+  return DOTGraphTraits<const Function *>::getSimpleNodeLabel(B, nullptr);
+}
+#endif
+
+// Get a MemoryAccess for an instruction, fake or real.
+MemoryUseOrDef *NewGVN::getMemoryAccess(const Instruction *I) const {
+  auto *Result = MSSA->getMemoryAccess(I);
+  return Result ? Result : TempToMemory.lookup(I);
+}
+
+// Get a MemoryPhi for a basic block. These are all real.
+MemoryPhi *NewGVN::getMemoryAccess(const BasicBlock *BB) const {
+  return MSSA->getMemoryAccess(BB);
+}
+
+// Get the basic block from an instruction/memory value.
+BasicBlock *NewGVN::getBlockForValue(Value *V) const {
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    auto *Parent = I->getParent();
+    if (Parent)
+      return Parent;
+    Parent = TempToBlock.lookup(V);
+    assert(Parent && "Every fake instruction should have a block");
+    return Parent;
+  }
+
+  auto *MP = dyn_cast<MemoryPhi>(V);
+  assert(MP && "Should have been an instruction or a MemoryPhi");
+  return MP->getBlock();
+}
+
+// Delete a definitely dead expression, so it can be reused by the expression
+// allocator.  Some of these are not in creation functions, so we have to accept
+// const versions.
+void NewGVN::deleteExpression(const Expression *E) const {
+  assert(isa<BasicExpression>(E));
+  auto *BE = cast<BasicExpression>(E);
+  const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
+  ExpressionAllocator.Deallocate(E);
+}
+PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
+                                           bool &OriginalOpsConstant) const {
+  BasicBlock *PHIBlock = getBlockForValue(I);
+  auto *PN = cast<PHINode>(I);
+  auto *E =
+      new (ExpressionAllocator) PHIExpression(PN->getNumOperands(), PHIBlock);
+
+  E->allocateOperands(ArgRecycler, ExpressionAllocator);
+  E->setType(I->getType());
+  E->setOpcode(I->getOpcode());
+
+  // NewGVN assumes the operands of a PHI node are in a consistent order across
+  // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix
+  // this in LLVM at some point we don't want GVN to find wrong congruences.
+  // Therefore, here we sort uses in predecessor order.
+  // We're sorting the values by pointer. In theory this might be cause of
+  // non-determinism, but here we don't rely on the ordering for anything
+  // significant, e.g. we don't create new instructions based on it so we're
+  // fine.
+  SmallVector<const Use *, 4> PHIOperands;
+  for (const Use &U : PN->operands())
+    PHIOperands.push_back(&U);
+  std::sort(PHIOperands.begin(), PHIOperands.end(),
+            [&](const Use *U1, const Use *U2) {
+              return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2);
+            });
+
+  // Filter out unreachable phi operands.
+  auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) {
+    if (*U == PN)
+      return false;
+    if (!ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock}))
+      return false;
+    // Things in TOPClass are equivalent to everything.
+    if (ValueToClass.lookup(*U) == TOPClass)
+      return false;
+    return lookupOperandLeader(*U) != PN;
+  });
+  std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
+                 [&](const Use *U) -> Value * {
+                   auto *BB = PN->getIncomingBlock(*U);
+                   HasBackedge = HasBackedge || isBackedge(BB, PHIBlock);
+                   OriginalOpsConstant =
+                       OriginalOpsConstant && isa<Constant>(*U);
+                   return lookupOperandLeader(*U);
+                 });
+  return E;
+}
+
+// Set basic expression info (Arguments, type, opcode) for Expression
+// E from Instruction I in block B.
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const {
+  bool AllConstant = true;
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+    E->setType(GEP->getSourceElementType());
+  else
+    E->setType(I->getType());
+  E->setOpcode(I->getOpcode());
+  E->allocateOperands(ArgRecycler, ExpressionAllocator);
+
+  // Transform the operand array into an operand leader array, and keep track of
+  // whether all members are constant.
+  std::transform(I->op_begin(), I->op_end(), op_inserter(E), [&](Value *O) {
+    auto Operand = lookupOperandLeader(O);
+    AllConstant = AllConstant && isa<Constant>(Operand);
+    return Operand;
+  });
+
+  return AllConstant;
+}
+
+const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
+                                                 Value *Arg1,
+                                                 Value *Arg2) const {
+  auto *E = new (ExpressionAllocator) BasicExpression(2);
+
+  E->setType(T);
+  E->setOpcode(Opcode);
+  E->allocateOperands(ArgRecycler, ExpressionAllocator);
+  if (Instruction::isCommutative(Opcode)) {
+    // Ensure that commutative instructions that only differ by a permutation
+    // of their operands get the same value number by sorting the operand value
+    // numbers.  Since all commutative instructions have two operands it is more
+    // efficient to sort by hand rather than using, say, std::sort.
+    if (shouldSwapOperands(Arg1, Arg2))
+      std::swap(Arg1, Arg2);
+  }
+  E->op_push_back(lookupOperandLeader(Arg1));
+  E->op_push_back(lookupOperandLeader(Arg2));
+
+  Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), SQ);
+  if (const Expression *SimplifiedE = checkSimplificationResults(E, nullptr, V))
+    return SimplifiedE;
+  return E;
+}
+
+// Take a Value returned by simplification of Expression E/Instruction
+// I, and see if it resulted in a simpler expression. If so, return
+// that expression.
+// TODO: Once finished, this should not take an Instruction, we only
+// use it for printing.
+const Expression *NewGVN::checkSimplificationResults(Expression *E,
+                                                     Instruction *I,
+                                                     Value *V) const {
+  if (!V)
+    return nullptr;
+  if (auto *C = dyn_cast<Constant>(V)) {
+    if (I)
+      DEBUG(dbgs() << "Simplified " << *I << " to "
+                   << " constant " << *C << "\n");
+    NumGVNOpsSimplified++;
+    assert(isa<BasicExpression>(E) &&
+           "We should always have had a basic expression here");
+    deleteExpression(E);
+    return createConstantExpression(C);
+  } else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
+    if (I)
+      DEBUG(dbgs() << "Simplified " << *I << " to "
+                   << " variable " << *V << "\n");
+    deleteExpression(E);
+    return createVariableExpression(V);
+  }
+
+  CongruenceClass *CC = ValueToClass.lookup(V);
+  if (CC && CC->getDefiningExpr()) {
+    // If we simplified to something else, we need to communicate
+    // that we're users of the value we simplified to.
+    if (I != V) {
+      // Don't add temporary instructions to the user lists.
+      if (!AllTempInstructions.count(I))
+        addAdditionalUsers(V, I);
+    }
+
+    if (I)
+      DEBUG(dbgs() << "Simplified " << *I << " to "
+                   << " expression " << *CC->getDefiningExpr() << "\n");
+    NumGVNOpsSimplified++;
+    deleteExpression(E);
+    return CC->getDefiningExpr();
+  }
+  return nullptr;
+}
+
+const Expression *NewGVN::createExpression(Instruction *I) const {
+  auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
+
+  bool AllConstant = setBasicExpressionInfo(I, E);
+
+  if (I->isCommutative()) {
+    // Ensure that commutative instructions that only differ by a permutation
+    // of their operands get the same value number by sorting the operand value
+    // numbers.  Since all commutative instructions have two operands it is more
+    // efficient to sort by hand rather than using, say, std::sort.
+    assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
+    if (shouldSwapOperands(E->getOperand(0), E->getOperand(1)))
+      E->swapOperands(0, 1);
+  }
+
+  // Perform simplificaiton
+  // TODO: Right now we only check to see if we get a constant result.
+  // We may get a less than constant, but still better, result for
+  // some operations.
+  // IE
+  //  add 0, x -> x
+  //  and x, x -> x
+  // We should handle this by simply rewriting the expression.
+  if (auto *CI = dyn_cast<CmpInst>(I)) {
+    // Sort the operand value numbers so x<y and y>x get the same value
+    // number.
+    CmpInst::Predicate Predicate = CI->getPredicate();
+    if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) {
+      E->swapOperands(0, 1);
+      Predicate = CmpInst::getSwappedPredicate(Predicate);
+    }
+    E->setOpcode((CI->getOpcode() << 8) | Predicate);
+    // TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands
+    assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() &&
+           "Wrong types on cmp instruction");
+    assert((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
+            E->getOperand(1)->getType() == I->getOperand(1)->getType()));
+    Value *V =
+        SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1), SQ);
+    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+      return SimplifiedE;
+  } else if (isa<SelectInst>(I)) {
+    if (isa<Constant>(E->getOperand(0)) ||
+        E->getOperand(0) == E->getOperand(1)) {
+      assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
+             E->getOperand(2)->getType() == I->getOperand(2)->getType());
+      Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1),
+                                    E->getOperand(2), SQ);
+      if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+        return SimplifiedE;
+    }
+  } else if (I->isBinaryOp()) {
+    Value *V =
+        SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1), SQ);
+    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+      return SimplifiedE;
+  } else if (auto *BI = dyn_cast<BitCastInst>(I)) {
+    Value *V =
+        SimplifyCastInst(BI->getOpcode(), BI->getOperand(0), BI->getType(), SQ);
+    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+      return SimplifiedE;
+  } else if (isa<GetElementPtrInst>(I)) {
+    Value *V = SimplifyGEPInst(
+        E->getType(), ArrayRef<Value *>(E->op_begin(), E->op_end()), SQ);
+    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+      return SimplifiedE;
+  } else if (AllConstant) {
+    // We don't bother trying to simplify unless all of the operands
+    // were constant.
+    // TODO: There are a lot of Simplify*'s we could call here, if we
+    // wanted to.  The original motivating case for this code was a
+    // zext i1 false to i8, which we don't have an interface to
+    // simplify (IE there is no SimplifyZExt).
+
+    SmallVector<Constant *, 8> C;
+    for (Value *Arg : E->operands())
+      C.emplace_back(cast<Constant>(Arg));
+
+    if (Value *V = ConstantFoldInstOperands(I, C, DL, TLI))
+      if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+        return SimplifiedE;
+  }
+  return E;
+}
+
+const AggregateValueExpression *
+NewGVN::createAggregateValueExpression(Instruction *I) const {
+  if (auto *II = dyn_cast<InsertValueInst>(I)) {
+    auto *E = new (ExpressionAllocator)
+        AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
+    setBasicExpressionInfo(I, E);
+    E->allocateIntOperands(ExpressionAllocator);
+    std::copy(II->idx_begin(), II->idx_end(), int_op_inserter(E));
+    return E;
+  } else if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
+    auto *E = new (ExpressionAllocator)
+        AggregateValueExpression(I->getNumOperands(), EI->getNumIndices());
+    setBasicExpressionInfo(EI, E);
+    E->allocateIntOperands(ExpressionAllocator);
+    std::copy(EI->idx_begin(), EI->idx_end(), int_op_inserter(E));
+    return E;
+  }
+  llvm_unreachable("Unhandled type of aggregate value operation");
+}
+
+const DeadExpression *NewGVN::createDeadExpression() const {
+  // DeadExpression has no arguments and all DeadExpression's are the same,
+  // so we only need one of them.
+  return SingletonDeadExpression;
+}
+
+const VariableExpression *NewGVN::createVariableExpression(Value *V) const {
+  auto *E = new (ExpressionAllocator) VariableExpression(V);
+  E->setOpcode(V->getValueID());
+  return E;
+}
+
+const Expression *NewGVN::createVariableOrConstant(Value *V) const {
+  if (auto *C = dyn_cast<Constant>(V))
+    return createConstantExpression(C);
+  return createVariableExpression(V);
+}
+
+const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const {
+  auto *E = new (ExpressionAllocator) ConstantExpression(C);
+  E->setOpcode(C->getValueID());
+  return E;
+}
+
+const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {
+  auto *E = new (ExpressionAllocator) UnknownExpression(I);
+  E->setOpcode(I->getOpcode());
+  return E;
+}
+
+const CallExpression *
+NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
+  // FIXME: Add operand bundles for calls.
+  auto *E =
+      new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
+  setBasicExpressionInfo(CI, E);
+  return E;
+}
+
+// Return true if some equivalent of instruction Inst dominates instruction U.
+bool NewGVN::someEquivalentDominates(const Instruction *Inst,
+                                     const Instruction *U) const {
+  auto *CC = ValueToClass.lookup(Inst);
+  // This must be an instruction because we are only called from phi nodes
+  // in the case that the value it needs to check against is an instruction.
+
+  // The most likely candiates for dominance are the leader and the next leader.
+  // The leader or nextleader will dominate in all cases where there is an
+  // equivalent that is higher up in the dom tree.
+  // We can't *only* check them, however, because the
+  // dominator tree could have an infinite number of non-dominating siblings
+  // with instructions that are in the right congruence class.
+  //       A
+  // B C D E F G
+  // |
+  // H
+  // Instruction U could be in H,  with equivalents in every other sibling.
+  // Depending on the rpo order picked, the leader could be the equivalent in
+  // any of these siblings.
+  if (!CC)
+    return false;
+  if (DT->dominates(cast<Instruction>(CC->getLeader()), U))
+    return true;
+  if (CC->getNextLeader().first &&
+      DT->dominates(cast<Instruction>(CC->getNextLeader().first), U))
+    return true;
+  return llvm::any_of(*CC, [&](const Value *Member) {
+    return Member != CC->getLeader() &&
+           DT->dominates(cast<Instruction>(Member), U);
+  });
+}
+
+// See if we have a congruence class and leader for this operand, and if so,
+// return it. Otherwise, return the operand itself.
+Value *NewGVN::lookupOperandLeader(Value *V) const {
+  CongruenceClass *CC = ValueToClass.lookup(V);
+  if (CC) {
+    // Everything in TOP is represented by undef, as it can be any value.
+    // We do have to make sure we get the type right though, so we can't set the
+    // RepLeader to undef.
+    if (CC == TOPClass)
+      return UndefValue::get(V->getType());
+    return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+  }
+
+  return V;
+}
+
+const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
+  auto *CC = getMemoryClass(MA);
+  assert(CC->getMemoryLeader() &&
+         "Every MemoryAccess should be mapped to a congruence class with a "
+         "representative memory access");
+  return CC->getMemoryLeader();
+}
+
+// Return true if the MemoryAccess is really equivalent to everything. This is
+// equivalent to the lattice value "TOP" in most lattices.  This is the initial
+// state of all MemoryAccesses.
+bool NewGVN::isMemoryAccessTOP(const MemoryAccess *MA) const {
+  return getMemoryClass(MA) == TOPClass;
+}
+
+LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
+                                             LoadInst *LI,
+                                             const MemoryAccess *MA) const {
+  auto *E =
+      new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
+  E->allocateOperands(ArgRecycler, ExpressionAllocator);
+  E->setType(LoadType);
+
+  // Give store and loads same opcode so they value number together.
+  E->setOpcode(0);
+  E->op_push_back(PointerOp);
+  if (LI)
+    E->setAlignment(LI->getAlignment());
+
+  // TODO: Value number heap versions. We may be able to discover
+  // things alias analysis can't on it's own (IE that a store and a
+  // load have the same value, and thus, it isn't clobbering the load).
+  return E;
+}
+
+const StoreExpression *
+NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const {
+  auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
+  auto *E = new (ExpressionAllocator)
+      StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
+  E->allocateOperands(ArgRecycler, ExpressionAllocator);
+  E->setType(SI->getValueOperand()->getType());
+
+  // Give store and loads same opcode so they value number together.
+  E->setOpcode(0);
+  E->op_push_back(lookupOperandLeader(SI->getPointerOperand()));
+
+  // TODO: Value number heap versions. We may be able to discover
+  // things alias analysis can't on it's own (IE that a store and a
+  // load have the same value, and thus, it isn't clobbering the load).
+  return E;
+}
+
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
+  // Unlike loads, we never try to eliminate stores, so we do not check if they
+  // are simple and avoid value numbering them.
+  auto *SI = cast<StoreInst>(I);
+  auto *StoreAccess = getMemoryAccess(SI);
+  // Get the expression, if any, for the RHS of the MemoryDef.
+  const MemoryAccess *StoreRHS = StoreAccess->getDefiningAccess();
+  if (EnableStoreRefinement)
+    StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess);
+  // If we bypassed the use-def chains, make sure we add a use.
+  if (StoreRHS != StoreAccess->getDefiningAccess())
+    addMemoryUsers(StoreRHS, StoreAccess);
+  StoreRHS = lookupMemoryLeader(StoreRHS);
+  // If we are defined by ourselves, use the live on entry def.
+  if (StoreRHS == StoreAccess)
+    StoreRHS = MSSA->getLiveOnEntryDef();
+
+  if (SI->isSimple()) {
+    // See if we are defined by a previous store expression, it already has a
+    // value, and it's the same value as our current store. FIXME: Right now, we
+    // only do this for simple stores, we should expand to cover memcpys, etc.
+    const auto *LastStore = createStoreExpression(SI, StoreRHS);
+    const auto *LastCC = ExpressionToClass.lookup(LastStore);
+    // We really want to check whether the expression we matched was a store. No
+    // easy way to do that. However, we can check that the class we found has a
+    // store, which, assuming the value numbering state is not corrupt, is
+    // sufficient, because we must also be equivalent to that store's expression
+    // for it to be in the same class as the load.
+    if (LastCC && LastCC->getStoredValue() == LastStore->getStoredValue())
+      return LastStore;
+    // Also check if our value operand is defined by a load of the same memory
+    // location, and the memory state is the same as it was then (otherwise, it
+    // could have been overwritten later. See test32 in
+    // transforms/DeadStoreElimination/simple.ll).
+    if (auto *LI = dyn_cast<LoadInst>(LastStore->getStoredValue()))
+      if ((lookupOperandLeader(LI->getPointerOperand()) ==
+           LastStore->getOperand(0)) &&
+          (lookupMemoryLeader(getMemoryAccess(LI)->getDefiningAccess()) ==
+           StoreRHS))
+        return LastStore;
+    deleteExpression(LastStore);
+  }
+
+  // If the store is not equivalent to anything, value number it as a store that
+  // produces a unique memory state (instead of using it's MemoryUse, we use
+  // it's MemoryDef).
+  return createStoreExpression(SI, StoreAccess);
+}
+
+// See if we can extract the value of a loaded pointer from a load, a store, or
+// a memory instruction.
+const Expression *
+NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
+                                    LoadInst *LI, Instruction *DepInst,
+                                    MemoryAccess *DefiningAccess) const {
+  assert((!LI || LI->isSimple()) && "Not a simple load");
+  if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
+    // Can't forward from non-atomic to atomic without violating memory model.
+    // Also don't need to coerce if they are the same type, we will just
+    // propogate..
+    if (LI->isAtomic() > DepSI->isAtomic() ||
+        LoadType == DepSI->getValueOperand()->getType())
+      return nullptr;
+    int Offset = analyzeLoadFromClobberingStore(LoadType, LoadPtr, DepSI, DL);
+    if (Offset >= 0) {
+      if (auto *C = dyn_cast<Constant>(
+              lookupOperandLeader(DepSI->getValueOperand()))) {
+        DEBUG(dbgs() << "Coercing load from store " << *DepSI << " to constant "
+                     << *C << "\n");
+        return createConstantExpression(
+            getConstantStoreValueForLoad(C, Offset, LoadType, DL));
+      }
+    }
+
+  } else if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+    // Can't forward from non-atomic to atomic without violating memory model.
+    if (LI->isAtomic() > DepLI->isAtomic())
+      return nullptr;
+    int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL);
+    if (Offset >= 0) {
+      // We can coerce a constant load into a load
+      if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
+        if (auto *PossibleConstant =
+                getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
+          DEBUG(dbgs() << "Coercing load from load " << *LI << " to constant "
+                       << *PossibleConstant << "\n");
+          return createConstantExpression(PossibleConstant);
+        }
+    }
+
+  } else if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
+    int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL);
+    if (Offset >= 0) {
+      if (auto *PossibleConstant =
+              getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) {
+        DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
+                     << " to constant " << *PossibleConstant << "\n");
+        return createConstantExpression(PossibleConstant);
+      }
+    }
+  }
+
+  // All of the below are only true if the loaded pointer is produced
+  // by the dependent instruction.
+  if (LoadPtr != lookupOperandLeader(DepInst) &&
+      !AA->isMustAlias(LoadPtr, DepInst))
+    return nullptr;
+  // If this load really doesn't depend on anything, then we must be loading an
+  // undef value.  This can happen when loading for a fresh allocation with no
+  // intervening stores, for example.  Note that this is only true in the case
+  // that the result of the allocation is pointer equal to the load ptr.
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
+    return createConstantExpression(UndefValue::get(LoadType));
+  }
+  // If this load occurs either right after a lifetime begin,
+  // then the loaded value is undefined.
+  else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      return createConstantExpression(UndefValue::get(LoadType));
+  }
+  // If this load follows a calloc (which zero initializes memory),
+  // then the loaded value is zero
+  else if (isCallocLikeFn(DepInst, TLI)) {
+    return createConstantExpression(Constant::getNullValue(LoadType));
+  }
+
+  return nullptr;
+}
+
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
+  auto *LI = cast<LoadInst>(I);
+
+  // We can eliminate in favor of non-simple loads, but we won't be able to
+  // eliminate the loads themselves.
+  if (!LI->isSimple())
+    return nullptr;
+
+  Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand());
+  // Load of undef is undef.
+  if (isa<UndefValue>(LoadAddressLeader))
+    return createConstantExpression(UndefValue::get(LI->getType()));
+  MemoryAccess *OriginalAccess = getMemoryAccess(I);
+  MemoryAccess *DefiningAccess =
+      MSSAWalker->getClobberingMemoryAccess(OriginalAccess);
+
+  if (!MSSA->isLiveOnEntryDef(DefiningAccess)) {
+    if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) {
+      Instruction *DefiningInst = MD->getMemoryInst();
+      // If the defining instruction is not reachable, replace with undef.
+      if (!ReachableBlocks.count(DefiningInst->getParent()))
+        return createConstantExpression(UndefValue::get(LI->getType()));
+      // This will handle stores and memory insts.  We only do if it the
+      // defining access has a different type, or it is a pointer produced by
+      // certain memory operations that cause the memory to have a fixed value
+      // (IE things like calloc).
+      if (const auto *CoercionResult =
+              performSymbolicLoadCoercion(LI->getType(), LoadAddressLeader, LI,
+                                          DefiningInst, DefiningAccess))
+        return CoercionResult;
+    }
+  }
+
+  const Expression *E = createLoadExpression(LI->getType(), LoadAddressLeader,
+                                             LI, DefiningAccess);
+  return E;
+}
+
+const Expression *
+NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
+  auto *PI = PredInfo->getPredicateInfoFor(I);
+  if (!PI)
+    return nullptr;
+
+  DEBUG(dbgs() << "Found predicate info from instruction !\n");
+
+  auto *PWC = dyn_cast<PredicateWithCondition>(PI);
+  if (!PWC)
+    return nullptr;
+
+  auto *CopyOf = I->getOperand(0);
+  auto *Cond = PWC->Condition;
+
+  // If this a copy of the condition, it must be either true or false depending
+  // on the predicate info type and edge
+  if (CopyOf == Cond) {
+    // We should not need to add predicate users because the predicate info is
+    // already a use of this operand.
+    if (isa<PredicateAssume>(PI))
+      return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
+    if (auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+      if (PBranch->TrueEdge)
+        return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
+      return createConstantExpression(ConstantInt::getFalse(Cond->getType()));
+    }
+    if (auto *PSwitch = dyn_cast<PredicateSwitch>(PI))
+      return createConstantExpression(cast<Constant>(PSwitch->CaseValue));
+  }
+
+  // Not a copy of the condition, so see what the predicates tell us about this
+  // value.  First, though, we check to make sure the value is actually a copy
+  // of one of the condition operands. It's possible, in certain cases, for it
+  // to be a copy of a predicateinfo copy. In particular, if two branch
+  // operations use the same condition, and one branch dominates the other, we
+  // will end up with a copy of a copy.  This is currently a small deficiency in
+  // predicateinfo.  What will end up happening here is that we will value
+  // number both copies the same anyway.
+
+  // Everything below relies on the condition being a comparison.
+  auto *Cmp = dyn_cast<CmpInst>(Cond);
+  if (!Cmp)
+    return nullptr;
+
+  if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
+    DEBUG(dbgs() << "Copy is not of any condition operands!\n");
+    return nullptr;
+  }
+  Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
+  Value *SecondOp = lookupOperandLeader(Cmp->getOperand(1));
+  bool SwappedOps = false;
+  // Sort the ops
+  if (shouldSwapOperands(FirstOp, SecondOp)) {
+    std::swap(FirstOp, SecondOp);
+    SwappedOps = true;
+  }
+  CmpInst::Predicate Predicate =
+      SwappedOps ? Cmp->getSwappedPredicate() : Cmp->getPredicate();
+
+  if (isa<PredicateAssume>(PI)) {
+    // If the comparison is true when the operands are equal, then we know the
+    // operands are equal, because assumes must always be true.
+    if (CmpInst::isTrueWhenEqual(Predicate)) {
+      addPredicateUsers(PI, I);
+      addAdditionalUsers(Cmp->getOperand(0), I);
+      return createVariableOrConstant(FirstOp);
+    }
+  }
+  if (const auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+    // If we are *not* a copy of the comparison, we may equal to the other
+    // operand when the predicate implies something about equality of
+    // operations.  In particular, if the comparison is true/false when the
+    // operands are equal, and we are on the right edge, we know this operation
+    // is equal to something.
+    if ((PBranch->TrueEdge && Predicate == CmpInst::ICMP_EQ) ||
+        (!PBranch->TrueEdge && Predicate == CmpInst::ICMP_NE)) {
+      addPredicateUsers(PI, I);
+      addAdditionalUsers(Cmp->getOperand(0), I);
+      return createVariableOrConstant(FirstOp);
+    }
+    // Handle the special case of floating point.
+    if (((PBranch->TrueEdge && Predicate == CmpInst::FCMP_OEQ) ||
+         (!PBranch->TrueEdge && Predicate == CmpInst::FCMP_UNE)) &&
+        isa<ConstantFP>(FirstOp) && !cast<ConstantFP>(FirstOp)->isZero()) {
+      addPredicateUsers(PI, I);
+      addAdditionalUsers(Cmp->getOperand(0), I);
+      return createConstantExpression(cast<Constant>(FirstOp));
+    }
+  }
+  return nullptr;
+}
+
+// Evaluate read only and pure calls, and create an expression result.
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
+  auto *CI = cast<CallInst>(I);
+  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+    // Instrinsics with the returned attribute are copies of arguments.
+    if (auto *ReturnedValue = II->getReturnedArgOperand()) {
+      if (II->getIntrinsicID() == Intrinsic::ssa_copy)
+        if (const auto *Result = performSymbolicPredicateInfoEvaluation(I))
+          return Result;
+      return createVariableOrConstant(ReturnedValue);
+    }
+  }
+  if (AA->doesNotAccessMemory(CI)) {
+    return createCallExpression(CI, TOPClass->getMemoryLeader());
+  } else if (AA->onlyReadsMemory(CI)) {
+    MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(CI);
+    return createCallExpression(CI, DefiningAccess);
+  }
+  return nullptr;
+}
+
+// Retrieve the memory class for a given MemoryAccess.
+CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const {
+
+  auto *Result = MemoryAccessToClass.lookup(MA);
+  assert(Result && "Should have found memory class");
+  return Result;
+}
+
+// Update the MemoryAccess equivalence table to say that From is equal to To,
+// and return true if this is different from what already existed in the table.
+bool NewGVN::setMemoryClass(const MemoryAccess *From,
+                            CongruenceClass *NewClass) {
+  assert(NewClass &&
+         "Every MemoryAccess should be getting mapped to a non-null class");
+  DEBUG(dbgs() << "Setting " << *From);
+  DEBUG(dbgs() << " equivalent to congruence class ");
+  DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
+  DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
+
+  auto LookupResult = MemoryAccessToClass.find(From);
+  bool Changed = false;
+  // If it's already in the table, see if the value changed.
+  if (LookupResult != MemoryAccessToClass.end()) {
+    auto *OldClass = LookupResult->second;
+    if (OldClass != NewClass) {
+      // If this is a phi, we have to handle memory member updates.
+      if (auto *MP = dyn_cast<MemoryPhi>(From)) {
+        OldClass->memory_erase(MP);
+        NewClass->memory_insert(MP);
+        // This may have killed the class if it had no non-memory members
+        if (OldClass->getMemoryLeader() == From) {
+          if (OldClass->definesNoMemory()) {
+            OldClass->setMemoryLeader(nullptr);
+          } else {
+            OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+            DEBUG(dbgs() << "Memory class leader change for class "
+                         << OldClass->getID() << " to "
+                         << *OldClass->getMemoryLeader()
+                         << " due to removal of a memory member " << *From
+                         << "\n");
+            markMemoryLeaderChangeTouched(OldClass);
+          }
+        }
+      }
+      // It wasn't equivalent before, and now it is.
+      LookupResult->second = NewClass;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+// Determine if a instruction is cycle-free.  That means the values in the
+// instruction don't depend on any expressions that can change value as a result
+// of the instruction.  For example, a non-cycle free instruction would be v =
+// phi(0, v+1).
+bool NewGVN::isCycleFree(const Instruction *I) const {
+  // In order to compute cycle-freeness, we do SCC finding on the instruction,
+  // and see what kind of SCC it ends up in.  If it is a singleton, it is
+  // cycle-free.  If it is not in a singleton, it is only cycle free if the
+  // other members are all phi nodes (as they do not compute anything, they are
+  // copies).
+  auto ICS = InstCycleState.lookup(I);
+  if (ICS == ICS_Unknown) {
+    SCCFinder.Start(I);
+    auto &SCC = SCCFinder.getComponentFor(I);
+    // It's cycle free if it's size 1 or or the SCC is *only* phi nodes.
+    if (SCC.size() == 1)
+      InstCycleState.insert({I, ICS_CycleFree});
+    else {
+      bool AllPhis =
+          llvm::all_of(SCC, [](const Value *V) { return isa<PHINode>(V); });
+      ICS = AllPhis ? ICS_CycleFree : ICS_Cycle;
+      for (auto *Member : SCC)
+        if (auto *MemberPhi = dyn_cast<PHINode>(Member))
+          InstCycleState.insert({MemberPhi, ICS});
+    }
+  }
+  if (ICS == ICS_Cycle)
+    return false;
+  return true;
+}
+
+// Evaluate PHI nodes symbolically, and create an expression result.
+const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
+  // True if one of the incoming phi edges is a backedge.
+  bool HasBackedge = false;
+  // All constant tracks the state of whether all the *original* phi operands
+  // This is really shorthand for "this phi cannot cycle due to forward
+  // change in value of the phi is guaranteed not to later change the value of
+  // the phi. IE it can't be v = phi(undef, v+1)
+  bool AllConstant = true;
+  auto *E =
+      cast<PHIExpression>(createPHIExpression(I, HasBackedge, AllConstant));
+  // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
+  // See if all arguments are the same.
+  // We track if any were undef because they need special handling.
+  bool HasUndef = false;
+  auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) {
+    if (isa<UndefValue>(Arg)) {
+      HasUndef = true;
+      return false;
+    }
+    return true;
+  });
+  // If we are left with no operands, it's dead.
+  if (Filtered.begin() == Filtered.end()) {
+    // If it has undef at this point, it means there are no-non-undef arguments,
+    // and thus, the value of the phi node must be undef.
+    if (HasUndef) {
+      DEBUG(dbgs() << "PHI Node " << *I
+                   << " has no non-undef arguments, valuing it as undef\n");
+      return createConstantExpression(UndefValue::get(I->getType()));
+    }
+
+    DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
+    deleteExpression(E);
+    return createDeadExpression();
+  }
+  unsigned NumOps = 0;
+  Value *AllSameValue = *(Filtered.begin());
+  ++Filtered.begin();
+  // Can't use std::equal here, sadly, because filter.begin moves.
+  if (llvm::all_of(Filtered, [&](Value *Arg) {
+        ++NumOps;
+        return Arg == AllSameValue;
+      })) {
+    // In LLVM's non-standard representation of phi nodes, it's possible to have
+    // phi nodes with cycles (IE dependent on other phis that are .... dependent
+    // on the original phi node), especially in weird CFG's where some arguments
+    // are unreachable, or uninitialized along certain paths.  This can cause
+    // infinite loops during evaluation. We work around this by not trying to
+    // really evaluate them independently, but instead using a variable
+    // expression to say if one is equivalent to the other.
+    // We also special case undef, so that if we have an undef, we can't use the
+    // common value unless it dominates the phi block.
+    if (HasUndef) {
+      // If we have undef and at least one other value, this is really a
+      // multivalued phi, and we need to know if it's cycle free in order to
+      // evaluate whether we can ignore the undef.  The other parts of this are
+      // just shortcuts.  If there is no backedge, or all operands are
+      // constants, or all operands are ignored but the undef, it also must be
+      // cycle free.
+      if (!AllConstant && HasBackedge && NumOps > 0 &&
+          !isa<UndefValue>(AllSameValue) && !isCycleFree(I))
+        return E;
+
+      // Only have to check for instructions
+      if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue))
+        if (!someEquivalentDominates(AllSameInst, I))
+          return E;
+    }
+    // Can't simplify to something that comes later in the iteration.
+    // Otherwise, when and if it changes congruence class, we will never catch
+    // up. We will always be a class behind it.
+    if (isa<Instruction>(AllSameValue) &&
+        InstrToDFSNum(AllSameValue) > InstrToDFSNum(I))
+      return E;
+    NumGVNPhisAllSame++;
+    DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
+                 << "\n");
+    deleteExpression(E);
+    return createVariableOrConstant(AllSameValue);
+  }
+  return E;
+}
+
+const Expression *
+NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {
+  if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
+    auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
+    if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
+      unsigned Opcode = 0;
+      // EI might be an extract from one of our recognised intrinsics. If it
+      // is we'll synthesize a semantically equivalent expression instead on
+      // an extract value expression.
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::sadd_with_overflow:
+      case Intrinsic::uadd_with_overflow:
+        Opcode = Instruction::Add;
+        break;
+      case Intrinsic::ssub_with_overflow:
+      case Intrinsic::usub_with_overflow:
+        Opcode = Instruction::Sub;
+        break;
+      case Intrinsic::smul_with_overflow:
+      case Intrinsic::umul_with_overflow:
+        Opcode = Instruction::Mul;
+        break;
+      default:
+        break;
+      }
+
+      if (Opcode != 0) {
+        // Intrinsic recognized. Grab its args to finish building the
+        // expression.
+        assert(II->getNumArgOperands() == 2 &&
+               "Expect two args for recognised intrinsics.");
+        return createBinaryExpression(
+            Opcode, EI->getType(), II->getArgOperand(0), II->getArgOperand(1));
+      }
+    }
+  }
+
+  return createAggregateValueExpression(I);
+}
+const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
+  auto *CI = dyn_cast<CmpInst>(I);
+  // See if our operands are equal to those of a previous predicate, and if so,
+  // if it implies true or false.
+  auto Op0 = lookupOperandLeader(CI->getOperand(0));
+  auto Op1 = lookupOperandLeader(CI->getOperand(1));
+  auto OurPredicate = CI->getPredicate();
+  if (shouldSwapOperands(Op0, Op1)) {
+    std::swap(Op0, Op1);
+    OurPredicate = CI->getSwappedPredicate();
+  }
+
+  // Avoid processing the same info twice
+  const PredicateBase *LastPredInfo = nullptr;
+  // See if we know something about the comparison itself, like it is the target
+  // of an assume.
+  auto *CmpPI = PredInfo->getPredicateInfoFor(I);
+  if (dyn_cast_or_null<PredicateAssume>(CmpPI))
+    return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+
+  if (Op0 == Op1) {
+    // This condition does not depend on predicates, no need to add users
+    if (CI->isTrueWhenEqual())
+      return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+    else if (CI->isFalseWhenEqual())
+      return createConstantExpression(ConstantInt::getFalse(CI->getType()));
+  }
+
+  // NOTE: Because we are comparing both operands here and below, and using
+  // previous comparisons, we rely on fact that predicateinfo knows to mark
+  // comparisons that use renamed operands as users of the earlier comparisons.
+  // It is *not* enough to just mark predicateinfo renamed operands as users of
+  // the earlier comparisons, because the *other* operand may have changed in a
+  // previous iteration.
+  // Example:
+  // icmp slt %a, %b
+  // %b.0 = ssa.copy(%b)
+  // false branch:
+  // icmp slt %c, %b.0
+
+  // %c and %a may start out equal, and thus, the code below will say the second
+  // %icmp is false.  c may become equal to something else, and in that case the
+  // %second icmp *must* be reexamined, but would not if only the renamed
+  // %operands are considered users of the icmp.
+
+  // *Currently* we only check one level of comparisons back, and only mark one
+  // level back as touched when changes appen .  If you modify this code to look
+  // back farther through comparisons, you *must* mark the appropriate
+  // comparisons as users in PredicateInfo.cpp, or you will cause bugs.  See if
+  // we know something just from the operands themselves
+
+  // See if our operands have predicate info, so that we may be able to derive
+  // something from a previous comparison.
+  for (const auto &Op : CI->operands()) {
+    auto *PI = PredInfo->getPredicateInfoFor(Op);
+    if (const auto *PBranch = dyn_cast_or_null<PredicateBranch>(PI)) {
+      if (PI == LastPredInfo)
+        continue;
+      LastPredInfo = PI;
+
+      // TODO: Along the false edge, we may know more things too, like icmp of
+      // same operands is false.
+      // TODO: We only handle actual comparison conditions below, not and/or.
+      auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition);
+      if (!BranchCond)
+        continue;
+      auto *BranchOp0 = lookupOperandLeader(BranchCond->getOperand(0));
+      auto *BranchOp1 = lookupOperandLeader(BranchCond->getOperand(1));
+      auto BranchPredicate = BranchCond->getPredicate();
+      if (shouldSwapOperands(BranchOp0, BranchOp1)) {
+        std::swap(BranchOp0, BranchOp1);
+        BranchPredicate = BranchCond->getSwappedPredicate();
+      }
+      if (BranchOp0 == Op0 && BranchOp1 == Op1) {
+        if (PBranch->TrueEdge) {
+          // If we know the previous predicate is true and we are in the true
+          // edge then we may be implied true or false.
+          if (CmpInst::isImpliedTrueByMatchingCmp(BranchPredicate,
+                                                  OurPredicate)) {
+            addPredicateUsers(PI, I);
+            return createConstantExpression(
+                ConstantInt::getTrue(CI->getType()));
+          }
+
+          if (CmpInst::isImpliedFalseByMatchingCmp(BranchPredicate,
+                                                   OurPredicate)) {
+            addPredicateUsers(PI, I);
+            return createConstantExpression(
+                ConstantInt::getFalse(CI->getType()));
+          }
+
+        } else {
+          // Just handle the ne and eq cases, where if we have the same
+          // operands, we may know something.
+          if (BranchPredicate == OurPredicate) {
+            addPredicateUsers(PI, I);
+            // Same predicate, same ops,we know it was false, so this is false.
+            return createConstantExpression(
+                ConstantInt::getFalse(CI->getType()));
+          } else if (BranchPredicate ==
+                     CmpInst::getInversePredicate(OurPredicate)) {
+            addPredicateUsers(PI, I);
+            // Inverse predicate, we know the other was false, so this is true.
+            return createConstantExpression(
+                ConstantInt::getTrue(CI->getType()));
+          }
+        }
+      }
+    }
+  }
+  // Create expression will take care of simplifyCmpInst
+  return createExpression(I);
+}
+
+// Return true if V is a value that will always be available (IE can
+// be placed anywhere) in the function.  We don't do globals here
+// because they are often worse to put in place.
+// TODO: Separate cost from availability
+static bool alwaysAvailable(Value *V) {
+  return isa<Constant>(V) || isa<Argument>(V);
+}
+
+// Substitute and symbolize the value before value numbering.
+const Expression *
+NewGVN::performSymbolicEvaluation(Value *V,
+                                  SmallPtrSetImpl<Value *> &Visited) const {
+  const Expression *E = nullptr;
+  if (auto *C = dyn_cast<Constant>(V))
+    E = createConstantExpression(C);
+  else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
+    E = createVariableExpression(V);
+  } else {
+    // TODO: memory intrinsics.
+    // TODO: Some day, we should do the forward propagation and reassociation
+    // parts of the algorithm.
+    auto *I = cast<Instruction>(V);
+    switch (I->getOpcode()) {
+    case Instruction::ExtractValue:
+    case Instruction::InsertValue:
+      E = performSymbolicAggrValueEvaluation(I);
+      break;
+    case Instruction::PHI:
+      E = performSymbolicPHIEvaluation(I);
+      break;
+    case Instruction::Call:
+      E = performSymbolicCallEvaluation(I);
+      break;
+    case Instruction::Store:
+      E = performSymbolicStoreEvaluation(I);
+      break;
+    case Instruction::Load:
+      E = performSymbolicLoadEvaluation(I);
+      break;
+    case Instruction::BitCast: {
+      E = createExpression(I);
+    } break;
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      E = performSymbolicCmpEvaluation(I);
+    } break;
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::Select:
+    case Instruction::ExtractElement:
+    case Instruction::InsertElement:
+    case Instruction::ShuffleVector:
+    case Instruction::GetElementPtr:
+      E = createExpression(I);
+      break;
+    default:
+      return nullptr;
+    }
+  }
+  return E;
+}
+
+// Look up a container in a map, and then call a function for each thing in the
+// found container.
+template <typename Map, typename KeyType, typename Func>
+void NewGVN::for_each_found(Map &M, const KeyType &Key, Func F) {
+  const auto Result = M.find_as(Key);
+  if (Result != M.end())
+    for (typename Map::mapped_type::value_type Mapped : Result->second)
+      F(Mapped);
+}
+
+// Look up a container of values/instructions in a map, and touch all the
+// instructions in the container.  Then erase value from the map.
+template <typename Map, typename KeyType>
+void NewGVN::touchAndErase(Map &M, const KeyType &Key) {
+  const auto Result = M.find_as(Key);
+  if (Result != M.end()) {
+    for (const typename Map::mapped_type::value_type Mapped : Result->second)
+      TouchedInstructions.set(InstrToDFSNum(Mapped));
+    M.erase(Result);
+  }
+}
+
+void NewGVN::addAdditionalUsers(Value *To, Value *User) const {
+  if (isa<Instruction>(To))
+    AdditionalUsers[To].insert(User);
+}
+
+void NewGVN::markUsersTouched(Value *V) {
+  // Now mark the users as touched.
+  for (auto *User : V->users()) {
+    assert(isa<Instruction>(User) && "Use of value not within an instruction?");
+    TouchedInstructions.set(InstrToDFSNum(User));
+  }
+  touchAndErase(AdditionalUsers, V);
+}
+
+void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
+  DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
+  MemoryToUsers[To].insert(U);
+}
+
+void NewGVN::markMemoryDefTouched(const MemoryAccess *MA) {
+  TouchedInstructions.set(MemoryToDFSNum(MA));
+}
+
+void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
+  if (isa<MemoryUse>(MA))
+    return;
+  for (auto U : MA->users())
+    TouchedInstructions.set(MemoryToDFSNum(U));
+  touchAndErase(MemoryToUsers, MA);
+}
+
+// Add I to the set of users of a given predicate.
+void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {
+  // Don't add temporary instructions to the user lists.
+  if (AllTempInstructions.count(I))
+    return;
+
+  if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
+    PredicateToUsers[PBranch->Condition].insert(I);
+  else if (auto *PAssume = dyn_cast<PredicateBranch>(PB))
+    PredicateToUsers[PAssume->Condition].insert(I);
+}
+
+// Touch all the predicates that depend on this instruction.
+void NewGVN::markPredicateUsersTouched(Instruction *I) {
+  touchAndErase(PredicateToUsers, I);
+}
+
+// Mark users affected by a memory leader change.
+void NewGVN::markMemoryLeaderChangeTouched(CongruenceClass *CC) {
+  for (auto M : CC->memory())
+    markMemoryDefTouched(M);
+}
+
+// Touch the instructions that need to be updated after a congruence class has a
+// leader change, and mark changed values.
+void NewGVN::markValueLeaderChangeTouched(CongruenceClass *CC) {
+  for (auto M : *CC) {
+    if (auto *I = dyn_cast<Instruction>(M))
+      TouchedInstructions.set(InstrToDFSNum(I));
+    LeaderChanges.insert(M);
+  }
+}
+
+// Give a range of things that have instruction DFS numbers, this will return
+// the member of the range with the smallest dfs number.
+template <class T, class Range>
+T *NewGVN::getMinDFSOfRange(const Range &R) const {
+  std::pair<T *, unsigned> MinDFS = {nullptr, ~0U};
+  for (const auto X : R) {
+    auto DFSNum = InstrToDFSNum(X);
+    if (DFSNum < MinDFS.second)
+      MinDFS = {X, DFSNum};
+  }
+  return MinDFS.first;
+}
+
+// This function returns the MemoryAccess that should be the next leader of
+// congruence class CC, under the assumption that the current leader is going to
+// disappear.
+const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
+  // TODO: If this ends up to slow, we can maintain a next memory leader like we
+  // do for regular leaders.
+  // Make sure there will be a leader to find
+  assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
+  if (CC->getStoreCount() > 0) {
+    if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
+      return getMemoryAccess(NL);
+    // Find the store with the minimum DFS number.
+    auto *V = getMinDFSOfRange<Value>(make_filter_range(
+        *CC, [&](const Value *V) { return isa<StoreInst>(V); }));
+    return getMemoryAccess(cast<StoreInst>(V));
+  }
+  assert(CC->getStoreCount() == 0);
+
+  // Given our assertion, hitting this part must mean
+  // !OldClass->memory_empty()
+  if (CC->memory_size() == 1)
+    return *CC->memory_begin();
+  return getMinDFSOfRange<const MemoryPhi>(CC->memory());
+}
+
+// This function returns the next value leader of a congruence class, under the
+// assumption that the current leader is going away.  This should end up being
+// the next most dominating member.
+Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const {
+  // We don't need to sort members if there is only 1, and we don't care about
+  // sorting the TOP class because everything either gets out of it or is
+  // unreachable.
+
+  if (CC->size() == 1 || CC == TOPClass) {
+    return *(CC->begin());
+  } else if (CC->getNextLeader().first) {
+    ++NumGVNAvoidedSortedLeaderChanges;
+    return CC->getNextLeader().first;
+  } else {
+    ++NumGVNSortedLeaderChanges;
+    // NOTE: If this ends up to slow, we can maintain a dual structure for
+    // member testing/insertion, or keep things mostly sorted, and sort only
+    // here, or use SparseBitVector or ....
+    return getMinDFSOfRange<Value>(*CC);
+  }
+}
+
+// Move a MemoryAccess, currently in OldClass, to NewClass, including updates to
+// the memory members, etc for the move.
+//
+// The invariants of this function are:
+//
+// - I must be moving to NewClass from OldClass
+// - The StoreCount of OldClass and NewClass is expected to have been updated
+//   for I already if it is is a store.
+// - The OldClass memory leader has not been updated yet if I was the leader.
+void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
+                                            MemoryAccess *InstMA,
+                                            CongruenceClass *OldClass,
+                                            CongruenceClass *NewClass) {
+  // If the leader is I, and we had a represenative MemoryAccess, it should
+  // be the MemoryAccess of OldClass.
+  assert((!InstMA || !OldClass->getMemoryLeader() ||
+          OldClass->getLeader() != I ||
+          MemoryAccessToClass.lookup(OldClass->getMemoryLeader()) ==
+              MemoryAccessToClass.lookup(InstMA)) &&
+         "Representative MemoryAccess mismatch");
+  // First, see what happens to the new class
+  if (!NewClass->getMemoryLeader()) {
+    // Should be a new class, or a store becoming a leader of a new class.
+    assert(NewClass->size() == 1 ||
+           (isa<StoreInst>(I) && NewClass->getStoreCount() == 1));
+    NewClass->setMemoryLeader(InstMA);
+    // Mark it touched if we didn't just create a singleton
+    DEBUG(dbgs() << "Memory class leader change for class " << NewClass->getID()
+                 << " due to new memory instruction becoming leader\n");
+    markMemoryLeaderChangeTouched(NewClass);
+  }
+  setMemoryClass(InstMA, NewClass);
+  // Now, fixup the old class if necessary
+  if (OldClass->getMemoryLeader() == InstMA) {
+    if (!OldClass->definesNoMemory()) {
+      OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+      DEBUG(dbgs() << "Memory class leader change for class "
+                   << OldClass->getID() << " to "
+                   << *OldClass->getMemoryLeader()
+                   << " due to removal of old leader " << *InstMA << "\n");
+      markMemoryLeaderChangeTouched(OldClass);
+    } else
+      OldClass->setMemoryLeader(nullptr);
+  }
+}
+
+// Move a value, currently in OldClass, to be part of NewClass
+// Update OldClass and NewClass for the move (including changing leaders, etc).
+void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
+                                           CongruenceClass *OldClass,
+                                           CongruenceClass *NewClass) {
+  if (I == OldClass->getNextLeader().first)
+    OldClass->resetNextLeader();
+
+  OldClass->erase(I);
+  NewClass->insert(I);
+
+  if (NewClass->getLeader() != I)
+    NewClass->addPossibleNextLeader({I, InstrToDFSNum(I)});
+  // Handle our special casing of stores.
+  if (auto *SI = dyn_cast<StoreInst>(I)) {
+    OldClass->decStoreCount();
+    // Okay, so when do we want to make a store a leader of a class?
+    // If we have a store defined by an earlier load, we want the earlier load
+    // to lead the class.
+    // If we have a store defined by something else, we want the store to lead
+    // the class so everything else gets the "something else" as a value.
+    // If we have a store as the single member of the class, we want the store
+    // as the leader
+    if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
+      // If it's a store expression we are using, it means we are not equivalent
+      // to something earlier.
+      if (auto *SE = dyn_cast<StoreExpression>(E)) {
+        NewClass->setStoredValue(SE->getStoredValue());
+        markValueLeaderChangeTouched(NewClass);
+        // Shift the new class leader to be the store
+        DEBUG(dbgs() << "Changing leader of congruence class "
+                     << NewClass->getID() << " from " << *NewClass->getLeader()
+                     << " to  " << *SI << " because store joined class\n");
+        // If we changed the leader, we have to mark it changed because we don't
+        // know what it will do to symbolic evaluation.
+        NewClass->setLeader(SI);
+      }
+      // We rely on the code below handling the MemoryAccess change.
+    }
+    NewClass->incStoreCount();
+  }
+  // True if there is no memory instructions left in a class that had memory
+  // instructions before.
+
+  // If it's not a memory use, set the MemoryAccess equivalence
+  auto *InstMA = dyn_cast_or_null<MemoryDef>(getMemoryAccess(I));
+  if (InstMA)
+    moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass);
+  ValueToClass[I] = NewClass;
+  // See if we destroyed the class or need to swap leaders.
+  if (OldClass->empty() && OldClass != TOPClass) {
+    if (OldClass->getDefiningExpr()) {
+      DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
+                   << " from table\n");
+      // We erase it as an exact expression to make sure we don't just erase an
+      // equivalent one.
+      auto Iter = ExpressionToClass.find_as(
+          ExactEqualsExpression(*OldClass->getDefiningExpr()));
+      if (Iter != ExpressionToClass.end())
+        ExpressionToClass.erase(Iter);
+#ifdef EXPENSIVE_CHECKS
+      assert(
+          (*OldClass->getDefiningExpr() != *E || ExpressionToClass.lookup(E)) &&
+          "We erased the expression we just inserted, which should not happen");
+#endif
+    }
+  } else if (OldClass->getLeader() == I) {
+    // When the leader changes, the value numbering of
+    // everything may change due to symbolization changes, so we need to
+    // reprocess.
+    DEBUG(dbgs() << "Value class leader change for class " << OldClass->getID()
+                 << "\n");
+    ++NumGVNLeaderChanges;
+    // Destroy the stored value if there are no more stores to represent it.
+    // Note that this is basically clean up for the expression removal that
+    // happens below.  If we remove stores from a class, we may leave it as a
+    // class of equivalent memory phis.
+    if (OldClass->getStoreCount() == 0) {
+      if (OldClass->getStoredValue())
+        OldClass->setStoredValue(nullptr);
+    }
+    OldClass->setLeader(getNextValueLeader(OldClass));
+    OldClass->resetNextLeader();
+    markValueLeaderChangeTouched(OldClass);
+  }
+}
+
+// For a given expression, mark the phi of ops instructions that could have
+// changed as a result.
+void NewGVN::markPhiOfOpsChanged(const Expression *E) {
+  touchAndErase(ExpressionToPhiOfOps, ExactEqualsExpression(*E));
+}
+
+// Perform congruence finding on a given value numbering expression.
+void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
+  // This is guaranteed to return something, since it will at least find
+  // TOP.
+
+  CongruenceClass *IClass = ValueToClass.lookup(I);
+  assert(IClass && "Should have found a IClass");
+  // Dead classes should have been eliminated from the mapping.
+  assert(!IClass->isDead() && "Found a dead class");
+
+  CongruenceClass *EClass = nullptr;
+  if (const auto *VE = dyn_cast<VariableExpression>(E)) {
+    EClass = ValueToClass.lookup(VE->getVariableValue());
+  } else if (isa<DeadExpression>(E)) {
+    EClass = TOPClass;
+  }
+  if (!EClass) {
+    auto lookupResult = ExpressionToClass.insert({E, nullptr});
+
+    // If it's not in the value table, create a new congruence class.
+    if (lookupResult.second) {
+      CongruenceClass *NewClass = createCongruenceClass(nullptr, E);
+      auto place = lookupResult.first;
+      place->second = NewClass;
+
+      // Constants and variables should always be made the leader.
+      if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
+        NewClass->setLeader(CE->getConstantValue());
+      } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
+        StoreInst *SI = SE->getStoreInst();
+        NewClass->setLeader(SI);
+        NewClass->setStoredValue(SE->getStoredValue());
+        // The RepMemoryAccess field will be filled in properly by the
+        // moveValueToNewCongruenceClass call.
+      } else {
+        NewClass->setLeader(I);
+      }
+      assert(!isa<VariableExpression>(E) &&
+             "VariableExpression should have been handled already");
+
+      EClass = NewClass;
+      DEBUG(dbgs() << "Created new congruence class for " << *I
+                   << " using expression " << *E << " at " << NewClass->getID()
+                   << " and leader " << *(NewClass->getLeader()));
+      if (NewClass->getStoredValue())
+        DEBUG(dbgs() << " and stored value " << *(NewClass->getStoredValue()));
+      DEBUG(dbgs() << "\n");
+    } else {
+      EClass = lookupResult.first->second;
+      if (isa<ConstantExpression>(E))
+        assert((isa<Constant>(EClass->getLeader()) ||
+                (EClass->getStoredValue() &&
+                 isa<Constant>(EClass->getStoredValue()))) &&
+               "Any class with a constant expression should have a "
+               "constant leader");
+
+      assert(EClass && "Somehow don't have an eclass");
+
+      assert(!EClass->isDead() && "We accidentally looked up a dead class");
+    }
+  }
+  bool ClassChanged = IClass != EClass;
+  bool LeaderChanged = LeaderChanges.erase(I);
+  if (ClassChanged || LeaderChanged) {
+    DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " << *E
+                 << "\n");
+    if (ClassChanged) {
+      moveValueToNewCongruenceClass(I, E, IClass, EClass);
+      markPhiOfOpsChanged(E);
+    }
+
+    markUsersTouched(I);
+    if (MemoryAccess *MA = getMemoryAccess(I))
+      markMemoryUsersTouched(MA);
+    if (auto *CI = dyn_cast<CmpInst>(I))
+      markPredicateUsersTouched(CI);
+  }
+  // If we changed the class of the store, we want to ensure nothing finds the
+  // old store expression.  In particular, loads do not compare against stored
+  // value, so they will find old store expressions (and associated class
+  // mappings) if we leave them in the table.
+  if (ClassChanged && isa<StoreInst>(I)) {
+    auto *OldE = ValueToExpression.lookup(I);
+    // It could just be that the old class died. We don't want to erase it if we
+    // just moved classes.
+    if (OldE && isa<StoreExpression>(OldE) && *E != *OldE) {
+      // Erase this as an exact expression to ensure we don't erase expressions
+      // equivalent to it.
+      auto Iter = ExpressionToClass.find_as(ExactEqualsExpression(*OldE));
+      if (Iter != ExpressionToClass.end())
+        ExpressionToClass.erase(Iter);
+    }
+  }
+  ValueToExpression[I] = E;
+}
+
+// Process the fact that Edge (from, to) is reachable, including marking
+// any newly reachable blocks and instructions for processing.
+void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
+  // Check if the Edge was reachable before.
+  if (ReachableEdges.insert({From, To}).second) {
+    // If this block wasn't reachable before, all instructions are touched.
+    if (ReachableBlocks.insert(To).second) {
+      DEBUG(dbgs() << "Block " << getBlockName(To) << " marked reachable\n");
+      const auto &InstRange = BlockInstRange.lookup(To);
+      TouchedInstructions.set(InstRange.first, InstRange.second);
+    } else {
+      DEBUG(dbgs() << "Block " << getBlockName(To)
+                   << " was reachable, but new edge {" << getBlockName(From)
+                   << "," << getBlockName(To) << "} to it found\n");
+
+      // We've made an edge reachable to an existing block, which may
+      // impact predicates. Otherwise, only mark the phi nodes as touched, as
+      // they are the only thing that depend on new edges. Anything using their
+      // values will get propagated to if necessary.
+      if (MemoryAccess *MemPhi = getMemoryAccess(To))
+        TouchedInstructions.set(InstrToDFSNum(MemPhi));
+
+      auto BI = To->begin();
+      while (isa<PHINode>(BI)) {
+        TouchedInstructions.set(InstrToDFSNum(&*BI));
+        ++BI;
+      }
+      for_each_found(PHIOfOpsPHIs, To, [&](const PHINode *I) {
+        TouchedInstructions.set(InstrToDFSNum(I));
+      });
+    }
+  }
+}
+
+// Given a predicate condition (from a switch, cmp, or whatever) and a block,
+// see if we know some constant value for it already.
+Value *NewGVN::findConditionEquivalence(Value *Cond) const {
+  auto Result = lookupOperandLeader(Cond);
+  return isa<Constant>(Result) ? Result : nullptr;
+}
+
+// Process the outgoing edges of a block for reachability.
+void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
+  // Evaluate reachability of terminator instruction.
+  BranchInst *BR;
+  if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) {
+    Value *Cond = BR->getCondition();
+    Value *CondEvaluated = findConditionEquivalence(Cond);
+    if (!CondEvaluated) {
+      if (auto *I = dyn_cast<Instruction>(Cond)) {
+        const Expression *E = createExpression(I);
+        if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
+          CondEvaluated = CE->getConstantValue();
+        }
+      } else if (isa<ConstantInt>(Cond)) {
+        CondEvaluated = Cond;
+      }
+    }
+    ConstantInt *CI;
+    BasicBlock *TrueSucc = BR->getSuccessor(0);
+    BasicBlock *FalseSucc = BR->getSuccessor(1);
+    if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) {
+      if (CI->isOne()) {
+        DEBUG(dbgs() << "Condition for Terminator " << *TI
+                     << " evaluated to true\n");
+        updateReachableEdge(B, TrueSucc);
+      } else if (CI->isZero()) {
+        DEBUG(dbgs() << "Condition for Terminator " << *TI
+                     << " evaluated to false\n");
+        updateReachableEdge(B, FalseSucc);
+      }
+    } else {
+      updateReachableEdge(B, TrueSucc);
+      updateReachableEdge(B, FalseSucc);
+    }
+  } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+    // For switches, propagate the case values into the case
+    // destinations.
+
+    // Remember how many outgoing edges there are to every successor.
+    SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+
+    Value *SwitchCond = SI->getCondition();
+    Value *CondEvaluated = findConditionEquivalence(SwitchCond);
+    // See if we were able to turn this switch statement into a constant.
+    if (CondEvaluated && isa<ConstantInt>(CondEvaluated)) {
+      auto *CondVal = cast<ConstantInt>(CondEvaluated);
+      // We should be able to get case value for this.
+      auto Case = *SI->findCaseValue(CondVal);
+      if (Case.getCaseSuccessor() == SI->getDefaultDest()) {
+        // We proved the value is outside of the range of the case.
+        // We can't do anything other than mark the default dest as reachable,
+        // and go home.
+        updateReachableEdge(B, SI->getDefaultDest());
+        return;
+      }
+      // Now get where it goes and mark it reachable.
+      BasicBlock *TargetBlock = Case.getCaseSuccessor();
+      updateReachableEdge(B, TargetBlock);
+    } else {
+      for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+        BasicBlock *TargetBlock = SI->getSuccessor(i);
+        ++SwitchEdges[TargetBlock];
+        updateReachableEdge(B, TargetBlock);
+      }
+    }
+  } else {
+    // Otherwise this is either unconditional, or a type we have no
+    // idea about. Just mark successors as reachable.
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+      BasicBlock *TargetBlock = TI->getSuccessor(i);
+      updateReachableEdge(B, TargetBlock);
+    }
+
+    // This also may be a memory defining terminator, in which case, set it
+    // equivalent only to itself.
+    //
+    auto *MA = getMemoryAccess(TI);
+    if (MA && !isa<MemoryUse>(MA)) {
+      auto *CC = ensureLeaderOfMemoryClass(MA);
+      if (setMemoryClass(MA, CC))
+        markMemoryUsersTouched(MA);
+    }
+  }
+}
+
+void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB,
+                         Instruction *ExistingValue) {
+  InstrDFS[Op] = InstrToDFSNum(ExistingValue);
+  AllTempInstructions.insert(Op);
+  PHIOfOpsPHIs[BB].push_back(Op);
+  TempToBlock[Op] = BB;
+  RealToTemp[ExistingValue] = Op;
+}
+
+static bool okayForPHIOfOps(const Instruction *I) {
+  return isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<CmpInst>(I) ||
+         isa<LoadInst>(I);
+}
+
+// When we see an instruction that is an op of phis, generate the equivalent phi
+// of ops form.
+const Expression *
+NewGVN::makePossiblePhiOfOps(Instruction *I,
+                             SmallPtrSetImpl<Value *> &Visited) {
+  if (!okayForPHIOfOps(I))
+    return nullptr;
+
+  if (!Visited.insert(I).second)
+    return nullptr;
+  // For now, we require the instruction be cycle free because we don't
+  // *always* create a phi of ops for instructions that could be done as phi
+  // of ops, we only do it if we think it is useful.  If we did do it all the
+  // time, we could remove the cycle free check.
+  if (!isCycleFree(I))
+    return nullptr;
+
+  unsigned IDFSNum = InstrToDFSNum(I);
+  SmallPtrSet<const Value *, 8> ProcessedPHIs;
+  // TODO: We don't do phi translation on memory accesses because it's
+  // complicated. For a load, we'd need to be able to simulate a new memoryuse,
+  // which we don't have a good way of doing ATM.
+  auto *MemAccess = getMemoryAccess(I);
+  // If the memory operation is defined by a memory operation this block that
+  // isn't a MemoryPhi, transforming the pointer backwards through a scalar phi
+  // can't help, as it would still be killed by that memory operation.
+  if (MemAccess && !isa<MemoryPhi>(MemAccess->getDefiningAccess()) &&
+      MemAccess->getDefiningAccess()->getBlock() == I->getParent())
+    return nullptr;
+
+  // Convert op of phis to phi of ops
+  for (auto &Op : I->operands()) {
+    // TODO: We can't handle expressions that must be recursively translated
+    // IE
+    // a = phi (b, c)
+    // f = use a
+    // g = f + phi of something
+    // To properly make a phi of ops for g, we'd have to properly translate and
+    // use the instruction for f.  We should add this by splitting out the
+    // instruction creation we do below.
+    if (isa<Instruction>(Op) && PHINodeUses.count(cast<Instruction>(Op)))
+      return nullptr;
+    if (!isa<PHINode>(Op))
+      continue;
+    auto *OpPHI = cast<PHINode>(Op);
+    // No point in doing this for one-operand phis.
+    if (OpPHI->getNumOperands() == 1)
+      continue;
+    if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
+      return nullptr;
+    SmallVector<std::pair<Value *, BasicBlock *>, 4> Ops;
+    auto *PHIBlock = getBlockForValue(OpPHI);
+    for (auto PredBB : OpPHI->blocks()) {
+      Value *FoundVal = nullptr;
+      // We could just skip unreachable edges entirely but it's tricky to do
+      // with rewriting existing phi nodes.
+      if (ReachableEdges.count({PredBB, PHIBlock})) {
+        // Clone the instruction, create an expression from it, and see if we
+        // have a leader.
+        Instruction *ValueOp = I->clone();
+        if (MemAccess)
+          TempToMemory.insert({ValueOp, MemAccess});
+
+        for (auto &Op : ValueOp->operands()) {
+          Op = Op->DoPHITranslation(PHIBlock, PredBB);
+          // When this operand changes, it could change whether there is a
+          // leader for us or not.
+          addAdditionalUsers(Op, I);
+        }
+        // Make sure it's marked as a temporary instruction.
+        AllTempInstructions.insert(ValueOp);
+        // and make sure anything that tries to add it's DFS number is
+        // redirected to the instruction we are making a phi of ops
+        // for.
+        InstrDFS.insert({ValueOp, IDFSNum});
+        const Expression *E = performSymbolicEvaluation(ValueOp, Visited);
+        InstrDFS.erase(ValueOp);
+        AllTempInstructions.erase(ValueOp);
+        ValueOp->deleteValue();
+        if (MemAccess)
+          TempToMemory.erase(ValueOp);
+        if (!E)
+          return nullptr;
+        FoundVal = findPhiOfOpsLeader(E, PredBB);
+        if (!FoundVal) {
+          ExpressionToPhiOfOps[E].insert(I);
+          return nullptr;
+        }
+        if (auto *SI = dyn_cast<StoreInst>(FoundVal))
+          FoundVal = SI->getValueOperand();
+      } else {
+        DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
+                     << getBlockName(PredBB)
+                     << " because the block is unreachable\n");
+        FoundVal = UndefValue::get(I->getType());
+      }
+
+      Ops.push_back({FoundVal, PredBB});
+      DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
+                   << getBlockName(PredBB) << "\n");
+    }
+    auto *ValuePHI = RealToTemp.lookup(I);
+    bool NewPHI = false;
+    if (!ValuePHI) {
+      ValuePHI = PHINode::Create(I->getType(), OpPHI->getNumOperands());
+      addPhiOfOps(ValuePHI, PHIBlock, I);
+      NewPHI = true;
+      NumGVNPHIOfOpsCreated++;
+    }
+    if (NewPHI) {
+      for (auto PHIOp : Ops)
+        ValuePHI->addIncoming(PHIOp.first, PHIOp.second);
+    } else {
+      unsigned int i = 0;
+      for (auto PHIOp : Ops) {
+        ValuePHI->setIncomingValue(i, PHIOp.first);
+        ValuePHI->setIncomingBlock(i, PHIOp.second);
+        ++i;
+      }
+    }
+
+    DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
+                 << "\n");
+    return performSymbolicEvaluation(ValuePHI, Visited);
+  }
+  return nullptr;
+}
+
+// The algorithm initially places the values of the routine in the TOP
+// congruence class. The leader of TOP is the undetermined value `undef`.
+// When the algorithm has finished, values still in TOP are unreachable.
+void NewGVN::initializeCongruenceClasses(Function &F) {
+  NextCongruenceNum = 0;
+
+  // Note that even though we use the live on entry def as a representative
+  // MemoryAccess, it is *not* the same as the actual live on entry def. We
+  // have no real equivalemnt to undef for MemoryAccesses, and so we really
+  // should be checking whether the MemoryAccess is top if we want to know if it
+  // is equivalent to everything.  Otherwise, what this really signifies is that
+  // the access "it reaches all the way back to the beginning of the function"
+
+  // Initialize all other instructions to be in TOP class.
+  TOPClass = createCongruenceClass(nullptr, nullptr);
+  TOPClass->setMemoryLeader(MSSA->getLiveOnEntryDef());
+  //  The live on entry def gets put into it's own class
+  MemoryAccessToClass[MSSA->getLiveOnEntryDef()] =
+      createMemoryClass(MSSA->getLiveOnEntryDef());
+
+  for (auto DTN : nodes(DT)) {
+    BasicBlock *BB = DTN->getBlock();
+    // All MemoryAccesses are equivalent to live on entry to start. They must
+    // be initialized to something so that initial changes are noticed. For
+    // the maximal answer, we initialize them all to be the same as
+    // liveOnEntry.
+    auto *MemoryBlockDefs = MSSA->getBlockDefs(BB);
+    if (MemoryBlockDefs)
+      for (const auto &Def : *MemoryBlockDefs) {
+        MemoryAccessToClass[&Def] = TOPClass;
+        auto *MD = dyn_cast<MemoryDef>(&Def);
+        // Insert the memory phis into the member list.
+        if (!MD) {
+          const MemoryPhi *MP = cast<MemoryPhi>(&Def);
+          TOPClass->memory_insert(MP);
+          MemoryPhiState.insert({MP, MPS_TOP});
+        }
+
+        if (MD && isa<StoreInst>(MD->getMemoryInst()))
+          TOPClass->incStoreCount();
+      }
+    for (auto &I : *BB) {
+      // TODO: Move to helper
+      if (isa<PHINode>(&I))
+        for (auto *U : I.users())
+          if (auto *UInst = dyn_cast<Instruction>(U))
+            if (InstrToDFSNum(UInst) != 0 && okayForPHIOfOps(UInst))
+              PHINodeUses.insert(UInst);
+      // Don't insert void terminators into the class. We don't value number
+      // them, and they just end up sitting in TOP.
+      if (isa<TerminatorInst>(I) && I.getType()->isVoidTy())
+        continue;
+      TOPClass->insert(&I);
+      ValueToClass[&I] = TOPClass;
+    }
+  }
+
+  // Initialize arguments to be in their own unique congruence classes
+  for (auto &FA : F.args())
+    createSingletonCongruenceClass(&FA);
+}
+
+void NewGVN::cleanupTables() {
+  for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) {
+    DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
+                 << " has " << CongruenceClasses[i]->size() << " members\n");
+    // Make sure we delete the congruence class (probably worth switching to
+    // a unique_ptr at some point.
+    delete CongruenceClasses[i];
+    CongruenceClasses[i] = nullptr;
+  }
+
+  // Destroy the value expressions
+  SmallVector<Instruction *, 8> TempInst(AllTempInstructions.begin(),
+                                         AllTempInstructions.end());
+  AllTempInstructions.clear();
+
+  // We have to drop all references for everything first, so there are no uses
+  // left as we delete them.
+  for (auto *I : TempInst) {
+    I->dropAllReferences();
+  }
+
+  while (!TempInst.empty()) {
+    auto *I = TempInst.back();
+    TempInst.pop_back();
+    I->deleteValue();
+  }
+
+  ValueToClass.clear();
+  ArgRecycler.clear(ExpressionAllocator);
+  ExpressionAllocator.Reset();
+  CongruenceClasses.clear();
+  ExpressionToClass.clear();
+  ValueToExpression.clear();
+  RealToTemp.clear();
+  AdditionalUsers.clear();
+  ExpressionToPhiOfOps.clear();
+  TempToBlock.clear();
+  TempToMemory.clear();
+  PHIOfOpsPHIs.clear();
+  ReachableBlocks.clear();
+  ReachableEdges.clear();
+#ifndef NDEBUG
+  ProcessedCount.clear();
+#endif
+  InstrDFS.clear();
+  InstructionsToErase.clear();
+  DFSToInstr.clear();
+  BlockInstRange.clear();
+  TouchedInstructions.clear();
+  MemoryAccessToClass.clear();
+  PredicateToUsers.clear();
+  MemoryToUsers.clear();
+}
+
+// Assign local DFS number mapping to instructions, and leave space for Value
+// PHI's.
+std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
+                                                       unsigned Start) {
+  unsigned End = Start;
+  if (MemoryAccess *MemPhi = getMemoryAccess(B)) {
+    InstrDFS[MemPhi] = End++;
+    DFSToInstr.emplace_back(MemPhi);
+  }
+
+  // Then the real block goes next.
+  for (auto &I : *B) {
+    // There's no need to call isInstructionTriviallyDead more than once on
+    // an instruction. Therefore, once we know that an instruction is dead
+    // we change its DFS number so that it doesn't get value numbered.
+    if (isInstructionTriviallyDead(&I, TLI)) {
+      InstrDFS[&I] = 0;
+      DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+      markInstructionForDeletion(&I);
+      continue;
+    }
+    InstrDFS[&I] = End++;
+    DFSToInstr.emplace_back(&I);
+  }
+
+  // All of the range functions taken half-open ranges (open on the end side).
+  // So we do not subtract one from count, because at this point it is one
+  // greater than the last instruction.
+  return std::make_pair(Start, End);
+}
+
+void NewGVN::updateProcessedCount(const Value *V) {
+#ifndef NDEBUG
+  if (ProcessedCount.count(V) == 0) {
+    ProcessedCount.insert({V, 1});
+  } else {
+    ++ProcessedCount[V];
+    assert(ProcessedCount[V] < 100 &&
+           "Seem to have processed the same Value a lot");
+  }
+#endif
+}
+// Evaluate MemoryPhi nodes symbolically, just like PHI nodes
+void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
+  // If all the arguments are the same, the MemoryPhi has the same value as the
+  // argument.  Filter out unreachable blocks and self phis from our operands.
+  // TODO: We could do cycle-checking on the memory phis to allow valueizing for
+  // self-phi checking.
+  const BasicBlock *PHIBlock = MP->getBlock();
+  auto Filtered = make_filter_range(MP->operands(), [&](const Use &U) {
+    return cast<MemoryAccess>(U) != MP &&
+           !isMemoryAccessTOP(cast<MemoryAccess>(U)) &&
+           ReachableEdges.count({MP->getIncomingBlock(U), PHIBlock});
+  });
+  // If all that is left is nothing, our memoryphi is undef. We keep it as
+  // InitialClass.  Note: The only case this should happen is if we have at
+  // least one self-argument.
+  if (Filtered.begin() == Filtered.end()) {
+    if (setMemoryClass(MP, TOPClass))
+      markMemoryUsersTouched(MP);
+    return;
+  }
+
+  // Transform the remaining operands into operand leaders.
+  // FIXME: mapped_iterator should have a range version.
+  auto LookupFunc = [&](const Use &U) {
+    return lookupMemoryLeader(cast<MemoryAccess>(U));
+  };
+  auto MappedBegin = map_iterator(Filtered.begin(), LookupFunc);
+  auto MappedEnd = map_iterator(Filtered.end(), LookupFunc);
+
+  // and now check if all the elements are equal.
+  // Sadly, we can't use std::equals since these are random access iterators.
+  const auto *AllSameValue = *MappedBegin;
+  ++MappedBegin;
+  bool AllEqual = std::all_of(
+      MappedBegin, MappedEnd,
+      [&AllSameValue](const MemoryAccess *V) { return V == AllSameValue; });
+
+  if (AllEqual)
+    DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue << "\n");
+  else
+    DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
+  // If it's equal to something, it's in that class. Otherwise, it has to be in
+  // a class where it is the leader (other things may be equivalent to it, but
+  // it needs to start off in its own class, which means it must have been the
+  // leader, and it can't have stopped being the leader because it was never
+  // removed).
+  CongruenceClass *CC =
+      AllEqual ? getMemoryClass(AllSameValue) : ensureLeaderOfMemoryClass(MP);
+  auto OldState = MemoryPhiState.lookup(MP);
+  assert(OldState != MPS_Invalid && "Invalid memory phi state");
+  auto NewState = AllEqual ? MPS_Equivalent : MPS_Unique;
+  MemoryPhiState[MP] = NewState;
+  if (setMemoryClass(MP, CC) || OldState != NewState)
+    markMemoryUsersTouched(MP);
+}
+
+// Value number a single instruction, symbolically evaluating, performing
+// congruence finding, and updating mappings.
+void NewGVN::valueNumberInstruction(Instruction *I) {
+  DEBUG(dbgs() << "Processing instruction " << *I << "\n");
+  if (!I->isTerminator()) {
+    const Expression *Symbolized = nullptr;
+    SmallPtrSet<Value *, 2> Visited;
+    if (DebugCounter::shouldExecute(VNCounter)) {
+      Symbolized = performSymbolicEvaluation(I, Visited);
+      // Make a phi of ops if necessary
+      if (Symbolized && !isa<ConstantExpression>(Symbolized) &&
+          !isa<VariableExpression>(Symbolized) && PHINodeUses.count(I)) {
+        auto *PHIE = makePossiblePhiOfOps(I, Visited);
+        if (PHIE)
+          Symbolized = PHIE;
+      }
+
+    } else {
+      // Mark the instruction as unused so we don't value number it again.
+      InstrDFS[I] = 0;
+    }
+    // If we couldn't come up with a symbolic expression, use the unknown
+    // expression
+    if (Symbolized == nullptr)
+      Symbolized = createUnknownExpression(I);
+    performCongruenceFinding(I, Symbolized);
+  } else {
+    // Handle terminators that return values. All of them produce values we
+    // don't currently understand.  We don't place non-value producing
+    // terminators in a class.
+    if (!I->getType()->isVoidTy()) {
+      auto *Symbolized = createUnknownExpression(I);
+      performCongruenceFinding(I, Symbolized);
+    }
+    processOutgoingEdges(dyn_cast<TerminatorInst>(I), I->getParent());
+  }
+}
+
+// Check if there is a path, using single or equal argument phi nodes, from
+// First to Second.
+bool NewGVN::singleReachablePHIPath(
+    SmallPtrSet<const MemoryAccess *, 8> &Visited, const MemoryAccess *First,
+    const MemoryAccess *Second) const {
+  if (First == Second)
+    return true;
+  if (MSSA->isLiveOnEntryDef(First))
+    return false;
+
+  // This is not perfect, but as we're just verifying here, we can live with
+  // the loss of precision. The real solution would be that of doing strongly
+  // connected component finding in this routine, and it's probably not worth
+  // the complexity for the time being. So, we just keep a set of visited
+  // MemoryAccess and return true when we hit a cycle.
+  if (Visited.count(First))
+    return true;
+  Visited.insert(First);
+
+  const auto *EndDef = First;
+  for (auto *ChainDef : optimized_def_chain(First)) {
+    if (ChainDef == Second)
+      return true;
+    if (MSSA->isLiveOnEntryDef(ChainDef))
+      return false;
+    EndDef = ChainDef;
+  }
+  auto *MP = cast<MemoryPhi>(EndDef);
+  auto ReachableOperandPred = [&](const Use &U) {
+    return ReachableEdges.count({MP->getIncomingBlock(U), MP->getBlock()});
+  };
+  auto FilteredPhiArgs =
+      make_filter_range(MP->operands(), ReachableOperandPred);
+  SmallVector<const Value *, 32> OperandList;
+  std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+            std::back_inserter(OperandList));
+  bool Okay = OperandList.size() == 1;
+  if (!Okay)
+    Okay =
+        std::equal(OperandList.begin(), OperandList.end(), OperandList.begin());
+  if (Okay)
+    return singleReachablePHIPath(Visited, cast<MemoryAccess>(OperandList[0]),
+                                  Second);
+  return false;
+}
+
+// Verify the that the memory equivalence table makes sense relative to the
+// congruence classes.  Note that this checking is not perfect, and is currently
+// subject to very rare false negatives. It is only useful for
+// testing/debugging.
+void NewGVN::verifyMemoryCongruency() const {
+#ifndef NDEBUG
+  // Verify that the memory table equivalence and memory member set match
+  for (const auto *CC : CongruenceClasses) {
+    if (CC == TOPClass || CC->isDead())
+      continue;
+    if (CC->getStoreCount() != 0) {
+      assert((CC->getStoredValue() || !isa<StoreInst>(CC->getLeader())) &&
+             "Any class with a store as a leader should have a "
+             "representative stored value");
+      assert(CC->getMemoryLeader() &&
+             "Any congruence class with a store should have a "
+             "representative access");
+    }
+
+    if (CC->getMemoryLeader())
+      assert(MemoryAccessToClass.lookup(CC->getMemoryLeader()) == CC &&
+             "Representative MemoryAccess does not appear to be reverse "
+             "mapped properly");
+    for (auto M : CC->memory())
+      assert(MemoryAccessToClass.lookup(M) == CC &&
+             "Memory member does not appear to be reverse mapped properly");
+  }
+
+  // Anything equivalent in the MemoryAccess table should be in the same
+  // congruence class.
+
+  // Filter out the unreachable and trivially dead entries, because they may
+  // never have been updated if the instructions were not processed.
+  auto ReachableAccessPred =
+      [&](const std::pair<const MemoryAccess *, CongruenceClass *> Pair) {
+        bool Result = ReachableBlocks.count(Pair.first->getBlock());
+        if (!Result || MSSA->isLiveOnEntryDef(Pair.first) ||
+            MemoryToDFSNum(Pair.first) == 0)
+          return false;
+        if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
+          return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+
+        // We could have phi nodes which operands are all trivially dead,
+        // so we don't process them.
+        if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) {
+          for (auto &U : MemPHI->incoming_values()) {
+            if (Instruction *I = dyn_cast<Instruction>(U.get())) {
+              if (!isInstructionTriviallyDead(I))
+                return true;
+            }
+          }
+          return false;
+        }
+
+        return true;
+      };
+
+  auto Filtered = make_filter_range(MemoryAccessToClass, ReachableAccessPred);
+  for (auto KV : Filtered) {
+    if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
+      auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second->getMemoryLeader());
+      if (FirstMUD && SecondMUD) {
+        SmallPtrSet<const MemoryAccess *, 8> VisitedMAS;
+        assert((singleReachablePHIPath(VisitedMAS, FirstMUD, SecondMUD) ||
+                ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
+                    ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
+               "The instructions for these memory operations should have "
+               "been in the same congruence class or reachable through"
+               "a single argument phi");
+      }
+    } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
+      // We can only sanely verify that MemoryDefs in the operand list all have
+      // the same class.
+      auto ReachableOperandPred = [&](const Use &U) {
+        return ReachableEdges.count(
+                   {FirstMP->getIncomingBlock(U), FirstMP->getBlock()}) &&
+               isa<MemoryDef>(U);
+
+      };
+      // All arguments should in the same class, ignoring unreachable arguments
+      auto FilteredPhiArgs =
+          make_filter_range(FirstMP->operands(), ReachableOperandPred);
+      SmallVector<const CongruenceClass *, 16> PhiOpClasses;
+      std::transform(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+                     std::back_inserter(PhiOpClasses), [&](const Use &U) {
+                       const MemoryDef *MD = cast<MemoryDef>(U);
+                       return ValueToClass.lookup(MD->getMemoryInst());
+                     });
+      assert(std::equal(PhiOpClasses.begin(), PhiOpClasses.end(),
+                        PhiOpClasses.begin()) &&
+             "All MemoryPhi arguments should be in the same class");
+    }
+  }
+#endif
+}
+
+// Verify that the sparse propagation we did actually found the maximal fixpoint
+// We do this by storing the value to class mapping, touching all instructions,
+// and redoing the iteration to see if anything changed.
+void NewGVN::verifyIterationSettled(Function &F) {
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Beginning iteration verification\n");
+  if (DebugCounter::isCounterSet(VNCounter))
+    DebugCounter::setCounterValue(VNCounter, StartingVNCounter);
+
+  // Note that we have to store the actual classes, as we may change existing
+  // classes during iteration.  This is because our memory iteration propagation
+  // is not perfect, and so may waste a little work.  But it should generate
+  // exactly the same congruence classes we have now, with different IDs.
+  std::map<const Value *, CongruenceClass> BeforeIteration;
+
+  for (auto &KV : ValueToClass) {
+    if (auto *I = dyn_cast<Instruction>(KV.first))
+      // Skip unused/dead instructions.
+      if (InstrToDFSNum(I) == 0)
+        continue;
+    BeforeIteration.insert({KV.first, *KV.second});
+  }
+
+  TouchedInstructions.set();
+  TouchedInstructions.reset(0);
+  iterateTouchedInstructions();
+  DenseSet<std::pair<const CongruenceClass *, const CongruenceClass *>>
+      EqualClasses;
+  for (const auto &KV : ValueToClass) {
+    if (auto *I = dyn_cast<Instruction>(KV.first))
+      // Skip unused/dead instructions.
+      if (InstrToDFSNum(I) == 0)
+        continue;
+    // We could sink these uses, but i think this adds a bit of clarity here as
+    // to what we are comparing.
+    auto *BeforeCC = &BeforeIteration.find(KV.first)->second;
+    auto *AfterCC = KV.second;
+    // Note that the classes can't change at this point, so we memoize the set
+    // that are equal.
+    if (!EqualClasses.count({BeforeCC, AfterCC})) {
+      assert(BeforeCC->isEquivalentTo(AfterCC) &&
+             "Value number changed after main loop completed!");
+      EqualClasses.insert({BeforeCC, AfterCC});
+    }
+  }
+#endif
+}
+
+// Verify that for each store expression in the expression to class mapping,
+// only the latest appears, and multiple ones do not appear.
+// Because loads do not use the stored value when doing equality with stores,
+// if we don't erase the old store expressions from the table, a load can find
+// a no-longer valid StoreExpression.
+void NewGVN::verifyStoreExpressions() const {
+#ifndef NDEBUG
+  // This is the only use of this, and it's not worth defining a complicated
+  // densemapinfo hash/equality function for it.
+  std::set<
+      std::pair<const Value *,
+                std::tuple<const Value *, const CongruenceClass *, Value *>>>
+      StoreExpressionSet;
+  for (const auto &KV : ExpressionToClass) {
+    if (auto *SE = dyn_cast<StoreExpression>(KV.first)) {
+      // Make sure a version that will conflict with loads is not already there
+      auto Res = StoreExpressionSet.insert(
+          {SE->getOperand(0), std::make_tuple(SE->getMemoryLeader(), KV.second,
+                                              SE->getStoredValue())});
+      bool Okay = Res.second;
+      // It's okay to have the same expression already in there if it is
+      // identical in nature.
+      // This can happen when the leader of the stored value changes over time.
+      if (!Okay)
+        Okay = (std::get<1>(Res.first->second) == KV.second) &&
+               (lookupOperandLeader(std::get<2>(Res.first->second)) ==
+                lookupOperandLeader(SE->getStoredValue()));
+      assert(Okay && "Stored expression conflict exists in expression table");
+      auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst());
+      assert(ValueExpr && ValueExpr->equals(*SE) &&
+             "StoreExpression in ExpressionToClass is not latest "
+             "StoreExpression for value");
+    }
+  }
+#endif
+}
+
+// This is the main value numbering loop, it iterates over the initial touched
+// instruction set, propagating value numbers, marking things touched, etc,
+// until the set of touched instructions is completely empty.
+void NewGVN::iterateTouchedInstructions() {
+  unsigned int Iterations = 0;
+  // Figure out where touchedinstructions starts
+  int FirstInstr = TouchedInstructions.find_first();
+  // Nothing set, nothing to iterate, just return.
+  if (FirstInstr == -1)
+    return;
+  const BasicBlock *LastBlock = getBlockForValue(InstrFromDFSNum(FirstInstr));
+  while (TouchedInstructions.any()) {
+    ++Iterations;
+    // Walk through all the instructions in all the blocks in RPO.
+    // TODO: As we hit a new block, we should push and pop equalities into a
+    // table lookupOperandLeader can use, to catch things PredicateInfo
+    // might miss, like edge-only equivalences.
+    for (unsigned InstrNum : TouchedInstructions.set_bits()) {
+
+      // This instruction was found to be dead. We don't bother looking
+      // at it again.
+      if (InstrNum == 0) {
+        TouchedInstructions.reset(InstrNum);
+        continue;
+      }
+
+      Value *V = InstrFromDFSNum(InstrNum);
+      const BasicBlock *CurrBlock = getBlockForValue(V);
+
+      // If we hit a new block, do reachability processing.
+      if (CurrBlock != LastBlock) {
+        LastBlock = CurrBlock;
+        bool BlockReachable = ReachableBlocks.count(CurrBlock);
+        const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
+
+        // If it's not reachable, erase any touched instructions and move on.
+        if (!BlockReachable) {
+          TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
+          DEBUG(dbgs() << "Skipping instructions in block "
+                       << getBlockName(CurrBlock)
+                       << " because it is unreachable\n");
+          continue;
+        }
+        updateProcessedCount(CurrBlock);
+      }
+      // Reset after processing (because we may mark ourselves as touched when
+      // we propagate equalities).
+      TouchedInstructions.reset(InstrNum);
+
+      if (auto *MP = dyn_cast<MemoryPhi>(V)) {
+        DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
+        valueNumberMemoryPhi(MP);
+      } else if (auto *I = dyn_cast<Instruction>(V)) {
+        valueNumberInstruction(I);
+      } else {
+        llvm_unreachable("Should have been a MemoryPhi or Instruction");
+      }
+      updateProcessedCount(V);
+    }
+  }
+  NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
+}
+
+// This is the main transformation entry point.
+bool NewGVN::runGVN() {
+  if (DebugCounter::isCounterSet(VNCounter))
+    StartingVNCounter = DebugCounter::getCounterValue(VNCounter);
+  bool Changed = false;
+  NumFuncArgs = F.arg_size();
+  MSSAWalker = MSSA->getWalker();
+  SingletonDeadExpression = new (ExpressionAllocator) DeadExpression();
+
+  // Count number of instructions for sizing of hash tables, and come
+  // up with a global dfs numbering for instructions.
+  unsigned ICount = 1;
+  // Add an empty instruction to account for the fact that we start at 1
+  DFSToInstr.emplace_back(nullptr);
+  // Note: We want ideal RPO traversal of the blocks, which is not quite the
+  // same as dominator tree order, particularly with regard whether backedges
+  // get visited first or second, given a block with multiple successors.
+  // If we visit in the wrong order, we will end up performing N times as many
+  // iterations.
+  // The dominator tree does guarantee that, for a given dom tree node, it's
+  // parent must occur before it in the RPO ordering. Thus, we only need to sort
+  // the siblings.
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  unsigned Counter = 0;
+  for (auto &B : RPOT) {
+    auto *Node = DT->getNode(B);
+    assert(Node && "RPO and Dominator tree should have same reachability");
+    RPOOrdering[Node] = ++Counter;
+  }
+  // Sort dominator tree children arrays into RPO.
+  for (auto &B : RPOT) {
+    auto *Node = DT->getNode(B);
+    if (Node->getChildren().size() > 1)
+      std::sort(Node->begin(), Node->end(),
+                [&](const DomTreeNode *A, const DomTreeNode *B) {
+                  return RPOOrdering[A] < RPOOrdering[B];
+                });
+  }
+
+  // Now a standard depth first ordering of the domtree is equivalent to RPO.
+  for (auto DTN : depth_first(DT->getRootNode())) {
+    BasicBlock *B = DTN->getBlock();
+    const auto &BlockRange = assignDFSNumbers(B, ICount);
+    BlockInstRange.insert({B, BlockRange});
+    ICount += BlockRange.second - BlockRange.first;
+  }
+  initializeCongruenceClasses(F);
+
+  TouchedInstructions.resize(ICount);
+  // Ensure we don't end up resizing the expressionToClass map, as
+  // that can be quite expensive. At most, we have one expression per
+  // instruction.
+  ExpressionToClass.reserve(ICount);
+
+  // Initialize the touched instructions to include the entry block.
+  const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock());
+  TouchedInstructions.set(InstRange.first, InstRange.second);
+  DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock())
+               << " marked reachable\n");
+  ReachableBlocks.insert(&F.getEntryBlock());
+
+  iterateTouchedInstructions();
+  verifyMemoryCongruency();
+  verifyIterationSettled(F);
+  verifyStoreExpressions();
+
+  Changed |= eliminateInstructions(F);
+
+  // Delete all instructions marked for deletion.
+  for (Instruction *ToErase : InstructionsToErase) {
+    if (!ToErase->use_empty())
+      ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType()));
+
+    if (ToErase->getParent())
+      ToErase->eraseFromParent();
+  }
+
+  // Delete all unreachable blocks.
+  auto UnreachableBlockPred = [&](const BasicBlock &BB) {
+    return !ReachableBlocks.count(&BB);
+  };
+
+  for (auto &BB : make_filter_range(F, UnreachableBlockPred)) {
+    DEBUG(dbgs() << "We believe block " << getBlockName(&BB)
+                 << " is unreachable\n");
+    deleteInstructionsInBlock(&BB);
+    Changed = true;
+  }
+
+  cleanupTables();
+  return Changed;
+}
+
+struct NewGVN::ValueDFS {
+  int DFSIn = 0;
+  int DFSOut = 0;
+  int LocalNum = 0;
+  // Only one of Def and U will be set.
+  // The bool in the Def tells us whether the Def is the stored value of a
+  // store.
+  PointerIntPair<Value *, 1, bool> Def;
+  Use *U = nullptr;
+  bool operator<(const ValueDFS &Other) const {
+    // It's not enough that any given field be less than - we have sets
+    // of fields that need to be evaluated together to give a proper ordering.
+    // For example, if you have;
+    // DFS (1, 3)
+    // Val 0
+    // DFS (1, 2)
+    // Val 50
+    // We want the second to be less than the first, but if we just go field
+    // by field, we will get to Val 0 < Val 50 and say the first is less than
+    // the second. We only want it to be less than if the DFS orders are equal.
+    //
+    // Each LLVM instruction only produces one value, and thus the lowest-level
+    // differentiator that really matters for the stack (and what we use as as a
+    // replacement) is the local dfs number.
+    // Everything else in the structure is instruction level, and only affects
+    // the order in which we will replace operands of a given instruction.
+    //
+    // For a given instruction (IE things with equal dfsin, dfsout, localnum),
+    // the order of replacement of uses does not matter.
+    // IE given,
+    //  a = 5
+    //  b = a + a
+    // When you hit b, you will have two valuedfs with the same dfsin, out, and
+    // localnum.
+    // The .val will be the same as well.
+    // The .u's will be different.
+    // You will replace both, and it does not matter what order you replace them
+    // in (IE whether you replace operand 2, then operand 1, or operand 1, then
+    // operand 2).
+    // Similarly for the case of same dfsin, dfsout, localnum, but different
+    // .val's
+    //  a = 5
+    //  b  = 6
+    //  c = a + b
+    // in c, we will a valuedfs for a, and one for b,with everything the same
+    // but .val  and .u.
+    // It does not matter what order we replace these operands in.
+    // You will always end up with the same IR, and this is guaranteed.
+    return std::tie(DFSIn, DFSOut, LocalNum, Def, U) <
+           std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Def,
+                    Other.U);
+  }
+};
+
+// This function converts the set of members for a congruence class from values,
+// to sets of defs and uses with associated DFS info.  The total number of
+// reachable uses for each value is stored in UseCount, and instructions that
+// seem
+// dead (have no non-dead uses) are stored in ProbablyDead.
+void NewGVN::convertClassToDFSOrdered(
+    const CongruenceClass &Dense, SmallVectorImpl<ValueDFS> &DFSOrderedSet,
+    DenseMap<const Value *, unsigned int> &UseCounts,
+    SmallPtrSetImpl<Instruction *> &ProbablyDead) const {
+  for (auto D : Dense) {
+    // First add the value.
+    BasicBlock *BB = getBlockForValue(D);
+    // Constants are handled prior to ever calling this function, so
+    // we should only be left with instructions as members.
+    assert(BB && "Should have figured out a basic block for value");
+    ValueDFS VDDef;
+    DomTreeNode *DomNode = DT->getNode(BB);
+    VDDef.DFSIn = DomNode->getDFSNumIn();
+    VDDef.DFSOut = DomNode->getDFSNumOut();
+    // If it's a store, use the leader of the value operand, if it's always
+    // available, or the value operand.  TODO: We could do dominance checks to
+    // find a dominating leader, but not worth it ATM.
+    if (auto *SI = dyn_cast<StoreInst>(D)) {
+      auto Leader = lookupOperandLeader(SI->getValueOperand());
+      if (alwaysAvailable(Leader)) {
+        VDDef.Def.setPointer(Leader);
+      } else {
+        VDDef.Def.setPointer(SI->getValueOperand());
+        VDDef.Def.setInt(true);
+      }
+    } else {
+      VDDef.Def.setPointer(D);
+    }
+    assert(isa<Instruction>(D) &&
+           "The dense set member should always be an instruction");
+    Instruction *Def = cast<Instruction>(D);
+    VDDef.LocalNum = InstrToDFSNum(D);
+    DFSOrderedSet.push_back(VDDef);
+    // If there is a phi node equivalent, add it
+    if (auto *PN = RealToTemp.lookup(Def)) {
+      auto *PHIE =
+          dyn_cast_or_null<PHIExpression>(ValueToExpression.lookup(Def));
+      if (PHIE) {
+        VDDef.Def.setInt(false);
+        VDDef.Def.setPointer(PN);
+        VDDef.LocalNum = 0;
+        DFSOrderedSet.push_back(VDDef);
+      }
+    }
+
+    unsigned int UseCount = 0;
+    // Now add the uses.
+    for (auto &U : Def->uses()) {
+      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+        // Don't try to replace into dead uses
+        if (InstructionsToErase.count(I))
+          continue;
+        ValueDFS VDUse;
+        // Put the phi node uses in the incoming block.
+        BasicBlock *IBlock;
+        if (auto *P = dyn_cast<PHINode>(I)) {
+          IBlock = P->getIncomingBlock(U);
+          // Make phi node users appear last in the incoming block
+          // they are from.
+          VDUse.LocalNum = InstrDFS.size() + 1;
+        } else {
+          IBlock = getBlockForValue(I);
+          VDUse.LocalNum = InstrToDFSNum(I);
+        }
+
+        // Skip uses in unreachable blocks, as we're going
+        // to delete them.
+        if (ReachableBlocks.count(IBlock) == 0)
+          continue;
+
+        DomTreeNode *DomNode = DT->getNode(IBlock);
+        VDUse.DFSIn = DomNode->getDFSNumIn();
+        VDUse.DFSOut = DomNode->getDFSNumOut();
+        VDUse.U = &U;
+        ++UseCount;
+        DFSOrderedSet.emplace_back(VDUse);
+      }
+    }
+
+    // If there are no uses, it's probably dead (but it may have side-effects,
+    // so not definitely dead. Otherwise, store the number of uses so we can
+    // track if it becomes dead later).
+    if (UseCount == 0)
+      ProbablyDead.insert(Def);
+    else
+      UseCounts[Def] = UseCount;
+  }
+}
+
+// This function converts the set of members for a congruence class from values,
+// to the set of defs for loads and stores, with associated DFS info.
+void NewGVN::convertClassToLoadsAndStores(
+    const CongruenceClass &Dense,
+    SmallVectorImpl<ValueDFS> &LoadsAndStores) const {
+  for (auto D : Dense) {
+    if (!isa<LoadInst>(D) && !isa<StoreInst>(D))
+      continue;
+
+    BasicBlock *BB = getBlockForValue(D);
+    ValueDFS VD;
+    DomTreeNode *DomNode = DT->getNode(BB);
+    VD.DFSIn = DomNode->getDFSNumIn();
+    VD.DFSOut = DomNode->getDFSNumOut();
+    VD.Def.setPointer(D);
+
+    // If it's an instruction, use the real local dfs number.
+    if (auto *I = dyn_cast<Instruction>(D))
+      VD.LocalNum = InstrToDFSNum(I);
+    else
+      llvm_unreachable("Should have been an instruction");
+
+    LoadsAndStores.emplace_back(VD);
+  }
+}
+
+static void patchReplacementInstruction(Instruction *I, Value *Repl) {
+  auto *ReplInst = dyn_cast<Instruction>(Repl);
+  if (!ReplInst)
+    return;
+
+  // Patch the replacement so that it is not more restrictive than the value
+  // being replaced.
+  // Note that if 'I' is a load being replaced by some operation,
+  // for example, by an arithmetic operation, then andIRFlags()
+  // would just erase all math flags from the original arithmetic
+  // operation, which is clearly not wanted and not needed.
+  if (!isa<LoadInst>(I))
+    ReplInst->andIRFlags(I);
+
+  // FIXME: If both the original and replacement value are part of the
+  // same control-flow region (meaning that the execution of one
+  // guarantees the execution of the other), then we can combine the
+  // noalias scopes here and do better than the general conservative
+  // answer used in combineMetadata().
+
+  // In general, GVN unifies expressions over different control-flow
+  // regions, and so we need a conservative combination of the noalias
+  // scopes.
+  static const unsigned KnownIDs[] = {
+      LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
+      LLVMContext::MD_noalias,        LLVMContext::MD_range,
+      LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
+      LLVMContext::MD_invariant_group};
+  combineMetadata(ReplInst, I, KnownIDs);
+}
+
+static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
+  patchReplacementInstruction(I, Repl);
+  I->replaceAllUsesWith(Repl);
+}
+
+void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
+  DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
+  ++NumGVNBlocksDeleted;
+
+  // Delete the instructions backwards, as it has a reduced likelihood of having
+  // to update as many def-use and use-def chains. Start after the terminator.
+  auto StartPoint = BB->rbegin();
+  ++StartPoint;
+  // Note that we explicitly recalculate BB->rend() on each iteration,
+  // as it may change when we remove the first instruction.
+  for (BasicBlock::reverse_iterator I(StartPoint); I != BB->rend();) {
+    Instruction &Inst = *I++;
+    if (!Inst.use_empty())
+      Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
+    if (isa<LandingPadInst>(Inst))
+      continue;
+
+    Inst.eraseFromParent();
+    ++NumGVNInstrDeleted;
+  }
+  // Now insert something that simplifycfg will turn into an unreachable.
+  Type *Int8Ty = Type::getInt8Ty(BB->getContext());
+  new StoreInst(UndefValue::get(Int8Ty),
+                Constant::getNullValue(Int8Ty->getPointerTo()),
+                BB->getTerminator());
+}
+
+void NewGVN::markInstructionForDeletion(Instruction *I) {
+  DEBUG(dbgs() << "Marking " << *I << " for deletion\n");
+  InstructionsToErase.insert(I);
+}
+
+void NewGVN::replaceInstruction(Instruction *I, Value *V) {
+
+  DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n");
+  patchAndReplaceAllUsesWith(I, V);
+  // We save the actual erasing to avoid invalidating memory
+  // dependencies until we are done with everything.
+  markInstructionForDeletion(I);
+}
+
+namespace {
+
+// This is a stack that contains both the value and dfs info of where
+// that value is valid.
+class ValueDFSStack {
+public:
+  Value *back() const { return ValueStack.back(); }
+  std::pair<int, int> dfs_back() const { return DFSStack.back(); }
+
+  void push_back(Value *V, int DFSIn, int DFSOut) {
+    ValueStack.emplace_back(V);
+    DFSStack.emplace_back(DFSIn, DFSOut);
+  }
+  bool empty() const { return DFSStack.empty(); }
+  bool isInScope(int DFSIn, int DFSOut) const {
+    if (empty())
+      return false;
+    return DFSIn >= DFSStack.back().first && DFSOut <= DFSStack.back().second;
+  }
+
+  void popUntilDFSScope(int DFSIn, int DFSOut) {
+
+    // These two should always be in sync at this point.
+    assert(ValueStack.size() == DFSStack.size() &&
+           "Mismatch between ValueStack and DFSStack");
+    while (
+        !DFSStack.empty() &&
+        !(DFSIn >= DFSStack.back().first && DFSOut <= DFSStack.back().second)) {
+      DFSStack.pop_back();
+      ValueStack.pop_back();
+    }
+  }
+
+private:
+  SmallVector<Value *, 8> ValueStack;
+  SmallVector<std::pair<int, int>, 8> DFSStack;
+};
+}
+
+// Given a value and a basic block we are trying to see if it is available in,
+// see if the value has a leader available in that block.
+Value *NewGVN::findPhiOfOpsLeader(const Expression *E,
+                                  const BasicBlock *BB) const {
+  // It would already be constant if we could make it constant
+  if (auto *CE = dyn_cast<ConstantExpression>(E))
+    return CE->getConstantValue();
+  if (auto *VE = dyn_cast<VariableExpression>(E))
+    return VE->getVariableValue();
+
+  auto *CC = ExpressionToClass.lookup(E);
+  if (!CC)
+    return nullptr;
+  if (alwaysAvailable(CC->getLeader()))
+    return CC->getLeader();
+
+  for (auto Member : *CC) {
+    auto *MemberInst = dyn_cast<Instruction>(Member);
+    // Anything that isn't an instruction is always available.
+    if (!MemberInst)
+      return Member;
+    // If we are looking for something in the same block as the member, it must
+    // be a leader because this function is looking for operands for a phi node.
+    if (MemberInst->getParent() == BB ||
+        DT->dominates(MemberInst->getParent(), BB)) {
+      return Member;
+    }
+  }
+  return nullptr;
+}
+
+bool NewGVN::eliminateInstructions(Function &F) {
+  // This is a non-standard eliminator. The normal way to eliminate is
+  // to walk the dominator tree in order, keeping track of available
+  // values, and eliminating them.  However, this is mildly
+  // pointless. It requires doing lookups on every instruction,
+  // regardless of whether we will ever eliminate it.  For
+  // instructions part of most singleton congruence classes, we know we
+  // will never eliminate them.
+
+  // Instead, this eliminator looks at the congruence classes directly, sorts
+  // them into a DFS ordering of the dominator tree, and then we just
+  // perform elimination straight on the sets by walking the congruence
+  // class member uses in order, and eliminate the ones dominated by the
+  // last member.   This is worst case O(E log E) where E = number of
+  // instructions in a single congruence class.  In theory, this is all
+  // instructions.   In practice, it is much faster, as most instructions are
+  // either in singleton congruence classes or can't possibly be eliminated
+  // anyway (if there are no overlapping DFS ranges in class).
+  // When we find something not dominated, it becomes the new leader
+  // for elimination purposes.
+  // TODO: If we wanted to be faster, We could remove any members with no
+  // overlapping ranges while sorting, as we will never eliminate anything
+  // with those members, as they don't dominate anything else in our set.
+
+  bool AnythingReplaced = false;
+
+  // Since we are going to walk the domtree anyway, and we can't guarantee the
+  // DFS numbers are updated, we compute some ourselves.
+  DT->updateDFSNumbers();
+
+  // Go through all of our phi nodes, and kill the arguments associated with
+  // unreachable edges.
+  auto ReplaceUnreachablePHIArgs = [&](PHINode &PHI, BasicBlock *BB) {
+    for (auto &Operand : PHI.incoming_values())
+      if (!ReachableEdges.count({PHI.getIncomingBlock(Operand), BB})) {
+        DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block "
+                     << getBlockName(PHI.getIncomingBlock(Operand))
+                     << " with undef due to it being unreachable\n");
+        Operand.set(UndefValue::get(PHI.getType()));
+      }
+  };
+  SmallPtrSet<BasicBlock *, 8> BlocksWithPhis;
+  for (auto &B : F)
+    if ((!B.empty() && isa<PHINode>(*B.begin())) ||
+        (PHIOfOpsPHIs.find(&B) != PHIOfOpsPHIs.end()))
+      BlocksWithPhis.insert(&B);
+  DenseMap<const BasicBlock *, unsigned> ReachablePredCount;
+  for (auto KV : ReachableEdges)
+    ReachablePredCount[KV.getEnd()]++;
+  for (auto *BB : BlocksWithPhis)
+    // TODO: It would be faster to use getNumIncomingBlocks() on a phi node in
+    // the block and subtract the pred count, but it's more complicated.
+    if (ReachablePredCount.lookup(BB) !=
+        unsigned(std::distance(pred_begin(BB), pred_end(BB)))) {
+      for (auto II = BB->begin(); isa<PHINode>(II); ++II) {
+        auto &PHI = cast<PHINode>(*II);
+        ReplaceUnreachablePHIArgs(PHI, BB);
+      }
+      for_each_found(PHIOfOpsPHIs, BB, [&](PHINode *PHI) {
+        ReplaceUnreachablePHIArgs(*PHI, BB);
+      });
+    }
+
+  // Map to store the use counts
+  DenseMap<const Value *, unsigned int> UseCounts;
+  for (auto *CC : reverse(CongruenceClasses)) {
+    DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() << "\n");
+    // Track the equivalent store info so we can decide whether to try
+    // dead store elimination.
+    SmallVector<ValueDFS, 8> PossibleDeadStores;
+    SmallPtrSet<Instruction *, 8> ProbablyDead;
+    if (CC->isDead() || CC->empty())
+      continue;
+    // Everything still in the TOP class is unreachable or dead.
+    if (CC == TOPClass) {
+      for (auto M : *CC) {
+        auto *VTE = ValueToExpression.lookup(M);
+        if (VTE && isa<DeadExpression>(VTE))
+          markInstructionForDeletion(cast<Instruction>(M));
+        assert((!ReachableBlocks.count(cast<Instruction>(M)->getParent()) ||
+                InstructionsToErase.count(cast<Instruction>(M))) &&
+               "Everything in TOP should be unreachable or dead at this "
+               "point");
+      }
+      continue;
+    }
+
+    assert(CC->getLeader() && "We should have had a leader");
+    // If this is a leader that is always available, and it's a
+    // constant or has no equivalences, just replace everything with
+    // it. We then update the congruence class with whatever members
+    // are left.
+    Value *Leader =
+        CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+    if (alwaysAvailable(Leader)) {
+      CongruenceClass::MemberSet MembersLeft;
+      for (auto M : *CC) {
+        Value *Member = M;
+        // Void things have no uses we can replace.
+        if (Member == Leader || !isa<Instruction>(Member) ||
+            Member->getType()->isVoidTy()) {
+          MembersLeft.insert(Member);
+          continue;
+        }
+        DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " << *Member
+                     << "\n");
+        auto *I = cast<Instruction>(Member);
+        assert(Leader != I && "About to accidentally remove our leader");
+        replaceInstruction(I, Leader);
+        AnythingReplaced = true;
+      }
+      CC->swap(MembersLeft);
+    } else {
+      // If this is a singleton, we can skip it.
+      if (CC->size() != 1 || RealToTemp.lookup(Leader)) {
+        // This is a stack because equality replacement/etc may place
+        // constants in the middle of the member list, and we want to use
+        // those constant values in preference to the current leader, over
+        // the scope of those constants.
+        ValueDFSStack EliminationStack;
+
+        // Convert the members to DFS ordered sets and then merge them.
+        SmallVector<ValueDFS, 8> DFSOrderedSet;
+        convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead);
+
+        // Sort the whole thing.
+        std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
+        for (auto &VD : DFSOrderedSet) {
+          int MemberDFSIn = VD.DFSIn;
+          int MemberDFSOut = VD.DFSOut;
+          Value *Def = VD.Def.getPointer();
+          bool FromStore = VD.Def.getInt();
+          Use *U = VD.U;
+          // We ignore void things because we can't get a value from them.
+          if (Def && Def->getType()->isVoidTy())
+            continue;
+          auto *DefInst = dyn_cast_or_null<Instruction>(Def);
+          if (DefInst && AllTempInstructions.count(DefInst)) {
+            auto *PN = cast<PHINode>(DefInst);
+
+            // If this is a value phi and that's the expression we used, insert
+            // it into the program
+            // remove from temp instruction list.
+            AllTempInstructions.erase(PN);
+            auto *DefBlock = getBlockForValue(Def);
+            DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def
+                         << " into block "
+                         << getBlockName(getBlockForValue(Def)) << "\n");
+            PN->insertBefore(&DefBlock->front());
+            Def = PN;
+            NumGVNPHIOfOpsEliminations++;
+          }
+
+          if (EliminationStack.empty()) {
+            DEBUG(dbgs() << "Elimination Stack is empty\n");
+          } else {
+            DEBUG(dbgs() << "Elimination Stack Top DFS numbers are ("
+                         << EliminationStack.dfs_back().first << ","
+                         << EliminationStack.dfs_back().second << ")\n");
+          }
+
+          DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
+                       << MemberDFSOut << ")\n");
+          // First, we see if we are out of scope or empty.  If so,
+          // and there equivalences, we try to replace the top of
+          // stack with equivalences (if it's on the stack, it must
+          // not have been eliminated yet).
+          // Then we synchronize to our current scope, by
+          // popping until we are back within a DFS scope that
+          // dominates the current member.
+          // Then, what happens depends on a few factors
+          // If the stack is now empty, we need to push
+          // If we have a constant or a local equivalence we want to
+          // start using, we also push.
+          // Otherwise, we walk along, processing members who are
+          // dominated by this scope, and eliminate them.
+          bool ShouldPush = Def && EliminationStack.empty();
+          bool OutOfScope =
+              !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut);
+
+          if (OutOfScope || ShouldPush) {
+            // Sync to our current scope.
+            EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
+            bool ShouldPush = Def && EliminationStack.empty();
+            if (ShouldPush) {
+              EliminationStack.push_back(Def, MemberDFSIn, MemberDFSOut);
+            }
+          }
+
+          // Skip the Def's, we only want to eliminate on their uses.  But mark
+          // dominated defs as dead.
+          if (Def) {
+            // For anything in this case, what and how we value number
+            // guarantees that any side-effets that would have occurred (ie
+            // throwing, etc) can be proven to either still occur (because it's
+            // dominated by something that has the same side-effects), or never
+            // occur.  Otherwise, we would not have been able to prove it value
+            // equivalent to something else. For these things, we can just mark
+            // it all dead.  Note that this is different from the "ProbablyDead"
+            // set, which may not be dominated by anything, and thus, are only
+            // easy to prove dead if they are also side-effect free. Note that
+            // because stores are put in terms of the stored value, we skip
+            // stored values here. If the stored value is really dead, it will
+            // still be marked for deletion when we process it in its own class.
+            if (!EliminationStack.empty() && Def != EliminationStack.back() &&
+                isa<Instruction>(Def) && !FromStore)
+              markInstructionForDeletion(cast<Instruction>(Def));
+            continue;
+          }
+          // At this point, we know it is a Use we are trying to possibly
+          // replace.
+
+          assert(isa<Instruction>(U->get()) &&
+                 "Current def should have been an instruction");
+          assert(isa<Instruction>(U->getUser()) &&
+                 "Current user should have been an instruction");
+
+          // If the thing we are replacing into is already marked to be dead,
+          // this use is dead.  Note that this is true regardless of whether
+          // we have anything dominating the use or not.  We do this here
+          // because we are already walking all the uses anyway.
+          Instruction *InstUse = cast<Instruction>(U->getUser());
+          if (InstructionsToErase.count(InstUse)) {
+            auto &UseCount = UseCounts[U->get()];
+            if (--UseCount == 0) {
+              ProbablyDead.insert(cast<Instruction>(U->get()));
+            }
+          }
+
+          // If we get to this point, and the stack is empty we must have a use
+          // with nothing we can use to eliminate this use, so just skip it.
+          if (EliminationStack.empty())
+            continue;
+
+          Value *DominatingLeader = EliminationStack.back();
+
+          auto *II = dyn_cast<IntrinsicInst>(DominatingLeader);
+          if (II && II->getIntrinsicID() == Intrinsic::ssa_copy)
+            DominatingLeader = II->getOperand(0);
+
+          // Don't replace our existing users with ourselves.
+          if (U->get() == DominatingLeader)
+            continue;
+          DEBUG(dbgs() << "Found replacement " << *DominatingLeader << " for "
+                       << *U->get() << " in " << *(U->getUser()) << "\n");
+
+          // If we replaced something in an instruction, handle the patching of
+          // metadata.  Skip this if we are replacing predicateinfo with its
+          // original operand, as we already know we can just drop it.
+          auto *ReplacedInst = cast<Instruction>(U->get());
+          auto *PI = PredInfo->getPredicateInfoFor(ReplacedInst);
+          if (!PI || DominatingLeader != PI->OriginalOp)
+            patchReplacementInstruction(ReplacedInst, DominatingLeader);
+          U->set(DominatingLeader);
+          // This is now a use of the dominating leader, which means if the
+          // dominating leader was dead, it's now live!
+          auto &LeaderUseCount = UseCounts[DominatingLeader];
+          // It's about to be alive again.
+          if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
+            ProbablyDead.erase(cast<Instruction>(DominatingLeader));
+          if (LeaderUseCount == 0 && II)
+            ProbablyDead.insert(II);
+          ++LeaderUseCount;
+          AnythingReplaced = true;
+        }
+      }
+    }
+
+    // At this point, anything still in the ProbablyDead set is actually dead if
+    // would be trivially dead.
+    for (auto *I : ProbablyDead)
+      if (wouldInstructionBeTriviallyDead(I))
+        markInstructionForDeletion(I);
+
+    // Cleanup the congruence class.
+    CongruenceClass::MemberSet MembersLeft;
+    for (auto *Member : *CC)
+      if (!isa<Instruction>(Member) ||
+          !InstructionsToErase.count(cast<Instruction>(Member)))
+        MembersLeft.insert(Member);
+    CC->swap(MembersLeft);
+
+    // If we have possible dead stores to look at, try to eliminate them.
+    if (CC->getStoreCount() > 0) {
+      convertClassToLoadsAndStores(*CC, PossibleDeadStores);
+      std::sort(PossibleDeadStores.begin(), PossibleDeadStores.end());
+      ValueDFSStack EliminationStack;
+      for (auto &VD : PossibleDeadStores) {
+        int MemberDFSIn = VD.DFSIn;
+        int MemberDFSOut = VD.DFSOut;
+        Instruction *Member = cast<Instruction>(VD.Def.getPointer());
+        if (EliminationStack.empty() ||
+            !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut)) {
+          // Sync to our current scope.
+          EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
+          if (EliminationStack.empty()) {
+            EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
+            continue;
+          }
+        }
+        // We already did load elimination, so nothing to do here.
+        if (isa<LoadInst>(Member))
+          continue;
+        assert(!EliminationStack.empty());
+        Instruction *Leader = cast<Instruction>(EliminationStack.back());
+        (void)Leader;
+        assert(DT->dominates(Leader->getParent(), Member->getParent()));
+        // Member is dominater by Leader, and thus dead
+        DEBUG(dbgs() << "Marking dead store " << *Member
+                     << " that is dominated by " << *Leader << "\n");
+        markInstructionForDeletion(Member);
+        CC->erase(Member);
+        ++NumGVNDeadStores;
+      }
+    }
+  }
+  return AnythingReplaced;
+}
+
+// This function provides global ranking of operations so that we can place them
+// in a canonical order.  Note that rank alone is not necessarily enough for a
+// complete ordering, as constants all have the same rank.  However, generally,
+// we will simplify an operation with all constants so that it doesn't matter
+// what order they appear in.
+unsigned int NewGVN::getRank(const Value *V) const {
+  // Prefer constants to undef to anything else
+  // Undef is a constant, have to check it first.
+  // Prefer smaller constants to constantexprs
+  if (isa<ConstantExpr>(V))
+    return 2;
+  if (isa<UndefValue>(V))
+    return 1;
+  if (isa<Constant>(V))
+    return 0;
+  else if (auto *A = dyn_cast<Argument>(V))
+    return 3 + A->getArgNo();
+
+  // Need to shift the instruction DFS by number of arguments + 3 to account for
+  // the constant and argument ranking above.
+  unsigned Result = InstrToDFSNum(V);
+  if (Result > 0)
+    return 4 + NumFuncArgs + Result;
+  // Unreachable or something else, just return a really large number.
+  return ~0;
+}
+
+// This is a function that says whether two commutative operations should
+// have their order swapped when canonicalizing.
+bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
+  // Because we only care about a total ordering, and don't rewrite expressions
+  // in this order, we order by rank, which will give a strict weak ordering to
+  // everything but constants, and then we order by pointer address.
+  return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
+}
+
+namespace {
+class NewGVNLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  NewGVNLegacyPass() : FunctionPass(ID) {
+    initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+} // namespace
+
+bool NewGVNLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+  return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+                &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+                &getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+                F.getParent()->getDataLayout())
+      .runGVN();
+}
+
+INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false,
+                    false)
+
+char NewGVNLegacyPass::ID = 0;
+
+// createGVNPass - The public interface to this file.
+FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); }
+
+PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
+  // Apparently the order in which we get these results matter for
+  // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
+  // the same order here, just in case.
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  bool Changed =
+      NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getParent()->getDataLayout())
+          .runGVN();
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
new file mode 100644
index 000000000000..1bfecea2f61e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -0,0 +1,176 @@
+//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to partially inline the fast path of well-known library
+// functions, such as using square-root instructions for cases where sqrt()
+// does not need to set errno.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "partially-inline-libcalls"
+
+
+static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
+                         BasicBlock &CurrBB, Function::iterator &BB) {
+  // There is no need to change the IR, since backend will emit sqrt
+  // instruction if the call has already been marked read-only.
+  if (Call->onlyReadsMemory())
+    return false;
+
+  // The call must have the expected result type.
+  if (!Call->getType()->isFloatingPointTy())
+    return false;
+
+  // Do the following transformation:
+  //
+  // (before)
+  // dst = sqrt(src)
+  //
+  // (after)
+  // v0 = sqrt_noreadmem(src) # native sqrt instruction.
+  // if (v0 is a NaN)
+  //   v1 = sqrt(src)         # library call.
+  // dst = phi(v0, v1)
+  //
+
+  // Move all instructions following Call to newly created block JoinBB.
+  // Create phi and replace all uses.
+  BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode());
+  IRBuilder<> Builder(JoinBB, JoinBB->begin());
+  PHINode *Phi = Builder.CreatePHI(Call->getType(), 2);
+  Call->replaceAllUsesWith(Phi);
+
+  // Create basic block LibCallBB and insert a call to library function sqrt.
+  BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt",
+                                             CurrBB.getParent(), JoinBB);
+  Builder.SetInsertPoint(LibCallBB);
+  Instruction *LibCall = Call->clone();
+  Builder.Insert(LibCall);
+  Builder.CreateBr(JoinBB);
+
+  // Add attribute "readnone" so that backend can use a native sqrt instruction
+  // for this call. Insert a FP compare instruction and a conditional branch
+  // at the end of CurrBB.
+  Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+  CurrBB.getTerminator()->eraseFromParent();
+  Builder.SetInsertPoint(&CurrBB);
+  Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
+  Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
+
+  // Add phi operands.
+  Phi->addIncoming(Call, &CurrBB);
+  Phi->addIncoming(LibCall, LibCallBB);
+
+  BB = JoinBB->getIterator();
+  return true;
+}
+
+static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
+                                       const TargetTransformInfo *TTI) {
+  bool Changed = false;
+
+  Function::iterator CurrBB;
+  for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
+    CurrBB = BB++;
+
+    for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end();
+         II != IE; ++II) {
+      CallInst *Call = dyn_cast<CallInst>(&*II);
+      Function *CalledFunc;
+
+      if (!Call || !(CalledFunc = Call->getCalledFunction()))
+        continue;
+
+      // Skip if function either has local linkage or is not a known library
+      // function.
+      LibFunc LF;
+      if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
+          !TLI->getLibFunc(CalledFunc->getName(), LF))
+        continue;
+
+      switch (LF) {
+      case LibFunc_sqrtf:
+      case LibFunc_sqrt:
+        if (TTI->haveFastSqrt(Call->getType()) &&
+            optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
+          break;
+        continue;
+      default:
+        continue;
+      }
+
+      Changed = true;
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses
+PartiallyInlineLibCallsPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  if (!runPartiallyInlineLibCalls(F, &TLI, &TTI))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+namespace {
+class PartiallyInlineLibCallsLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  PartiallyInlineLibCallsLegacyPass() : FunctionPass(ID) {
+    initializePartiallyInlineLibCallsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    const TargetTransformInfo *TTI =
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    return runPartiallyInlineLibCalls(F, TLI, TTI);
+  }
+};
+}
+
+char PartiallyInlineLibCallsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PartiallyInlineLibCallsLegacyPass,
+                      "partially-inline-libcalls",
+                      "Partially inline calls to library functions", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(PartiallyInlineLibCallsLegacyPass,
+                    "partially-inline-libcalls",
+                    "Partially inline calls to library functions", false, false)
+
+FunctionPass *llvm::createPartiallyInlineLibCallsPass() {
+  return new PartiallyInlineLibCallsLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
new file mode 100644
index 000000000000..e47b636348e3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -0,0 +1,680 @@
+//===- PlaceSafepoints.cpp - Place GC Safepoints --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Place garbage collection safepoints at appropriate locations in the IR. This
+// does not make relocation semantics or variable liveness explicit.  That's
+// done by RewriteStatepointsForGC.
+//
+// Terminology:
+// - A call is said to be "parseable" if there is a stack map generated for the
+// return PC of the call.  A runtime can determine where values listed in the
+// deopt arguments and (after RewriteStatepointsForGC) gc arguments are located
+// on the stack when the code is suspended inside such a call.  Every parse
+// point is represented by a call wrapped in an gc.statepoint intrinsic.
+// - A "poll" is an explicit check in the generated code to determine if the
+// runtime needs the generated code to cooperate by calling a helper routine
+// and thus suspending its execution at a known state. The call to the helper
+// routine will be parseable.  The (gc & runtime specific) logic of a poll is
+// assumed to be provided in a function of the name "gc.safepoint_poll".
+//
+// We aim to insert polls such that running code can quickly be brought to a
+// well defined state for inspection by the collector.  In the current
+// implementation, this is done via the insertion of poll sites at method entry
+// and the backedge of most loops.  We try to avoid inserting more polls than
+// are necessary to ensure a finite period between poll sites.  This is not
+// because the poll itself is expensive in the generated code; it's not.  Polls
+// do tend to impact the optimizer itself in negative ways; we'd like to avoid
+// perturbing the optimization of the method as much as we can.
+//
+// We also need to make most call sites parseable.  The callee might execute a
+// poll (or otherwise be inspected by the GC).  If so, the entire stack
+// (including the suspended frame of the current method) must be parseable.
+//
+// This pass will insert:
+// - Call parse points ("call safepoints") for any call which may need to
+// reach a safepoint during the execution of the callee function.
+// - Backedge safepoint polls and entry safepoint polls to ensure that
+// executing code reaches a safepoint poll in a finite amount of time.
+//
+// We do not currently support return statepoints, but adding them would not
+// be hard.  They are not required for correctness - entry safepoints are an
+// alternative - but some GCs may prefer them.  Patches welcome.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#define DEBUG_TYPE "safepoint-placement"
+
+STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted");
+STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted");
+
+STATISTIC(CallInLoop,
+          "Number of loops without safepoints due to calls in loop");
+STATISTIC(FiniteExecution,
+          "Number of loops without safepoints finite execution");
+
+using namespace llvm;
+
+// Ignore opportunities to avoid placing safepoints on backedges, useful for
+// validation
+static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden,
+                                  cl::init(false));
+
+/// How narrow does the trip count of a loop have to be to have to be considered
+/// "counted"?  Counted loops do not get safepoints at backedges.
+static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width",
+                                         cl::Hidden, cl::init(32));
+
+// If true, split the backedge of a loop when placing the safepoint, otherwise
+// split the latch block itself.  Both are useful to support for
+// experimentation, but in practice, it looks like splitting the backedge
+// optimizes better.
+static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden,
+                                   cl::init(false));
+
+namespace {
+
+/// An analysis pass whose purpose is to identify each of the backedges in
+/// the function which require a safepoint poll to be inserted.
+struct PlaceBackedgeSafepointsImpl : public FunctionPass {
+  static char ID;
+
+  /// The output of the pass - gives a list of each backedge (described by
+  /// pointing at the branch) which need a poll inserted.
+  std::vector<TerminatorInst *> PollLocations;
+
+  /// True unless we're running spp-no-calls in which case we need to disable
+  /// the call-dependent placement opts.
+  bool CallSafepointsEnabled;
+
+  ScalarEvolution *SE = nullptr;
+  DominatorTree *DT = nullptr;
+  LoopInfo *LI = nullptr;
+
+  PlaceBackedgeSafepointsImpl(bool CallSafepoints = false)
+      : FunctionPass(ID), CallSafepointsEnabled(CallSafepoints) {
+    initializePlaceBackedgeSafepointsImplPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *);
+  void runOnLoopAndSubLoops(Loop *L) {
+    // Visit all the subloops
+    for (Loop *I : *L)
+      runOnLoopAndSubLoops(I);
+    runOnLoop(L);
+  }
+
+  bool runOnFunction(Function &F) override {
+    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    for (Loop *I : *LI) {
+      runOnLoopAndSubLoops(I);
+    }
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    // We no longer modify the IR at all in this pass.  Thus all
+    // analysis are preserved.
+    AU.setPreservesAll();
+  }
+};
+}
+
+static cl::opt<bool> NoEntry("spp-no-entry", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoCall("spp-no-call", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoBackedge("spp-no-backedge", cl::Hidden, cl::init(false));
+
+namespace {
+struct PlaceSafepoints : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  PlaceSafepoints() : FunctionPass(ID) {
+    initializePlaceSafepointsPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // We modify the graph wholesale (inlining, block insertion, etc).  We
+    // preserve nothing at the moment.  We could potentially preserve dom tree
+    // if that was worth doing
+  }
+};
+}
+
+// Insert a safepoint poll immediately before the given instruction.  Does
+// not handle the parsability of state at the runtime call, that's the
+// callers job.
+static void
+InsertSafepointPoll(Instruction *InsertBefore,
+                    std::vector<CallSite> &ParsePointsNeeded /*rval*/);
+
+static bool needsStatepoint(const CallSite &CS) {
+  if (callsGCLeafFunction(CS))
+    return false;
+  if (CS.isCall()) {
+    CallInst *call = cast<CallInst>(CS.getInstruction());
+    if (call->isInlineAsm())
+      return false;
+  }
+
+  return !(isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS));
+}
+
+/// Returns true if this loop is known to contain a call safepoint which
+/// must unconditionally execute on any iteration of the loop which returns
+/// to the loop header via an edge from Pred.  Returns a conservative correct
+/// answer; i.e. false is always valid.
+static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
+                                               BasicBlock *Pred,
+                                               DominatorTree &DT) {
+  // In general, we're looking for any cut of the graph which ensures
+  // there's a call safepoint along every edge between Header and Pred.
+  // For the moment, we look only for the 'cuts' that consist of a single call
+  // instruction in a block which is dominated by the Header and dominates the
+  // loop latch (Pred) block.  Somewhat surprisingly, walking the entire chain
+  // of such dominating blocks gets substantially more occurrences than just
+  // checking the Pred and Header blocks themselves.  This may be due to the
+  // density of loop exit conditions caused by range and null checks.
+  // TODO: structure this as an analysis pass, cache the result for subloops,
+  // avoid dom tree recalculations
+  assert(DT.dominates(Header, Pred) && "loop latch not dominated by header?");
+
+  BasicBlock *Current = Pred;
+  while (true) {
+    for (Instruction &I : *Current) {
+      if (auto CS = CallSite(&I))
+        // Note: Technically, needing a safepoint isn't quite the right
+        // condition here.  We should instead be checking if the target method
+        // has an
+        // unconditional poll. In practice, this is only a theoretical concern
+        // since we don't have any methods with conditional-only safepoint
+        // polls.
+        if (needsStatepoint(CS))
+          return true;
+    }
+
+    if (Current == Header)
+      break;
+    Current = DT.getNode(Current)->getIDom()->getBlock();
+  }
+
+  return false;
+}
+
+/// Returns true if this loop is known to terminate in a finite number of
+/// iterations.  Note that this function may return false for a loop which
+/// does actual terminate in a finite constant number of iterations due to
+/// conservatism in the analysis.
+static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
+                                    BasicBlock *Pred) {
+  // A conservative bound on the loop as a whole.
+  const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L);
+  if (MaxTrips != SE->getCouldNotCompute() &&
+      SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(
+          CountedLoopTripWidth))
+    return true;
+
+  // If this is a conditional branch to the header with the alternate path
+  // being outside the loop, we can ask questions about the execution frequency
+  // of the exit block.
+  if (L->isLoopExiting(Pred)) {
+    // This returns an exact expression only.  TODO: We really only need an
+    // upper bound here, but SE doesn't expose that.
+    const SCEV *MaxExec = SE->getExitCount(L, Pred);
+    if (MaxExec != SE->getCouldNotCompute() &&
+        SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(
+            CountedLoopTripWidth))
+        return true;
+  }
+
+  return /* not finite */ false;
+}
+
+static void scanOneBB(Instruction *Start, Instruction *End,
+                      std::vector<CallInst *> &Calls,
+                      DenseSet<BasicBlock *> &Seen,
+                      std::vector<BasicBlock *> &Worklist) {
+  for (BasicBlock::iterator BBI(Start), BBE0 = Start->getParent()->end(),
+                                        BBE1 = BasicBlock::iterator(End);
+       BBI != BBE0 && BBI != BBE1; BBI++) {
+    if (CallInst *CI = dyn_cast<CallInst>(&*BBI))
+      Calls.push_back(CI);
+
+    // FIXME: This code does not handle invokes
+    assert(!isa<InvokeInst>(&*BBI) &&
+           "support for invokes in poll code needed");
+
+    // Only add the successor blocks if we reach the terminator instruction
+    // without encountering end first
+    if (BBI->isTerminator()) {
+      BasicBlock *BB = BBI->getParent();
+      for (BasicBlock *Succ : successors(BB)) {
+        if (Seen.insert(Succ).second) {
+          Worklist.push_back(Succ);
+        }
+      }
+    }
+  }
+}
+
+static void scanInlinedCode(Instruction *Start, Instruction *End,
+                            std::vector<CallInst *> &Calls,
+                            DenseSet<BasicBlock *> &Seen) {
+  Calls.clear();
+  std::vector<BasicBlock *> Worklist;
+  Seen.insert(Start->getParent());
+  scanOneBB(Start, End, Calls, Seen, Worklist);
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.back();
+    Worklist.pop_back();
+    scanOneBB(&*BB->begin(), End, Calls, Seen, Worklist);
+  }
+}
+
+bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
+  // Loop through all loop latches (branches controlling backedges).  We need
+  // to place a safepoint on every backedge (potentially).
+  // Note: In common usage, there will be only one edge due to LoopSimplify
+  // having run sometime earlier in the pipeline, but this code must be correct
+  // w.r.t. loops with multiple backedges.
+  BasicBlock *Header = L->getHeader();
+  SmallVector<BasicBlock*, 16> LoopLatches;
+  L->getLoopLatches(LoopLatches);
+  for (BasicBlock *Pred : LoopLatches) {
+    assert(L->contains(Pred));
+
+    // Make a policy decision about whether this loop needs a safepoint or
+    // not.  Note that this is about unburdening the optimizer in loops, not
+    // avoiding the runtime cost of the actual safepoint.
+    if (!AllBackedges) {
+      if (mustBeFiniteCountedLoop(L, SE, Pred)) {
+        DEBUG(dbgs() << "skipping safepoint placement in finite loop\n");
+        FiniteExecution++;
+        continue;
+      }
+      if (CallSafepointsEnabled &&
+          containsUnconditionalCallSafepoint(L, Header, Pred, *DT)) {
+        // Note: This is only semantically legal since we won't do any further
+        // IPO or inlining before the actual call insertion..  If we hadn't, we
+        // might latter loose this call safepoint.
+        DEBUG(dbgs() << "skipping safepoint placement due to unconditional call\n");
+        CallInLoop++;
+        continue;
+      }
+    }
+
+    // TODO: We can create an inner loop which runs a finite number of
+    // iterations with an outer loop which contains a safepoint.  This would
+    // not help runtime performance that much, but it might help our ability to
+    // optimize the inner loop.
+
+    // Safepoint insertion would involve creating a new basic block (as the
+    // target of the current backedge) which does the safepoint (of all live
+    // variables) and branches to the true header
+    TerminatorInst *Term = Pred->getTerminator();
+
+    DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term);
+
+    PollLocations.push_back(Term);
+  }
+
+  return false;
+}
+
+/// Returns true if an entry safepoint is not required before this callsite in
+/// the caller function.
+static bool doesNotRequireEntrySafepointBefore(const CallSite &CS) {
+  Instruction *Inst = CS.getInstruction();
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::experimental_gc_statepoint:
+    case Intrinsic::experimental_patchpoint_void:
+    case Intrinsic::experimental_patchpoint_i64:
+      // The can wrap an actual call which may grow the stack by an unbounded
+      // amount or run forever.
+      return false;
+    default:
+      // Most LLVM intrinsics are things which do not expand to actual calls, or
+      // at least if they do, are leaf functions that cause only finite stack
+      // growth.  In particular, the optimizer likes to form things like memsets
+      // out of stores in the original IR.  Another important example is
+      // llvm.localescape which must occur in the entry block.  Inserting a
+      // safepoint before it is not legal since it could push the localescape
+      // out of the entry block.
+      return true;
+    }
+  }
+  return false;
+}
+
+static Instruction *findLocationForEntrySafepoint(Function &F,
+                                                  DominatorTree &DT) {
+
+  // Conceptually, this poll needs to be on method entry, but in
+  // practice, we place it as late in the entry block as possible.  We
+  // can place it as late as we want as long as it dominates all calls
+  // that can grow the stack.  This, combined with backedge polls,
+  // give us all the progress guarantees we need.
+
+  // hasNextInstruction and nextInstruction are used to iterate
+  // through a "straight line" execution sequence.
+
+  auto HasNextInstruction = [](Instruction *I) {
+    if (!I->isTerminator())
+      return true;
+
+    BasicBlock *nextBB = I->getParent()->getUniqueSuccessor();
+    return nextBB && (nextBB->getUniquePredecessor() != nullptr);
+  };
+
+  auto NextInstruction = [&](Instruction *I) {
+    assert(HasNextInstruction(I) &&
+           "first check if there is a next instruction!");
+
+    if (I->isTerminator())
+      return &I->getParent()->getUniqueSuccessor()->front();
+    return &*++I->getIterator();
+  };
+
+  Instruction *Cursor = nullptr;
+  for (Cursor = &F.getEntryBlock().front(); HasNextInstruction(Cursor);
+       Cursor = NextInstruction(Cursor)) {
+
+    // We need to ensure a safepoint poll occurs before any 'real' call.  The
+    // easiest way to ensure finite execution between safepoints in the face of
+    // recursive and mutually recursive functions is to enforce that each take
+    // a safepoint.  Additionally, we need to ensure a poll before any call
+    // which can grow the stack by an unbounded amount.  This isn't required
+    // for GC semantics per se, but is a common requirement for languages
+    // which detect stack overflow via guard pages and then throw exceptions.
+    if (auto CS = CallSite(Cursor)) {
+      if (doesNotRequireEntrySafepointBefore(CS))
+        continue;
+      break;
+    }
+  }
+
+  assert((HasNextInstruction(Cursor) || Cursor->isTerminator()) &&
+         "either we stopped because of a call, or because of terminator");
+
+  return Cursor;
+}
+
+static const char *const GCSafepointPollName = "gc.safepoint_poll";
+
+static bool isGCSafepointPoll(Function &F) {
+  return F.getName().equals(GCSafepointPollName);
+}
+
+/// Returns true if this function should be rewritten to include safepoint
+/// polls and parseable call sites.  The main point of this function is to be
+/// an extension point for custom logic.
+static bool shouldRewriteFunction(Function &F) {
+  // TODO: This should check the GCStrategy
+  if (F.hasGC()) {
+    const auto &FunctionGCName = F.getGC();
+    const StringRef StatepointExampleName("statepoint-example");
+    const StringRef CoreCLRName("coreclr");
+    return (StatepointExampleName == FunctionGCName) ||
+           (CoreCLRName == FunctionGCName);
+  } else
+    return false;
+}
+
+// TODO: These should become properties of the GCStrategy, possibly with
+// command line overrides.
+static bool enableEntrySafepoints(Function &F) { return !NoEntry; }
+static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; }
+static bool enableCallSafepoints(Function &F) { return !NoCall; }
+
+bool PlaceSafepoints::runOnFunction(Function &F) {
+  if (F.isDeclaration() || F.empty()) {
+    // This is a declaration, nothing to do.  Must exit early to avoid crash in
+    // dom tree calculation
+    return false;
+  }
+
+  if (isGCSafepointPoll(F)) {
+    // Given we're inlining this inside of safepoint poll insertion, this
+    // doesn't make any sense.  Note that we do make any contained calls
+    // parseable after we inline a poll.
+    return false;
+  }
+
+  if (!shouldRewriteFunction(F))
+    return false;
+
+  bool Modified = false;
+
+  // In various bits below, we rely on the fact that uses are reachable from
+  // defs.  When there are basic blocks unreachable from the entry, dominance
+  // and reachablity queries return non-sensical results.  Thus, we preprocess
+  // the function to ensure these properties hold.
+  Modified |= removeUnreachableBlocks(F);
+
+  // STEP 1 - Insert the safepoint polling locations.  We do not need to
+  // actually insert parse points yet.  That will be done for all polls and
+  // calls in a single pass.
+
+  DominatorTree DT;
+  DT.recalculate(F);
+
+  SmallVector<Instruction *, 16> PollsNeeded;
+  std::vector<CallSite> ParsePointNeeded;
+
+  if (enableBackedgeSafepoints(F)) {
+    // Construct a pass manager to run the LoopPass backedge logic.  We
+    // need the pass manager to handle scheduling all the loop passes
+    // appropriately.  Doing this by hand is painful and just not worth messing
+    // with for the moment.
+    legacy::FunctionPassManager FPM(F.getParent());
+    bool CanAssumeCallSafepoints = enableCallSafepoints(F);
+    auto *PBS = new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints);
+    FPM.add(PBS);
+    FPM.run(F);
+
+    // We preserve dominance information when inserting the poll, otherwise
+    // we'd have to recalculate this on every insert
+    DT.recalculate(F);
+
+    auto &PollLocations = PBS->PollLocations;
+
+    auto OrderByBBName = [](Instruction *a, Instruction *b) {
+      return a->getParent()->getName() < b->getParent()->getName();
+    };
+    // We need the order of list to be stable so that naming ends up stable
+    // when we split edges.  This makes test cases much easier to write.
+    std::sort(PollLocations.begin(), PollLocations.end(), OrderByBBName);
+
+    // We can sometimes end up with duplicate poll locations.  This happens if
+    // a single loop is visited more than once.   The fact this happens seems
+    // wrong, but it does happen for the split-backedge.ll test case.
+    PollLocations.erase(std::unique(PollLocations.begin(),
+                                    PollLocations.end()),
+                        PollLocations.end());
+
+    // Insert a poll at each point the analysis pass identified
+    // The poll location must be the terminator of a loop latch block.
+    for (TerminatorInst *Term : PollLocations) {
+      // We are inserting a poll, the function is modified
+      Modified = true;
+
+      if (SplitBackedge) {
+        // Split the backedge of the loop and insert the poll within that new
+        // basic block.  This creates a loop with two latches per original
+        // latch (which is non-ideal), but this appears to be easier to
+        // optimize in practice than inserting the poll immediately before the
+        // latch test.
+
+        // Since this is a latch, at least one of the successors must dominate
+        // it. Its possible that we have a) duplicate edges to the same header
+        // and b) edges to distinct loop headers.  We need to insert pools on
+        // each.
+        SetVector<BasicBlock *> Headers;
+        for (unsigned i = 0; i < Term->getNumSuccessors(); i++) {
+          BasicBlock *Succ = Term->getSuccessor(i);
+          if (DT.dominates(Succ, Term->getParent())) {
+            Headers.insert(Succ);
+          }
+        }
+        assert(!Headers.empty() && "poll location is not a loop latch?");
+
+        // The split loop structure here is so that we only need to recalculate
+        // the dominator tree once.  Alternatively, we could just keep it up to
+        // date and use a more natural merged loop.
+        SetVector<BasicBlock *> SplitBackedges;
+        for (BasicBlock *Header : Headers) {
+          BasicBlock *NewBB = SplitEdge(Term->getParent(), Header, &DT);
+          PollsNeeded.push_back(NewBB->getTerminator());
+          NumBackedgeSafepoints++;
+        }
+      } else {
+        // Split the latch block itself, right before the terminator.
+        PollsNeeded.push_back(Term);
+        NumBackedgeSafepoints++;
+      }
+    }
+  }
+
+  if (enableEntrySafepoints(F)) {
+    if (Instruction *Location = findLocationForEntrySafepoint(F, DT)) {
+      PollsNeeded.push_back(Location);
+      Modified = true;
+      NumEntrySafepoints++;
+    }
+    // TODO: else we should assert that there was, in fact, a policy choice to
+    // not insert a entry safepoint poll.
+  }
+
+  // Now that we've identified all the needed safepoint poll locations, insert
+  // safepoint polls themselves.
+  for (Instruction *PollLocation : PollsNeeded) {
+    std::vector<CallSite> RuntimeCalls;
+    InsertSafepointPoll(PollLocation, RuntimeCalls);
+    ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(),
+                            RuntimeCalls.end());
+  }
+
+  return Modified;
+}
+
+char PlaceBackedgeSafepointsImpl::ID = 0;
+char PlaceSafepoints::ID = 0;
+
+FunctionPass *llvm::createPlaceSafepointsPass() {
+  return new PlaceSafepoints();
+}
+
+INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl,
+                      "place-backedge-safepoints-impl",
+                      "Place Backedge Safepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl,
+                    "place-backedge-safepoints-impl",
+                    "Place Backedge Safepoints", false, false)
+
+INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+                      false, false)
+INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+                    false, false)
+
+static void
+InsertSafepointPoll(Instruction *InsertBefore,
+                    std::vector<CallSite> &ParsePointsNeeded /*rval*/) {
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Module *M = InsertBefore->getModule();
+  assert(M && "must be part of a module");
+
+  // Inline the safepoint poll implementation - this will get all the branch,
+  // control flow, etc..  Most importantly, it will introduce the actual slow
+  // path call - where we need to insert a safepoint (parsepoint).
+
+  auto *F = M->getFunction(GCSafepointPollName);
+  assert(F && "gc.safepoint_poll function is missing");
+  assert(F->getValueType() ==
+         FunctionType::get(Type::getVoidTy(M->getContext()), false) &&
+         "gc.safepoint_poll declared with wrong type");
+  assert(!F->empty() && "gc.safepoint_poll must be a non-empty function");
+  CallInst *PollCall = CallInst::Create(F, "", InsertBefore);
+
+  // Record some information about the call site we're replacing
+  BasicBlock::iterator Before(PollCall), After(PollCall);
+  bool IsBegin = false;
+  if (Before == OrigBB->begin())
+    IsBegin = true;
+  else
+    Before--;
+
+  After++;
+  assert(After != OrigBB->end() && "must have successor");
+
+  // Do the actual inlining
+  InlineFunctionInfo IFI;
+  bool InlineStatus = InlineFunction(PollCall, IFI);
+  assert(InlineStatus && "inline must succeed");
+  (void)InlineStatus; // suppress warning in release-asserts
+
+  // Check post-conditions
+  assert(IFI.StaticAllocas.empty() && "can't have allocs");
+
+  std::vector<CallInst *> Calls; // new calls
+  DenseSet<BasicBlock *> BBs;    // new BBs + insertee
+
+  // Include only the newly inserted instructions, Note: begin may not be valid
+  // if we inserted to the beginning of the basic block
+  BasicBlock::iterator Start = IsBegin ? OrigBB->begin() : std::next(Before);
+
+  // If your poll function includes an unreachable at the end, that's not
+  // valid.  Bugpoint likes to create this, so check for it.
+  assert(isPotentiallyReachable(&*Start, &*After) &&
+         "malformed poll function");
+
+  scanInlinedCode(&*Start, &*After, Calls, BBs);
+  assert(!Calls.empty() && "slow path not found for safepoint poll");
+
+  // Record the fact we need a parsable state at the runtime call contained in
+  // the poll function.  This is required so that the runtime knows how to
+  // parse the last frame when we actually take  the safepoint (i.e. execute
+  // the slow path)
+  assert(ParsePointsNeeded.empty());
+  for (auto *CI : Calls) {
+    // No safepoint needed or wanted
+    if (!needsStatepoint(CI))
+      continue;
+
+    // These are likely runtime calls.  Should we assert that via calling
+    // convention or something?
+    ParsePointsNeeded.push_back(CallSite(CI));
+  }
+  assert(ParsePointsNeeded.size() <= Calls.size());
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
new file mode 100644
index 000000000000..29d1ba406ae4
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -0,0 +1,2281 @@
+//===- Reassociate.cpp - Reassociate binary expressions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates commutative expressions in an order that is designed
+// to promote better constant propagation, GCSE, LICM, PRE, etc.
+//
+// For example: 4 + (x + 5) -> x + (4 + 5)
+//
+// In the implementation of this algorithm, constants are assigned rank = 0,
+// function arguments are rank = 1, and other values are assigned ranks
+// corresponding to the reverse post order traversal of current function
+// (starting at 2), which effectively gives values in deep loops higher rank
+// than values not in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+using namespace reassociate;
+
+#define DEBUG_TYPE "reassociate"
+
+STATISTIC(NumChanged, "Number of insts reassociated");
+STATISTIC(NumAnnihil, "Number of expr tree annihilated");
+STATISTIC(NumFactor , "Number of multiplies factored");
+
+#ifndef NDEBUG
+/// Print out the expression identified in the Ops list.
+///
+static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
+  Module *M = I->getModule();
+  dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
+       << *Ops[0].Op->getType() << '\t';
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    dbgs() << "[ ";
+    Ops[i].Op->printAsOperand(dbgs(), false, M);
+    dbgs() << ", #" << Ops[i].Rank << "] ";
+  }
+}
+#endif
+
+/// Utility class representing a non-constant Xor-operand. We classify
+/// non-constant Xor-Operands into two categories:
+///  C1) The operand is in the form "X & C", where C is a constant and C != ~0
+///  C2)
+///    C2.1) The operand is in the form of "X | C", where C is a non-zero
+///          constant.
+///    C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
+///          operand as "E | 0"
+class llvm::reassociate::XorOpnd {
+public:
+  XorOpnd(Value *V);
+
+  bool isInvalid() const { return SymbolicPart == nullptr; }
+  bool isOrExpr() const { return isOr; }
+  Value *getValue() const { return OrigVal; }
+  Value *getSymbolicPart() const { return SymbolicPart; }
+  unsigned getSymbolicRank() const { return SymbolicRank; }
+  const APInt &getConstPart() const { return ConstPart; }
+
+  void Invalidate() { SymbolicPart = OrigVal = nullptr; }
+  void setSymbolicRank(unsigned R) { SymbolicRank = R; }
+
+private:
+  Value *OrigVal;
+  Value *SymbolicPart;
+  APInt ConstPart;
+  unsigned SymbolicRank;
+  bool isOr;
+};
+
+XorOpnd::XorOpnd(Value *V) {
+  assert(!isa<ConstantInt>(V) && "No ConstantInt");
+  OrigVal = V;
+  Instruction *I = dyn_cast<Instruction>(V);
+  SymbolicRank = 0;
+
+  if (I && (I->getOpcode() == Instruction::Or ||
+            I->getOpcode() == Instruction::And)) {
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    const APInt *C;
+    if (match(V0, PatternMatch::m_APInt(C)))
+      std::swap(V0, V1);
+
+    if (match(V1, PatternMatch::m_APInt(C))) {
+      ConstPart = *C;
+      SymbolicPart = V0;
+      isOr = (I->getOpcode() == Instruction::Or);
+      return;
+    }
+  }
+
+  // view the operand as "V | 0"
+  SymbolicPart = V;
+  ConstPart = APInt::getNullValue(V->getType()->getScalarSizeInBits());
+  isOr = true;
+}
+
+/// Return true if V is an instruction of the specified opcode and if it
+/// only has one use.
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
+  if (V->hasOneUse() && isa<Instruction>(V) &&
+      cast<Instruction>(V)->getOpcode() == Opcode &&
+      (!isa<FPMathOperator>(V) ||
+       cast<Instruction>(V)->hasUnsafeAlgebra()))
+    return cast<BinaryOperator>(V);
+  return nullptr;
+}
+
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
+                                        unsigned Opcode2) {
+  if (V->hasOneUse() && isa<Instruction>(V) &&
+      (cast<Instruction>(V)->getOpcode() == Opcode1 ||
+       cast<Instruction>(V)->getOpcode() == Opcode2) &&
+      (!isa<FPMathOperator>(V) ||
+       cast<Instruction>(V)->hasUnsafeAlgebra()))
+    return cast<BinaryOperator>(V);
+  return nullptr;
+}
+
+void ReassociatePass::BuildRankMap(Function &F,
+                                   ReversePostOrderTraversal<Function*> &RPOT) {
+  unsigned i = 2;
+
+  // Assign distinct ranks to function arguments.
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
+    ValueRankMap[&*I] = ++i;
+    DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n");
+  }
+
+  // Traverse basic blocks in ReversePostOrder
+  for (BasicBlock *BB : RPOT) {
+    unsigned BBRank = RankMap[BB] = ++i << 16;
+
+    // Walk the basic block, adding precomputed ranks for any instructions that
+    // we cannot move.  This ensures that the ranks for these instructions are
+    // all different in the block.
+    for (Instruction &I : *BB)
+      if (mayBeMemoryDependent(I))
+        ValueRankMap[&I] = ++BBRank;
+  }
+}
+
+unsigned ReassociatePass::getRank(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    if (isa<Argument>(V)) return ValueRankMap[V];   // Function argument.
+    return 0;  // Otherwise it's a global or constant, rank 0.
+  }
+
+  if (unsigned Rank = ValueRankMap[I])
+    return Rank;    // Rank already known?
+
+  // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
+  // we can reassociate expressions for code motion!  Since we do not recurse
+  // for PHI nodes, we cannot have infinite recursion here, because there
+  // cannot be loops in the value graph that do not go through PHI nodes.
+  unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
+  for (unsigned i = 0, e = I->getNumOperands();
+       i != e && Rank != MaxRank; ++i)
+    Rank = std::max(Rank, getRank(I->getOperand(i)));
+
+  // If this is a not or neg instruction, do not count it for rank.  This
+  // assures us that X and ~X will have the same rank.
+  if  (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
+       !BinaryOperator::isFNeg(I))
+    ++Rank;
+
+  DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n");
+
+  return ValueRankMap[I] = Rank;
+}
+
+// Canonicalize constants to RHS.  Otherwise, sort the operands by rank.
+void ReassociatePass::canonicalizeOperands(Instruction *I) {
+  assert(isa<BinaryOperator>(I) && "Expected binary operator.");
+  assert(I->isCommutative() && "Expected commutative operator.");
+
+  Value *LHS = I->getOperand(0);
+  Value *RHS = I->getOperand(1);
+  unsigned LHSRank = getRank(LHS);
+  unsigned RHSRank = getRank(RHS);
+
+  if (isa<Constant>(RHS))
+    return;
+
+  if (isa<Constant>(LHS) || RHSRank < LHSRank)
+    cast<BinaryOperator>(I)->swapOperands();
+}
+
+static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name,
+                                 Instruction *InsertBefore, Value *FlagsOp) {
+  if (S1->getType()->isIntOrIntVectorTy())
+    return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore);
+  else {
+    BinaryOperator *Res =
+        BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore);
+    Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+    return Res;
+  }
+}
+
+static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
+                                 Instruction *InsertBefore, Value *FlagsOp) {
+  if (S1->getType()->isIntOrIntVectorTy())
+    return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore);
+  else {
+    BinaryOperator *Res =
+      BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore);
+    Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+    return Res;
+  }
+}
+
+static BinaryOperator *CreateNeg(Value *S1, const Twine &Name,
+                                 Instruction *InsertBefore, Value *FlagsOp) {
+  if (S1->getType()->isIntOrIntVectorTy())
+    return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
+  else {
+    BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore);
+    Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+    return Res;
+  }
+}
+
+/// Replace 0-X with X*-1.
+static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
+  Type *Ty = Neg->getType();
+  Constant *NegOne = Ty->isIntOrIntVectorTy() ?
+    ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0);
+
+  BinaryOperator *Res = CreateMul(Neg->getOperand(1), NegOne, "", Neg, Neg);
+  Neg->setOperand(1, Constant::getNullValue(Ty)); // Drop use of op.
+  Res->takeName(Neg);
+  Neg->replaceAllUsesWith(Res);
+  Res->setDebugLoc(Neg->getDebugLoc());
+  return Res;
+}
+
+/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael
+/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for
+/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
+/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
+/// even x in Bitwidth-bit arithmetic.
+static unsigned CarmichaelShift(unsigned Bitwidth) {
+  if (Bitwidth < 3)
+    return Bitwidth - 1;
+  return Bitwidth - 2;
+}
+
+/// Add the extra weight 'RHS' to the existing weight 'LHS',
+/// reducing the combined weight using any special properties of the operation.
+/// The existing weight LHS represents the computation X op X op ... op X where
+/// X occurs LHS times.  The combined weight represents  X op X op ... op X with
+/// X occurring LHS + RHS times.  If op is "Xor" for example then the combined
+/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
+/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
+static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
+  // If we were working with infinite precision arithmetic then the combined
+  // weight would be LHS + RHS.  But we are using finite precision arithmetic,
+  // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
+  // for nilpotent operations and addition, but not for idempotent operations
+  // and multiplication), so it is important to correctly reduce the combined
+  // weight back into range if wrapping would be wrong.
+
+  // If RHS is zero then the weight didn't change.
+  if (RHS.isMinValue())
+    return;
+  // If LHS is zero then the combined weight is RHS.
+  if (LHS.isMinValue()) {
+    LHS = RHS;
+    return;
+  }
+  // From this point on we know that neither LHS nor RHS is zero.
+
+  if (Instruction::isIdempotent(Opcode)) {
+    // Idempotent means X op X === X, so any non-zero weight is equivalent to a
+    // weight of 1.  Keeping weights at zero or one also means that wrapping is
+    // not a problem.
+    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+    return; // Return a weight of 1.
+  }
+  if (Instruction::isNilpotent(Opcode)) {
+    // Nilpotent means X op X === 0, so reduce weights modulo 2.
+    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+    LHS = 0; // 1 + 1 === 0 modulo 2.
+    return;
+  }
+  if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) {
+    // TODO: Reduce the weight by exploiting nsw/nuw?
+    LHS += RHS;
+    return;
+  }
+
+  assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) &&
+         "Unknown associative operation!");
+  unsigned Bitwidth = LHS.getBitWidth();
+  // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
+  // can be replaced with W-CM.  That's because x^W=x^(W-CM) for every Bitwidth
+  // bit number x, since either x is odd in which case x^CM = 1, or x is even in
+  // which case both x^W and x^(W - CM) are zero.  By subtracting off multiples
+  // of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
+  // which by a happy accident means that they can always be represented using
+  // Bitwidth bits.
+  // TODO: Reduce the weight by exploiting nsw/nuw?  (Could do much better than
+  // the Carmichael number).
+  if (Bitwidth > 3) {
+    /// CM - The value of Carmichael's lambda function.
+    APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
+    // Any weight W >= Threshold can be replaced with W - CM.
+    APInt Threshold = CM + Bitwidth;
+    assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
+    // For Bitwidth 4 or more the following sum does not overflow.
+    LHS += RHS;
+    while (LHS.uge(Threshold))
+      LHS -= CM;
+  } else {
+    // To avoid problems with overflow do everything the same as above but using
+    // a larger type.
+    unsigned CM = 1U << CarmichaelShift(Bitwidth);
+    unsigned Threshold = CM + Bitwidth;
+    assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
+           "Weights not reduced!");
+    unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
+    while (Total >= Threshold)
+      Total -= CM;
+    LHS = Total;
+  }
+}
+
+typedef std::pair<Value*, APInt> RepeatedValue;
+
+/// Given an associative binary expression, return the leaf
+/// nodes in Ops along with their weights (how many times the leaf occurs).  The
+/// original expression is the same as
+///   (Ops[0].first op Ops[0].first op ... Ops[0].first)  <- Ops[0].second times
+/// op
+///   (Ops[1].first op Ops[1].first op ... Ops[1].first)  <- Ops[1].second times
+/// op
+///   ...
+/// op
+///   (Ops[N].first op Ops[N].first op ... Ops[N].first)  <- Ops[N].second times
+///
+/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct.
+///
+/// This routine may modify the function, in which case it returns 'true'.  The
+/// changes it makes may well be destructive, changing the value computed by 'I'
+/// to something completely different.  Thus if the routine returns 'true' then
+/// you MUST either replace I with a new expression computed from the Ops array,
+/// or use RewriteExprTree to put the values back in.
+///
+/// A leaf node is either not a binary operation of the same kind as the root
+/// node 'I' (i.e. is not a binary operator at all, or is, but with a different
+/// opcode), or is the same kind of binary operator but has a use which either
+/// does not belong to the expression, or does belong to the expression but is
+/// a leaf node.  Every leaf node has at least one use that is a non-leaf node
+/// of the expression, while for non-leaf nodes (except for the root 'I') every
+/// use is a non-leaf node of the expression.
+///
+/// For example:
+///           expression graph        node names
+///
+///                     +        |        I
+///                    / \       |
+///                   +   +      |      A,  B
+///                  / \ / \     |
+///                 *   +   *    |    C,  D,  E
+///                / \ / \ / \   |
+///                   +   *      |      F,  G
+///
+/// The leaf nodes are C, E, F and G.  The Ops array will contain (maybe not in
+/// that order) (C, 1), (E, 1), (F, 2), (G, 2).
+///
+/// The expression is maximal: if some instruction is a binary operator of the
+/// same kind as 'I', and all of its uses are non-leaf nodes of the expression,
+/// then the instruction also belongs to the expression, is not a leaf node of
+/// it, and its operands also belong to the expression (but may be leaf nodes).
+///
+/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in
+/// order to ensure that every non-root node in the expression has *exactly one*
+/// use by a non-leaf node of the expression.  This destruction means that the
+/// caller MUST either replace 'I' with a new expression or use something like
+/// RewriteExprTree to put the values back in if the routine indicates that it
+/// made a change by returning 'true'.
+///
+/// In the above example either the right operand of A or the left operand of B
+/// will be replaced by undef.  If it is B's operand then this gives:
+///
+///                     +        |        I
+///                    / \       |
+///                   +   +      |      A,  B - operand of B replaced with undef
+///                  / \   \     |
+///                 *   +   *    |    C,  D,  E
+///                / \ / \ / \   |
+///                   +   *      |      F,  G
+///
+/// Note that such undef operands can only be reached by passing through 'I'.
+/// For example, if you visit operands recursively starting from a leaf node
+/// then you will never see such an undef operand unless you get back to 'I',
+/// which requires passing through a phi node.
+///
+/// Note that this routine may also mutate binary operators of the wrong type
+/// that have all uses inside the expression (i.e. only used by non-leaf nodes
+/// of the expression) if it can turn them into binary operators of the right
+/// type and thus make the expression bigger.
+
+static bool LinearizeExprTree(BinaryOperator *I,
+                              SmallVectorImpl<RepeatedValue> &Ops) {
+  DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
+  unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
+  unsigned Opcode = I->getOpcode();
+  assert(I->isAssociative() && I->isCommutative() &&
+         "Expected an associative and commutative operation!");
+
+  // Visit all operands of the expression, keeping track of their weight (the
+  // number of paths from the expression root to the operand, or if you like
+  // the number of times that operand occurs in the linearized expression).
+  // For example, if I = X + A, where X = A + B, then I, X and B have weight 1
+  // while A has weight two.
+
+  // Worklist of non-leaf nodes (their operands are in the expression too) along
+  // with their weights, representing a certain number of paths to the operator.
+  // If an operator occurs in the worklist multiple times then we found multiple
+  // ways to get to it.
+  SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight)
+  Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
+  bool Changed = false;
+
+  // Leaves of the expression are values that either aren't the right kind of
+  // operation (eg: a constant, or a multiply in an add tree), or are, but have
+  // some uses that are not inside the expression.  For example, in I = X + X,
+  // X = A + B, the value X has two uses (by I) that are in the expression.  If
+  // X has any other uses, for example in a return instruction, then we consider
+  // X to be a leaf, and won't analyze it further.  When we first visit a value,
+  // if it has more than one use then at first we conservatively consider it to
+  // be a leaf.  Later, as the expression is explored, we may discover some more
+  // uses of the value from inside the expression.  If all uses turn out to be
+  // from within the expression (and the value is a binary operator of the right
+  // kind) then the value is no longer considered to be a leaf, and its operands
+  // are explored.
+
+  // Leaves - Keeps track of the set of putative leaves as well as the number of
+  // paths to each leaf seen so far.
+  typedef DenseMap<Value*, APInt> LeafMap;
+  LeafMap Leaves; // Leaf -> Total weight so far.
+  SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order.
+
+#ifndef NDEBUG
+  SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme.
+#endif
+  while (!Worklist.empty()) {
+    std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val();
+    I = P.first; // We examine the operands of this binary operator.
+
+    for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands.
+      Value *Op = I->getOperand(OpIdx);
+      APInt Weight = P.second; // Number of paths to this operand.
+      DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
+      assert(!Op->use_empty() && "No uses, so how did we get to it?!");
+
+      // If this is a binary operation of the right kind with only one use then
+      // add its operands to the expression.
+      if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+        assert(Visited.insert(Op).second && "Not first visit!");
+        DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
+        Worklist.push_back(std::make_pair(BO, Weight));
+        continue;
+      }
+
+      // Appears to be a leaf.  Is the operand already in the set of leaves?
+      LeafMap::iterator It = Leaves.find(Op);
+      if (It == Leaves.end()) {
+        // Not in the leaf map.  Must be the first time we saw this operand.
+        assert(Visited.insert(Op).second && "Not first visit!");
+        if (!Op->hasOneUse()) {
+          // This value has uses not accounted for by the expression, so it is
+          // not safe to modify.  Mark it as being a leaf.
+          DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
+          LeafOrder.push_back(Op);
+          Leaves[Op] = Weight;
+          continue;
+        }
+        // No uses outside the expression, try morphing it.
+      } else {
+        // Already in the leaf map.
+        assert(It != Leaves.end() && Visited.count(Op) &&
+               "In leaf map but not visited!");
+
+        // Update the number of paths to the leaf.
+        IncorporateWeight(It->second, Weight, Opcode);
+
+#if 0   // TODO: Re-enable once PR13021 is fixed.
+        // The leaf already has one use from inside the expression.  As we want
+        // exactly one such use, drop this new use of the leaf.
+        assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
+        I->setOperand(OpIdx, UndefValue::get(I->getType()));
+        Changed = true;
+
+        // If the leaf is a binary operation of the right kind and we now see
+        // that its multiple original uses were in fact all by nodes belonging
+        // to the expression, then no longer consider it to be a leaf and add
+        // its operands to the expression.
+        if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+          DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
+          Worklist.push_back(std::make_pair(BO, It->second));
+          Leaves.erase(It);
+          continue;
+        }
+#endif
+
+        // If we still have uses that are not accounted for by the expression
+        // then it is not safe to modify the value.
+        if (!Op->hasOneUse())
+          continue;
+
+        // No uses outside the expression, try morphing it.
+        Weight = It->second;
+        Leaves.erase(It); // Since the value may be morphed below.
+      }
+
+      // At this point we have a value which, first of all, is not a binary
+      // expression of the right kind, and secondly, is only used inside the
+      // expression.  This means that it can safely be modified.  See if we
+      // can usefully morph it into an expression of the right kind.
+      assert((!isa<Instruction>(Op) ||
+              cast<Instruction>(Op)->getOpcode() != Opcode
+              || (isa<FPMathOperator>(Op) &&
+                  !cast<Instruction>(Op)->hasUnsafeAlgebra())) &&
+             "Should have been handled above!");
+      assert(Op->hasOneUse() && "Has uses outside the expression tree!");
+
+      // If this is a multiply expression, turn any internal negations into
+      // multiplies by -1 so they can be reassociated.
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
+        if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) ||
+            (Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
+          DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+          BO = LowerNegateToMultiply(BO);
+          DEBUG(dbgs() << *BO << '\n');
+          Worklist.push_back(std::make_pair(BO, Weight));
+          Changed = true;
+          continue;
+        }
+
+      // Failed to morph into an expression of the right type.  This really is
+      // a leaf.
+      DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
+      assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
+      LeafOrder.push_back(Op);
+      Leaves[Op] = Weight;
+    }
+  }
+
+  // The leaves, repeated according to their weights, represent the linearized
+  // form of the expression.
+  for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) {
+    Value *V = LeafOrder[i];
+    LeafMap::iterator It = Leaves.find(V);
+    if (It == Leaves.end())
+      // Node initially thought to be a leaf wasn't.
+      continue;
+    assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!");
+    APInt Weight = It->second;
+    if (Weight.isMinValue())
+      // Leaf already output or weight reduction eliminated it.
+      continue;
+    // Ensure the leaf is only output once.
+    It->second = 0;
+    Ops.push_back(std::make_pair(V, Weight));
+  }
+
+  // For nilpotent operations or addition there may be no operands, for example
+  // because the expression was "X xor X" or consisted of 2^Bitwidth additions:
+  // in both cases the weight reduces to 0 causing the value to be skipped.
+  if (Ops.empty()) {
+    Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
+    assert(Identity && "Associative operation without identity!");
+    Ops.emplace_back(Identity, APInt(Bitwidth, 1));
+  }
+
+  return Changed;
+}
+
+/// Now that the operands for this expression tree are
+/// linearized and optimized, emit them in-order.
+void ReassociatePass::RewriteExprTree(BinaryOperator *I,
+                                      SmallVectorImpl<ValueEntry> &Ops) {
+  assert(Ops.size() > 1 && "Single values should be used directly!");
+
+  // Since our optimizations should never increase the number of operations, the
+  // new expression can usually be written reusing the existing binary operators
+  // from the original expression tree, without creating any new instructions,
+  // though the rewritten expression may have a completely different topology.
+  // We take care to not change anything if the new expression will be the same
+  // as the original.  If more than trivial changes (like commuting operands)
+  // were made then we are obliged to clear out any optional subclass data like
+  // nsw flags.
+
+  /// NodesToRewrite - Nodes from the original expression available for writing
+  /// the new expression into.
+  SmallVector<BinaryOperator*, 8> NodesToRewrite;
+  unsigned Opcode = I->getOpcode();
+  BinaryOperator *Op = I;
+
+  /// NotRewritable - The operands being written will be the leaves of the new
+  /// expression and must not be used as inner nodes (via NodesToRewrite) by
+  /// mistake.  Inner nodes are always reassociable, and usually leaves are not
+  /// (if they were they would have been incorporated into the expression and so
+  /// would not be leaves), so most of the time there is no danger of this.  But
+  /// in rare cases a leaf may become reassociable if an optimization kills uses
+  /// of it, or it may momentarily become reassociable during rewriting (below)
+  /// due it being removed as an operand of one of its uses.  Ensure that misuse
+  /// of leaf nodes as inner nodes cannot occur by remembering all of the future
+  /// leaves and refusing to reuse any of them as inner nodes.
+  SmallPtrSet<Value*, 8> NotRewritable;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    NotRewritable.insert(Ops[i].Op);
+
+  // ExpressionChanged - Non-null if the rewritten expression differs from the
+  // original in some non-trivial way, requiring the clearing of optional flags.
+  // Flags are cleared from the operator in ExpressionChanged up to I inclusive.
+  BinaryOperator *ExpressionChanged = nullptr;
+  for (unsigned i = 0; ; ++i) {
+    // The last operation (which comes earliest in the IR) is special as both
+    // operands will come from Ops, rather than just one with the other being
+    // a subexpression.
+    if (i+2 == Ops.size()) {
+      Value *NewLHS = Ops[i].Op;
+      Value *NewRHS = Ops[i+1].Op;
+      Value *OldLHS = Op->getOperand(0);
+      Value *OldRHS = Op->getOperand(1);
+
+      if (NewLHS == OldLHS && NewRHS == OldRHS)
+        // Nothing changed, leave it alone.
+        break;
+
+      if (NewLHS == OldRHS && NewRHS == OldLHS) {
+        // The order of the operands was reversed.  Swap them.
+        DEBUG(dbgs() << "RA: " << *Op << '\n');
+        Op->swapOperands();
+        DEBUG(dbgs() << "TO: " << *Op << '\n');
+        MadeChange = true;
+        ++NumChanged;
+        break;
+      }
+
+      // The new operation differs non-trivially from the original. Overwrite
+      // the old operands with the new ones.
+      DEBUG(dbgs() << "RA: " << *Op << '\n');
+      if (NewLHS != OldLHS) {
+        BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
+        if (BO && !NotRewritable.count(BO))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(0, NewLHS);
+      }
+      if (NewRHS != OldRHS) {
+        BinaryOperator *BO = isReassociableOp(OldRHS, Opcode);
+        if (BO && !NotRewritable.count(BO))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(1, NewRHS);
+      }
+      DEBUG(dbgs() << "TO: " << *Op << '\n');
+
+      ExpressionChanged = Op;
+      MadeChange = true;
+      ++NumChanged;
+
+      break;
+    }
+
+    // Not the last operation.  The left-hand side will be a sub-expression
+    // while the right-hand side will be the current element of Ops.
+    Value *NewRHS = Ops[i].Op;
+    if (NewRHS != Op->getOperand(1)) {
+      DEBUG(dbgs() << "RA: " << *Op << '\n');
+      if (NewRHS == Op->getOperand(0)) {
+        // The new right-hand side was already present as the left operand.  If
+        // we are lucky then swapping the operands will sort out both of them.
+        Op->swapOperands();
+      } else {
+        // Overwrite with the new right-hand side.
+        BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode);
+        if (BO && !NotRewritable.count(BO))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(1, NewRHS);
+        ExpressionChanged = Op;
+      }
+      DEBUG(dbgs() << "TO: " << *Op << '\n');
+      MadeChange = true;
+      ++NumChanged;
+    }
+
+    // Now deal with the left-hand side.  If this is already an operation node
+    // from the original expression then just rewrite the rest of the expression
+    // into it.
+    BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode);
+    if (BO && !NotRewritable.count(BO)) {
+      Op = BO;
+      continue;
+    }
+
+    // Otherwise, grab a spare node from the original expression and use that as
+    // the left-hand side.  If there are no nodes left then the optimizers made
+    // an expression with more nodes than the original!  This usually means that
+    // they did something stupid but it might mean that the problem was just too
+    // hard (finding the mimimal number of multiplications needed to realize a
+    // multiplication expression is NP-complete).  Whatever the reason, smart or
+    // stupid, create a new node if there are none left.
+    BinaryOperator *NewOp;
+    if (NodesToRewrite.empty()) {
+      Constant *Undef = UndefValue::get(I->getType());
+      NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
+                                     Undef, Undef, "", I);
+      if (NewOp->getType()->isFPOrFPVectorTy())
+        NewOp->setFastMathFlags(I->getFastMathFlags());
+    } else {
+      NewOp = NodesToRewrite.pop_back_val();
+    }
+
+    DEBUG(dbgs() << "RA: " << *Op << '\n');
+    Op->setOperand(0, NewOp);
+    DEBUG(dbgs() << "TO: " << *Op << '\n');
+    ExpressionChanged = Op;
+    MadeChange = true;
+    ++NumChanged;
+    Op = NewOp;
+  }
+
+  // If the expression changed non-trivially then clear out all subclass data
+  // starting from the operator specified in ExpressionChanged, and compactify
+  // the operators to just before the expression root to guarantee that the
+  // expression tree is dominated by all of Ops.
+  if (ExpressionChanged)
+    do {
+      // Preserve FastMathFlags.
+      if (isa<FPMathOperator>(I)) {
+        FastMathFlags Flags = I->getFastMathFlags();
+        ExpressionChanged->clearSubclassOptionalData();
+        ExpressionChanged->setFastMathFlags(Flags);
+      } else
+        ExpressionChanged->clearSubclassOptionalData();
+
+      if (ExpressionChanged == I)
+        break;
+      ExpressionChanged->moveBefore(I);
+      ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
+    } while (1);
+
+  // Throw away any left over nodes from the original expression.
+  for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
+    RedoInsts.insert(NodesToRewrite[i]);
+}
+
+/// Insert instructions before the instruction pointed to by BI,
+/// that computes the negative version of the value specified.  The negative
+/// version of the value is returned, and BI is left pointing at the instruction
+/// that should be processed next by the reassociation pass.
+/// Also add intermediate instructions to the redo list that are modified while
+/// pushing the negates through adds.  These will be revisited to see if
+/// additional opportunities have been exposed.
+static Value *NegateValue(Value *V, Instruction *BI,
+                          SetVector<AssertingVH<Instruction>> &ToRedo) {
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    if (C->getType()->isFPOrFPVectorTy()) {
+      return ConstantExpr::getFNeg(C);
+    }
+    return ConstantExpr::getNeg(C);
+  }
+
+
+  // We are trying to expose opportunity for reassociation.  One of the things
+  // that we want to do to achieve this is to push a negation as deep into an
+  // expression chain as possible, to expose the add instructions.  In practice,
+  // this means that we turn this:
+  //   X = -(A+12+C+D)   into    X = -A + -12 + -C + -D = -12 + -A + -C + -D
+  // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
+  // the constants.  We assume that instcombine will clean up the mess later if
+  // we introduce tons of unnecessary negation instructions.
+  //
+  if (BinaryOperator *I =
+          isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {
+    // Push the negates through the add.
+    I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo));
+    I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo));
+    if (I->getOpcode() == Instruction::Add) {
+      I->setHasNoUnsignedWrap(false);
+      I->setHasNoSignedWrap(false);
+    }
+
+    // We must move the add instruction here, because the neg instructions do
+    // not dominate the old add instruction in general.  By moving it, we are
+    // assured that the neg instructions we just inserted dominate the
+    // instruction we are about to insert after them.
+    //
+    I->moveBefore(BI);
+    I->setName(I->getName()+".neg");
+
+    // Add the intermediate negates to the redo list as processing them later
+    // could expose more reassociating opportunities.
+    ToRedo.insert(I);
+    return I;
+  }
+
+  // Okay, we need to materialize a negated version of V with an instruction.
+  // Scan the use lists of V to see if we have one already.
+  for (User *U : V->users()) {
+    if (!BinaryOperator::isNeg(U) && !BinaryOperator::isFNeg(U))
+      continue;
+
+    // We found one!  Now we have to make sure that the definition dominates
+    // this use.  We do this by moving it to the entry block (if it is a
+    // non-instruction value) or right after the definition.  These negates will
+    // be zapped by reassociate later, so we don't need much finesse here.
+    BinaryOperator *TheNeg = cast<BinaryOperator>(U);
+
+    // Verify that the negate is in this function, V might be a constant expr.
+    if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
+      continue;
+
+    BasicBlock::iterator InsertPt;
+    if (Instruction *InstInput = dyn_cast<Instruction>(V)) {
+      if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
+        InsertPt = II->getNormalDest()->begin();
+      } else {
+        InsertPt = ++InstInput->getIterator();
+      }
+      while (isa<PHINode>(InsertPt)) ++InsertPt;
+    } else {
+      InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin();
+    }
+    TheNeg->moveBefore(&*InsertPt);
+    if (TheNeg->getOpcode() == Instruction::Sub) {
+      TheNeg->setHasNoUnsignedWrap(false);
+      TheNeg->setHasNoSignedWrap(false);
+    } else {
+      TheNeg->andIRFlags(BI);
+    }
+    ToRedo.insert(TheNeg);
+    return TheNeg;
+  }
+
+  // Insert a 'neg' instruction that subtracts the value from zero to get the
+  // negation.
+  BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
+  ToRedo.insert(NewNeg);
+  return NewNeg;
+}
+
+/// Return true if we should break up this subtract of X-Y into (X + -Y).
+static bool ShouldBreakUpSubtract(Instruction *Sub) {
+  // If this is a negation, we can't split it up!
+  if (BinaryOperator::isNeg(Sub) || BinaryOperator::isFNeg(Sub))
+    return false;
+
+  // Don't breakup X - undef.
+  if (isa<UndefValue>(Sub->getOperand(1)))
+    return false;
+
+  // Don't bother to break this up unless either the LHS is an associable add or
+  // subtract or if this is only used by one.
+  Value *V0 = Sub->getOperand(0);
+  if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) ||
+      isReassociableOp(V0, Instruction::Sub, Instruction::FSub))
+    return true;
+  Value *V1 = Sub->getOperand(1);
+  if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) ||
+      isReassociableOp(V1, Instruction::Sub, Instruction::FSub))
+    return true;
+  Value *VB = Sub->user_back();
+  if (Sub->hasOneUse() &&
+      (isReassociableOp(VB, Instruction::Add, Instruction::FAdd) ||
+       isReassociableOp(VB, Instruction::Sub, Instruction::FSub)))
+    return true;
+
+  return false;
+}
+
+/// If we have (X-Y), and if either X is an add, or if this is only used by an
+/// add, transform this into (X+(0-Y)) to promote better reassociation.
+static BinaryOperator *
+BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
+  // Convert a subtract into an add and a neg instruction. This allows sub
+  // instructions to be commuted with other add instructions.
+  //
+  // Calculate the negative value of Operand 1 of the sub instruction,
+  // and set it as the RHS of the add instruction we just made.
+  //
+  Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);
+  BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
+  Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
+  Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
+  New->takeName(Sub);
+
+  // Everyone now refers to the add instruction.
+  Sub->replaceAllUsesWith(New);
+  New->setDebugLoc(Sub->getDebugLoc());
+
+  DEBUG(dbgs() << "Negated: " << *New << '\n');
+  return New;
+}
+
+/// If this is a shift of a reassociable multiply or is used by one, change
+/// this into a multiply by a constant to assist with further reassociation.
+static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
+  Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
+  MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
+
+  BinaryOperator *Mul =
+    BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
+  Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
+  Mul->takeName(Shl);
+
+  // Everyone now refers to the mul instruction.
+  Shl->replaceAllUsesWith(Mul);
+  Mul->setDebugLoc(Shl->getDebugLoc());
+
+  // We can safely preserve the nuw flag in all cases.  It's also safe to turn a
+  // nuw nsw shl into a nuw nsw mul.  However, nsw in isolation requires special
+  // handling.
+  bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap();
+  bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap();
+  if (NSW && NUW)
+    Mul->setHasNoSignedWrap(true);
+  Mul->setHasNoUnsignedWrap(NUW);
+  return Mul;
+}
+
+/// Scan backwards and forwards among values with the same rank as element i
+/// to see if X exists.  If X does not exist, return i.  This is useful when
+/// scanning for 'x' when we see '-x' because they both get the same rank.
+static unsigned FindInOperandList(const SmallVectorImpl<ValueEntry> &Ops,
+                                  unsigned i, Value *X) {
+  unsigned XRank = Ops[i].Rank;
+  unsigned e = Ops.size();
+  for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
+    if (Ops[j].Op == X)
+      return j;
+    if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+      if (Instruction *I2 = dyn_cast<Instruction>(X))
+        if (I1->isIdenticalTo(I2))
+          return j;
+  }
+  // Scan backwards.
+  for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) {
+    if (Ops[j].Op == X)
+      return j;
+    if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+      if (Instruction *I2 = dyn_cast<Instruction>(X))
+        if (I1->isIdenticalTo(I2))
+          return j;
+  }
+  return i;
+}
+
+/// Emit a tree of add instructions, summing Ops together
+/// and returning the result.  Insert the tree before I.
+static Value *EmitAddTreeOfValues(Instruction *I,
+                                  SmallVectorImpl<WeakTrackingVH> &Ops) {
+  if (Ops.size() == 1) return Ops.back();
+
+  Value *V1 = Ops.back();
+  Ops.pop_back();
+  Value *V2 = EmitAddTreeOfValues(I, Ops);
+  return CreateAdd(V2, V1, "tmp", I, I);
+}
+
+/// If V is an expression tree that is a multiplication sequence,
+/// and if this sequence contains a multiply by Factor,
+/// remove Factor from the tree and return the new tree.
+Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
+  if (!BO)
+    return nullptr;
+
+  SmallVector<RepeatedValue, 8> Tree;
+  MadeChange |= LinearizeExprTree(BO, Tree);
+  SmallVector<ValueEntry, 8> Factors;
+  Factors.reserve(Tree.size());
+  for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+    RepeatedValue E = Tree[i];
+    Factors.append(E.second.getZExtValue(),
+                   ValueEntry(getRank(E.first), E.first));
+  }
+
+  bool FoundFactor = false;
+  bool NeedsNegate = false;
+  for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+    if (Factors[i].Op == Factor) {
+      FoundFactor = true;
+      Factors.erase(Factors.begin()+i);
+      break;
+    }
+
+    // If this is a negative version of this factor, remove it.
+    if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) {
+      if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
+        if (FC1->getValue() == -FC2->getValue()) {
+          FoundFactor = NeedsNegate = true;
+          Factors.erase(Factors.begin()+i);
+          break;
+        }
+    } else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) {
+      if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) {
+        const APFloat &F1 = FC1->getValueAPF();
+        APFloat F2(FC2->getValueAPF());
+        F2.changeSign();
+        if (F1.compare(F2) == APFloat::cmpEqual) {
+          FoundFactor = NeedsNegate = true;
+          Factors.erase(Factors.begin() + i);
+          break;
+        }
+      }
+    }
+  }
+
+  if (!FoundFactor) {
+    // Make sure to restore the operands to the expression tree.
+    RewriteExprTree(BO, Factors);
+    return nullptr;
+  }
+
+  BasicBlock::iterator InsertPt = ++BO->getIterator();
+
+  // If this was just a single multiply, remove the multiply and return the only
+  // remaining operand.
+  if (Factors.size() == 1) {
+    RedoInsts.insert(BO);
+    V = Factors[0].Op;
+  } else {
+    RewriteExprTree(BO, Factors);
+    V = BO;
+  }
+
+  if (NeedsNegate)
+    V = CreateNeg(V, "neg", &*InsertPt, BO);
+
+  return V;
+}
+
+/// If V is a single-use multiply, recursively add its operands as factors,
+/// otherwise add V to the list of factors.
+///
+/// Ops is the top-level list of add operands we're trying to factor.
+static void FindSingleUseMultiplyFactors(Value *V,
+                                         SmallVectorImpl<Value*> &Factors) {
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
+  if (!BO) {
+    Factors.push_back(V);
+    return;
+  }
+
+  // Otherwise, add the LHS and RHS to the list of factors.
+  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors);
+  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors);
+}
+
+/// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
+/// This optimizes based on identities.  If it can be reduced to a single Value,
+/// it is returned, otherwise the Ops list is mutated as necessary.
+static Value *OptimizeAndOrXor(unsigned Opcode,
+                               SmallVectorImpl<ValueEntry> &Ops) {
+  // Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
+  // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    // First, check for X and ~X in the operand list.
+    assert(i < Ops.size());
+    if (BinaryOperator::isNot(Ops[i].Op)) {    // Cannot occur for ^.
+      Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
+      unsigned FoundX = FindInOperandList(Ops, i, X);
+      if (FoundX != i) {
+        if (Opcode == Instruction::And)   // ...&X&~X = 0
+          return Constant::getNullValue(X->getType());
+
+        if (Opcode == Instruction::Or)    // ...|X|~X = -1
+          return Constant::getAllOnesValue(X->getType());
+      }
+    }
+
+    // Next, check for duplicate pairs of values, which we assume are next to
+    // each other, due to our sorting criteria.
+    assert(i < Ops.size());
+    if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
+      if (Opcode == Instruction::And || Opcode == Instruction::Or) {
+        // Drop duplicate values for And and Or.
+        Ops.erase(Ops.begin()+i);
+        --i; --e;
+        ++NumAnnihil;
+        continue;
+      }
+
+      // Drop pairs of values for Xor.
+      assert(Opcode == Instruction::Xor);
+      if (e == 2)
+        return Constant::getNullValue(Ops[0].Op->getType());
+
+      // Y ^ X^X -> Y
+      Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
+      i -= 1; e -= 2;
+      ++NumAnnihil;
+    }
+  }
+  return nullptr;
+}
+
+/// Helper function of CombineXorOpnd(). It creates a bitwise-and
+/// instruction with the given two operands, and return the resulting
+/// instruction. There are two special cases: 1) if the constant operand is 0,
+/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
+/// be returned.
+static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
+                             const APInt &ConstOpnd) {
+  if (ConstOpnd.isNullValue())
+    return nullptr;
+
+  if (ConstOpnd.isAllOnesValue())
+    return Opnd;
+
+  Instruction *I = BinaryOperator::CreateAnd(
+      Opnd, ConstantInt::get(Opnd->getType(), ConstOpnd), "and.ra",
+      InsertBefore);
+  I->setDebugLoc(InsertBefore->getDebugLoc());
+  return I;
+}
+
+// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
+// into "R ^ C", where C would be 0, and R is a symbolic value.
+//
+// If it was successful, true is returned, and the "R" and "C" is returned
+// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
+// and both "Res" and "ConstOpnd" remain unchanged.
+//
+bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+                                     APInt &ConstOpnd, Value *&Res) {
+  // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2 
+  //                       = ((x | c1) ^ c1) ^ (c1 ^ c2)
+  //                       = (x & ~c1) ^ (c1 ^ c2)
+  // It is useful only when c1 == c2.
+  if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isNullValue())
+    return false;
+
+  if (!Opnd1->getValue()->hasOneUse())
+    return false;
+
+  const APInt &C1 = Opnd1->getConstPart();
+  if (C1 != ConstOpnd)
+    return false;
+
+  Value *X = Opnd1->getSymbolicPart();
+  Res = createAndInstr(I, X, ~C1);
+  // ConstOpnd was C2, now C1 ^ C2.
+  ConstOpnd ^= C1;
+
+  if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+    RedoInsts.insert(T);
+  return true;
+}
+
+                           
+// Helper function of OptimizeXor(). It tries to simplify
+// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
+// symbolic value. 
+// 
+// If it was successful, true is returned, and the "R" and "C" is returned 
+// via "Res" and "ConstOpnd", respectively (If the entire expression is
+// evaluated to a constant, the Res is set to NULL); otherwise, false is
+// returned, and both "Res" and "ConstOpnd" remain unchanged.
+bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+                                     XorOpnd *Opnd2, APInt &ConstOpnd,
+                                     Value *&Res) {
+  Value *X = Opnd1->getSymbolicPart();
+  if (X != Opnd2->getSymbolicPart())
+    return false;
+
+  // This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.)
+  int DeadInstNum = 1;
+  if (Opnd1->getValue()->hasOneUse())
+    DeadInstNum++;
+  if (Opnd2->getValue()->hasOneUse())
+    DeadInstNum++;
+
+  // Xor-Rule 2:
+  //  (x | c1) ^ (x & c2)
+  //   = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1
+  //   = (x & ~c1) ^ (x & c2) ^ c1               // Xor-Rule 1
+  //   = (x & c3) ^ c1, where c3 = ~c1 ^ c2      // Xor-rule 3
+  //
+  if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) {
+    if (Opnd2->isOrExpr())
+      std::swap(Opnd1, Opnd2);
+
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3((~C1) ^ C2);
+
+    // Do not increase code size!
+    if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
+      if (NewInstNum > DeadInstNum)
+        return false;
+    }
+
+    Res = createAndInstr(I, X, C3);
+    ConstOpnd ^= C1;
+
+  } else if (Opnd1->isOrExpr()) {
+    // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
+    //
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3 = C1 ^ C2;
+    
+    // Do not increase code size
+    if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
+      if (NewInstNum > DeadInstNum)
+        return false;
+    }
+
+    Res = createAndInstr(I, X, C3);
+    ConstOpnd ^= C3;
+  } else {
+    // Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2))
+    //
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3 = C1 ^ C2;
+    Res = createAndInstr(I, X, C3);
+  }
+
+  // Put the original operands in the Redo list; hope they will be deleted
+  // as dead code.
+  if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+    RedoInsts.insert(T);
+  if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue()))
+    RedoInsts.insert(T);
+
+  return true;
+}
+
+/// Optimize a series of operands to an 'xor' instruction. If it can be reduced
+/// to a single Value, it is returned, otherwise the Ops list is mutated as
+/// necessary.
+Value *ReassociatePass::OptimizeXor(Instruction *I,
+                                    SmallVectorImpl<ValueEntry> &Ops) {
+  if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
+    return V;
+      
+  if (Ops.size() == 1)
+    return nullptr;
+
+  SmallVector<XorOpnd, 8> Opnds;
+  SmallVector<XorOpnd*, 8> OpndPtrs;
+  Type *Ty = Ops[0].Op->getType();
+  APInt ConstOpnd(Ty->getScalarSizeInBits(), 0);
+
+  // Step 1: Convert ValueEntry to XorOpnd
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    Value *V = Ops[i].Op;
+    const APInt *C;
+    // TODO: Support non-splat vectors.
+    if (match(V, PatternMatch::m_APInt(C))) {
+      ConstOpnd ^= *C;
+    } else {
+      XorOpnd O(V);
+      O.setSymbolicRank(getRank(O.getSymbolicPart()));
+      Opnds.push_back(O);
+    }
+  }
+
+  // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds".
+  //  It would otherwise invalidate the "Opnds"'s iterator, and hence invalidate
+  //  the "OpndPtrs" as well. For the similar reason, do not fuse this loop
+  //  with the previous loop --- the iterator of the "Opnds" may be invalidated
+  //  when new elements are added to the vector.
+  for (unsigned i = 0, e = Opnds.size(); i != e; ++i)
+    OpndPtrs.push_back(&Opnds[i]);
+
+  // Step 2: Sort the Xor-Operands in a way such that the operands containing
+  //  the same symbolic value cluster together. For instance, the input operand
+  //  sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
+  //  ("x | 123", "x & 789", "y & 456").
+  //
+  //  The purpose is twofold:
+  //  1) Cluster together the operands sharing the same symbolic-value.
+  //  2) Operand having smaller symbolic-value-rank is permuted earlier, which
+  //     could potentially shorten crital path, and expose more loop-invariants.
+  //     Note that values' rank are basically defined in RPO order (FIXME).
+  //     So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier
+  //     than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
+  //     "z" in the order of X-Y-Z is better than any other orders.
+  std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(),
+                   [](XorOpnd *LHS, XorOpnd *RHS) {
+    return LHS->getSymbolicRank() < RHS->getSymbolicRank();
+  });
+
+  // Step 3: Combine adjacent operands
+  XorOpnd *PrevOpnd = nullptr;
+  bool Changed = false;
+  for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
+    XorOpnd *CurrOpnd = OpndPtrs[i];
+    // The combined value
+    Value *CV;
+
+    // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
+    if (!ConstOpnd.isNullValue() &&
+        CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
+      Changed = true;
+      if (CV)
+        *CurrOpnd = XorOpnd(CV);
+      else {
+        CurrOpnd->Invalidate();
+        continue;
+      }
+    }
+
+    if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) {
+      PrevOpnd = CurrOpnd;
+      continue;
+    }
+
+    // step 3.2: When previous and current operands share the same symbolic
+    //  value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd" 
+    //    
+    if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
+      // Remove previous operand
+      PrevOpnd->Invalidate();
+      if (CV) {
+        *CurrOpnd = XorOpnd(CV);
+        PrevOpnd = CurrOpnd;
+      } else {
+        CurrOpnd->Invalidate();
+        PrevOpnd = nullptr;
+      }
+      Changed = true;
+    }
+  }
+
+  // Step 4: Reassemble the Ops
+  if (Changed) {
+    Ops.clear();
+    for (unsigned int i = 0, e = Opnds.size(); i < e; i++) {
+      XorOpnd &O = Opnds[i];
+      if (O.isInvalid())
+        continue;
+      ValueEntry VE(getRank(O.getValue()), O.getValue());
+      Ops.push_back(VE);
+    }
+    if (!ConstOpnd.isNullValue()) {
+      Value *C = ConstantInt::get(Ty, ConstOpnd);
+      ValueEntry VE(getRank(C), C);
+      Ops.push_back(VE);
+    }
+    unsigned Sz = Ops.size();
+    if (Sz == 1)
+      return Ops.back().Op;
+    if (Sz == 0) {
+      assert(ConstOpnd.isNullValue());
+      return ConstantInt::get(Ty, ConstOpnd);
+    }
+  }
+
+  return nullptr;
+}
+
+/// Optimize a series of operands to an 'add' instruction.  This
+/// optimizes based on identities.  If it can be reduced to a single Value, it
+/// is returned, otherwise the Ops list is mutated as necessary.
+Value *ReassociatePass::OptimizeAdd(Instruction *I,
+                                    SmallVectorImpl<ValueEntry> &Ops) {
+  // Scan the operand lists looking for X and -X pairs.  If we find any, we
+  // can simplify expressions like X+-X == 0 and X+~X ==-1.  While we're at it,
+  // scan for any
+  // duplicates.  We want to canonicalize Y+Y+Y+Z -> 3*Y+Z.
+
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    Value *TheOp = Ops[i].Op;
+    // Check to see if we've seen this operand before.  If so, we factor all
+    // instances of the operand together.  Due to our sorting criteria, we know
+    // that these need to be next to each other in the vector.
+    if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) {
+      // Rescan the list, remove all instances of this operand from the expr.
+      unsigned NumFound = 0;
+      do {
+        Ops.erase(Ops.begin()+i);
+        ++NumFound;
+      } while (i != Ops.size() && Ops[i].Op == TheOp);
+
+      DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
+      ++NumFactor;
+
+      // Insert a new multiply.
+      Type *Ty = TheOp->getType();
+      Constant *C = Ty->isIntOrIntVectorTy() ?
+        ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound);
+      Instruction *Mul = CreateMul(TheOp, C, "factor", I, I);
+
+      // Now that we have inserted a multiply, optimize it. This allows us to
+      // handle cases that require multiple factoring steps, such as this:
+      // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
+      RedoInsts.insert(Mul);
+
+      // If every add operand was a duplicate, return the multiply.
+      if (Ops.empty())
+        return Mul;
+
+      // Otherwise, we had some input that didn't have the dupe, such as
+      // "A + A + B" -> "A*2 + B".  Add the new multiply to the list of
+      // things being added by this operation.
+      Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul));
+
+      --i;
+      e = Ops.size();
+      continue;
+    }
+
+    // Check for X and -X or X and ~X in the operand list.
+    if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isFNeg(TheOp) &&
+        !BinaryOperator::isNot(TheOp))
+      continue;
+
+    Value *X = nullptr;
+    if (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp))
+      X = BinaryOperator::getNegArgument(TheOp);
+    else if (BinaryOperator::isNot(TheOp))
+      X = BinaryOperator::getNotArgument(TheOp);
+
+    unsigned FoundX = FindInOperandList(Ops, i, X);
+    if (FoundX == i)
+      continue;
+
+    // Remove X and -X from the operand list.
+    if (Ops.size() == 2 &&
+        (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp)))
+      return Constant::getNullValue(X->getType());
+
+    // Remove X and ~X from the operand list.
+    if (Ops.size() == 2 && BinaryOperator::isNot(TheOp))
+      return Constant::getAllOnesValue(X->getType());
+
+    Ops.erase(Ops.begin()+i);
+    if (i < FoundX)
+      --FoundX;
+    else
+      --i;   // Need to back up an extra one.
+    Ops.erase(Ops.begin()+FoundX);
+    ++NumAnnihil;
+    --i;     // Revisit element.
+    e -= 2;  // Removed two elements.
+
+    // if X and ~X we append -1 to the operand list.
+    if (BinaryOperator::isNot(TheOp)) {
+      Value *V = Constant::getAllOnesValue(X->getType());
+      Ops.insert(Ops.end(), ValueEntry(getRank(V), V));
+      e += 1;
+    }
+  }
+
+  // Scan the operand list, checking to see if there are any common factors
+  // between operands.  Consider something like A*A+A*B*C+D.  We would like to
+  // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
+  // To efficiently find this, we count the number of times a factor occurs
+  // for any ADD operands that are MULs.
+  DenseMap<Value*, unsigned> FactorOccurrences;
+
+  // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4)
+  // where they are actually the same multiply.
+  unsigned MaxOcc = 0;
+  Value *MaxOccVal = nullptr;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    BinaryOperator *BOp =
+        isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
+    if (!BOp)
+      continue;
+
+    // Compute all of the factors of this added value.
+    SmallVector<Value*, 8> Factors;
+    FindSingleUseMultiplyFactors(BOp, Factors);
+    assert(Factors.size() > 1 && "Bad linearize!");
+
+    // Add one to FactorOccurrences for each unique factor in this op.
+    SmallPtrSet<Value*, 8> Duplicates;
+    for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+      Value *Factor = Factors[i];
+      if (!Duplicates.insert(Factor).second)
+        continue;
+
+      unsigned Occ = ++FactorOccurrences[Factor];
+      if (Occ > MaxOcc) {
+        MaxOcc = Occ;
+        MaxOccVal = Factor;
+      }
+
+      // If Factor is a negative constant, add the negated value as a factor
+      // because we can percolate the negate out.  Watch for minint, which
+      // cannot be positivified.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) {
+        if (CI->isNegative() && !CI->isMinValue(true)) {
+          Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
+          if (!Duplicates.insert(Factor).second)
+            continue;
+          unsigned Occ = ++FactorOccurrences[Factor];
+          if (Occ > MaxOcc) {
+            MaxOcc = Occ;
+            MaxOccVal = Factor;
+          }
+        }
+      } else if (ConstantFP *CF = dyn_cast<ConstantFP>(Factor)) {
+        if (CF->isNegative()) {
+          APFloat F(CF->getValueAPF());
+          F.changeSign();
+          Factor = ConstantFP::get(CF->getContext(), F);
+          if (!Duplicates.insert(Factor).second)
+            continue;
+          unsigned Occ = ++FactorOccurrences[Factor];
+          if (Occ > MaxOcc) {
+            MaxOcc = Occ;
+            MaxOccVal = Factor;
+          }
+        }
+      }
+    }
+  }
+
+  // If any factor occurred more than one time, we can pull it out.
+  if (MaxOcc > 1) {
+    DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
+    ++NumFactor;
+
+    // Create a new instruction that uses the MaxOccVal twice.  If we don't do
+    // this, we could otherwise run into situations where removing a factor
+    // from an expression will drop a use of maxocc, and this can cause
+    // RemoveFactorFromExpression on successive values to behave differently.
+    Instruction *DummyInst =
+        I->getType()->isIntOrIntVectorTy()
+            ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
+            : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);
+
+    SmallVector<WeakTrackingVH, 4> NewMulOps;
+    for (unsigned i = 0; i != Ops.size(); ++i) {
+      // Only try to remove factors from expressions we're allowed to.
+      BinaryOperator *BOp =
+          isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
+      if (!BOp)
+        continue;
+
+      if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
+        // The factorized operand may occur several times.  Convert them all in
+        // one fell swoop.
+        for (unsigned j = Ops.size(); j != i;) {
+          --j;
+          if (Ops[j].Op == Ops[i].Op) {
+            NewMulOps.push_back(V);
+            Ops.erase(Ops.begin()+j);
+          }
+        }
+        --i;
+      }
+    }
+
+    // No need for extra uses anymore.
+    DummyInst->deleteValue();
+
+    unsigned NumAddedValues = NewMulOps.size();
+    Value *V = EmitAddTreeOfValues(I, NewMulOps);
+
+    // Now that we have inserted the add tree, optimize it. This allows us to
+    // handle cases that require multiple factoring steps, such as this:
+    // A*A*B + A*A*C   -->   A*(A*B+A*C)   -->   A*(A*(B+C))
+    assert(NumAddedValues > 1 && "Each occurrence should contribute a value");
+    (void)NumAddedValues;
+    if (Instruction *VI = dyn_cast<Instruction>(V))
+      RedoInsts.insert(VI);
+
+    // Create the multiply.
+    Instruction *V2 = CreateMul(V, MaxOccVal, "tmp", I, I);
+
+    // Rerun associate on the multiply in case the inner expression turned into
+    // a multiply.  We want to make sure that we keep things in canonical form.
+    RedoInsts.insert(V2);
+
+    // If every add operand included the factor (e.g. "A*B + A*C"), then the
+    // entire result expression is just the multiply "A*(B+C)".
+    if (Ops.empty())
+      return V2;
+
+    // Otherwise, we had some input that didn't have the factor, such as
+    // "A*B + A*C + D" -> "A*(B+C) + D".  Add the new multiply to the list of
+    // things being added by this operation.
+    Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
+  }
+
+  return nullptr;
+}
+
+/// \brief Build up a vector of value/power pairs factoring a product.
+///
+/// Given a series of multiplication operands, build a vector of factors and
+/// the powers each is raised to when forming the final product. Sort them in
+/// the order of descending power.
+///
+///      (x*x)          -> [(x, 2)]
+///     ((x*x)*x)       -> [(x, 3)]
+///   ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
+///
+/// \returns Whether any factors have a power greater than one.
+static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+                                   SmallVectorImpl<Factor> &Factors) {
+  // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
+  // Compute the sum of powers of simplifiable factors.
+  unsigned FactorPowerSum = 0;
+  for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) {
+    Value *Op = Ops[Idx-1].Op;
+
+    // Count the number of occurrences of this value.
+    unsigned Count = 1;
+    for (; Idx < Size && Ops[Idx].Op == Op; ++Idx)
+      ++Count;
+    // Track for simplification all factors which occur 2 or more times.
+    if (Count > 1)
+      FactorPowerSum += Count;
+  }
+
+  // We can only simplify factors if the sum of the powers of our simplifiable
+  // factors is 4 or higher. When that is the case, we will *always* have
+  // a simplification. This is an important invariant to prevent cyclicly
+  // trying to simplify already minimal formations.
+  if (FactorPowerSum < 4)
+    return false;
+
+  // Now gather the simplifiable factors, removing them from Ops.
+  FactorPowerSum = 0;
+  for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) {
+    Value *Op = Ops[Idx-1].Op;
+
+    // Count the number of occurrences of this value.
+    unsigned Count = 1;
+    for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx)
+      ++Count;
+    if (Count == 1)
+      continue;
+    // Move an even number of occurrences to Factors.
+    Count &= ~1U;
+    Idx -= Count;
+    FactorPowerSum += Count;
+    Factors.push_back(Factor(Op, Count));
+    Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count);
+  }
+
+  // None of the adjustments above should have reduced the sum of factor powers
+  // below our mininum of '4'.
+  assert(FactorPowerSum >= 4);
+
+  std::stable_sort(Factors.begin(), Factors.end(),
+                   [](const Factor &LHS, const Factor &RHS) {
+    return LHS.Power > RHS.Power;
+  });
+  return true;
+}
+
+/// \brief Build a tree of multiplies, computing the product of Ops.
+static Value *buildMultiplyTree(IRBuilder<> &Builder,
+                                SmallVectorImpl<Value*> &Ops) {
+  if (Ops.size() == 1)
+    return Ops.back();
+
+  Value *LHS = Ops.pop_back_val();
+  do {
+    if (LHS->getType()->isIntOrIntVectorTy())
+      LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
+    else
+      LHS = Builder.CreateFMul(LHS, Ops.pop_back_val());
+  } while (!Ops.empty());
+
+  return LHS;
+}
+
+/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
+///
+/// Given a vector of values raised to various powers, where no two values are
+/// equal and the powers are sorted in decreasing order, compute the minimal
+/// DAG of multiplies to compute the final product, and return that product
+/// value.
+Value *
+ReassociatePass::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
+                                         SmallVectorImpl<Factor> &Factors) {
+  assert(Factors[0].Power);
+  SmallVector<Value *, 4> OuterProduct;
+  for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size();
+       Idx < Size && Factors[Idx].Power > 0; ++Idx) {
+    if (Factors[Idx].Power != Factors[LastIdx].Power) {
+      LastIdx = Idx;
+      continue;
+    }
+
+    // We want to multiply across all the factors with the same power so that
+    // we can raise them to that power as a single entity. Build a mini tree
+    // for that.
+    SmallVector<Value *, 4> InnerProduct;
+    InnerProduct.push_back(Factors[LastIdx].Base);
+    do {
+      InnerProduct.push_back(Factors[Idx].Base);
+      ++Idx;
+    } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power);
+
+    // Reset the base value of the first factor to the new expression tree.
+    // We'll remove all the factors with the same power in a second pass.
+    Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct);
+    if (Instruction *MI = dyn_cast<Instruction>(M))
+      RedoInsts.insert(MI);
+
+    LastIdx = Idx;
+  }
+  // Unique factors with equal powers -- we've folded them into the first one's
+  // base.
+  Factors.erase(std::unique(Factors.begin(), Factors.end(),
+                            [](const Factor &LHS, const Factor &RHS) {
+                              return LHS.Power == RHS.Power;
+                            }),
+                Factors.end());
+
+  // Iteratively collect the base of each factor with an add power into the
+  // outer product, and halve each power in preparation for squaring the
+  // expression.
+  for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) {
+    if (Factors[Idx].Power & 1)
+      OuterProduct.push_back(Factors[Idx].Base);
+    Factors[Idx].Power >>= 1;
+  }
+  if (Factors[0].Power) {
+    Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors);
+    OuterProduct.push_back(SquareRoot);
+    OuterProduct.push_back(SquareRoot);
+  }
+  if (OuterProduct.size() == 1)
+    return OuterProduct.front();
+
+  Value *V = buildMultiplyTree(Builder, OuterProduct);
+  return V;
+}
+
+Value *ReassociatePass::OptimizeMul(BinaryOperator *I,
+                                    SmallVectorImpl<ValueEntry> &Ops) {
+  // We can only optimize the multiplies when there is a chain of more than
+  // three, such that a balanced tree might require fewer total multiplies.
+  if (Ops.size() < 4)
+    return nullptr;
+
+  // Try to turn linear trees of multiplies without other uses of the
+  // intermediate stages into minimal multiply DAGs with perfect sub-expression
+  // re-use.
+  SmallVector<Factor, 4> Factors;
+  if (!collectMultiplyFactors(Ops, Factors))
+    return nullptr; // All distinct factors, so nothing left for us to do.
+
+  IRBuilder<> Builder(I);
+  // The reassociate transformation for FP operations is performed only
+  // if unsafe algebra is permitted by FastMathFlags. Propagate those flags
+  // to the newly generated operations.
+  if (auto FPI = dyn_cast<FPMathOperator>(I))
+    Builder.setFastMathFlags(FPI->getFastMathFlags());
+
+  Value *V = buildMinimalMultiplyDAG(Builder, Factors);
+  if (Ops.empty())
+    return V;
+
+  ValueEntry NewEntry = ValueEntry(getRank(V), V);
+  Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry);
+  return nullptr;
+}
+
+Value *ReassociatePass::OptimizeExpression(BinaryOperator *I,
+                                           SmallVectorImpl<ValueEntry> &Ops) {
+  // Now that we have the linearized expression tree, try to optimize it.
+  // Start by folding any constants that we found.
+  Constant *Cst = nullptr;
+  unsigned Opcode = I->getOpcode();
+  while (!Ops.empty() && isa<Constant>(Ops.back().Op)) {
+    Constant *C = cast<Constant>(Ops.pop_back_val().Op);
+    Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C;
+  }
+  // If there was nothing but constants then we are done.
+  if (Ops.empty())
+    return Cst;
+
+  // Put the combined constant back at the end of the operand list, except if
+  // there is no point.  For example, an add of 0 gets dropped here, while a
+  // multiplication by zero turns the whole expression into zero.
+  if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) {
+    if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType()))
+      return Cst;
+    Ops.push_back(ValueEntry(0, Cst));
+  }
+
+  if (Ops.size() == 1) return Ops[0].Op;
+
+  // Handle destructive annihilation due to identities between elements in the
+  // argument list here.
+  unsigned NumOps = Ops.size();
+  switch (Opcode) {
+  default: break;
+  case Instruction::And:
+  case Instruction::Or:
+    if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
+      return Result;
+    break;
+
+  case Instruction::Xor:
+    if (Value *Result = OptimizeXor(I, Ops))
+      return Result;
+    break;
+
+  case Instruction::Add:
+  case Instruction::FAdd:
+    if (Value *Result = OptimizeAdd(I, Ops))
+      return Result;
+    break;
+
+  case Instruction::Mul:
+  case Instruction::FMul:
+    if (Value *Result = OptimizeMul(I, Ops))
+      return Result;
+    break;
+  }
+
+  if (Ops.size() != NumOps)
+    return OptimizeExpression(I, Ops);
+  return nullptr;
+}
+
+// Remove dead instructions and if any operands are trivially dead add them to
+// Insts so they will be removed as well.
+void ReassociatePass::RecursivelyEraseDeadInsts(
+    Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) {
+  assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+  SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
+  ValueRankMap.erase(I);
+  Insts.remove(I);
+  RedoInsts.remove(I);
+  I->eraseFromParent();
+  for (auto Op : Ops)
+    if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+      if (OpInst->use_empty())
+        Insts.insert(OpInst);
+}
+
+/// Zap the given instruction, adding interesting operands to the work list.
+void ReassociatePass::EraseInst(Instruction *I) {
+  assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+  DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
+
+  SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
+  // Erase the dead instruction.
+  ValueRankMap.erase(I);
+  RedoInsts.remove(I);
+  I->eraseFromParent();
+  // Optimize its operands.
+  SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) {
+      // If this is a node in an expression tree, climb to the expression root
+      // and add that since that's where optimization actually happens.
+      unsigned Opcode = Op->getOpcode();
+      while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
+             Visited.insert(Op).second)
+        Op = Op->user_back();
+      RedoInsts.insert(Op);
+    }
+
+  MadeChange = true;
+}
+
+// Canonicalize expressions of the following form:
+//  x + (-Constant * y) -> x - (Constant * y)
+//  x - (-Constant * y) -> x + (Constant * y)
+Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) {
+  if (!I->hasOneUse() || I->getType()->isVectorTy())
+    return nullptr;
+
+  // Must be a fmul or fdiv instruction.
+  unsigned Opcode = I->getOpcode();
+  if (Opcode != Instruction::FMul && Opcode != Instruction::FDiv)
+    return nullptr;
+
+  auto *C0 = dyn_cast<ConstantFP>(I->getOperand(0));
+  auto *C1 = dyn_cast<ConstantFP>(I->getOperand(1));
+
+  // Both operands are constant, let it get constant folded away.
+  if (C0 && C1)
+    return nullptr;
+
+  ConstantFP *CF = C0 ? C0 : C1;
+
+  // Must have one constant operand.
+  if (!CF)
+    return nullptr;
+
+  // Must be a negative ConstantFP.
+  if (!CF->isNegative())
+    return nullptr;
+
+  // User must be a binary operator with one or more uses.
+  Instruction *User = I->user_back();
+  if (!isa<BinaryOperator>(User) || User->use_empty())
+    return nullptr;
+
+  unsigned UserOpcode = User->getOpcode();
+  if (UserOpcode != Instruction::FAdd && UserOpcode != Instruction::FSub)
+    return nullptr;
+
+  // Subtraction is not commutative. Explicitly, the following transform is
+  // not valid: (-Constant * y) - x  -> x + (Constant * y)
+  if (!User->isCommutative() && User->getOperand(1) != I)
+    return nullptr;
+
+  // Change the sign of the constant.
+  APFloat Val = CF->getValueAPF();
+  Val.changeSign();
+  I->setOperand(C0 ? 0 : 1, ConstantFP::get(CF->getContext(), Val));
+
+  // Canonicalize I to RHS to simplify the next bit of logic. E.g.,
+  // ((-Const*y) + x) -> (x + (-Const*y)).
+  if (User->getOperand(0) == I && User->isCommutative())
+    cast<BinaryOperator>(User)->swapOperands();
+
+  Value *Op0 = User->getOperand(0);
+  Value *Op1 = User->getOperand(1);
+  BinaryOperator *NI;
+  switch (UserOpcode) {
+  default:
+    llvm_unreachable("Unexpected Opcode!");
+  case Instruction::FAdd:
+    NI = BinaryOperator::CreateFSub(Op0, Op1);
+    NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
+    break;
+  case Instruction::FSub:
+    NI = BinaryOperator::CreateFAdd(Op0, Op1);
+    NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
+    break;
+  }
+
+  NI->insertBefore(User);
+  NI->setName(User->getName());
+  User->replaceAllUsesWith(NI);
+  NI->setDebugLoc(I->getDebugLoc());
+  RedoInsts.insert(I);
+  MadeChange = true;
+  return NI;
+}
+
+/// Inspect and optimize the given instruction. Note that erasing
+/// instructions is not allowed.
+void ReassociatePass::OptimizeInst(Instruction *I) {
+  // Only consider operations that we understand.
+  if (!isa<BinaryOperator>(I))
+    return;
+
+  if (I->getOpcode() == Instruction::Shl && isa<ConstantInt>(I->getOperand(1)))
+    // If an operand of this shift is a reassociable multiply, or if the shift
+    // is used by a reassociable multiply or add, turn into a multiply.
+    if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
+        (I->hasOneUse() &&
+         (isReassociableOp(I->user_back(), Instruction::Mul) ||
+          isReassociableOp(I->user_back(), Instruction::Add)))) {
+      Instruction *NI = ConvertShiftToMul(I);
+      RedoInsts.insert(I);
+      MadeChange = true;
+      I = NI;
+    }
+
+  // Canonicalize negative constants out of expressions.
+  if (Instruction *Res = canonicalizeNegConstExpr(I))
+    I = Res;
+
+  // Commute binary operators, to canonicalize the order of their operands.
+  // This can potentially expose more CSE opportunities, and makes writing other
+  // transformations simpler.
+  if (I->isCommutative())
+    canonicalizeOperands(I);
+
+  // Don't optimize floating point instructions that don't have unsafe algebra.
+  if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())
+    return;
+
+  // Do not reassociate boolean (i1) expressions.  We want to preserve the
+  // original order of evaluation for short-circuited comparisons that
+  // SimplifyCFG has folded to AND/OR expressions.  If the expression
+  // is not further optimized, it is likely to be transformed back to a
+  // short-circuited form for code gen, and the source order may have been
+  // optimized for the most likely conditions.
+  if (I->getType()->isIntegerTy(1))
+    return;
+
+  // If this is a subtract instruction which is not already in negate form,
+  // see if we can convert it to X+-Y.
+  if (I->getOpcode() == Instruction::Sub) {
+    if (ShouldBreakUpSubtract(I)) {
+      Instruction *NI = BreakUpSubtract(I, RedoInsts);
+      RedoInsts.insert(I);
+      MadeChange = true;
+      I = NI;
+    } else if (BinaryOperator::isNeg(I)) {
+      // Otherwise, this is a negation.  See if the operand is a multiply tree
+      // and if this is not an inner node of a multiply tree.
+      if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
+          (!I->hasOneUse() ||
+           !isReassociableOp(I->user_back(), Instruction::Mul))) {
+        Instruction *NI = LowerNegateToMultiply(I);
+        // If the negate was simplified, revisit the users to see if we can
+        // reassociate further.
+        for (User *U : NI->users()) {
+          if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+            RedoInsts.insert(Tmp);
+        }
+        RedoInsts.insert(I);
+        MadeChange = true;
+        I = NI;
+      }
+    }
+  } else if (I->getOpcode() == Instruction::FSub) {
+    if (ShouldBreakUpSubtract(I)) {
+      Instruction *NI = BreakUpSubtract(I, RedoInsts);
+      RedoInsts.insert(I);
+      MadeChange = true;
+      I = NI;
+    } else if (BinaryOperator::isFNeg(I)) {
+      // Otherwise, this is a negation.  See if the operand is a multiply tree
+      // and if this is not an inner node of a multiply tree.
+      if (isReassociableOp(I->getOperand(1), Instruction::FMul) &&
+          (!I->hasOneUse() ||
+           !isReassociableOp(I->user_back(), Instruction::FMul))) {
+        // If the negate was simplified, revisit the users to see if we can
+        // reassociate further.
+        Instruction *NI = LowerNegateToMultiply(I);
+        for (User *U : NI->users()) {
+          if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+            RedoInsts.insert(Tmp);
+        }
+        RedoInsts.insert(I);
+        MadeChange = true;
+        I = NI;
+      }
+    }
+  }
+
+  // If this instruction is an associative binary operator, process it.
+  if (!I->isAssociative()) return;
+  BinaryOperator *BO = cast<BinaryOperator>(I);
+
+  // If this is an interior node of a reassociable tree, ignore it until we
+  // get to the root of the tree, to avoid N^2 analysis.
+  unsigned Opcode = BO->getOpcode();
+  if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) {
+    // During the initial run we will get to the root of the tree.
+    // But if we get here while we are redoing instructions, there is no
+    // guarantee that the root will be visited. So Redo later
+    if (BO->user_back() != BO &&
+        BO->getParent() == BO->user_back()->getParent())
+      RedoInsts.insert(BO->user_back());
+    return;
+  }
+
+  // If this is an add tree that is used by a sub instruction, ignore it
+  // until we process the subtract.
+  if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
+      cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub)
+    return;
+  if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd &&
+      cast<Instruction>(BO->user_back())->getOpcode() == Instruction::FSub)
+    return;
+
+  ReassociateExpression(BO);
+}
+
+void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
+  // First, walk the expression tree, linearizing the tree, collecting the
+  // operand information.
+  SmallVector<RepeatedValue, 8> Tree;
+  MadeChange |= LinearizeExprTree(I, Tree);
+  SmallVector<ValueEntry, 8> Ops;
+  Ops.reserve(Tree.size());
+  for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+    RepeatedValue E = Tree[i];
+    Ops.append(E.second.getZExtValue(),
+               ValueEntry(getRank(E.first), E.first));
+  }
+
+  DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
+
+  // Now that we have linearized the tree to a list and have gathered all of
+  // the operands and their ranks, sort the operands by their rank.  Use a
+  // stable_sort so that values with equal ranks will have their relative
+  // positions maintained (and so the compiler is deterministic).  Note that
+  // this sorts so that the highest ranking values end up at the beginning of
+  // the vector.
+  std::stable_sort(Ops.begin(), Ops.end());
+
+  // Now that we have the expression tree in a convenient
+  // sorted form, optimize it globally if possible.
+  if (Value *V = OptimizeExpression(I, Ops)) {
+    if (V == I)
+      // Self-referential expression in unreachable code.
+      return;
+    // This expression tree simplified to something that isn't a tree,
+    // eliminate it.
+    DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
+    I->replaceAllUsesWith(V);
+    if (Instruction *VI = dyn_cast<Instruction>(V))
+      VI->setDebugLoc(I->getDebugLoc());
+    RedoInsts.insert(I);
+    ++NumAnnihil;
+    return;
+  }
+
+  // We want to sink immediates as deeply as possible except in the case where
+  // this is a multiply tree used only by an add, and the immediate is a -1.
+  // In this case we reassociate to put the negation on the outside so that we
+  // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
+  if (I->hasOneUse()) {
+    if (I->getOpcode() == Instruction::Mul &&
+        cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
+        isa<ConstantInt>(Ops.back().Op) &&
+        cast<ConstantInt>(Ops.back().Op)->isMinusOne()) {
+      ValueEntry Tmp = Ops.pop_back_val();
+      Ops.insert(Ops.begin(), Tmp);
+    } else if (I->getOpcode() == Instruction::FMul &&
+               cast<Instruction>(I->user_back())->getOpcode() ==
+                   Instruction::FAdd &&
+               isa<ConstantFP>(Ops.back().Op) &&
+               cast<ConstantFP>(Ops.back().Op)->isExactlyValue(-1.0)) {
+      ValueEntry Tmp = Ops.pop_back_val();
+      Ops.insert(Ops.begin(), Tmp);
+    }
+  }
+
+  DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
+
+  if (Ops.size() == 1) {
+    if (Ops[0].Op == I)
+      // Self-referential expression in unreachable code.
+      return;
+
+    // This expression tree simplified to something that isn't a tree,
+    // eliminate it.
+    I->replaceAllUsesWith(Ops[0].Op);
+    if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op))
+      OI->setDebugLoc(I->getDebugLoc());
+    RedoInsts.insert(I);
+    return;
+  }
+
+  // Now that we ordered and optimized the expressions, splat them back into
+  // the expression tree, removing any unneeded nodes.
+  RewriteExprTree(I, Ops);
+}
+
+PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
+  // Get the functions basic blocks in Reverse Post Order. This order is used by
+  // BuildRankMap to pre calculate ranks correctly. It also excludes dead basic
+  // blocks (it has been seen that the analysis in this pass could hang when
+  // analysing dead basic blocks).
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+
+  // Calculate the rank map for F.
+  BuildRankMap(F, RPOT);
+
+  MadeChange = false;
+  // Traverse the same blocks that was analysed by BuildRankMap.
+  for (BasicBlock *BI : RPOT) {
+    assert(RankMap.count(&*BI) && "BB should be ranked.");
+    // Optimize every instruction in the basic block.
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;)
+      if (isInstructionTriviallyDead(&*II)) {
+        EraseInst(&*II++);
+      } else {
+        OptimizeInst(&*II);
+        assert(II->getParent() == &*BI && "Moved to a different block!");
+        ++II;
+      }
+
+    // Make a copy of all the instructions to be redone so we can remove dead
+    // instructions.
+    SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
+    // Iterate over all instructions to be reevaluated and remove trivially dead
+    // instructions. If any operand of the trivially dead instruction becomes
+    // dead mark it for deletion as well. Continue this process until all
+    // trivially dead instructions have been removed.
+    while (!ToRedo.empty()) {
+      Instruction *I = ToRedo.pop_back_val();
+      if (isInstructionTriviallyDead(I)) {
+        RecursivelyEraseDeadInsts(I, ToRedo);
+        MadeChange = true;
+      }
+    }
+
+    // Now that we have removed dead instructions, we can reoptimize the
+    // remaining instructions.
+    while (!RedoInsts.empty()) {
+      Instruction *I = RedoInsts.pop_back_val();
+      if (isInstructionTriviallyDead(I))
+        EraseInst(I);
+      else
+        OptimizeInst(I);
+    }
+  }
+
+  // We are done with the rank map.
+  RankMap.clear();
+  ValueRankMap.clear();
+
+  if (MadeChange) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    PA.preserve<GlobalsAA>();
+    return PA;
+  }
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+  class ReassociateLegacyPass : public FunctionPass {
+    ReassociatePass Impl;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    ReassociateLegacyPass() : FunctionPass(ID) {
+      initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override {
+      if (skipFunction(F))
+        return false;
+
+      FunctionAnalysisManager DummyFAM;
+      auto PA = Impl.run(F, DummyFAM);
+      return !PA.areAllPreserved();
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+    }
+  };
+}
+
+char ReassociateLegacyPass::ID = 0;
+INITIALIZE_PASS(ReassociateLegacyPass, "reassociate",
+                "Reassociate expressions", false, false)
+
+// Public interface to the Reassociate pass
+FunctionPass *llvm::createReassociatePass() {
+  return new ReassociateLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
new file mode 100644
index 000000000000..96295683314c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -0,0 +1,128 @@
+//===- Reg2Mem.cpp - Convert registers to allocas -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file demotes all registers to memory references.  It is intended to be
+// the inverse of PromoteMemoryToRegister.  By converting to loads, the only
+// values live across basic blocks are allocas and loads before phi nodes.
+// It is intended that this should make CFG hacking much easier.
+// To make later hacking easier, the entry block is split into two, such that
+// all introduced allocas and nothing else are in the entry block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <list>
+using namespace llvm;
+
+#define DEBUG_TYPE "reg2mem"
+
+STATISTIC(NumRegsDemoted, "Number of registers demoted");
+STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted");
+
+namespace {
+  struct RegToMem : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    RegToMem() : FunctionPass(ID) {
+      initializeRegToMemPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addPreservedID(BreakCriticalEdgesID);
+    }
+
+    bool valueEscapes(const Instruction *Inst) const {
+      const BasicBlock *BB = Inst->getParent();
+      for (const User *U : Inst->users()) {
+        const Instruction *UI = cast<Instruction>(U);
+        if (UI->getParent() != BB || isa<PHINode>(UI))
+          return true;
+      }
+      return false;
+    }
+
+    bool runOnFunction(Function &F) override;
+  };
+}
+
+char RegToMem::ID = 0;
+INITIALIZE_PASS_BEGIN(RegToMem, "reg2mem", "Demote all values to stack slots",
+                false, false)
+INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
+INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots",
+                false, false)
+
+bool RegToMem::runOnFunction(Function &F) {
+  if (F.isDeclaration() || skipFunction(F))
+    return false;
+
+  // Insert all new allocas into entry block.
+  BasicBlock *BBEntry = &F.getEntryBlock();
+  assert(pred_empty(BBEntry) &&
+         "Entry block to function must not have predecessors!");
+
+  // Find first non-alloca instruction and create insertion point. This is
+  // safe if block is well-formed: it always have terminator, otherwise
+  // we'll get and assertion.
+  BasicBlock::iterator I = BBEntry->begin();
+  while (isa<AllocaInst>(I)) ++I;
+
+  CastInst *AllocaInsertionPoint = new BitCastInst(
+      Constant::getNullValue(Type::getInt32Ty(F.getContext())),
+      Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I);
+
+  // Find the escaped instructions. But don't create stack slots for
+  // allocas in entry block.
+  std::list<Instruction*> WorkList;
+  for (BasicBlock &ibb : F)
+    for (BasicBlock::iterator iib = ibb.begin(), iie = ibb.end(); iib != iie;
+         ++iib) {
+      if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) &&
+          valueEscapes(&*iib)) {
+        WorkList.push_front(&*iib);
+      }
+    }
+
+  // Demote escaped instructions
+  NumRegsDemoted += WorkList.size();
+  for (Instruction *ilb : WorkList)
+    DemoteRegToStack(*ilb, false, AllocaInsertionPoint);
+
+  WorkList.clear();
+
+  // Find all phi's
+  for (BasicBlock &ibb : F)
+    for (BasicBlock::iterator iib = ibb.begin(), iie = ibb.end(); iib != iie;
+         ++iib)
+      if (isa<PHINode>(iib))
+        WorkList.push_front(&*iib);
+
+  // Demote phi nodes
+  NumPhisDemoted += WorkList.size();
+  for (Instruction *ilb : WorkList)
+    DemotePHIToStack(cast<PHINode>(ilb), AllocaInsertionPoint);
+
+  return true;
+}
+
+
+// createDemoteRegisterToMemory - Provide an entry point to create this pass.
+char &llvm::DemoteRegisterToMemoryID = RegToMem::ID;
+FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
+  return new RegToMem();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
new file mode 100644
index 000000000000..f19d45329d23
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -0,0 +1,2725 @@
+//===- RewriteStatepointsForGC.cpp - Make GC relocations explicit ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Rewrite call/invoke instructions so as to make potential relocations
+// performed by the garbage collector explicit in the IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+
+#define DEBUG_TYPE "rewrite-statepoints-for-gc"
+
+using namespace llvm;
+
+// Print the liveset found at the insert location
+static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden,
+                                  cl::init(false));
+static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size", cl::Hidden,
+                                      cl::init(false));
+// Print out the base pointers for debugging
+static cl::opt<bool> PrintBasePointers("spp-print-base-pointers", cl::Hidden,
+                                       cl::init(false));
+
+// Cost threshold measuring when it is profitable to rematerialize value instead
+// of relocating it
+static cl::opt<unsigned>
+RematerializationThreshold("spp-rematerialization-threshold", cl::Hidden,
+                           cl::init(6));
+
+#ifdef EXPENSIVE_CHECKS
+static bool ClobberNonLive = true;
+#else
+static bool ClobberNonLive = false;
+#endif
+static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live",
+                                                  cl::location(ClobberNonLive),
+                                                  cl::Hidden);
+
+static cl::opt<bool>
+    AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info",
+                                   cl::Hidden, cl::init(true));
+
+namespace {
+struct RewriteStatepointsForGC : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+
+  RewriteStatepointsForGC() : ModulePass(ID) {
+    initializeRewriteStatepointsForGCPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F);
+  bool runOnModule(Module &M) override {
+    bool Changed = false;
+    for (Function &F : M)
+      Changed |= runOnFunction(F);
+
+    if (Changed) {
+      // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn
+      // returns true for at least one function in the module.  Since at least
+      // one function changed, we know that the precondition is satisfied.
+      stripNonValidAttributesAndMetadata(M);
+    }
+
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // We add and rewrite a bunch of instructions, but don't really do much
+    // else.  We could in theory preserve a lot more analyses here.
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  /// The IR fed into RewriteStatepointsForGC may have had attributes and
+  /// metadata implying dereferenceability that are no longer valid/correct after
+  /// RewriteStatepointsForGC has run. This is because semantically, after
+  /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
+  /// heap. stripNonValidAttributesAndMetadata (conservatively) restores
+  /// correctness by erasing all attributes in the module that externally imply
+  /// dereferenceability. Similar reasoning also applies to the noalias
+  /// attributes and metadata. gc.statepoint can touch the entire heap including
+  /// noalias objects.
+  void stripNonValidAttributesAndMetadata(Module &M);
+
+  // Helpers for stripNonValidAttributesAndMetadata
+  void stripNonValidAttributesAndMetadataFromBody(Function &F);
+  void stripNonValidAttributesFromPrototype(Function &F);
+  // Certain metadata on instructions are invalid after running RS4GC.
+  // Optimizations that run after RS4GC can incorrectly use this metadata to
+  // optimize functions. We drop such metadata on the instruction.
+  void stripInvalidMetadataFromInstruction(Instruction &I);
+};
+} // namespace
+
+char RewriteStatepointsForGC::ID = 0;
+
+ModulePass *llvm::createRewriteStatepointsForGCPass() {
+  return new RewriteStatepointsForGC();
+}
+
+INITIALIZE_PASS_BEGIN(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
+                      "Make relocations explicit at statepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
+                    "Make relocations explicit at statepoints", false, false)
+
+namespace {
+struct GCPtrLivenessData {
+  /// Values defined in this block.
+  MapVector<BasicBlock *, SetVector<Value *>> KillSet;
+  /// Values used in this block (and thus live); does not included values
+  /// killed within this block.
+  MapVector<BasicBlock *, SetVector<Value *>> LiveSet;
+
+  /// Values live into this basic block (i.e. used by any
+  /// instruction in this basic block or ones reachable from here)
+  MapVector<BasicBlock *, SetVector<Value *>> LiveIn;
+
+  /// Values live out of this basic block (i.e. live into
+  /// any successor block)
+  MapVector<BasicBlock *, SetVector<Value *>> LiveOut;
+};
+
+// The type of the internal cache used inside the findBasePointers family
+// of functions.  From the callers perspective, this is an opaque type and
+// should not be inspected.
+//
+// In the actual implementation this caches two relations:
+// - The base relation itself (i.e. this pointer is based on that one)
+// - The base defining value relation (i.e. before base_phi insertion)
+// Generally, after the execution of a full findBasePointer call, only the
+// base relation will remain.  Internally, we add a mixture of the two
+// types, then update all the second type to the first type
+typedef MapVector<Value *, Value *> DefiningValueMapTy;
+typedef SetVector<Value *> StatepointLiveSetTy;
+typedef MapVector<AssertingVH<Instruction>, AssertingVH<Value>>
+  RematerializedValueMapTy;
+
+struct PartiallyConstructedSafepointRecord {
+  /// The set of values known to be live across this safepoint
+  StatepointLiveSetTy LiveSet;
+
+  /// Mapping from live pointers to a base-defining-value
+  MapVector<Value *, Value *> PointerToBase;
+
+  /// The *new* gc.statepoint instruction itself.  This produces the token
+  /// that normal path gc.relocates and the gc.result are tied to.
+  Instruction *StatepointToken;
+
+  /// Instruction to which exceptional gc relocates are attached
+  /// Makes it easier to iterate through them during relocationViaAlloca.
+  Instruction *UnwindToken;
+
+  /// Record live values we are rematerialized instead of relocating.
+  /// They are not included into 'LiveSet' field.
+  /// Maps rematerialized copy to it's original value.
+  RematerializedValueMapTy RematerializedValues;
+};
+}
+
+static ArrayRef<Use> GetDeoptBundleOperands(ImmutableCallSite CS) {
+  Optional<OperandBundleUse> DeoptBundle =
+      CS.getOperandBundle(LLVMContext::OB_deopt);
+
+  if (!DeoptBundle.hasValue()) {
+    assert(AllowStatepointWithNoDeoptInfo &&
+           "Found non-leaf call without deopt info!");
+    return None;
+  }
+
+  return DeoptBundle.getValue().Inputs;
+}
+
+/// Compute the live-in set for every basic block in the function
+static void computeLiveInValues(DominatorTree &DT, Function &F,
+                                GCPtrLivenessData &Data);
+
+/// Given results from the dataflow liveness computation, find the set of live
+/// Values at a particular instruction.
+static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data,
+                              StatepointLiveSetTy &out);
+
+// TODO: Once we can get to the GCStrategy, this becomes
+// Optional<bool> isGCManagedPointer(const Type *Ty) const override {
+
+static bool isGCPointerType(Type *T) {
+  if (auto *PT = dyn_cast<PointerType>(T))
+    // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
+    // GC managed heap.  We know that a pointer into this heap needs to be
+    // updated and that no other pointer does.
+    return PT->getAddressSpace() == 1;
+  return false;
+}
+
+// Return true if this type is one which a) is a gc pointer or contains a GC
+// pointer and b) is of a type this code expects to encounter as a live value.
+// (The insertion code will assert that a type which matches (a) and not (b)
+// is not encountered.)
+static bool isHandledGCPointerType(Type *T) {
+  // We fully support gc pointers
+  if (isGCPointerType(T))
+    return true;
+  // We partially support vectors of gc pointers. The code will assert if it
+  // can't handle something.
+  if (auto VT = dyn_cast<VectorType>(T))
+    if (isGCPointerType(VT->getElementType()))
+      return true;
+  return false;
+}
+
+#ifndef NDEBUG
+/// Returns true if this type contains a gc pointer whether we know how to
+/// handle that type or not.
+static bool containsGCPtrType(Type *Ty) {
+  if (isGCPointerType(Ty))
+    return true;
+  if (VectorType *VT = dyn_cast<VectorType>(Ty))
+    return isGCPointerType(VT->getScalarType());
+  if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
+    return containsGCPtrType(AT->getElementType());
+  if (StructType *ST = dyn_cast<StructType>(Ty))
+    return any_of(ST->subtypes(), containsGCPtrType);
+  return false;
+}
+
+// Returns true if this is a type which a) is a gc pointer or contains a GC
+// pointer and b) is of a type which the code doesn't expect (i.e. first class
+// aggregates).  Used to trip assertions.
+static bool isUnhandledGCPointerType(Type *Ty) {
+  return containsGCPtrType(Ty) && !isHandledGCPointerType(Ty);
+}
+#endif
+
+// Return the name of the value suffixed with the provided value, or if the
+// value didn't have a name, the default value specified.
+static std::string suffixed_name_or(Value *V, StringRef Suffix,
+                                    StringRef DefaultName) {
+  return V->hasName() ? (V->getName() + Suffix).str() : DefaultName.str();
+}
+
+// Conservatively identifies any definitions which might be live at the
+// given instruction. The  analysis is performed immediately before the
+// given instruction. Values defined by that instruction are not considered
+// live.  Values used by that instruction are considered live.
+static void
+analyzeParsePointLiveness(DominatorTree &DT,
+                          GCPtrLivenessData &OriginalLivenessData, CallSite CS,
+                          PartiallyConstructedSafepointRecord &Result) {
+  Instruction *Inst = CS.getInstruction();
+
+  StatepointLiveSetTy LiveSet;
+  findLiveSetAtInst(Inst, OriginalLivenessData, LiveSet);
+
+  if (PrintLiveSet) {
+    dbgs() << "Live Variables:\n";
+    for (Value *V : LiveSet)
+      dbgs() << " " << V->getName() << " " << *V << "\n";
+  }
+  if (PrintLiveSetSize) {
+    dbgs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n";
+    dbgs() << "Number live values: " << LiveSet.size() << "\n";
+  }
+  Result.LiveSet = LiveSet;
+}
+
+static bool isKnownBaseResult(Value *V);
+namespace {
+/// A single base defining value - An immediate base defining value for an
+/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'.
+/// For instructions which have multiple pointer [vector] inputs or that
+/// transition between vector and scalar types, there is no immediate base
+/// defining value.  The 'base defining value' for 'Def' is the transitive
+/// closure of this relation stopping at the first instruction which has no
+/// immediate base defining value.  The b.d.v. might itself be a base pointer,
+/// but it can also be an arbitrary derived pointer. 
+struct BaseDefiningValueResult {
+  /// Contains the value which is the base defining value.
+  Value * const BDV;
+  /// True if the base defining value is also known to be an actual base
+  /// pointer.
+  const bool IsKnownBase;
+  BaseDefiningValueResult(Value *BDV, bool IsKnownBase)
+    : BDV(BDV), IsKnownBase(IsKnownBase) {
+#ifndef NDEBUG
+    // Check consistency between new and old means of checking whether a BDV is
+    // a base.
+    bool MustBeBase = isKnownBaseResult(BDV);
+    assert(!MustBeBase || MustBeBase == IsKnownBase);
+#endif
+  }
+};
+}
+
+static BaseDefiningValueResult findBaseDefiningValue(Value *I);
+
+/// Return a base defining value for the 'Index' element of the given vector
+/// instruction 'I'.  If Index is null, returns a BDV for the entire vector
+/// 'I'.  As an optimization, this method will try to determine when the 
+/// element is known to already be a base pointer.  If this can be established,
+/// the second value in the returned pair will be true.  Note that either a
+/// vector or a pointer typed value can be returned.  For the former, the
+/// vector returned is a BDV (and possibly a base) of the entire vector 'I'.
+/// If the later, the return pointer is a BDV (or possibly a base) for the
+/// particular element in 'I'.  
+static BaseDefiningValueResult
+findBaseDefiningValueOfVector(Value *I) {
+  // Each case parallels findBaseDefiningValue below, see that code for
+  // detailed motivation.
+
+  if (isa<Argument>(I))
+    // An incoming argument to the function is a base pointer
+    return BaseDefiningValueResult(I, true);
+
+  if (isa<Constant>(I))
+    // Base of constant vector consists only of constant null pointers. 
+    // For reasoning see similar case inside 'findBaseDefiningValue' function.
+    return BaseDefiningValueResult(ConstantAggregateZero::get(I->getType()),
+                                   true);
+
+  if (isa<LoadInst>(I))
+    return BaseDefiningValueResult(I, true);
+
+  if (isa<InsertElementInst>(I))
+    // We don't know whether this vector contains entirely base pointers or
+    // not.  To be conservatively correct, we treat it as a BDV and will
+    // duplicate code as needed to construct a parallel vector of bases.
+    return BaseDefiningValueResult(I, false);
+
+  if (isa<ShuffleVectorInst>(I))
+    // We don't know whether this vector contains entirely base pointers or
+    // not.  To be conservatively correct, we treat it as a BDV and will
+    // duplicate code as needed to construct a parallel vector of bases.
+    // TODO: There a number of local optimizations which could be applied here
+    // for particular sufflevector patterns.
+    return BaseDefiningValueResult(I, false);
+
+  // The behavior of getelementptr instructions is the same for vector and
+  // non-vector data types.
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+    return findBaseDefiningValue(GEP->getPointerOperand());
+
+  // A PHI or Select is a base defining value.  The outer findBasePointer
+  // algorithm is responsible for constructing a base value for this BDV.
+  assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
+         "unknown vector instruction - no base found for vector element");
+  return BaseDefiningValueResult(I, false);
+}
+
+/// Helper function for findBasePointer - Will return a value which either a)
+/// defines the base pointer for the input, b) blocks the simple search
+/// (i.e. a PHI or Select of two derived pointers), or c) involves a change
+/// from pointer to vector type or back.
+static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
+  assert(I->getType()->isPtrOrPtrVectorTy() &&
+         "Illegal to ask for the base pointer of a non-pointer type");
+
+  if (I->getType()->isVectorTy())
+    return findBaseDefiningValueOfVector(I);
+
+  if (isa<Argument>(I))
+    // An incoming argument to the function is a base pointer
+    // We should have never reached here if this argument isn't an gc value
+    return BaseDefiningValueResult(I, true);
+
+  if (isa<Constant>(I)) {
+    // We assume that objects with a constant base (e.g. a global) can't move
+    // and don't need to be reported to the collector because they are always
+    // live. Besides global references, all kinds of constants (e.g. undef, 
+    // constant expressions, null pointers) can be introduced by the inliner or
+    // the optimizer, especially on dynamically dead paths.
+    // Here we treat all of them as having single null base. By doing this we
+    // trying to avoid problems reporting various conflicts in a form of 
+    // "phi (const1, const2)" or "phi (const, regular gc ptr)".
+    // See constant.ll file for relevant test cases.
+
+    return BaseDefiningValueResult(
+        ConstantPointerNull::get(cast<PointerType>(I->getType())), true);
+  }
+
+  if (CastInst *CI = dyn_cast<CastInst>(I)) {
+    Value *Def = CI->stripPointerCasts();
+    // If stripping pointer casts changes the address space there is an
+    // addrspacecast in between.
+    assert(cast<PointerType>(Def->getType())->getAddressSpace() ==
+               cast<PointerType>(CI->getType())->getAddressSpace() &&
+           "unsupported addrspacecast");
+    // If we find a cast instruction here, it means we've found a cast which is
+    // not simply a pointer cast (i.e. an inttoptr).  We don't know how to
+    // handle int->ptr conversion.
+    assert(!isa<CastInst>(Def) && "shouldn't find another cast here");
+    return findBaseDefiningValue(Def);
+  }
+
+  if (isa<LoadInst>(I))
+    // The value loaded is an gc base itself
+    return BaseDefiningValueResult(I, true);
+  
+
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
+    // The base of this GEP is the base
+    return findBaseDefiningValue(GEP->getPointerOperand());
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      // fall through to general call handling
+      break;
+    case Intrinsic::experimental_gc_statepoint:
+      llvm_unreachable("statepoints don't produce pointers");
+    case Intrinsic::experimental_gc_relocate: {
+      // Rerunning safepoint insertion after safepoints are already
+      // inserted is not supported.  It could probably be made to work,
+      // but why are you doing this?  There's no good reason.
+      llvm_unreachable("repeat safepoint insertion is not supported");
+    }
+    case Intrinsic::gcroot:
+      // Currently, this mechanism hasn't been extended to work with gcroot.
+      // There's no reason it couldn't be, but I haven't thought about the
+      // implications much.
+      llvm_unreachable(
+          "interaction with the gcroot mechanism is not supported");
+    }
+  }
+  // We assume that functions in the source language only return base
+  // pointers.  This should probably be generalized via attributes to support
+  // both source language and internal functions.
+  if (isa<CallInst>(I) || isa<InvokeInst>(I))
+    return BaseDefiningValueResult(I, true);
+
+  // TODO: I have absolutely no idea how to implement this part yet.  It's not
+  // necessarily hard, I just haven't really looked at it yet.
+  assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented");
+
+  if (isa<AtomicCmpXchgInst>(I))
+    // A CAS is effectively a atomic store and load combined under a
+    // predicate.  From the perspective of base pointers, we just treat it
+    // like a load.
+    return BaseDefiningValueResult(I, true);
+
+  assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are "
+                                   "binary ops which don't apply to pointers");
+
+  // The aggregate ops.  Aggregates can either be in the heap or on the
+  // stack, but in either case, this is simply a field load.  As a result,
+  // this is a defining definition of the base just like a load is.
+  if (isa<ExtractValueInst>(I))
+    return BaseDefiningValueResult(I, true);
+
+  // We should never see an insert vector since that would require we be
+  // tracing back a struct value not a pointer value.
+  assert(!isa<InsertValueInst>(I) &&
+         "Base pointer for a struct is meaningless");
+
+  // An extractelement produces a base result exactly when it's input does.
+  // We may need to insert a parallel instruction to extract the appropriate
+  // element out of the base vector corresponding to the input. Given this,
+  // it's analogous to the phi and select case even though it's not a merge.
+  if (isa<ExtractElementInst>(I))
+    // Note: There a lot of obvious peephole cases here.  This are deliberately
+    // handled after the main base pointer inference algorithm to make writing
+    // test cases to exercise that code easier.
+    return BaseDefiningValueResult(I, false);
+
+  // The last two cases here don't return a base pointer.  Instead, they
+  // return a value which dynamically selects from among several base
+  // derived pointers (each with it's own base potentially).  It's the job of
+  // the caller to resolve these.
+  assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
+         "missing instruction case in findBaseDefiningValing");
+  return BaseDefiningValueResult(I, false);
+}
+
+/// Returns the base defining value for this value.
+static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) {
+  Value *&Cached = Cache[I];
+  if (!Cached) {
+    Cached = findBaseDefiningValue(I).BDV;
+    DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
+                 << Cached->getName() << "\n");
+  }
+  assert(Cache[I] != nullptr);
+  return Cached;
+}
+
+/// Return a base pointer for this value if known.  Otherwise, return it's
+/// base defining value.
+static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) {
+  Value *Def = findBaseDefiningValueCached(I, Cache);
+  auto Found = Cache.find(Def);
+  if (Found != Cache.end()) {
+    // Either a base-of relation, or a self reference.  Caller must check.
+    return Found->second;
+  }
+  // Only a BDV available
+  return Def;
+}
+
+/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV,
+/// is it known to be a base pointer?  Or do we need to continue searching.
+static bool isKnownBaseResult(Value *V) {
+  if (!isa<PHINode>(V) && !isa<SelectInst>(V) &&
+      !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) &&
+      !isa<ShuffleVectorInst>(V)) {
+    // no recursion possible
+    return true;
+  }
+  if (isa<Instruction>(V) &&
+      cast<Instruction>(V)->getMetadata("is_base_value")) {
+    // This is a previously inserted base phi or select.  We know
+    // that this is a base value.
+    return true;
+  }
+
+  // We need to keep searching
+  return false;
+}
+
+namespace {
+/// Models the state of a single base defining value in the findBasePointer
+/// algorithm for determining where a new instruction is needed to propagate
+/// the base of this BDV.
+class BDVState {
+public:
+  enum Status { Unknown, Base, Conflict };
+
+  BDVState() : Status(Unknown), BaseValue(nullptr) {}
+
+  explicit BDVState(Status Status, Value *BaseValue = nullptr)
+      : Status(Status), BaseValue(BaseValue) {
+    assert(Status != Base || BaseValue);
+  }
+
+  explicit BDVState(Value *BaseValue) : Status(Base), BaseValue(BaseValue) {}
+
+  Status getStatus() const { return Status; }
+  Value *getBaseValue() const { return BaseValue; }
+
+  bool isBase() const { return getStatus() == Base; }
+  bool isUnknown() const { return getStatus() == Unknown; }
+  bool isConflict() const { return getStatus() == Conflict; }
+
+  bool operator==(const BDVState &Other) const {
+    return BaseValue == Other.BaseValue && Status == Other.Status;
+  }
+
+  bool operator!=(const BDVState &other) const { return !(*this == other); }
+
+  LLVM_DUMP_METHOD
+  void dump() const {
+    print(dbgs());
+    dbgs() << '\n';
+  }
+
+  void print(raw_ostream &OS) const {
+    switch (getStatus()) {
+    case Unknown:
+      OS << "U";
+      break;
+    case Base:
+      OS << "B";
+      break;
+    case Conflict:
+      OS << "C";
+      break;
+    };
+    OS << " (" << getBaseValue() << " - "
+       << (getBaseValue() ? getBaseValue()->getName() : "nullptr") << "): ";
+  }
+
+private:
+  Status Status;
+  AssertingVH<Value> BaseValue; // Non-null only if Status == Base.
+};
+}
+
+#ifndef NDEBUG
+static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) {
+  State.print(OS);
+  return OS;
+}
+#endif
+
+static BDVState meetBDVStateImpl(const BDVState &LHS, const BDVState &RHS) {
+  switch (LHS.getStatus()) {
+  case BDVState::Unknown:
+    return RHS;
+
+  case BDVState::Base:
+    assert(LHS.getBaseValue() && "can't be null");
+    if (RHS.isUnknown())
+      return LHS;
+
+    if (RHS.isBase()) {
+      if (LHS.getBaseValue() == RHS.getBaseValue()) {
+        assert(LHS == RHS && "equality broken!");
+        return LHS;
+      }
+      return BDVState(BDVState::Conflict);
+    }
+    assert(RHS.isConflict() && "only three states!");
+    return BDVState(BDVState::Conflict);
+
+  case BDVState::Conflict:
+    return LHS;
+  }
+  llvm_unreachable("only three states!");
+}
+
+// Values of type BDVState form a lattice, and this function implements the meet
+// operation.
+static BDVState meetBDVState(const BDVState &LHS, const BDVState &RHS) {
+  BDVState Result = meetBDVStateImpl(LHS, RHS);
+  assert(Result == meetBDVStateImpl(RHS, LHS) &&
+         "Math is wrong: meet does not commute!");
+  return Result;
+}
+
+/// For a given value or instruction, figure out what base ptr its derived from.
+/// For gc objects, this is simply itself.  On success, returns a value which is
+/// the base pointer.  (This is reliable and can be used for relocation.)  On
+/// failure, returns nullptr.
+static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
+  Value *Def = findBaseOrBDV(I, Cache);
+
+  if (isKnownBaseResult(Def))
+    return Def;
+
+  // Here's the rough algorithm:
+  // - For every SSA value, construct a mapping to either an actual base
+  //   pointer or a PHI which obscures the base pointer.
+  // - Construct a mapping from PHI to unknown TOP state.  Use an
+  //   optimistic algorithm to propagate base pointer information.  Lattice
+  //   looks like:
+  //   UNKNOWN
+  //   b1 b2 b3 b4
+  //   CONFLICT
+  //   When algorithm terminates, all PHIs will either have a single concrete
+  //   base or be in a conflict state.
+  // - For every conflict, insert a dummy PHI node without arguments.  Add
+  //   these to the base[Instruction] = BasePtr mapping.  For every
+  //   non-conflict, add the actual base.
+  //  - For every conflict, add arguments for the base[a] of each input
+  //   arguments.
+  //
+  // Note: A simpler form of this would be to add the conflict form of all
+  // PHIs without running the optimistic algorithm.  This would be
+  // analogous to pessimistic data flow and would likely lead to an
+  // overall worse solution.
+
+#ifndef NDEBUG
+  auto isExpectedBDVType = [](Value *BDV) {
+    return isa<PHINode>(BDV) || isa<SelectInst>(BDV) ||
+           isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV) ||
+           isa<ShuffleVectorInst>(BDV);
+  };
+#endif
+
+  // Once populated, will contain a mapping from each potentially non-base BDV
+  // to a lattice value (described above) which corresponds to that BDV.
+  // We use the order of insertion (DFS over the def/use graph) to provide a
+  // stable deterministic ordering for visiting DenseMaps (which are unordered)
+  // below.  This is important for deterministic compilation.
+  MapVector<Value *, BDVState> States;
+
+  // Recursively fill in all base defining values reachable from the initial
+  // one for which we don't already know a definite base value for
+  /* scope */ {
+    SmallVector<Value*, 16> Worklist;
+    Worklist.push_back(Def);
+    States.insert({Def, BDVState()});
+    while (!Worklist.empty()) {
+      Value *Current = Worklist.pop_back_val();
+      assert(!isKnownBaseResult(Current) && "why did it get added?");
+
+      auto visitIncomingValue = [&](Value *InVal) {
+        Value *Base = findBaseOrBDV(InVal, Cache);
+        if (isKnownBaseResult(Base))
+          // Known bases won't need new instructions introduced and can be
+          // ignored safely
+          return;
+        assert(isExpectedBDVType(Base) && "the only non-base values "
+               "we see should be base defining values");
+        if (States.insert(std::make_pair(Base, BDVState())).second)
+          Worklist.push_back(Base);
+      };
+      if (PHINode *PN = dyn_cast<PHINode>(Current)) {
+        for (Value *InVal : PN->incoming_values())
+          visitIncomingValue(InVal);
+      } else if (SelectInst *SI = dyn_cast<SelectInst>(Current)) {
+        visitIncomingValue(SI->getTrueValue());
+        visitIncomingValue(SI->getFalseValue());
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) {
+        visitIncomingValue(EE->getVectorOperand());
+      } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) {
+        visitIncomingValue(IE->getOperand(0)); // vector operand
+        visitIncomingValue(IE->getOperand(1)); // scalar operand
+      } else if (auto *SV = dyn_cast<ShuffleVectorInst>(Current)) {
+        visitIncomingValue(SV->getOperand(0));
+        visitIncomingValue(SV->getOperand(1));
+      }
+      else {
+        llvm_unreachable("Unimplemented instruction case");
+      }
+    }
+  }
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "States after initialization:\n");
+  for (auto Pair : States) {
+    DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+  }
+#endif
+
+  // Return a phi state for a base defining value.  We'll generate a new
+  // base state for known bases and expect to find a cached state otherwise.
+  auto getStateForBDV = [&](Value *baseValue) {
+    if (isKnownBaseResult(baseValue))
+      return BDVState(baseValue);
+    auto I = States.find(baseValue);
+    assert(I != States.end() && "lookup failed!");
+    return I->second;
+  };
+
+  bool Progress = true;
+  while (Progress) {
+#ifndef NDEBUG
+    const size_t OldSize = States.size();
+#endif
+    Progress = false;
+    // We're only changing values in this loop, thus safe to keep iterators.
+    // Since this is computing a fixed point, the order of visit does not
+    // effect the result.  TODO: We could use a worklist here and make this run
+    // much faster.
+    for (auto Pair : States) {
+      Value *BDV = Pair.first;
+      assert(!isKnownBaseResult(BDV) && "why did it get added?");
+
+      // Given an input value for the current instruction, return a BDVState
+      // instance which represents the BDV of that value.
+      auto getStateForInput = [&](Value *V) mutable {
+        Value *BDV = findBaseOrBDV(V, Cache);
+        return getStateForBDV(BDV);
+      };
+
+      BDVState NewState;
+      if (SelectInst *SI = dyn_cast<SelectInst>(BDV)) {
+        NewState = meetBDVState(NewState, getStateForInput(SI->getTrueValue()));
+        NewState =
+            meetBDVState(NewState, getStateForInput(SI->getFalseValue()));
+      } else if (PHINode *PN = dyn_cast<PHINode>(BDV)) {
+        for (Value *Val : PN->incoming_values())
+          NewState = meetBDVState(NewState, getStateForInput(Val));
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) {
+        // The 'meet' for an extractelement is slightly trivial, but it's still
+        // useful in that it drives us to conflict if our input is.
+        NewState =
+            meetBDVState(NewState, getStateForInput(EE->getVectorOperand()));
+      } else if (auto *IE = dyn_cast<InsertElementInst>(BDV)){
+        // Given there's a inherent type mismatch between the operands, will
+        // *always* produce Conflict.
+        NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(0)));
+        NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(1)));
+      } else {
+        // The only instance this does not return a Conflict is when both the
+        // vector operands are the same vector.
+        auto *SV = cast<ShuffleVectorInst>(BDV);
+        NewState = meetBDVState(NewState, getStateForInput(SV->getOperand(0)));
+        NewState = meetBDVState(NewState, getStateForInput(SV->getOperand(1)));
+      }
+
+      BDVState OldState = States[BDV];
+      if (OldState != NewState) {
+        Progress = true;
+        States[BDV] = NewState;
+      }
+    }
+
+    assert(OldSize == States.size() &&
+           "fixed point shouldn't be adding any new nodes to state");
+  }
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "States after meet iteration:\n");
+  for (auto Pair : States) {
+    DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+  }
+#endif
+
+  // Insert Phis for all conflicts
+  // TODO: adjust naming patterns to avoid this order of iteration dependency
+  for (auto Pair : States) {
+    Instruction *I = cast<Instruction>(Pair.first);
+    BDVState State = Pair.second;
+    assert(!isKnownBaseResult(I) && "why did it get added?");
+    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+
+    // extractelement instructions are a bit special in that we may need to
+    // insert an extract even when we know an exact base for the instruction.
+    // The problem is that we need to convert from a vector base to a scalar
+    // base for the particular indice we're interested in.
+    if (State.isBase() && isa<ExtractElementInst>(I) &&
+        isa<VectorType>(State.getBaseValue()->getType())) {
+      auto *EE = cast<ExtractElementInst>(I);
+      // TODO: In many cases, the new instruction is just EE itself.  We should
+      // exploit this, but can't do it here since it would break the invariant
+      // about the BDV not being known to be a base.
+      auto *BaseInst = ExtractElementInst::Create(
+          State.getBaseValue(), EE->getIndexOperand(), "base_ee", EE);
+      BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
+      States[I] = BDVState(BDVState::Base, BaseInst);
+    }
+
+    // Since we're joining a vector and scalar base, they can never be the
+    // same.  As a result, we should always see insert element having reached
+    // the conflict state.
+    assert(!isa<InsertElementInst>(I) || State.isConflict());
+
+    if (!State.isConflict())
+      continue;
+
+    /// Create and insert a new instruction which will represent the base of
+    /// the given instruction 'I'.
+    auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* {
+      if (isa<PHINode>(I)) {
+        BasicBlock *BB = I->getParent();
+        int NumPreds = std::distance(pred_begin(BB), pred_end(BB));
+        assert(NumPreds > 0 && "how did we reach here");
+        std::string Name = suffixed_name_or(I, ".base", "base_phi");
+        return PHINode::Create(I->getType(), NumPreds, Name, I);
+      } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+        // The undef will be replaced later
+        UndefValue *Undef = UndefValue::get(SI->getType());
+        std::string Name = suffixed_name_or(I, ".base", "base_select");
+        return SelectInst::Create(SI->getCondition(), Undef, Undef, Name, SI);
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
+        UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType());
+        std::string Name = suffixed_name_or(I, ".base", "base_ee");
+        return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name,
+                                          EE);
+      } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
+        UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType());
+        UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType());
+        std::string Name = suffixed_name_or(I, ".base", "base_ie");
+        return InsertElementInst::Create(VecUndef, ScalarUndef,
+                                         IE->getOperand(2), Name, IE);
+      } else {
+        auto *SV = cast<ShuffleVectorInst>(I);
+        UndefValue *VecUndef = UndefValue::get(SV->getOperand(0)->getType());
+        std::string Name = suffixed_name_or(I, ".base", "base_sv");
+        return new ShuffleVectorInst(VecUndef, VecUndef, SV->getOperand(2),
+                                     Name, SV);
+      }
+    };
+    Instruction *BaseInst = MakeBaseInstPlaceholder(I);
+    // Add metadata marking this as a base value
+    BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
+    States[I] = BDVState(BDVState::Conflict, BaseInst);
+  }
+
+  // Returns a instruction which produces the base pointer for a given
+  // instruction.  The instruction is assumed to be an input to one of the BDVs
+  // seen in the inference algorithm above.  As such, we must either already
+  // know it's base defining value is a base, or have inserted a new
+  // instruction to propagate the base of it's BDV and have entered that newly
+  // introduced instruction into the state table.  In either case, we are
+  // assured to be able to determine an instruction which produces it's base
+  // pointer.
+  auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) {
+    Value *BDV = findBaseOrBDV(Input, Cache);
+    Value *Base = nullptr;
+    if (isKnownBaseResult(BDV)) {
+      Base = BDV;
+    } else {
+      // Either conflict or base.
+      assert(States.count(BDV));
+      Base = States[BDV].getBaseValue();
+    }
+    assert(Base && "Can't be null");
+    // The cast is needed since base traversal may strip away bitcasts
+    if (Base->getType() != Input->getType() && InsertPt)
+      Base = new BitCastInst(Base, Input->getType(), "cast", InsertPt);
+    return Base;
+  };
+
+  // Fixup all the inputs of the new PHIs.  Visit order needs to be
+  // deterministic and predictable because we're naming newly created
+  // instructions.
+  for (auto Pair : States) {
+    Instruction *BDV = cast<Instruction>(Pair.first);
+    BDVState State = Pair.second;
+
+    assert(!isKnownBaseResult(BDV) && "why did it get added?");
+    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+    if (!State.isConflict())
+      continue;
+
+    if (PHINode *BasePHI = dyn_cast<PHINode>(State.getBaseValue())) {
+      PHINode *PN = cast<PHINode>(BDV);
+      unsigned NumPHIValues = PN->getNumIncomingValues();
+      for (unsigned i = 0; i < NumPHIValues; i++) {
+        Value *InVal = PN->getIncomingValue(i);
+        BasicBlock *InBB = PN->getIncomingBlock(i);
+
+        // If we've already seen InBB, add the same incoming value
+        // we added for it earlier.  The IR verifier requires phi
+        // nodes with multiple entries from the same basic block
+        // to have the same incoming value for each of those
+        // entries.  If we don't do this check here and basephi
+        // has a different type than base, we'll end up adding two
+        // bitcasts (and hence two distinct values) as incoming
+        // values for the same basic block.
+
+        int BlockIndex = BasePHI->getBasicBlockIndex(InBB);
+        if (BlockIndex != -1) {
+          Value *OldBase = BasePHI->getIncomingValue(BlockIndex);
+          BasePHI->addIncoming(OldBase, InBB);
+
+#ifndef NDEBUG
+          Value *Base = getBaseForInput(InVal, nullptr);
+          // In essence this assert states: the only way two values
+          // incoming from the same basic block may be different is by
+          // being different bitcasts of the same value.  A cleanup
+          // that remains TODO is changing findBaseOrBDV to return an
+          // llvm::Value of the correct type (and still remain pure).
+          // This will remove the need to add bitcasts.
+          assert(Base->stripPointerCasts() == OldBase->stripPointerCasts() &&
+                 "Sanity -- findBaseOrBDV should be pure!");
+#endif
+          continue;
+        }
+
+        // Find the instruction which produces the base for each input.  We may
+        // need to insert a bitcast in the incoming block.
+        // TODO: Need to split critical edges if insertion is needed
+        Value *Base = getBaseForInput(InVal, InBB->getTerminator());
+        BasePHI->addIncoming(Base, InBB);
+      }
+      assert(BasePHI->getNumIncomingValues() == NumPHIValues);
+    } else if (SelectInst *BaseSI =
+                   dyn_cast<SelectInst>(State.getBaseValue())) {
+      SelectInst *SI = cast<SelectInst>(BDV);
+
+      // Find the instruction which produces the base for each input.
+      // We may need to insert a bitcast.
+      BaseSI->setTrueValue(getBaseForInput(SI->getTrueValue(), BaseSI));
+      BaseSI->setFalseValue(getBaseForInput(SI->getFalseValue(), BaseSI));
+    } else if (auto *BaseEE =
+                   dyn_cast<ExtractElementInst>(State.getBaseValue())) {
+      Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand();
+      // Find the instruction which produces the base for each input.  We may
+      // need to insert a bitcast.
+      BaseEE->setOperand(0, getBaseForInput(InVal, BaseEE));
+    } else if (auto *BaseIE = dyn_cast<InsertElementInst>(State.getBaseValue())){
+      auto *BdvIE = cast<InsertElementInst>(BDV);
+      auto UpdateOperand = [&](int OperandIdx) {
+        Value *InVal = BdvIE->getOperand(OperandIdx);
+        Value *Base = getBaseForInput(InVal, BaseIE);
+        BaseIE->setOperand(OperandIdx, Base);
+      };
+      UpdateOperand(0); // vector operand
+      UpdateOperand(1); // scalar operand
+    } else {
+      auto *BaseSV = cast<ShuffleVectorInst>(State.getBaseValue());
+      auto *BdvSV = cast<ShuffleVectorInst>(BDV);
+      auto UpdateOperand = [&](int OperandIdx) {
+        Value *InVal = BdvSV->getOperand(OperandIdx);
+        Value *Base = getBaseForInput(InVal, BaseSV);
+        BaseSV->setOperand(OperandIdx, Base);
+      };
+      UpdateOperand(0); // vector operand
+      UpdateOperand(1); // vector operand
+    }
+  }
+
+  // Cache all of our results so we can cheaply reuse them
+  // NOTE: This is actually two caches: one of the base defining value
+  // relation and one of the base pointer relation!  FIXME
+  for (auto Pair : States) {
+    auto *BDV = Pair.first;
+    Value *Base = Pair.second.getBaseValue();
+    assert(BDV && Base);
+    assert(!isKnownBaseResult(BDV) && "why did it get added?");
+
+    DEBUG(dbgs() << "Updating base value cache"
+                 << " for: " << BDV->getName() << " from: "
+                 << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none")
+                 << " to: " << Base->getName() << "\n");
+
+    if (Cache.count(BDV)) {
+      assert(isKnownBaseResult(Base) &&
+             "must be something we 'know' is a base pointer");
+      // Once we transition from the BDV relation being store in the Cache to
+      // the base relation being stored, it must be stable
+      assert((!isKnownBaseResult(Cache[BDV]) || Cache[BDV] == Base) &&
+             "base relation should be stable");
+    }
+    Cache[BDV] = Base;
+  }
+  assert(Cache.count(Def));
+  return Cache[Def];
+}
+
+// For a set of live pointers (base and/or derived), identify the base
+// pointer of the object which they are derived from.  This routine will
+// mutate the IR graph as needed to make the 'base' pointer live at the
+// definition site of 'derived'.  This ensures that any use of 'derived' can
+// also use 'base'.  This may involve the insertion of a number of
+// additional PHI nodes.
+//
+// preconditions: live is a set of pointer type Values
+//
+// side effects: may insert PHI nodes into the existing CFG, will preserve
+// CFG, will not remove or mutate any existing nodes
+//
+// post condition: PointerToBase contains one (derived, base) pair for every
+// pointer in live.  Note that derived can be equal to base if the original
+// pointer was a base pointer.
+static void
+findBasePointers(const StatepointLiveSetTy &live,
+                 MapVector<Value *, Value *> &PointerToBase,
+                 DominatorTree *DT, DefiningValueMapTy &DVCache) {
+  for (Value *ptr : live) {
+    Value *base = findBasePointer(ptr, DVCache);
+    assert(base && "failed to find base pointer");
+    PointerToBase[ptr] = base;
+    assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) ||
+            DT->dominates(cast<Instruction>(base)->getParent(),
+                          cast<Instruction>(ptr)->getParent())) &&
+           "The base we found better dominate the derived pointer");
+  }
+}
+
+/// Find the required based pointers (and adjust the live set) for the given
+/// parse point.
+static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
+                             CallSite CS,
+                             PartiallyConstructedSafepointRecord &result) {
+  MapVector<Value *, Value *> PointerToBase;
+  findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache);
+
+  if (PrintBasePointers) {
+    errs() << "Base Pairs (w/o Relocation):\n";
+    for (auto &Pair : PointerToBase) {
+      errs() << " derived ";
+      Pair.first->printAsOperand(errs(), false);
+      errs() << " base ";
+      Pair.second->printAsOperand(errs(), false);
+      errs() << "\n";;
+    }
+  }
+
+  result.PointerToBase = PointerToBase;
+}
+
+/// Given an updated version of the dataflow liveness results, update the
+/// liveset and base pointer maps for the call site CS.
+static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
+                                  CallSite CS,
+                                  PartiallyConstructedSafepointRecord &result);
+
+static void recomputeLiveInValues(
+    Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate,
+    MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+  // TODO-PERF: reuse the original liveness, then simply run the dataflow
+  // again.  The old values are still live and will help it stabilize quickly.
+  GCPtrLivenessData RevisedLivenessData;
+  computeLiveInValues(DT, F, RevisedLivenessData);
+  for (size_t i = 0; i < records.size(); i++) {
+    struct PartiallyConstructedSafepointRecord &info = records[i];
+    recomputeLiveInValues(RevisedLivenessData, toUpdate[i], info);
+  }
+}
+
+// When inserting gc.relocate and gc.result calls, we need to ensure there are
+// no uses of the original value / return value between the gc.statepoint and
+// the gc.relocate / gc.result call.  One case which can arise is a phi node
+// starting one of the successor blocks.  We also need to be able to insert the
+// gc.relocates only on the path which goes through the statepoint.  We might
+// need to split an edge to make this possible.
+static BasicBlock *
+normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,
+                            DominatorTree &DT) {
+  BasicBlock *Ret = BB;
+  if (!BB->getUniquePredecessor())
+    Ret = SplitBlockPredecessors(BB, InvokeParent, "", &DT);
+
+  // Now that 'Ret' has unique predecessor we can safely remove all phi nodes
+  // from it
+  FoldSingleEntryPHINodes(Ret);
+  assert(!isa<PHINode>(Ret->begin()) &&
+         "All PHI nodes should have been removed!");
+
+  // At this point, we can safely insert a gc.relocate or gc.result as the first
+  // instruction in Ret if needed.
+  return Ret;
+}
+
+// Create new attribute set containing only attributes which can be transferred
+// from original call to the safepoint.
+static AttributeList legalizeCallAttributes(AttributeList AL) {
+  if (AL.isEmpty())
+    return AL;
+
+  // Remove the readonly, readnone, and statepoint function attributes.
+  AttrBuilder FnAttrs = AL.getFnAttributes();
+  FnAttrs.removeAttribute(Attribute::ReadNone);
+  FnAttrs.removeAttribute(Attribute::ReadOnly);
+  for (Attribute A : AL.getFnAttributes()) {
+    if (isStatepointDirectiveAttr(A))
+      FnAttrs.remove(A);
+  }
+
+  // Just skip parameter and return attributes for now
+  LLVMContext &Ctx = AL.getContext();
+  return AttributeList::get(Ctx, AttributeList::FunctionIndex,
+                            AttributeSet::get(Ctx, FnAttrs));
+}
+
+/// Helper function to place all gc relocates necessary for the given
+/// statepoint.
+/// Inputs:
+///   liveVariables - list of variables to be relocated.
+///   liveStart - index of the first live variable.
+///   basePtrs - base pointers.
+///   statepointToken - statepoint instruction to which relocates should be
+///   bound.
+///   Builder - Llvm IR builder to be used to construct new calls.
+static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
+                              const int LiveStart,
+                              ArrayRef<Value *> BasePtrs,
+                              Instruction *StatepointToken,
+                              IRBuilder<> Builder) {
+  if (LiveVariables.empty())
+    return;
+
+  auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) {
+    auto ValIt = find(LiveVec, Val);
+    assert(ValIt != LiveVec.end() && "Val not found in LiveVec!");
+    size_t Index = std::distance(LiveVec.begin(), ValIt);
+    assert(Index < LiveVec.size() && "Bug in std::find?");
+    return Index;
+  };
+  Module *M = StatepointToken->getModule();
+  
+  // All gc_relocate are generated as i8 addrspace(1)* (or a vector type whose
+  // element type is i8 addrspace(1)*). We originally generated unique
+  // declarations for each pointer type, but this proved problematic because
+  // the intrinsic mangling code is incomplete and fragile.  Since we're moving
+  // towards a single unified pointer type anyways, we can just cast everything
+  // to an i8* of the right address space.  A bitcast is added later to convert
+  // gc_relocate to the actual value's type.  
+  auto getGCRelocateDecl = [&] (Type *Ty) {
+    assert(isHandledGCPointerType(Ty));
+    auto AS = Ty->getScalarType()->getPointerAddressSpace();
+    Type *NewTy = Type::getInt8PtrTy(M->getContext(), AS);
+    if (auto *VT = dyn_cast<VectorType>(Ty))
+      NewTy = VectorType::get(NewTy, VT->getNumElements());
+    return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate,
+                                     {NewTy});
+  };
+
+  // Lazily populated map from input types to the canonicalized form mentioned
+  // in the comment above.  This should probably be cached somewhere more
+  // broadly.
+  DenseMap<Type*, Value*> TypeToDeclMap;
+
+  for (unsigned i = 0; i < LiveVariables.size(); i++) {
+    // Generate the gc.relocate call and save the result
+    Value *BaseIdx =
+      Builder.getInt32(LiveStart + FindIndex(LiveVariables, BasePtrs[i]));
+    Value *LiveIdx = Builder.getInt32(LiveStart + i);
+
+    Type *Ty = LiveVariables[i]->getType();
+    if (!TypeToDeclMap.count(Ty))
+      TypeToDeclMap[Ty] = getGCRelocateDecl(Ty);
+    Value *GCRelocateDecl = TypeToDeclMap[Ty];
+
+    // only specify a debug name if we can give a useful one
+    CallInst *Reloc = Builder.CreateCall(
+        GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx},
+        suffixed_name_or(LiveVariables[i], ".relocated", ""));
+    // Trick CodeGen into thinking there are lots of free registers at this
+    // fake call.
+    Reloc->setCallingConv(CallingConv::Cold);
+  }
+}
+
+namespace {
+
+/// This struct is used to defer RAUWs and `eraseFromParent` s.  Using this
+/// avoids having to worry about keeping around dangling pointers to Values.
+class DeferredReplacement {
+  AssertingVH<Instruction> Old;
+  AssertingVH<Instruction> New;
+  bool IsDeoptimize = false;
+
+  DeferredReplacement() {}
+
+public:
+  static DeferredReplacement createRAUW(Instruction *Old, Instruction *New) {
+    assert(Old != New && Old && New &&
+           "Cannot RAUW equal values or to / from null!");
+
+    DeferredReplacement D;
+    D.Old = Old;
+    D.New = New;
+    return D;
+  }
+
+  static DeferredReplacement createDelete(Instruction *ToErase) {
+    DeferredReplacement D;
+    D.Old = ToErase;
+    return D;
+  }
+
+  static DeferredReplacement createDeoptimizeReplacement(Instruction *Old) {
+#ifndef NDEBUG
+    auto *F = cast<CallInst>(Old)->getCalledFunction();
+    assert(F && F->getIntrinsicID() == Intrinsic::experimental_deoptimize &&
+           "Only way to construct a deoptimize deferred replacement");
+#endif
+    DeferredReplacement D;
+    D.Old = Old;
+    D.IsDeoptimize = true;
+    return D;
+  }
+
+  /// Does the task represented by this instance.
+  void doReplacement() {
+    Instruction *OldI = Old;
+    Instruction *NewI = New;
+
+    assert(OldI != NewI && "Disallowed at construction?!");
+    assert((!IsDeoptimize || !New) &&
+           "Deoptimize instrinsics are not replaced!");
+
+    Old = nullptr;
+    New = nullptr;
+
+    if (NewI)
+      OldI->replaceAllUsesWith(NewI);
+
+    if (IsDeoptimize) {
+      // Note: we've inserted instructions, so the call to llvm.deoptimize may
+      // not necessarilly be followed by the matching return.
+      auto *RI = cast<ReturnInst>(OldI->getParent()->getTerminator());
+      new UnreachableInst(RI->getContext(), RI);
+      RI->eraseFromParent();
+    }
+
+    OldI->eraseFromParent();
+  }
+};
+}
+
+static StringRef getDeoptLowering(CallSite CS) {
+  const char *DeoptLowering = "deopt-lowering";
+  if (CS.hasFnAttr(DeoptLowering)) {
+    // FIXME: CallSite has a *really* confusing interface around attributes
+    // with values.
+    const AttributeList &CSAS = CS.getAttributes();
+    if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering))
+      return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering)
+          .getValueAsString();
+    Function *F = CS.getCalledFunction();
+    assert(F && F->hasFnAttribute(DeoptLowering));
+    return F->getFnAttribute(DeoptLowering).getValueAsString();
+  }
+  return "live-through";
+}
+    
+
+static void
+makeStatepointExplicitImpl(const CallSite CS, /* to replace */
+                           const SmallVectorImpl<Value *> &BasePtrs,
+                           const SmallVectorImpl<Value *> &LiveVariables,
+                           PartiallyConstructedSafepointRecord &Result,
+                           std::vector<DeferredReplacement> &Replacements) {
+  assert(BasePtrs.size() == LiveVariables.size());
+
+  // Then go ahead and use the builder do actually do the inserts.  We insert
+  // immediately before the previous instruction under the assumption that all
+  // arguments will be available here.  We can't insert afterwards since we may
+  // be replacing a terminator.
+  Instruction *InsertBefore = CS.getInstruction();
+  IRBuilder<> Builder(InsertBefore);
+
+  ArrayRef<Value *> GCArgs(LiveVariables);
+  uint64_t StatepointID = StatepointDirectives::DefaultStatepointID;
+  uint32_t NumPatchBytes = 0;
+  uint32_t Flags = uint32_t(StatepointFlags::None);
+
+  ArrayRef<Use> CallArgs(CS.arg_begin(), CS.arg_end());
+  ArrayRef<Use> DeoptArgs = GetDeoptBundleOperands(CS);
+  ArrayRef<Use> TransitionArgs;
+  if (auto TransitionBundle =
+      CS.getOperandBundle(LLVMContext::OB_gc_transition)) {
+    Flags |= uint32_t(StatepointFlags::GCTransition);
+    TransitionArgs = TransitionBundle->Inputs;
+  }
+
+  // Instead of lowering calls to @llvm.experimental.deoptimize as normal calls
+  // with a return value, we lower then as never returning calls to
+  // __llvm_deoptimize that are followed by unreachable to get better codegen.
+  bool IsDeoptimize = false;
+
+  StatepointDirectives SD =
+      parseStatepointDirectivesFromAttrs(CS.getAttributes());
+  if (SD.NumPatchBytes)
+    NumPatchBytes = *SD.NumPatchBytes;
+  if (SD.StatepointID)
+    StatepointID = *SD.StatepointID;
+
+  // Pass through the requested lowering if any.  The default is live-through.
+  StringRef DeoptLowering = getDeoptLowering(CS);
+  if (DeoptLowering.equals("live-in"))
+    Flags |= uint32_t(StatepointFlags::DeoptLiveIn);
+  else {
+    assert(DeoptLowering.equals("live-through") && "Unsupported value!");
+  }
+
+  Value *CallTarget = CS.getCalledValue();
+  if (Function *F = dyn_cast<Function>(CallTarget)) {
+    if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize) {
+      // Calls to llvm.experimental.deoptimize are lowered to calls to the
+      // __llvm_deoptimize symbol.  We want to resolve this now, since the
+      // verifier does not allow taking the address of an intrinsic function.
+
+      SmallVector<Type *, 8> DomainTy;
+      for (Value *Arg : CallArgs)
+        DomainTy.push_back(Arg->getType());
+      auto *FTy = FunctionType::get(Type::getVoidTy(F->getContext()), DomainTy,
+                                    /* isVarArg = */ false);
+
+      // Note: CallTarget can be a bitcast instruction of a symbol if there are
+      // calls to @llvm.experimental.deoptimize with different argument types in
+      // the same module.  This is fine -- we assume the frontend knew what it
+      // was doing when generating this kind of IR.
+      CallTarget =
+          F->getParent()->getOrInsertFunction("__llvm_deoptimize", FTy);
+
+      IsDeoptimize = true;
+    }
+  }
+
+  // Create the statepoint given all the arguments
+  Instruction *Token = nullptr;
+  if (CS.isCall()) {
+    CallInst *ToReplace = cast<CallInst>(CS.getInstruction());
+    CallInst *Call = Builder.CreateGCStatepointCall(
+        StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs,
+        TransitionArgs, DeoptArgs, GCArgs, "safepoint_token");
+
+    Call->setTailCallKind(ToReplace->getTailCallKind());
+    Call->setCallingConv(ToReplace->getCallingConv());
+
+    // Currently we will fail on parameter attributes and on certain
+    // function attributes.  In case if we can handle this set of attributes -
+    // set up function attrs directly on statepoint and return attrs later for
+    // gc_result intrinsic.
+    Call->setAttributes(legalizeCallAttributes(ToReplace->getAttributes()));
+
+    Token = Call;
+
+    // Put the following gc_result and gc_relocate calls immediately after the
+    // the old call (which we're about to delete)
+    assert(ToReplace->getNextNode() && "Not a terminator, must have next!");
+    Builder.SetInsertPoint(ToReplace->getNextNode());
+    Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc());
+  } else {
+    InvokeInst *ToReplace = cast<InvokeInst>(CS.getInstruction());
+
+    // Insert the new invoke into the old block.  We'll remove the old one in a
+    // moment at which point this will become the new terminator for the
+    // original block.
+    InvokeInst *Invoke = Builder.CreateGCStatepointInvoke(
+        StatepointID, NumPatchBytes, CallTarget, ToReplace->getNormalDest(),
+        ToReplace->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs,
+        GCArgs, "statepoint_token");
+
+    Invoke->setCallingConv(ToReplace->getCallingConv());
+
+    // Currently we will fail on parameter attributes and on certain
+    // function attributes.  In case if we can handle this set of attributes -
+    // set up function attrs directly on statepoint and return attrs later for
+    // gc_result intrinsic.
+    Invoke->setAttributes(legalizeCallAttributes(ToReplace->getAttributes()));
+
+    Token = Invoke;
+
+    // Generate gc relocates in exceptional path
+    BasicBlock *UnwindBlock = ToReplace->getUnwindDest();
+    assert(!isa<PHINode>(UnwindBlock->begin()) &&
+           UnwindBlock->getUniquePredecessor() &&
+           "can't safely insert in this block!");
+
+    Builder.SetInsertPoint(&*UnwindBlock->getFirstInsertionPt());
+    Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
+
+    // Attach exceptional gc relocates to the landingpad.
+    Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst();
+    Result.UnwindToken = ExceptionalToken;
+
+    const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx();
+    CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, ExceptionalToken,
+                      Builder);
+
+    // Generate gc relocates and returns for normal block
+    BasicBlock *NormalDest = ToReplace->getNormalDest();
+    assert(!isa<PHINode>(NormalDest->begin()) &&
+           NormalDest->getUniquePredecessor() &&
+           "can't safely insert in this block!");
+
+    Builder.SetInsertPoint(&*NormalDest->getFirstInsertionPt());
+
+    // gc relocates will be generated later as if it were regular call
+    // statepoint
+  }
+  assert(Token && "Should be set in one of the above branches!");
+
+  if (IsDeoptimize) {
+    // If we're wrapping an @llvm.experimental.deoptimize in a statepoint, we
+    // transform the tail-call like structure to a call to a void function
+    // followed by unreachable to get better codegen.
+    Replacements.push_back(
+        DeferredReplacement::createDeoptimizeReplacement(CS.getInstruction()));
+  } else {
+    Token->setName("statepoint_token");
+    if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) {
+      StringRef Name =
+          CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
+      CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name);
+      GCResult->setAttributes(
+          AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex,
+                             CS.getAttributes().getRetAttributes()));
+
+      // We cannot RAUW or delete CS.getInstruction() because it could be in the
+      // live set of some other safepoint, in which case that safepoint's
+      // PartiallyConstructedSafepointRecord will hold a raw pointer to this
+      // llvm::Instruction.  Instead, we defer the replacement and deletion to
+      // after the live sets have been made explicit in the IR, and we no longer
+      // have raw pointers to worry about.
+      Replacements.emplace_back(
+          DeferredReplacement::createRAUW(CS.getInstruction(), GCResult));
+    } else {
+      Replacements.emplace_back(
+          DeferredReplacement::createDelete(CS.getInstruction()));
+    }
+  }
+
+  Result.StatepointToken = Token;
+
+  // Second, create a gc.relocate for every live variable
+  const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx();
+  CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, Token, Builder);
+}
+
+// Replace an existing gc.statepoint with a new one and a set of gc.relocates
+// which make the relocations happening at this safepoint explicit.
+//
+// WARNING: Does not do any fixup to adjust users of the original live
+// values.  That's the callers responsibility.
+static void
+makeStatepointExplicit(DominatorTree &DT, CallSite CS,
+                       PartiallyConstructedSafepointRecord &Result,
+                       std::vector<DeferredReplacement> &Replacements) {
+  const auto &LiveSet = Result.LiveSet;
+  const auto &PointerToBase = Result.PointerToBase;
+
+  // Convert to vector for efficient cross referencing.
+  SmallVector<Value *, 64> BaseVec, LiveVec;
+  LiveVec.reserve(LiveSet.size());
+  BaseVec.reserve(LiveSet.size());
+  for (Value *L : LiveSet) {
+    LiveVec.push_back(L);
+    assert(PointerToBase.count(L));
+    Value *Base = PointerToBase.find(L)->second;
+    BaseVec.push_back(Base);
+  }
+  assert(LiveVec.size() == BaseVec.size());
+
+  // Do the actual rewriting and delete the old statepoint
+  makeStatepointExplicitImpl(CS, BaseVec, LiveVec, Result, Replacements);
+}
+
+// Helper function for the relocationViaAlloca.
+//
+// It receives iterator to the statepoint gc relocates and emits a store to the
+// assigned location (via allocaMap) for the each one of them.  It adds the
+// visited values into the visitedLiveValues set, which we will later use them
+// for sanity checking.
+static void
+insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
+                       DenseMap<Value *, Value *> &AllocaMap,
+                       DenseSet<Value *> &VisitedLiveValues) {
+
+  for (User *U : GCRelocs) {
+    GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U);
+    if (!Relocate)
+      continue;
+
+    Value *OriginalValue = Relocate->getDerivedPtr();
+    assert(AllocaMap.count(OriginalValue));
+    Value *Alloca = AllocaMap[OriginalValue];
+
+    // Emit store into the related alloca
+    // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to
+    // the correct type according to alloca.
+    assert(Relocate->getNextNode() &&
+           "Should always have one since it's not a terminator");
+    IRBuilder<> Builder(Relocate->getNextNode());
+    Value *CastedRelocatedValue =
+      Builder.CreateBitCast(Relocate,
+                            cast<AllocaInst>(Alloca)->getAllocatedType(),
+                            suffixed_name_or(Relocate, ".casted", ""));
+
+    StoreInst *Store = new StoreInst(CastedRelocatedValue, Alloca);
+    Store->insertAfter(cast<Instruction>(CastedRelocatedValue));
+
+#ifndef NDEBUG
+    VisitedLiveValues.insert(OriginalValue);
+#endif
+  }
+}
+
+// Helper function for the "relocationViaAlloca". Similar to the
+// "insertRelocationStores" but works for rematerialized values.
+static void insertRematerializationStores(
+    const RematerializedValueMapTy &RematerializedValues,
+    DenseMap<Value *, Value *> &AllocaMap,
+    DenseSet<Value *> &VisitedLiveValues) {
+
+  for (auto RematerializedValuePair: RematerializedValues) {
+    Instruction *RematerializedValue = RematerializedValuePair.first;
+    Value *OriginalValue = RematerializedValuePair.second;
+
+    assert(AllocaMap.count(OriginalValue) &&
+           "Can not find alloca for rematerialized value");
+    Value *Alloca = AllocaMap[OriginalValue];
+
+    StoreInst *Store = new StoreInst(RematerializedValue, Alloca);
+    Store->insertAfter(RematerializedValue);
+
+#ifndef NDEBUG
+    VisitedLiveValues.insert(OriginalValue);
+#endif
+  }
+}
+
+/// Do all the relocation update via allocas and mem2reg
+static void relocationViaAlloca(
+    Function &F, DominatorTree &DT, ArrayRef<Value *> Live,
+    ArrayRef<PartiallyConstructedSafepointRecord> Records) {
+#ifndef NDEBUG
+  // record initial number of (static) allocas; we'll check we have the same
+  // number when we get done.
+  int InitialAllocaNum = 0;
+  for (Instruction &I : F.getEntryBlock())
+    if (isa<AllocaInst>(I))
+      InitialAllocaNum++;
+#endif
+
+  // TODO-PERF: change data structures, reserve
+  DenseMap<Value *, Value *> AllocaMap;
+  SmallVector<AllocaInst *, 200> PromotableAllocas;
+  // Used later to chack that we have enough allocas to store all values
+  std::size_t NumRematerializedValues = 0;
+  PromotableAllocas.reserve(Live.size());
+
+  // Emit alloca for "LiveValue" and record it in "allocaMap" and
+  // "PromotableAllocas"
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  auto emitAllocaFor = [&](Value *LiveValue) {
+    AllocaInst *Alloca = new AllocaInst(LiveValue->getType(),
+                                        DL.getAllocaAddrSpace(), "",
+                                        F.getEntryBlock().getFirstNonPHI());
+    AllocaMap[LiveValue] = Alloca;
+    PromotableAllocas.push_back(Alloca);
+  };
+
+  // Emit alloca for each live gc pointer
+  for (Value *V : Live)
+    emitAllocaFor(V);
+
+  // Emit allocas for rematerialized values
+  for (const auto &Info : Records)
+    for (auto RematerializedValuePair : Info.RematerializedValues) {
+      Value *OriginalValue = RematerializedValuePair.second;
+      if (AllocaMap.count(OriginalValue) != 0)
+        continue;
+
+      emitAllocaFor(OriginalValue);
+      ++NumRematerializedValues;
+    }
+
+  // The next two loops are part of the same conceptual operation.  We need to
+  // insert a store to the alloca after the original def and at each
+  // redefinition.  We need to insert a load before each use.  These are split
+  // into distinct loops for performance reasons.
+
+  // Update gc pointer after each statepoint: either store a relocated value or
+  // null (if no relocated value was found for this gc pointer and it is not a
+  // gc_result).  This must happen before we update the statepoint with load of
+  // alloca otherwise we lose the link between statepoint and old def.
+  for (const auto &Info : Records) {
+    Value *Statepoint = Info.StatepointToken;
+
+    // This will be used for consistency check
+    DenseSet<Value *> VisitedLiveValues;
+
+    // Insert stores for normal statepoint gc relocates
+    insertRelocationStores(Statepoint->users(), AllocaMap, VisitedLiveValues);
+
+    // In case if it was invoke statepoint
+    // we will insert stores for exceptional path gc relocates.
+    if (isa<InvokeInst>(Statepoint)) {
+      insertRelocationStores(Info.UnwindToken->users(), AllocaMap,
+                             VisitedLiveValues);
+    }
+
+    // Do similar thing with rematerialized values
+    insertRematerializationStores(Info.RematerializedValues, AllocaMap,
+                                  VisitedLiveValues);
+
+    if (ClobberNonLive) {
+      // As a debugging aid, pretend that an unrelocated pointer becomes null at
+      // the gc.statepoint.  This will turn some subtle GC problems into
+      // slightly easier to debug SEGVs.  Note that on large IR files with
+      // lots of gc.statepoints this is extremely costly both memory and time
+      // wise.
+      SmallVector<AllocaInst *, 64> ToClobber;
+      for (auto Pair : AllocaMap) {
+        Value *Def = Pair.first;
+        AllocaInst *Alloca = cast<AllocaInst>(Pair.second);
+
+        // This value was relocated
+        if (VisitedLiveValues.count(Def)) {
+          continue;
+        }
+        ToClobber.push_back(Alloca);
+      }
+
+      auto InsertClobbersAt = [&](Instruction *IP) {
+        for (auto *AI : ToClobber) {
+          auto PT = cast<PointerType>(AI->getAllocatedType());
+          Constant *CPN = ConstantPointerNull::get(PT);
+          StoreInst *Store = new StoreInst(CPN, AI);
+          Store->insertBefore(IP);
+        }
+      };
+
+      // Insert the clobbering stores.  These may get intermixed with the
+      // gc.results and gc.relocates, but that's fine.
+      if (auto II = dyn_cast<InvokeInst>(Statepoint)) {
+        InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt());
+        InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt());
+      } else {
+        InsertClobbersAt(cast<Instruction>(Statepoint)->getNextNode());
+      }
+    }
+  }
+
+  // Update use with load allocas and add store for gc_relocated.
+  for (auto Pair : AllocaMap) {
+    Value *Def = Pair.first;
+    Value *Alloca = Pair.second;
+
+    // We pre-record the uses of allocas so that we dont have to worry about
+    // later update that changes the user information..
+
+    SmallVector<Instruction *, 20> Uses;
+    // PERF: trade a linear scan for repeated reallocation
+    Uses.reserve(std::distance(Def->user_begin(), Def->user_end()));
+    for (User *U : Def->users()) {
+      if (!isa<ConstantExpr>(U)) {
+        // If the def has a ConstantExpr use, then the def is either a
+        // ConstantExpr use itself or null.  In either case
+        // (recursively in the first, directly in the second), the oop
+        // it is ultimately dependent on is null and this particular
+        // use does not need to be fixed up.
+        Uses.push_back(cast<Instruction>(U));
+      }
+    }
+
+    std::sort(Uses.begin(), Uses.end());
+    auto Last = std::unique(Uses.begin(), Uses.end());
+    Uses.erase(Last, Uses.end());
+
+    for (Instruction *Use : Uses) {
+      if (isa<PHINode>(Use)) {
+        PHINode *Phi = cast<PHINode>(Use);
+        for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++) {
+          if (Def == Phi->getIncomingValue(i)) {
+            LoadInst *Load = new LoadInst(
+                Alloca, "", Phi->getIncomingBlock(i)->getTerminator());
+            Phi->setIncomingValue(i, Load);
+          }
+        }
+      } else {
+        LoadInst *Load = new LoadInst(Alloca, "", Use);
+        Use->replaceUsesOfWith(Def, Load);
+      }
+    }
+
+    // Emit store for the initial gc value.  Store must be inserted after load,
+    // otherwise store will be in alloca's use list and an extra load will be
+    // inserted before it.
+    StoreInst *Store = new StoreInst(Def, Alloca);
+    if (Instruction *Inst = dyn_cast<Instruction>(Def)) {
+      if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) {
+        // InvokeInst is a TerminatorInst so the store need to be inserted
+        // into its normal destination block.
+        BasicBlock *NormalDest = Invoke->getNormalDest();
+        Store->insertBefore(NormalDest->getFirstNonPHI());
+      } else {
+        assert(!Inst->isTerminator() &&
+               "The only TerminatorInst that can produce a value is "
+               "InvokeInst which is handled above.");
+        Store->insertAfter(Inst);
+      }
+    } else {
+      assert(isa<Argument>(Def));
+      Store->insertAfter(cast<Instruction>(Alloca));
+    }
+  }
+
+  assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues &&
+         "we must have the same allocas with lives");
+  if (!PromotableAllocas.empty()) {
+    // Apply mem2reg to promote alloca to SSA
+    PromoteMemToReg(PromotableAllocas, DT);
+  }
+
+#ifndef NDEBUG
+  for (auto &I : F.getEntryBlock())
+    if (isa<AllocaInst>(I))
+      InitialAllocaNum--;
+  assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas");
+#endif
+}
+
+/// Implement a unique function which doesn't require we sort the input
+/// vector.  Doing so has the effect of changing the output of a couple of
+/// tests in ways which make them less useful in testing fused safepoints.
+template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) {
+  SmallSet<T, 8> Seen;
+  Vec.erase(remove_if(Vec, [&](const T &V) { return !Seen.insert(V).second; }),
+            Vec.end());
+}
+
+/// Insert holders so that each Value is obviously live through the entire
+/// lifetime of the call.
+static void insertUseHolderAfter(CallSite &CS, const ArrayRef<Value *> Values,
+                                 SmallVectorImpl<CallInst *> &Holders) {
+  if (Values.empty())
+    // No values to hold live, might as well not insert the empty holder
+    return;
+
+  Module *M = CS.getInstruction()->getModule();
+  // Use a dummy vararg function to actually hold the values live
+  Function *Func = cast<Function>(M->getOrInsertFunction(
+      "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true)));
+  if (CS.isCall()) {
+    // For call safepoints insert dummy calls right after safepoint
+    Holders.push_back(CallInst::Create(Func, Values, "",
+                                       &*++CS.getInstruction()->getIterator()));
+    return;
+  }
+  // For invoke safepooints insert dummy calls both in normal and
+  // exceptional destination blocks
+  auto *II = cast<InvokeInst>(CS.getInstruction());
+  Holders.push_back(CallInst::Create(
+      Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt()));
+  Holders.push_back(CallInst::Create(
+      Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt()));
+}
+
+static void findLiveReferences(
+    Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate,
+    MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+  GCPtrLivenessData OriginalLivenessData;
+  computeLiveInValues(DT, F, OriginalLivenessData);
+  for (size_t i = 0; i < records.size(); i++) {
+    struct PartiallyConstructedSafepointRecord &info = records[i];
+    analyzeParsePointLiveness(DT, OriginalLivenessData, toUpdate[i], info);
+  }
+}
+
+// Helper function for the "rematerializeLiveValues". It walks use chain
+// starting from the "CurrentValue" until it reaches the root of the chain, i.e.
+// the base or a value it cannot process. Only "simple" values are processed
+// (currently it is GEP's and casts). The returned root is  examined by the
+// callers of findRematerializableChainToBasePointer.  Fills "ChainToBase" array
+// with all visited values.
+static Value* findRematerializableChainToBasePointer(
+  SmallVectorImpl<Instruction*> &ChainToBase,
+  Value *CurrentValue) {
+
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurrentValue)) {
+    ChainToBase.push_back(GEP);
+    return findRematerializableChainToBasePointer(ChainToBase,
+                                                  GEP->getPointerOperand());
+  }
+
+  if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) {
+    if (!CI->isNoopCast(CI->getModule()->getDataLayout()))
+      return CI;
+
+    ChainToBase.push_back(CI);
+    return findRematerializableChainToBasePointer(ChainToBase,
+                                                  CI->getOperand(0));
+  }
+
+  // We have reached the root of the chain, which is either equal to the base or
+  // is the first unsupported value along the use chain.
+  return CurrentValue;
+}
+
+// Helper function for the "rematerializeLiveValues". Compute cost of the use
+// chain we are going to rematerialize.
+static unsigned
+chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
+                       TargetTransformInfo &TTI) {
+  unsigned Cost = 0;
+
+  for (Instruction *Instr : Chain) {
+    if (CastInst *CI = dyn_cast<CastInst>(Instr)) {
+      assert(CI->isNoopCast(CI->getModule()->getDataLayout()) &&
+             "non noop cast is found during rematerialization");
+
+      Type *SrcTy = CI->getOperand(0)->getType();
+      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, CI);
+
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
+      // Cost of the address calculation
+      Type *ValTy = GEP->getSourceElementType();
+      Cost += TTI.getAddressComputationCost(ValTy);
+
+      // And cost of the GEP itself
+      // TODO: Use TTI->getGEPCost here (it exists, but appears to be not
+      //       allowed for the external usage)
+      if (!GEP->hasAllConstantIndices())
+        Cost += 2;
+
+    } else {
+      llvm_unreachable("unsupported instruciton type during rematerialization");
+    }
+  }
+
+  return Cost;
+}
+
+static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPhi) {
+
+  unsigned PhiNum = OrigRootPhi.getNumIncomingValues();
+  if (PhiNum != AlternateRootPhi.getNumIncomingValues() ||
+      OrigRootPhi.getParent() != AlternateRootPhi.getParent())
+    return false;
+  // Map of incoming values and their corresponding basic blocks of
+  // OrigRootPhi.
+  SmallDenseMap<Value *, BasicBlock *, 8> CurrentIncomingValues;
+  for (unsigned i = 0; i < PhiNum; i++)
+    CurrentIncomingValues[OrigRootPhi.getIncomingValue(i)] =
+        OrigRootPhi.getIncomingBlock(i);
+
+  // Both current and base PHIs should have same incoming values and
+  // the same basic blocks corresponding to the incoming values.
+  for (unsigned i = 0; i < PhiNum; i++) {
+    auto CIVI =
+        CurrentIncomingValues.find(AlternateRootPhi.getIncomingValue(i));
+    if (CIVI == CurrentIncomingValues.end())
+      return false;
+    BasicBlock *CurrentIncomingBB = CIVI->second;
+    if (CurrentIncomingBB != AlternateRootPhi.getIncomingBlock(i))
+      return false;
+  }
+  return true;
+
+}
+
+// From the statepoint live set pick values that are cheaper to recompute then
+// to relocate. Remove this values from the live set, rematerialize them after
+// statepoint and record them in "Info" structure. Note that similar to
+// relocated values we don't do any user adjustments here.
+static void rematerializeLiveValues(CallSite CS,
+                                    PartiallyConstructedSafepointRecord &Info,
+                                    TargetTransformInfo &TTI) {
+  const unsigned int ChainLengthThreshold = 10;
+
+  // Record values we are going to delete from this statepoint live set.
+  // We can not di this in following loop due to iterator invalidation.
+  SmallVector<Value *, 32> LiveValuesToBeDeleted;
+
+  for (Value *LiveValue: Info.LiveSet) {
+    // For each live pointer find it's defining chain
+    SmallVector<Instruction *, 3> ChainToBase;
+    assert(Info.PointerToBase.count(LiveValue));
+    Value *RootOfChain =
+      findRematerializableChainToBasePointer(ChainToBase,
+                                             LiveValue);
+
+    // Nothing to do, or chain is too long
+    if ( ChainToBase.size() == 0 ||
+        ChainToBase.size() > ChainLengthThreshold)
+      continue;
+
+    // Handle the scenario where the RootOfChain is not equal to the
+    // Base Value, but they are essentially the same phi values.
+    if (RootOfChain != Info.PointerToBase[LiveValue]) {
+      PHINode *OrigRootPhi = dyn_cast<PHINode>(RootOfChain);
+      PHINode *AlternateRootPhi = dyn_cast<PHINode>(Info.PointerToBase[LiveValue]);
+      if (!OrigRootPhi || !AlternateRootPhi)
+        continue;
+      // PHI nodes that have the same incoming values, and belonging to the same
+      // basic blocks are essentially the same SSA value.  When the original phi
+      // has incoming values with different base pointers, the original phi is
+      // marked as conflict, and an additional `AlternateRootPhi` with the same
+      // incoming values get generated by the findBasePointer function. We need
+      // to identify the newly generated AlternateRootPhi (.base version of phi)
+      // and RootOfChain (the original phi node itself) are the same, so that we
+      // can rematerialize the gep and casts. This is a workaround for the
+      // deficiency in the findBasePointer algorithm.
+      if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi))
+        continue;
+      // Now that the phi nodes are proved to be the same, assert that
+      // findBasePointer's newly generated AlternateRootPhi is present in the
+      // liveset of the call.
+      assert(Info.LiveSet.count(AlternateRootPhi));
+    }
+    // Compute cost of this chain
+    unsigned Cost = chainToBasePointerCost(ChainToBase, TTI);
+    // TODO: We can also account for cases when we will be able to remove some
+    //       of the rematerialized values by later optimization passes. I.e if
+    //       we rematerialized several intersecting chains. Or if original values
+    //       don't have any uses besides this statepoint.
+
+    // For invokes we need to rematerialize each chain twice - for normal and
+    // for unwind basic blocks. Model this by multiplying cost by two.
+    if (CS.isInvoke()) {
+      Cost *= 2;
+    }
+    // If it's too expensive - skip it
+    if (Cost >= RematerializationThreshold)
+      continue;
+
+    // Remove value from the live set
+    LiveValuesToBeDeleted.push_back(LiveValue);
+
+    // Clone instructions and record them inside "Info" structure
+
+    // Walk backwards to visit top-most instructions first
+    std::reverse(ChainToBase.begin(), ChainToBase.end());
+
+    // Utility function which clones all instructions from "ChainToBase"
+    // and inserts them before "InsertBefore". Returns rematerialized value
+    // which should be used after statepoint.
+    auto rematerializeChain = [&ChainToBase](
+        Instruction *InsertBefore, Value *RootOfChain, Value *AlternateLiveBase) {
+      Instruction *LastClonedValue = nullptr;
+      Instruction *LastValue = nullptr;
+      for (Instruction *Instr: ChainToBase) {
+        // Only GEP's and casts are supported as we need to be careful to not
+        // introduce any new uses of pointers not in the liveset.
+        // Note that it's fine to introduce new uses of pointers which were
+        // otherwise not used after this statepoint.
+        assert(isa<GetElementPtrInst>(Instr) || isa<CastInst>(Instr));
+
+        Instruction *ClonedValue = Instr->clone();
+        ClonedValue->insertBefore(InsertBefore);
+        ClonedValue->setName(Instr->getName() + ".remat");
+
+        // If it is not first instruction in the chain then it uses previously
+        // cloned value. We should update it to use cloned value.
+        if (LastClonedValue) {
+          assert(LastValue);
+          ClonedValue->replaceUsesOfWith(LastValue, LastClonedValue);
+#ifndef NDEBUG
+          for (auto OpValue : ClonedValue->operand_values()) {
+            // Assert that cloned instruction does not use any instructions from
+            // this chain other than LastClonedValue
+            assert(!is_contained(ChainToBase, OpValue) &&
+                   "incorrect use in rematerialization chain");
+            // Assert that the cloned instruction does not use the RootOfChain
+            // or the AlternateLiveBase.
+            assert(OpValue != RootOfChain && OpValue != AlternateLiveBase);
+          }
+#endif
+        } else {
+          // For the first instruction, replace the use of unrelocated base i.e.
+          // RootOfChain/OrigRootPhi, with the corresponding PHI present in the
+          // live set. They have been proved to be the same PHI nodes.  Note
+          // that the *only* use of the RootOfChain in the ChainToBase list is
+          // the first Value in the list.
+          if (RootOfChain != AlternateLiveBase)
+            ClonedValue->replaceUsesOfWith(RootOfChain, AlternateLiveBase);
+        }
+
+        LastClonedValue = ClonedValue;
+        LastValue = Instr;
+      }
+      assert(LastClonedValue);
+      return LastClonedValue;
+    };
+
+    // Different cases for calls and invokes. For invokes we need to clone
+    // instructions both on normal and unwind path.
+    if (CS.isCall()) {
+      Instruction *InsertBefore = CS.getInstruction()->getNextNode();
+      assert(InsertBefore);
+      Instruction *RematerializedValue = rematerializeChain(
+          InsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
+      Info.RematerializedValues[RematerializedValue] = LiveValue;
+    } else {
+      InvokeInst *Invoke = cast<InvokeInst>(CS.getInstruction());
+
+      Instruction *NormalInsertBefore =
+          &*Invoke->getNormalDest()->getFirstInsertionPt();
+      Instruction *UnwindInsertBefore =
+          &*Invoke->getUnwindDest()->getFirstInsertionPt();
+
+      Instruction *NormalRematerializedValue = rematerializeChain(
+          NormalInsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
+      Instruction *UnwindRematerializedValue = rematerializeChain(
+          UnwindInsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
+
+      Info.RematerializedValues[NormalRematerializedValue] = LiveValue;
+      Info.RematerializedValues[UnwindRematerializedValue] = LiveValue;
+    }
+  }
+
+  // Remove rematerializaed values from the live set
+  for (auto LiveValue: LiveValuesToBeDeleted) {
+    Info.LiveSet.remove(LiveValue);
+  }
+}
+
+static bool insertParsePoints(Function &F, DominatorTree &DT,
+                              TargetTransformInfo &TTI,
+                              SmallVectorImpl<CallSite> &ToUpdate) {
+#ifndef NDEBUG
+  // sanity check the input
+  std::set<CallSite> Uniqued;
+  Uniqued.insert(ToUpdate.begin(), ToUpdate.end());
+  assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!");
+
+  for (CallSite CS : ToUpdate)
+    assert(CS.getInstruction()->getFunction() == &F);
+#endif
+
+  // When inserting gc.relocates for invokes, we need to be able to insert at
+  // the top of the successor blocks.  See the comment on
+  // normalForInvokeSafepoint on exactly what is needed.  Note that this step
+  // may restructure the CFG.
+  for (CallSite CS : ToUpdate) {
+    if (!CS.isInvoke())
+      continue;
+    auto *II = cast<InvokeInst>(CS.getInstruction());
+    normalizeForInvokeSafepoint(II->getNormalDest(), II->getParent(), DT);
+    normalizeForInvokeSafepoint(II->getUnwindDest(), II->getParent(), DT);
+  }
+
+  // A list of dummy calls added to the IR to keep various values obviously
+  // live in the IR.  We'll remove all of these when done.
+  SmallVector<CallInst *, 64> Holders;
+
+  // Insert a dummy call with all of the deopt operands we'll need for the
+  // actual safepoint insertion as arguments.  This ensures reference operands
+  // in the deopt argument list are considered live through the safepoint (and
+  // thus makes sure they get relocated.)
+  for (CallSite CS : ToUpdate) {
+    SmallVector<Value *, 64> DeoptValues;
+
+    for (Value *Arg : GetDeoptBundleOperands(CS)) {
+      assert(!isUnhandledGCPointerType(Arg->getType()) &&
+             "support for FCA unimplemented");
+      if (isHandledGCPointerType(Arg->getType()))
+        DeoptValues.push_back(Arg);
+    }
+
+    insertUseHolderAfter(CS, DeoptValues, Holders);
+  }
+
+  SmallVector<PartiallyConstructedSafepointRecord, 64> Records(ToUpdate.size());
+
+  // A) Identify all gc pointers which are statically live at the given call
+  // site.
+  findLiveReferences(F, DT, ToUpdate, Records);
+
+  // B) Find the base pointers for each live pointer
+  /* scope for caching */ {
+    // Cache the 'defining value' relation used in the computation and
+    // insertion of base phis and selects.  This ensures that we don't insert
+    // large numbers of duplicate base_phis.
+    DefiningValueMapTy DVCache;
+
+    for (size_t i = 0; i < Records.size(); i++) {
+      PartiallyConstructedSafepointRecord &info = Records[i];
+      findBasePointers(DT, DVCache, ToUpdate[i], info);
+    }
+  } // end of cache scope
+
+  // The base phi insertion logic (for any safepoint) may have inserted new
+  // instructions which are now live at some safepoint.  The simplest such
+  // example is:
+  // loop:
+  //   phi a  <-- will be a new base_phi here
+  //   safepoint 1 <-- that needs to be live here
+  //   gep a + 1
+  //   safepoint 2
+  //   br loop
+  // We insert some dummy calls after each safepoint to definitely hold live
+  // the base pointers which were identified for that safepoint.  We'll then
+  // ask liveness for _every_ base inserted to see what is now live.  Then we
+  // remove the dummy calls.
+  Holders.reserve(Holders.size() + Records.size());
+  for (size_t i = 0; i < Records.size(); i++) {
+    PartiallyConstructedSafepointRecord &Info = Records[i];
+
+    SmallVector<Value *, 128> Bases;
+    for (auto Pair : Info.PointerToBase)
+      Bases.push_back(Pair.second);
+
+    insertUseHolderAfter(ToUpdate[i], Bases, Holders);
+  }
+
+  // By selecting base pointers, we've effectively inserted new uses. Thus, we
+  // need to rerun liveness.  We may *also* have inserted new defs, but that's
+  // not the key issue.
+  recomputeLiveInValues(F, DT, ToUpdate, Records);
+
+  if (PrintBasePointers) {
+    for (auto &Info : Records) {
+      errs() << "Base Pairs: (w/Relocation)\n";
+      for (auto Pair : Info.PointerToBase) {
+        errs() << " derived ";
+        Pair.first->printAsOperand(errs(), false);
+        errs() << " base ";
+        Pair.second->printAsOperand(errs(), false);
+        errs() << "\n";
+      }
+    }
+  }
+
+  // It is possible that non-constant live variables have a constant base.  For
+  // example, a GEP with a variable offset from a global.  In this case we can
+  // remove it from the liveset.  We already don't add constants to the liveset
+  // because we assume they won't move at runtime and the GC doesn't need to be
+  // informed about them.  The same reasoning applies if the base is constant.
+  // Note that the relocation placement code relies on this filtering for
+  // correctness as it expects the base to be in the liveset, which isn't true
+  // if the base is constant.
+  for (auto &Info : Records)
+    for (auto &BasePair : Info.PointerToBase)
+      if (isa<Constant>(BasePair.second))
+        Info.LiveSet.remove(BasePair.first);
+
+  for (CallInst *CI : Holders)
+    CI->eraseFromParent();
+
+  Holders.clear();
+
+  // In order to reduce live set of statepoint we might choose to rematerialize
+  // some values instead of relocating them. This is purely an optimization and
+  // does not influence correctness.
+  for (size_t i = 0; i < Records.size(); i++)
+    rematerializeLiveValues(ToUpdate[i], Records[i], TTI);
+
+  // We need this to safely RAUW and delete call or invoke return values that
+  // may themselves be live over a statepoint.  For details, please see usage in
+  // makeStatepointExplicitImpl.
+  std::vector<DeferredReplacement> Replacements;
+
+  // Now run through and replace the existing statepoints with new ones with
+  // the live variables listed.  We do not yet update uses of the values being
+  // relocated. We have references to live variables that need to
+  // survive to the last iteration of this loop.  (By construction, the
+  // previous statepoint can not be a live variable, thus we can and remove
+  // the old statepoint calls as we go.)
+  for (size_t i = 0; i < Records.size(); i++)
+    makeStatepointExplicit(DT, ToUpdate[i], Records[i], Replacements);
+
+  ToUpdate.clear(); // prevent accident use of invalid CallSites
+
+  for (auto &PR : Replacements)
+    PR.doReplacement();
+
+  Replacements.clear();
+
+  for (auto &Info : Records) {
+    // These live sets may contain state Value pointers, since we replaced calls
+    // with operand bundles with calls wrapped in gc.statepoint, and some of
+    // those calls may have been def'ing live gc pointers.  Clear these out to
+    // avoid accidentally using them.
+    //
+    // TODO: We should create a separate data structure that does not contain
+    // these live sets, and migrate to using that data structure from this point
+    // onward.
+    Info.LiveSet.clear();
+    Info.PointerToBase.clear();
+  }
+
+  // Do all the fixups of the original live variables to their relocated selves
+  SmallVector<Value *, 128> Live;
+  for (size_t i = 0; i < Records.size(); i++) {
+    PartiallyConstructedSafepointRecord &Info = Records[i];
+
+    // We can't simply save the live set from the original insertion.  One of
+    // the live values might be the result of a call which needs a safepoint.
+    // That Value* no longer exists and we need to use the new gc_result.
+    // Thankfully, the live set is embedded in the statepoint (and updated), so
+    // we just grab that.
+    Statepoint Statepoint(Info.StatepointToken);
+    Live.insert(Live.end(), Statepoint.gc_args_begin(),
+                Statepoint.gc_args_end());
+#ifndef NDEBUG
+    // Do some basic sanity checks on our liveness results before performing
+    // relocation.  Relocation can and will turn mistakes in liveness results
+    // into non-sensical code which is must harder to debug.
+    // TODO: It would be nice to test consistency as well
+    assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) &&
+           "statepoint must be reachable or liveness is meaningless");
+    for (Value *V : Statepoint.gc_args()) {
+      if (!isa<Instruction>(V))
+        // Non-instruction values trivial dominate all possible uses
+        continue;
+      auto *LiveInst = cast<Instruction>(V);
+      assert(DT.isReachableFromEntry(LiveInst->getParent()) &&
+             "unreachable values should never be live");
+      assert(DT.dominates(LiveInst, Info.StatepointToken) &&
+             "basic SSA liveness expectation violated by liveness analysis");
+    }
+#endif
+  }
+  unique_unsorted(Live);
+
+#ifndef NDEBUG
+  // sanity check
+  for (auto *Ptr : Live)
+    assert(isHandledGCPointerType(Ptr->getType()) &&
+           "must be a gc pointer type");
+#endif
+
+  relocationViaAlloca(F, DT, Live, Records);
+  return !Records.empty();
+}
+
+// Handles both return values and arguments for Functions and CallSites.
+template <typename AttrHolder>
+static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
+                                      unsigned Index) {
+  AttrBuilder R;
+  if (AH.getDereferenceableBytes(Index))
+    R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable,
+                                  AH.getDereferenceableBytes(Index)));
+  if (AH.getDereferenceableOrNullBytes(Index))
+    R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,
+                                  AH.getDereferenceableOrNullBytes(Index)));
+  if (AH.getAttributes().hasAttribute(Index, Attribute::NoAlias))
+    R.addAttribute(Attribute::NoAlias);
+
+  if (!R.empty())
+    AH.setAttributes(AH.getAttributes().removeAttributes(Ctx, Index, R));
+}
+
+void
+RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
+  LLVMContext &Ctx = F.getContext();
+
+  for (Argument &A : F.args())
+    if (isa<PointerType>(A.getType()))
+      RemoveNonValidAttrAtIndex(Ctx, F,
+                                A.getArgNo() + AttributeList::FirstArgIndex);
+
+  if (isa<PointerType>(F.getReturnType()))
+    RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
+}
+
+void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I) {
+
+  if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+    return;
+  // These are the attributes that are still valid on loads and stores after
+  // RS4GC.
+  // The metadata implying dereferenceability and noalias are (conservatively)
+  // dropped.  This is because semantically, after RewriteStatepointsForGC runs,
+  // all calls to gc.statepoint "free" the entire heap. Also, gc.statepoint can
+  // touch the entire heap including noalias objects. Note: The reasoning is
+  // same as stripping the dereferenceability and noalias attributes that are
+  // analogous to the metadata counterparts.
+  // We also drop the invariant.load metadata on the load because that metadata
+  // implies the address operand to the load points to memory that is never
+  // changed once it became dereferenceable. This is no longer true after RS4GC.
+  // Similar reasoning applies to invariant.group metadata, which applies to
+  // loads within a group.
+  unsigned ValidMetadataAfterRS4GC[] = {LLVMContext::MD_tbaa,
+                         LLVMContext::MD_range,
+                         LLVMContext::MD_alias_scope,
+                         LLVMContext::MD_nontemporal,
+                         LLVMContext::MD_nonnull,
+                         LLVMContext::MD_align,
+                         LLVMContext::MD_type};
+
+  // Drops all metadata on the instruction other than ValidMetadataAfterRS4GC.
+  I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC);
+
+}
+
+void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) {
+  if (F.empty())
+    return;
+
+  LLVMContext &Ctx = F.getContext();
+  MDBuilder Builder(Ctx);
+
+
+  for (Instruction &I : instructions(F)) {
+    if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
+      assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
+      bool IsImmutableTBAA =
+          MD->getNumOperands() == 4 &&
+          mdconst::extract<ConstantInt>(MD->getOperand(3))->getValue() == 1;
+
+      if (!IsImmutableTBAA)
+        continue; // no work to do, MD_tbaa is already marked mutable
+
+      MDNode *Base = cast<MDNode>(MD->getOperand(0));
+      MDNode *Access = cast<MDNode>(MD->getOperand(1));
+      uint64_t Offset =
+          mdconst::extract<ConstantInt>(MD->getOperand(2))->getZExtValue();
+
+      MDNode *MutableTBAA =
+          Builder.createTBAAStructTagNode(Base, Access, Offset);
+      I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA);
+    }
+
+    stripInvalidMetadataFromInstruction(I);
+
+    if (CallSite CS = CallSite(&I)) {
+      for (int i = 0, e = CS.arg_size(); i != e; i++)
+        if (isa<PointerType>(CS.getArgument(i)->getType()))
+          RemoveNonValidAttrAtIndex(Ctx, CS, i + AttributeList::FirstArgIndex);
+      if (isa<PointerType>(CS.getType()))
+        RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex);
+    }
+  }
+}
+
+/// Returns true if this function should be rewritten by this pass.  The main
+/// point of this function is as an extension point for custom logic.
+static bool shouldRewriteStatepointsIn(Function &F) {
+  // TODO: This should check the GCStrategy
+  if (F.hasGC()) {
+    const auto &FunctionGCName = F.getGC();
+    const StringRef StatepointExampleName("statepoint-example");
+    const StringRef CoreCLRName("coreclr");
+    return (StatepointExampleName == FunctionGCName) ||
+           (CoreCLRName == FunctionGCName);
+  } else
+    return false;
+}
+
+void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) {
+#ifndef NDEBUG
+  assert(any_of(M, shouldRewriteStatepointsIn) && "precondition!");
+#endif
+
+  for (Function &F : M)
+    stripNonValidAttributesFromPrototype(F);
+
+  for (Function &F : M)
+    stripNonValidAttributesAndMetadataFromBody(F);
+}
+
+bool RewriteStatepointsForGC::runOnFunction(Function &F) {
+  // Nothing to do for declarations.
+  if (F.isDeclaration() || F.empty())
+    return false;
+
+  // Policy choice says not to rewrite - the most common reason is that we're
+  // compiling code without a GCStrategy.
+  if (!shouldRewriteStatepointsIn(F))
+    return false;
+
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+  TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  auto NeedsRewrite = [](Instruction &I) {
+    if (ImmutableCallSite CS = ImmutableCallSite(&I))
+      return !callsGCLeafFunction(CS) && !isStatepoint(CS);
+    return false;
+  };
+
+  // Gather all the statepoints which need rewritten.  Be careful to only
+  // consider those in reachable code since we need to ask dominance queries
+  // when rewriting.  We'll delete the unreachable ones in a moment.
+  SmallVector<CallSite, 64> ParsePointNeeded;
+  bool HasUnreachableStatepoint = false;
+  for (Instruction &I : instructions(F)) {
+    // TODO: only the ones with the flag set!
+    if (NeedsRewrite(I)) {
+      if (DT.isReachableFromEntry(I.getParent()))
+        ParsePointNeeded.push_back(CallSite(&I));
+      else
+        HasUnreachableStatepoint = true;
+    }
+  }
+
+  bool MadeChange = false;
+
+  // Delete any unreachable statepoints so that we don't have unrewritten
+  // statepoints surviving this pass.  This makes testing easier and the
+  // resulting IR less confusing to human readers.  Rather than be fancy, we
+  // just reuse a utility function which removes the unreachable blocks.
+  if (HasUnreachableStatepoint)
+    MadeChange |= removeUnreachableBlocks(F);
+
+  // Return early if no work to do.
+  if (ParsePointNeeded.empty())
+    return MadeChange;
+
+  // As a prepass, go ahead and aggressively destroy single entry phi nodes.
+  // These are created by LCSSA.  They have the effect of increasing the size
+  // of liveness sets for no good reason.  It may be harder to do this post
+  // insertion since relocations and base phis can confuse things.
+  for (BasicBlock &BB : F)
+    if (BB.getUniquePredecessor()) {
+      MadeChange = true;
+      FoldSingleEntryPHINodes(&BB);
+    }
+
+  // Before we start introducing relocations, we want to tweak the IR a bit to
+  // avoid unfortunate code generation effects.  The main example is that we 
+  // want to try to make sure the comparison feeding a branch is after any
+  // safepoints.  Otherwise, we end up with a comparison of pre-relocation
+  // values feeding a branch after relocation.  This is semantically correct,
+  // but results in extra register pressure since both the pre-relocation and
+  // post-relocation copies must be available in registers.  For code without
+  // relocations this is handled elsewhere, but teaching the scheduler to
+  // reverse the transform we're about to do would be slightly complex.
+  // Note: This may extend the live range of the inputs to the icmp and thus
+  // increase the liveset of any statepoint we move over.  This is profitable
+  // as long as all statepoints are in rare blocks.  If we had in-register
+  // lowering for live values this would be a much safer transform.
+  auto getConditionInst = [](TerminatorInst *TI) -> Instruction* {
+    if (auto *BI = dyn_cast<BranchInst>(TI))
+      if (BI->isConditional())
+        return dyn_cast<Instruction>(BI->getCondition());
+    // TODO: Extend this to handle switches
+    return nullptr;
+  };
+  for (BasicBlock &BB : F) {
+    TerminatorInst *TI = BB.getTerminator();
+    if (auto *Cond = getConditionInst(TI))
+      // TODO: Handle more than just ICmps here.  We should be able to move
+      // most instructions without side effects or memory access.  
+      if (isa<ICmpInst>(Cond) && Cond->hasOneUse()) {
+        MadeChange = true;
+        Cond->moveBefore(TI);
+      }
+  }
+
+  MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded);
+  return MadeChange;
+}
+
+// liveness computation via standard dataflow
+// -------------------------------------------------------------------
+
+// TODO: Consider using bitvectors for liveness, the set of potentially
+// interesting values should be small and easy to pre-compute.
+
+/// Compute the live-in set for the location rbegin starting from
+/// the live-out set of the basic block
+static void computeLiveInValues(BasicBlock::reverse_iterator Begin,
+                                BasicBlock::reverse_iterator End,
+                                SetVector<Value *> &LiveTmp) {
+  for (auto &I : make_range(Begin, End)) {
+    // KILL/Def - Remove this definition from LiveIn
+    LiveTmp.remove(&I);
+
+    // Don't consider *uses* in PHI nodes, we handle their contribution to
+    // predecessor blocks when we seed the LiveOut sets
+    if (isa<PHINode>(I))
+      continue;
+
+    // USE - Add to the LiveIn set for this instruction
+    for (Value *V : I.operands()) {
+      assert(!isUnhandledGCPointerType(V->getType()) &&
+             "support for FCA unimplemented");
+      if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) {
+        // The choice to exclude all things constant here is slightly subtle.
+        // There are two independent reasons:
+        // - We assume that things which are constant (from LLVM's definition)
+        // do not move at runtime.  For example, the address of a global
+        // variable is fixed, even though it's contents may not be.
+        // - Second, we can't disallow arbitrary inttoptr constants even
+        // if the language frontend does.  Optimization passes are free to
+        // locally exploit facts without respect to global reachability.  This
+        // can create sections of code which are dynamically unreachable and
+        // contain just about anything.  (see constants.ll in tests)
+        LiveTmp.insert(V);
+      }
+    }
+  }
+}
+
+static void computeLiveOutSeed(BasicBlock *BB, SetVector<Value *> &LiveTmp) {
+  for (BasicBlock *Succ : successors(BB)) {
+    for (auto &I : *Succ) {
+      PHINode *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+
+      Value *V = PN->getIncomingValueForBlock(BB);
+      assert(!isUnhandledGCPointerType(V->getType()) &&
+             "support for FCA unimplemented");
+      if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V))
+        LiveTmp.insert(V);
+    }
+  }
+}
+
+static SetVector<Value *> computeKillSet(BasicBlock *BB) {
+  SetVector<Value *> KillSet;
+  for (Instruction &I : *BB)
+    if (isHandledGCPointerType(I.getType()))
+      KillSet.insert(&I);
+  return KillSet;
+}
+
+#ifndef NDEBUG
+/// Check that the items in 'Live' dominate 'TI'.  This is used as a basic
+/// sanity check for the liveness computation.
+static void checkBasicSSA(DominatorTree &DT, SetVector<Value *> &Live,
+                          TerminatorInst *TI, bool TermOkay = false) {
+  for (Value *V : Live) {
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      // The terminator can be a member of the LiveOut set.  LLVM's definition
+      // of instruction dominance states that V does not dominate itself.  As
+      // such, we need to special case this to allow it.
+      if (TermOkay && TI == I)
+        continue;
+      assert(DT.dominates(I, TI) &&
+             "basic SSA liveness expectation violated by liveness analysis");
+    }
+  }
+}
+
+/// Check that all the liveness sets used during the computation of liveness
+/// obey basic SSA properties.  This is useful for finding cases where we miss
+/// a def.
+static void checkBasicSSA(DominatorTree &DT, GCPtrLivenessData &Data,
+                          BasicBlock &BB) {
+  checkBasicSSA(DT, Data.LiveSet[&BB], BB.getTerminator());
+  checkBasicSSA(DT, Data.LiveOut[&BB], BB.getTerminator(), true);
+  checkBasicSSA(DT, Data.LiveIn[&BB], BB.getTerminator());
+}
+#endif
+
+static void computeLiveInValues(DominatorTree &DT, Function &F,
+                                GCPtrLivenessData &Data) {
+  SmallSetVector<BasicBlock *, 32> Worklist;
+
+  // Seed the liveness for each individual block
+  for (BasicBlock &BB : F) {
+    Data.KillSet[&BB] = computeKillSet(&BB);
+    Data.LiveSet[&BB].clear();
+    computeLiveInValues(BB.rbegin(), BB.rend(), Data.LiveSet[&BB]);
+
+#ifndef NDEBUG
+    for (Value *Kill : Data.KillSet[&BB])
+      assert(!Data.LiveSet[&BB].count(Kill) && "live set contains kill");
+#endif
+
+    Data.LiveOut[&BB] = SetVector<Value *>();
+    computeLiveOutSeed(&BB, Data.LiveOut[&BB]);
+    Data.LiveIn[&BB] = Data.LiveSet[&BB];
+    Data.LiveIn[&BB].set_union(Data.LiveOut[&BB]);
+    Data.LiveIn[&BB].set_subtract(Data.KillSet[&BB]);
+    if (!Data.LiveIn[&BB].empty())
+      Worklist.insert(pred_begin(&BB), pred_end(&BB));
+  }
+
+  // Propagate that liveness until stable
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.pop_back_val();
+
+    // Compute our new liveout set, then exit early if it hasn't changed despite
+    // the contribution of our successor.
+    SetVector<Value *> LiveOut = Data.LiveOut[BB];
+    const auto OldLiveOutSize = LiveOut.size();
+    for (BasicBlock *Succ : successors(BB)) {
+      assert(Data.LiveIn.count(Succ));
+      LiveOut.set_union(Data.LiveIn[Succ]);
+    }
+    // assert OutLiveOut is a subset of LiveOut
+    if (OldLiveOutSize == LiveOut.size()) {
+      // If the sets are the same size, then we didn't actually add anything
+      // when unioning our successors LiveIn.  Thus, the LiveIn of this block
+      // hasn't changed.
+      continue;
+    }
+    Data.LiveOut[BB] = LiveOut;
+
+    // Apply the effects of this basic block
+    SetVector<Value *> LiveTmp = LiveOut;
+    LiveTmp.set_union(Data.LiveSet[BB]);
+    LiveTmp.set_subtract(Data.KillSet[BB]);
+
+    assert(Data.LiveIn.count(BB));
+    const SetVector<Value *> &OldLiveIn = Data.LiveIn[BB];
+    // assert: OldLiveIn is a subset of LiveTmp
+    if (OldLiveIn.size() != LiveTmp.size()) {
+      Data.LiveIn[BB] = LiveTmp;
+      Worklist.insert(pred_begin(BB), pred_end(BB));
+    }
+  } // while (!Worklist.empty())
+
+#ifndef NDEBUG
+  // Sanity check our output against SSA properties.  This helps catch any
+  // missing kills during the above iteration.
+  for (BasicBlock &BB : F)
+    checkBasicSSA(DT, Data, BB);
+#endif
+}
+
+static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data,
+                              StatepointLiveSetTy &Out) {
+
+  BasicBlock *BB = Inst->getParent();
+
+  // Note: The copy is intentional and required
+  assert(Data.LiveOut.count(BB));
+  SetVector<Value *> LiveOut = Data.LiveOut[BB];
+
+  // We want to handle the statepoint itself oddly.  It's
+  // call result is not live (normal), nor are it's arguments
+  // (unless they're used again later).  This adjustment is
+  // specifically what we need to relocate
+  computeLiveInValues(BB->rbegin(), ++Inst->getIterator().getReverse(),
+                      LiveOut);
+  LiveOut.remove(Inst);
+  Out.insert(LiveOut.begin(), LiveOut.end());
+}
+
+static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
+                                  CallSite CS,
+                                  PartiallyConstructedSafepointRecord &Info) {
+  Instruction *Inst = CS.getInstruction();
+  StatepointLiveSetTy Updated;
+  findLiveSetAtInst(Inst, RevisedLivenessData, Updated);
+
+#ifndef NDEBUG
+  DenseSet<Value *> Bases;
+  for (auto KVPair : Info.PointerToBase)
+    Bases.insert(KVPair.second);
+#endif
+
+  // We may have base pointers which are now live that weren't before.  We need
+  // to update the PointerToBase structure to reflect this.
+  for (auto V : Updated)
+    if (Info.PointerToBase.insert({V, V}).second) {
+      assert(Bases.count(V) && "Can't find base for unexpected live value!");
+      continue;
+    }
+
+#ifndef NDEBUG
+  for (auto V : Updated)
+    assert(Info.PointerToBase.count(V) &&
+           "Must be able to find base for live value!");
+#endif
+
+  // Remove any stale base mappings - this can happen since our liveness is
+  // more precise then the one inherent in the base pointer analysis.
+  DenseSet<Value *> ToErase;
+  for (auto KVPair : Info.PointerToBase)
+    if (!Updated.count(KVPair.first))
+      ToErase.insert(KVPair.first);
+
+  for (auto *V : ToErase)
+    Info.PointerToBase.erase(V);
+
+#ifndef NDEBUG
+  for (auto KVPair : Info.PointerToBase)
+    assert(Updated.count(KVPair.first) && "record for non-live value");
+#endif
+
+  Info.LiveSet = Updated;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
new file mode 100644
index 000000000000..a738ebb4607e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -0,0 +1,1983 @@
+//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements sparse conditional constant propagation and merging:
+//
+// Specifically, this:
+//   * Assumes values are constant unless proven otherwise
+//   * Assumes BasicBlocks are dead unless proven otherwise
+//   * Proves values to be constant, and replaces them with constants
+//   * Proves conditional branches to be unconditional
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/SCCP.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/SCCP.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "sccp"
+
+STATISTIC(NumInstRemoved, "Number of instructions removed");
+STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
+
+STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
+STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
+STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
+
+namespace {
+/// LatticeVal class - This class represents the different lattice values that
+/// an LLVM value may occupy.  It is a simple class with value semantics.
+///
+class LatticeVal {
+  enum LatticeValueTy {
+    /// unknown - This LLVM Value has no known value yet.
+    unknown,
+
+    /// constant - This LLVM Value has a specific constant value.
+    constant,
+
+    /// forcedconstant - This LLVM Value was thought to be undef until
+    /// ResolvedUndefsIn.  This is treated just like 'constant', but if merged
+    /// with another (different) constant, it goes to overdefined, instead of
+    /// asserting.
+    forcedconstant,
+
+    /// overdefined - This instruction is not known to be constant, and we know
+    /// it has a value.
+    overdefined
+  };
+
+  /// Val: This stores the current lattice value along with the Constant* for
+  /// the constant if this is a 'constant' or 'forcedconstant' value.
+  PointerIntPair<Constant *, 2, LatticeValueTy> Val;
+
+  LatticeValueTy getLatticeValue() const {
+    return Val.getInt();
+  }
+
+public:
+  LatticeVal() : Val(nullptr, unknown) {}
+
+  bool isUnknown() const { return getLatticeValue() == unknown; }
+  bool isConstant() const {
+    return getLatticeValue() == constant || getLatticeValue() == forcedconstant;
+  }
+  bool isOverdefined() const { return getLatticeValue() == overdefined; }
+
+  Constant *getConstant() const {
+    assert(isConstant() && "Cannot get the constant of a non-constant!");
+    return Val.getPointer();
+  }
+
+  /// markOverdefined - Return true if this is a change in status.
+  bool markOverdefined() {
+    if (isOverdefined())
+      return false;
+
+    Val.setInt(overdefined);
+    return true;
+  }
+
+  /// markConstant - Return true if this is a change in status.
+  bool markConstant(Constant *V) {
+    if (getLatticeValue() == constant) { // Constant but not forcedconstant.
+      assert(getConstant() == V && "Marking constant with different value");
+      return false;
+    }
+
+    if (isUnknown()) {
+      Val.setInt(constant);
+      assert(V && "Marking constant with NULL");
+      Val.setPointer(V);
+    } else {
+      assert(getLatticeValue() == forcedconstant &&
+             "Cannot move from overdefined to constant!");
+      // Stay at forcedconstant if the constant is the same.
+      if (V == getConstant()) return false;
+
+      // Otherwise, we go to overdefined.  Assumptions made based on the
+      // forced value are possibly wrong.  Assuming this is another constant
+      // could expose a contradiction.
+      Val.setInt(overdefined);
+    }
+    return true;
+  }
+
+  /// getConstantInt - If this is a constant with a ConstantInt value, return it
+  /// otherwise return null.
+  ConstantInt *getConstantInt() const {
+    if (isConstant())
+      return dyn_cast<ConstantInt>(getConstant());
+    return nullptr;
+  }
+
+  /// getBlockAddress - If this is a constant with a BlockAddress value, return
+  /// it, otherwise return null.
+  BlockAddress *getBlockAddress() const {
+    if (isConstant())
+      return dyn_cast<BlockAddress>(getConstant());
+    return nullptr;
+  }
+
+  void markForcedConstant(Constant *V) {
+    assert(isUnknown() && "Can't force a defined value!");
+    Val.setInt(forcedconstant);
+    Val.setPointer(V);
+  }
+};
+} // end anonymous namespace.
+
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+//
+/// SCCPSolver - This class is a general purpose solver for Sparse Conditional
+/// Constant Propagation.
+///
+class SCCPSolver : public InstVisitor<SCCPSolver> {
+  const DataLayout &DL;
+  const TargetLibraryInfo *TLI;
+  SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable.
+  DenseMap<Value*, LatticeVal> ValueState;  // The state each value is in.
+
+  /// StructValueState - This maintains ValueState for values that have
+  /// StructType, for example for formal arguments, calls, insertelement, etc.
+  ///
+  DenseMap<std::pair<Value*, unsigned>, LatticeVal> StructValueState;
+
+  /// GlobalValue - If we are tracking any values for the contents of a global
+  /// variable, we keep a mapping from the constant accessor to the element of
+  /// the global, to the currently known value.  If the value becomes
+  /// overdefined, it's entry is simply removed from this map.
+  DenseMap<GlobalVariable*, LatticeVal> TrackedGlobals;
+
+  /// TrackedRetVals - If we are tracking arguments into and the return
+  /// value out of a function, it will have an entry in this map, indicating
+  /// what the known return value for the function is.
+  DenseMap<Function*, LatticeVal> TrackedRetVals;
+
+  /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
+  /// that return multiple values.
+  DenseMap<std::pair<Function*, unsigned>, LatticeVal> TrackedMultipleRetVals;
+
+  /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is
+  /// represented here for efficient lookup.
+  SmallPtrSet<Function*, 16> MRVFunctionsTracked;
+
+  /// TrackingIncomingArguments - This is the set of functions for whose
+  /// arguments we make optimistic assumptions about and try to prove as
+  /// constants.
+  SmallPtrSet<Function*, 16> TrackingIncomingArguments;
+
+  /// The reason for two worklists is that overdefined is the lowest state
+  /// on the lattice, and moving things to overdefined as fast as possible
+  /// makes SCCP converge much faster.
+  ///
+  /// By having a separate worklist, we accomplish this because everything
+  /// possibly overdefined will become overdefined at the soonest possible
+  /// point.
+  SmallVector<Value*, 64> OverdefinedInstWorkList;
+  SmallVector<Value*, 64> InstWorkList;
+
+
+  SmallVector<BasicBlock*, 64>  BBWorkList;  // The BasicBlock work list
+
+  /// KnownFeasibleEdges - Entries in this set are edges which have already had
+  /// PHI nodes retriggered.
+  typedef std::pair<BasicBlock*, BasicBlock*> Edge;
+  DenseSet<Edge> KnownFeasibleEdges;
+public:
+  SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli)
+      : DL(DL), TLI(tli) {}
+
+  /// MarkBlockExecutable - This method can be used by clients to mark all of
+  /// the blocks that are known to be intrinsically live in the processed unit.
+  ///
+  /// This returns true if the block was not considered live before.
+  bool MarkBlockExecutable(BasicBlock *BB) {
+    if (!BBExecutable.insert(BB).second)
+      return false;
+    DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
+    BBWorkList.push_back(BB);  // Add the block to the work list!
+    return true;
+  }
+
+  /// TrackValueOfGlobalVariable - Clients can use this method to
+  /// inform the SCCPSolver that it should track loads and stores to the
+  /// specified global variable if it can.  This is only legal to call if
+  /// performing Interprocedural SCCP.
+  void TrackValueOfGlobalVariable(GlobalVariable *GV) {
+    // We only track the contents of scalar globals.
+    if (GV->getValueType()->isSingleValueType()) {
+      LatticeVal &IV = TrackedGlobals[GV];
+      if (!isa<UndefValue>(GV->getInitializer()))
+        IV.markConstant(GV->getInitializer());
+    }
+  }
+
+  /// AddTrackedFunction - If the SCCP solver is supposed to track calls into
+  /// and out of the specified function (which cannot have its address taken),
+  /// this method must be called.
+  void AddTrackedFunction(Function *F) {
+    // Add an entry, F -> undef.
+    if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
+      MRVFunctionsTracked.insert(F);
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i),
+                                                     LatticeVal()));
+    } else
+      TrackedRetVals.insert(std::make_pair(F, LatticeVal()));
+  }
+
+  void AddArgumentTrackedFunction(Function *F) {
+    TrackingIncomingArguments.insert(F);
+  }
+
+  /// Solve - Solve for constants and executable blocks.
+  ///
+  void Solve();
+
+  /// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+  /// that branches on undef values cannot reach any of their successors.
+  /// However, this is not a safe assumption.  After we solve dataflow, this
+  /// method should be use to handle this.  If this returns true, the solver
+  /// should be rerun.
+  bool ResolvedUndefsIn(Function &F);
+
+  bool isBlockExecutable(BasicBlock *BB) const {
+    return BBExecutable.count(BB);
+  }
+
+  std::vector<LatticeVal> getStructLatticeValueFor(Value *V) const {
+    std::vector<LatticeVal> StructValues;
+    auto *STy = dyn_cast<StructType>(V->getType());
+    assert(STy && "getStructLatticeValueFor() can be called only on structs");
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      auto I = StructValueState.find(std::make_pair(V, i));
+      assert(I != StructValueState.end() && "Value not in valuemap!");
+      StructValues.push_back(I->second);
+    }
+    return StructValues;
+  }
+
+  LatticeVal getLatticeValueFor(Value *V) const {
+    DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V);
+    assert(I != ValueState.end() && "V is not in valuemap!");
+    return I->second;
+  }
+
+  /// getTrackedRetVals - Get the inferred return value map.
+  ///
+  const DenseMap<Function*, LatticeVal> &getTrackedRetVals() {
+    return TrackedRetVals;
+  }
+
+  /// getTrackedGlobals - Get and return the set of inferred initializers for
+  /// global variables.
+  const DenseMap<GlobalVariable*, LatticeVal> &getTrackedGlobals() {
+    return TrackedGlobals;
+  }
+
+  /// getMRVFunctionsTracked - Get the set of functions which return multiple
+  /// values tracked by the pass.
+  const SmallPtrSet<Function *, 16> getMRVFunctionsTracked() {
+    return MRVFunctionsTracked;
+  }
+
+  /// markOverdefined - Mark the specified value overdefined.  This
+  /// works with both scalars and structs.
+  void markOverdefined(Value *V) {
+    if (auto *STy = dyn_cast<StructType>(V->getType()))
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        markOverdefined(getStructValueState(V, i), V);
+    else
+      markOverdefined(ValueState[V], V);
+  }
+
+  // isStructLatticeConstant - Return true if all the lattice values
+  // corresponding to elements of the structure are not overdefined,
+  // false otherwise.
+  bool isStructLatticeConstant(Function *F, StructType *STy) {
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i));
+      assert(It != TrackedMultipleRetVals.end());
+      LatticeVal LV = It->second;
+      if (LV.isOverdefined())
+        return false;
+    }
+    return true;
+  }
+
+private:
+  // pushToWorkList - Helper for markConstant/markForcedConstant/markOverdefined
+  void pushToWorkList(LatticeVal &IV, Value *V) {
+    if (IV.isOverdefined())
+      return OverdefinedInstWorkList.push_back(V);
+    InstWorkList.push_back(V);
+  }
+
+  // markConstant - Make a value be marked as "constant".  If the value
+  // is not already a constant, add it to the instruction work list so that
+  // the users of the instruction are updated later.
+  //
+  void markConstant(LatticeVal &IV, Value *V, Constant *C) {
+    if (!IV.markConstant(C)) return;
+    DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
+    pushToWorkList(IV, V);
+  }
+
+  void markConstant(Value *V, Constant *C) {
+    assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
+    markConstant(ValueState[V], V, C);
+  }
+
+  void markForcedConstant(Value *V, Constant *C) {
+    assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
+    LatticeVal &IV = ValueState[V];
+    IV.markForcedConstant(C);
+    DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n');
+    pushToWorkList(IV, V);
+  }
+
+
+  // markOverdefined - Make a value be marked as "overdefined". If the
+  // value is not already overdefined, add it to the overdefined instruction
+  // work list so that the users of the instruction are updated later.
+  void markOverdefined(LatticeVal &IV, Value *V) {
+    if (!IV.markOverdefined()) return;
+
+    DEBUG(dbgs() << "markOverdefined: ";
+          if (auto *F = dyn_cast<Function>(V))
+            dbgs() << "Function '" << F->getName() << "'\n";
+          else
+            dbgs() << *V << '\n');
+    // Only instructions go on the work list
+    pushToWorkList(IV, V);
+  }
+
+  void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
+    if (IV.isOverdefined() || MergeWithV.isUnknown())
+      return;  // Noop.
+    if (MergeWithV.isOverdefined())
+      return markOverdefined(IV, V);
+    if (IV.isUnknown())
+      return markConstant(IV, V, MergeWithV.getConstant());
+    if (IV.getConstant() != MergeWithV.getConstant())
+      return markOverdefined(IV, V);
+  }
+
+  void mergeInValue(Value *V, LatticeVal MergeWithV) {
+    assert(!V->getType()->isStructTy() &&
+           "non-structs should use markConstant");
+    mergeInValue(ValueState[V], V, MergeWithV);
+  }
+
+
+  /// getValueState - Return the LatticeVal object that corresponds to the
+  /// value.  This function handles the case when the value hasn't been seen yet
+  /// by properly seeding constants etc.
+  LatticeVal &getValueState(Value *V) {
+    assert(!V->getType()->isStructTy() && "Should use getStructValueState");
+
+    std::pair<DenseMap<Value*, LatticeVal>::iterator, bool> I =
+      ValueState.insert(std::make_pair(V, LatticeVal()));
+    LatticeVal &LV = I.first->second;
+
+    if (!I.second)
+      return LV;  // Common case, already in the map.
+
+    if (auto *C = dyn_cast<Constant>(V)) {
+      // Undef values remain unknown.
+      if (!isa<UndefValue>(V))
+        LV.markConstant(C);          // Constants are constant
+    }
+
+    // All others are underdefined by default.
+    return LV;
+  }
+
+  /// getStructValueState - Return the LatticeVal object that corresponds to the
+  /// value/field pair.  This function handles the case when the value hasn't
+  /// been seen yet by properly seeding constants etc.
+  LatticeVal &getStructValueState(Value *V, unsigned i) {
+    assert(V->getType()->isStructTy() && "Should use getValueState");
+    assert(i < cast<StructType>(V->getType())->getNumElements() &&
+           "Invalid element #");
+
+    std::pair<DenseMap<std::pair<Value*, unsigned>, LatticeVal>::iterator,
+              bool> I = StructValueState.insert(
+                        std::make_pair(std::make_pair(V, i), LatticeVal()));
+    LatticeVal &LV = I.first->second;
+
+    if (!I.second)
+      return LV;  // Common case, already in the map.
+
+    if (auto *C = dyn_cast<Constant>(V)) {
+      Constant *Elt = C->getAggregateElement(i);
+
+      if (!Elt)
+        LV.markOverdefined();      // Unknown sort of constant.
+      else if (isa<UndefValue>(Elt))
+        ; // Undef values remain unknown.
+      else
+        LV.markConstant(Elt);      // Constants are constant.
+    }
+
+    // All others are underdefined by default.
+    return LV;
+  }
+
+
+  /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
+  /// work list if it is not already executable.
+  void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+    if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
+      return;  // This edge is already known to be executable!
+
+    if (!MarkBlockExecutable(Dest)) {
+      // If the destination is already executable, we just made an *edge*
+      // feasible that wasn't before.  Revisit the PHI nodes in the block
+      // because they have potentially new operands.
+      DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
+            << " -> " << Dest->getName() << '\n');
+
+      PHINode *PN;
+      for (BasicBlock::iterator I = Dest->begin();
+           (PN = dyn_cast<PHINode>(I)); ++I)
+        visitPHINode(*PN);
+    }
+  }
+
+  // getFeasibleSuccessors - Return a vector of booleans to indicate which
+  // successors are reachable from a given terminator instruction.
+  //
+  void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs);
+
+  // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+  // block to the 'To' basic block is currently feasible.
+  //
+  bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
+
+  // OperandChangedState - This method is invoked on all of the users of an
+  // instruction that was just changed state somehow.  Based on this
+  // information, we need to update the specified user of this instruction.
+  //
+  void OperandChangedState(Instruction *I) {
+    if (BBExecutable.count(I->getParent()))   // Inst is executable?
+      visit(*I);
+  }
+
+private:
+  friend class InstVisitor<SCCPSolver>;
+
+  // visit implementations - Something changed in this instruction.  Either an
+  // operand made a transition, or the instruction is newly executable.  Change
+  // the value type of I to reflect these changes if appropriate.
+  void visitPHINode(PHINode &I);
+
+  // Terminators
+  void visitReturnInst(ReturnInst &I);
+  void visitTerminatorInst(TerminatorInst &TI);
+
+  void visitCastInst(CastInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitBinaryOperator(Instruction &I);
+  void visitCmpInst(CmpInst &I);
+  void visitExtractValueInst(ExtractValueInst &EVI);
+  void visitInsertValueInst(InsertValueInst &IVI);
+  void visitCatchSwitchInst(CatchSwitchInst &CPI) {
+    markOverdefined(&CPI);
+    visitTerminatorInst(CPI);
+  }
+
+  // Instructions that cannot be folded away.
+  void visitStoreInst     (StoreInst &I);
+  void visitLoadInst      (LoadInst &I);
+  void visitGetElementPtrInst(GetElementPtrInst &I);
+  void visitCallInst      (CallInst &I) {
+    visitCallSite(&I);
+  }
+  void visitInvokeInst    (InvokeInst &II) {
+    visitCallSite(&II);
+    visitTerminatorInst(II);
+  }
+  void visitCallSite      (CallSite CS);
+  void visitResumeInst    (TerminatorInst &I) { /*returns void*/ }
+  void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
+  void visitFenceInst     (FenceInst &I) { /*returns void*/ }
+  void visitInstruction(Instruction &I) {
+    // All the instructions we don't do any special handling for just
+    // go to overdefined.
+    DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
+    markOverdefined(&I);
+  }
+};
+
+} // end anonymous namespace
+
+
+// getFeasibleSuccessors - Return a vector of booleans to indicate which
+// successors are reachable from a given terminator instruction.
+//
+void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
+                                       SmallVectorImpl<bool> &Succs) {
+  Succs.resize(TI.getNumSuccessors());
+  if (auto *BI = dyn_cast<BranchInst>(&TI)) {
+    if (BI->isUnconditional()) {
+      Succs[0] = true;
+      return;
+    }
+
+    LatticeVal BCValue = getValueState(BI->getCondition());
+    ConstantInt *CI = BCValue.getConstantInt();
+    if (!CI) {
+      // Overdefined condition variables, and branches on unfoldable constant
+      // conditions, mean the branch could go either way.
+      if (!BCValue.isUnknown())
+        Succs[0] = Succs[1] = true;
+      return;
+    }
+
+    // Constant condition variables mean the branch can only go a single way.
+    Succs[CI->isZero()] = true;
+    return;
+  }
+
+  // Unwinding instructions successors are always executable.
+  if (TI.isExceptional()) {
+    Succs.assign(TI.getNumSuccessors(), true);
+    return;
+  }
+
+  if (auto *SI = dyn_cast<SwitchInst>(&TI)) {
+    if (!SI->getNumCases()) {
+      Succs[0] = true;
+      return;
+    }
+    LatticeVal SCValue = getValueState(SI->getCondition());
+    ConstantInt *CI = SCValue.getConstantInt();
+
+    if (!CI) {   // Overdefined or unknown condition?
+      // All destinations are executable!
+      if (!SCValue.isUnknown())
+        Succs.assign(TI.getNumSuccessors(), true);
+      return;
+    }
+
+    Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true;
+    return;
+  }
+
+  // In case of indirect branch and its address is a blockaddress, we mark
+  // the target as executable.
+  if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
+    // Casts are folded by visitCastInst.
+    LatticeVal IBRValue = getValueState(IBR->getAddress());
+    BlockAddress *Addr = IBRValue.getBlockAddress();
+    if (!Addr) {   // Overdefined or unknown condition?
+      // All destinations are executable!
+      if (!IBRValue.isUnknown())
+        Succs.assign(TI.getNumSuccessors(), true);
+      return;
+    }
+
+    BasicBlock* T = Addr->getBasicBlock();
+    assert(Addr->getFunction() == T->getParent() &&
+           "Block address of a different function ?");
+    for (unsigned i = 0; i < IBR->getNumSuccessors(); ++i) {
+      // This is the target.
+      if (IBR->getDestination(i) == T) {
+        Succs[i] = true;
+        return;
+      }
+    }
+
+    // If we didn't find our destination in the IBR successor list, then we
+    // have undefined behavior. Its ok to assume no successor is executable.
+    return;
+  }
+
+  DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
+  llvm_unreachable("SCCP: Don't know how to handle this terminator!");
+}
+
+
+// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+// block to the 'To' basic block is currently feasible.
+//
+bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
+  assert(BBExecutable.count(To) && "Dest should always be alive!");
+
+  // Make sure the source basic block is executable!!
+  if (!BBExecutable.count(From)) return false;
+
+  // Check to make sure this edge itself is actually feasible now.
+  TerminatorInst *TI = From->getTerminator();
+  if (auto *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isUnconditional())
+      return true;
+
+    LatticeVal BCValue = getValueState(BI->getCondition());
+
+    // Overdefined condition variables mean the branch could go either way,
+    // undef conditions mean that neither edge is feasible yet.
+    ConstantInt *CI = BCValue.getConstantInt();
+    if (!CI)
+      return !BCValue.isUnknown();
+
+    // Constant condition variables mean the branch can only go a single way.
+    return BI->getSuccessor(CI->isZero()) == To;
+  }
+
+  // Unwinding instructions successors are always executable.
+  if (TI->isExceptional())
+    return true;
+
+  if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+    if (SI->getNumCases() < 1)
+      return true;
+
+    LatticeVal SCValue = getValueState(SI->getCondition());
+    ConstantInt *CI = SCValue.getConstantInt();
+
+    if (!CI)
+      return !SCValue.isUnknown();
+
+    return SI->findCaseValue(CI)->getCaseSuccessor() == To;
+  }
+
+  // In case of indirect branch and its address is a blockaddress, we mark
+  // the target as executable.
+  if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+    LatticeVal IBRValue = getValueState(IBR->getAddress());
+    BlockAddress *Addr = IBRValue.getBlockAddress();
+
+    if (!Addr)
+      return !IBRValue.isUnknown();
+
+    // At this point, the indirectbr is branching on a blockaddress.
+    return Addr->getBasicBlock() == To;
+  }
+
+  DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n');
+  llvm_unreachable("SCCP: Don't know how to handle this terminator!");
+}
+
+// visit Implementations - Something changed in this instruction, either an
+// operand made a transition, or the instruction is newly executable.  Change
+// the value type of I to reflect these changes if appropriate.  This method
+// makes sure to do the following actions:
+//
+// 1. If a phi node merges two constants in, and has conflicting value coming
+//    from different branches, or if the PHI node merges in an overdefined
+//    value, then the PHI node becomes overdefined.
+// 2. If a phi node merges only constants in, and they all agree on value, the
+//    PHI node becomes a constant value equal to that.
+// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant
+// 4. If V <- x (op) y && (isOverdefined(x) || isOverdefined(y)) V = Overdefined
+// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined
+// 6. If a conditional branch has a value that is constant, make the selected
+//    destination executable
+// 7. If a conditional branch has a value that is overdefined, make all
+//    successors executable.
+//
+void SCCPSolver::visitPHINode(PHINode &PN) {
+  // If this PN returns a struct, just mark the result overdefined.
+  // TODO: We could do a lot better than this if code actually uses this.
+  if (PN.getType()->isStructTy())
+    return markOverdefined(&PN);
+
+  if (getValueState(&PN).isOverdefined())
+    return;  // Quick exit
+
+  // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
+  // and slow us down a lot.  Just mark them overdefined.
+  if (PN.getNumIncomingValues() > 64)
+    return markOverdefined(&PN);
+
+  // Look at all of the executable operands of the PHI node.  If any of them
+  // are overdefined, the PHI becomes overdefined as well.  If they are all
+  // constant, and they agree with each other, the PHI becomes the identical
+  // constant.  If they are constant and don't agree, the PHI is overdefined.
+  // If there are no executable operands, the PHI remains unknown.
+  //
+  Constant *OperandVal = nullptr;
+  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+    LatticeVal IV = getValueState(PN.getIncomingValue(i));
+    if (IV.isUnknown()) continue;  // Doesn't influence PHI node.
+
+    if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
+      continue;
+
+    if (IV.isOverdefined())    // PHI node becomes overdefined!
+      return markOverdefined(&PN);
+
+    if (!OperandVal) {   // Grab the first value.
+      OperandVal = IV.getConstant();
+      continue;
+    }
+
+    // There is already a reachable operand.  If we conflict with it,
+    // then the PHI node becomes overdefined.  If we agree with it, we
+    // can continue on.
+
+    // Check to see if there are two different constants merging, if so, the PHI
+    // node is overdefined.
+    if (IV.getConstant() != OperandVal)
+      return markOverdefined(&PN);
+  }
+
+  // If we exited the loop, this means that the PHI node only has constant
+  // arguments that agree with each other(and OperandVal is the constant) or
+  // OperandVal is null because there are no defined incoming arguments.  If
+  // this is the case, the PHI remains unknown.
+  //
+  if (OperandVal)
+    markConstant(&PN, OperandVal);      // Acquire operand value
+}
+
+void SCCPSolver::visitReturnInst(ReturnInst &I) {
+  if (I.getNumOperands() == 0) return;  // ret void
+
+  Function *F = I.getParent()->getParent();
+  Value *ResultOp = I.getOperand(0);
+
+  // If we are tracking the return value of this function, merge it in.
+  if (!TrackedRetVals.empty() && !ResultOp->getType()->isStructTy()) {
+    DenseMap<Function*, LatticeVal>::iterator TFRVI =
+      TrackedRetVals.find(F);
+    if (TFRVI != TrackedRetVals.end()) {
+      mergeInValue(TFRVI->second, F, getValueState(ResultOp));
+      return;
+    }
+  }
+
+  // Handle functions that return multiple values.
+  if (!TrackedMultipleRetVals.empty()) {
+    if (auto *STy = dyn_cast<StructType>(ResultOp->getType()))
+      if (MRVFunctionsTracked.count(F))
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+          mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
+                       getStructValueState(ResultOp, i));
+
+  }
+}
+
+void SCCPSolver::visitTerminatorInst(TerminatorInst &TI) {
+  SmallVector<bool, 16> SuccFeasible;
+  getFeasibleSuccessors(TI, SuccFeasible);
+
+  BasicBlock *BB = TI.getParent();
+
+  // Mark all feasible successors executable.
+  for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i)
+    if (SuccFeasible[i])
+      markEdgeExecutable(BB, TI.getSuccessor(i));
+}
+
+void SCCPSolver::visitCastInst(CastInst &I) {
+  LatticeVal OpSt = getValueState(I.getOperand(0));
+  if (OpSt.isOverdefined())          // Inherit overdefinedness of operand
+    markOverdefined(&I);
+  else if (OpSt.isConstant()) {
+    // Fold the constant as we build.
+    Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpSt.getConstant(),
+                                          I.getType(), DL);
+    if (isa<UndefValue>(C))
+      return;
+    // Propagate constant value
+    markConstant(&I, C);
+  }
+}
+
+
+void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
+  // If this returns a struct, mark all elements over defined, we don't track
+  // structs in structs.
+  if (EVI.getType()->isStructTy())
+    return markOverdefined(&EVI);
+
+  // If this is extracting from more than one level of struct, we don't know.
+  if (EVI.getNumIndices() != 1)
+    return markOverdefined(&EVI);
+
+  Value *AggVal = EVI.getAggregateOperand();
+  if (AggVal->getType()->isStructTy()) {
+    unsigned i = *EVI.idx_begin();
+    LatticeVal EltVal = getStructValueState(AggVal, i);
+    mergeInValue(getValueState(&EVI), &EVI, EltVal);
+  } else {
+    // Otherwise, must be extracting from an array.
+    return markOverdefined(&EVI);
+  }
+}
+
+void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
+  auto *STy = dyn_cast<StructType>(IVI.getType());
+  if (!STy)
+    return markOverdefined(&IVI);
+
+  // If this has more than one index, we can't handle it, drive all results to
+  // undef.
+  if (IVI.getNumIndices() != 1)
+    return markOverdefined(&IVI);
+
+  Value *Aggr = IVI.getAggregateOperand();
+  unsigned Idx = *IVI.idx_begin();
+
+  // Compute the result based on what we're inserting.
+  for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+    // This passes through all values that aren't the inserted element.
+    if (i != Idx) {
+      LatticeVal EltVal = getStructValueState(Aggr, i);
+      mergeInValue(getStructValueState(&IVI, i), &IVI, EltVal);
+      continue;
+    }
+
+    Value *Val = IVI.getInsertedValueOperand();
+    if (Val->getType()->isStructTy())
+      // We don't track structs in structs.
+      markOverdefined(getStructValueState(&IVI, i), &IVI);
+    else {
+      LatticeVal InVal = getValueState(Val);
+      mergeInValue(getStructValueState(&IVI, i), &IVI, InVal);
+    }
+  }
+}
+
+void SCCPSolver::visitSelectInst(SelectInst &I) {
+  // If this select returns a struct, just mark the result overdefined.
+  // TODO: We could do a lot better than this if code actually uses this.
+  if (I.getType()->isStructTy())
+    return markOverdefined(&I);
+
+  LatticeVal CondValue = getValueState(I.getCondition());
+  if (CondValue.isUnknown())
+    return;
+
+  if (ConstantInt *CondCB = CondValue.getConstantInt()) {
+    Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue();
+    mergeInValue(&I, getValueState(OpVal));
+    return;
+  }
+
+  // Otherwise, the condition is overdefined or a constant we can't evaluate.
+  // See if we can produce something better than overdefined based on the T/F
+  // value.
+  LatticeVal TVal = getValueState(I.getTrueValue());
+  LatticeVal FVal = getValueState(I.getFalseValue());
+
+  // select ?, C, C -> C.
+  if (TVal.isConstant() && FVal.isConstant() &&
+      TVal.getConstant() == FVal.getConstant())
+    return markConstant(&I, FVal.getConstant());
+
+  if (TVal.isUnknown())   // select ?, undef, X -> X.
+    return mergeInValue(&I, FVal);
+  if (FVal.isUnknown())   // select ?, X, undef -> X.
+    return mergeInValue(&I, TVal);
+  markOverdefined(&I);
+}
+
+// Handle Binary Operators.
+void SCCPSolver::visitBinaryOperator(Instruction &I) {
+  LatticeVal V1State = getValueState(I.getOperand(0));
+  LatticeVal V2State = getValueState(I.getOperand(1));
+
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  if (V1State.isConstant() && V2State.isConstant()) {
+    Constant *C = ConstantExpr::get(I.getOpcode(), V1State.getConstant(),
+                                    V2State.getConstant());
+    // X op Y -> undef.
+    if (isa<UndefValue>(C))
+      return;
+    return markConstant(IV, &I, C);
+  }
+
+  // If something is undef, wait for it to resolve.
+  if (!V1State.isOverdefined() && !V2State.isOverdefined())
+    return;
+
+  // Otherwise, one of our operands is overdefined.  Try to produce something
+  // better than overdefined with some tricks.
+  // If this is 0 / Y, it doesn't matter that the second operand is
+  // overdefined, and we can replace it with zero.
+  if (I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv)
+    if (V1State.isConstant() && V1State.getConstant()->isNullValue())
+      return markConstant(IV, &I, V1State.getConstant());
+
+  // If this is:
+  // -> AND/MUL with 0
+  // -> OR with -1
+  // it doesn't matter that the other operand is overdefined.
+  if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Mul ||
+      I.getOpcode() == Instruction::Or) {
+    LatticeVal *NonOverdefVal = nullptr;
+    if (!V1State.isOverdefined())
+      NonOverdefVal = &V1State;
+    else if (!V2State.isOverdefined())
+      NonOverdefVal = &V2State;
+
+    if (NonOverdefVal) {
+      if (NonOverdefVal->isUnknown())
+        return;
+
+      if (I.getOpcode() == Instruction::And ||
+          I.getOpcode() == Instruction::Mul) {
+        // X and 0 = 0
+        // X * 0 = 0
+        if (NonOverdefVal->getConstant()->isNullValue())
+          return markConstant(IV, &I, NonOverdefVal->getConstant());
+      } else {
+        // X or -1 = -1
+        if (ConstantInt *CI = NonOverdefVal->getConstantInt())
+          if (CI->isMinusOne())
+            return markConstant(IV, &I, NonOverdefVal->getConstant());
+      }
+    }
+  }
+
+
+  markOverdefined(&I);
+}
+
+// Handle ICmpInst instruction.
+void SCCPSolver::visitCmpInst(CmpInst &I) {
+  LatticeVal V1State = getValueState(I.getOperand(0));
+  LatticeVal V2State = getValueState(I.getOperand(1));
+
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  if (V1State.isConstant() && V2State.isConstant()) {
+    Constant *C = ConstantExpr::getCompare(
+        I.getPredicate(), V1State.getConstant(), V2State.getConstant());
+    if (isa<UndefValue>(C))
+      return;
+    return markConstant(IV, &I, C);
+  }
+
+  // If operands are still unknown, wait for it to resolve.
+  if (!V1State.isOverdefined() && !V2State.isOverdefined())
+    return;
+
+  markOverdefined(&I);
+}
+
+// Handle getelementptr instructions.  If all operands are constants then we
+// can turn this into a getelementptr ConstantExpr.
+//
+void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
+  if (ValueState[&I].isOverdefined()) return;
+
+  SmallVector<Constant*, 8> Operands;
+  Operands.reserve(I.getNumOperands());
+
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+    LatticeVal State = getValueState(I.getOperand(i));
+    if (State.isUnknown())
+      return;  // Operands are not resolved yet.
+
+    if (State.isOverdefined())
+      return markOverdefined(&I);
+
+    assert(State.isConstant() && "Unknown state!");
+    Operands.push_back(State.getConstant());
+  }
+
+  Constant *Ptr = Operands[0];
+  auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end());
+  Constant *C =
+      ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, Indices);
+  if (isa<UndefValue>(C))
+      return;
+  markConstant(&I, C);
+}
+
+void SCCPSolver::visitStoreInst(StoreInst &SI) {
+  // If this store is of a struct, ignore it.
+  if (SI.getOperand(0)->getType()->isStructTy())
+    return;
+
+  if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1)))
+    return;
+
+  GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1));
+  DenseMap<GlobalVariable*, LatticeVal>::iterator I = TrackedGlobals.find(GV);
+  if (I == TrackedGlobals.end() || I->second.isOverdefined()) return;
+
+  // Get the value we are storing into the global, then merge it.
+  mergeInValue(I->second, GV, getValueState(SI.getOperand(0)));
+  if (I->second.isOverdefined())
+    TrackedGlobals.erase(I);      // No need to keep tracking this!
+}
+
+
+// Handle load instructions.  If the operand is a constant pointer to a constant
+// global, we can replace the load with the loaded constant value!
+void SCCPSolver::visitLoadInst(LoadInst &I) {
+  // If this load is of a struct, just mark the result overdefined.
+  if (I.getType()->isStructTy())
+    return markOverdefined(&I);
+
+  LatticeVal PtrVal = getValueState(I.getOperand(0));
+  if (PtrVal.isUnknown()) return;   // The pointer is not resolved yet!
+
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  if (!PtrVal.isConstant() || I.isVolatile())
+    return markOverdefined(IV, &I);
+
+  Constant *Ptr = PtrVal.getConstant();
+
+  // load null is undefined.
+  if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0)
+    return;
+
+  // Transform load (constant global) into the value loaded.
+  if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) {
+    if (!TrackedGlobals.empty()) {
+      // If we are tracking this global, merge in the known value for it.
+      DenseMap<GlobalVariable*, LatticeVal>::iterator It =
+        TrackedGlobals.find(GV);
+      if (It != TrackedGlobals.end()) {
+        mergeInValue(IV, &I, It->second);
+        return;
+      }
+    }
+  }
+
+  // Transform load from a constant into a constant if possible.
+  if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) {
+    if (isa<UndefValue>(C))
+      return;
+    return markConstant(IV, &I, C);
+  }
+
+  // Otherwise we cannot say for certain what value this load will produce.
+  // Bail out.
+  markOverdefined(IV, &I);
+}
+
+void SCCPSolver::visitCallSite(CallSite CS) {
+  Function *F = CS.getCalledFunction();
+  Instruction *I = CS.getInstruction();
+
+  // The common case is that we aren't tracking the callee, either because we
+  // are not doing interprocedural analysis or the callee is indirect, or is
+  // external.  Handle these cases first.
+  if (!F || F->isDeclaration()) {
+CallOverdefined:
+    // Void return and not tracking callee, just bail.
+    if (I->getType()->isVoidTy()) return;
+
+    // Otherwise, if we have a single return value case, and if the function is
+    // a declaration, maybe we can constant fold it.
+    if (F && F->isDeclaration() && !I->getType()->isStructTy() &&
+        canConstantFoldCallTo(CS, F)) {
+
+      SmallVector<Constant*, 8> Operands;
+      for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
+           AI != E; ++AI) {
+        LatticeVal State = getValueState(*AI);
+
+        if (State.isUnknown())
+          return;  // Operands are not resolved yet.
+        if (State.isOverdefined())
+          return markOverdefined(I);
+        assert(State.isConstant() && "Unknown state!");
+        Operands.push_back(State.getConstant());
+      }
+
+      if (getValueState(I).isOverdefined())
+        return;
+
+      // If we can constant fold this, mark the result of the call as a
+      // constant.
+      if (Constant *C = ConstantFoldCall(CS, F, Operands, TLI)) {
+        // call -> undef.
+        if (isa<UndefValue>(C))
+          return;
+        return markConstant(I, C);
+      }
+    }
+
+    // Otherwise, we don't know anything about this call, mark it overdefined.
+    return markOverdefined(I);
+  }
+
+  // If this is a local function that doesn't have its address taken, mark its
+  // entry block executable and merge in the actual arguments to the call into
+  // the formal arguments of the function.
+  if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){
+    MarkBlockExecutable(&F->front());
+
+    // Propagate information from this call site into the callee.
+    CallSite::arg_iterator CAI = CS.arg_begin();
+    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+         AI != E; ++AI, ++CAI) {
+      // If this argument is byval, and if the function is not readonly, there
+      // will be an implicit copy formed of the input aggregate.
+      if (AI->hasByValAttr() && !F->onlyReadsMemory()) {
+        markOverdefined(&*AI);
+        continue;
+      }
+
+      if (auto *STy = dyn_cast<StructType>(AI->getType())) {
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          LatticeVal CallArg = getStructValueState(*CAI, i);
+          mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg);
+        }
+      } else {
+        mergeInValue(&*AI, getValueState(*CAI));
+      }
+    }
+  }
+
+  // If this is a single/zero retval case, see if we're tracking the function.
+  if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
+    if (!MRVFunctionsTracked.count(F))
+      goto CallOverdefined;  // Not tracking this callee.
+
+    // If we are tracking this callee, propagate the result of the function
+    // into this call site.
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+      mergeInValue(getStructValueState(I, i), I,
+                   TrackedMultipleRetVals[std::make_pair(F, i)]);
+  } else {
+    DenseMap<Function*, LatticeVal>::iterator TFRVI = TrackedRetVals.find(F);
+    if (TFRVI == TrackedRetVals.end())
+      goto CallOverdefined;  // Not tracking this callee.
+
+    // If so, propagate the return value of the callee into this call result.
+    mergeInValue(I, TFRVI->second);
+  }
+}
+
+void SCCPSolver::Solve() {
+  // Process the work lists until they are empty!
+  while (!BBWorkList.empty() || !InstWorkList.empty() ||
+         !OverdefinedInstWorkList.empty()) {
+    // Process the overdefined instruction's work list first, which drives other
+    // things to overdefined more quickly.
+    while (!OverdefinedInstWorkList.empty()) {
+      Value *I = OverdefinedInstWorkList.pop_back_val();
+
+      DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
+
+      // "I" got into the work list because it either made the transition from
+      // bottom to constant, or to overdefined.
+      //
+      // Anything on this worklist that is overdefined need not be visited
+      // since all of its users will have already been marked as overdefined
+      // Update all of the users of this instruction's value.
+      //
+      for (User *U : I->users())
+        if (auto *UI = dyn_cast<Instruction>(U))
+          OperandChangedState(UI);
+    }
+
+    // Process the instruction work list.
+    while (!InstWorkList.empty()) {
+      Value *I = InstWorkList.pop_back_val();
+
+      DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');
+
+      // "I" got into the work list because it made the transition from undef to
+      // constant.
+      //
+      // Anything on this worklist that is overdefined need not be visited
+      // since all of its users will have already been marked as overdefined.
+      // Update all of the users of this instruction's value.
+      //
+      if (I->getType()->isStructTy() || !getValueState(I).isOverdefined())
+        for (User *U : I->users())
+          if (auto *UI = dyn_cast<Instruction>(U))
+            OperandChangedState(UI);
+    }
+
+    // Process the basic block work list.
+    while (!BBWorkList.empty()) {
+      BasicBlock *BB = BBWorkList.back();
+      BBWorkList.pop_back();
+
+      DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
+
+      // Notify all instructions in this basic block that they are newly
+      // executable.
+      visit(BB);
+    }
+  }
+}
+
+/// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+/// that branches on undef values cannot reach any of their successors.
+/// However, this is not a safe assumption.  After we solve dataflow, this
+/// method should be use to handle this.  If this returns true, the solver
+/// should be rerun.
+///
+/// This method handles this by finding an unresolved branch and marking it one
+/// of the edges from the block as being feasible, even though the condition
+/// doesn't say it would otherwise be.  This allows SCCP to find the rest of the
+/// CFG and only slightly pessimizes the analysis results (by marking one,
+/// potentially infeasible, edge feasible).  This cannot usefully modify the
+/// constraints on the condition of the branch, as that would impact other users
+/// of the value.
+///
+/// This scan also checks for values that use undefs, whose results are actually
+/// defined.  For example, 'zext i8 undef to i32' should produce all zeros
+/// conservatively, as "(zext i8 X -> i32) & 0xFF00" must always return zero,
+/// even if X isn't defined.
+bool SCCPSolver::ResolvedUndefsIn(Function &F) {
+  for (BasicBlock &BB : F) {
+    if (!BBExecutable.count(&BB))
+      continue;
+
+    for (Instruction &I : BB) {
+      // Look for instructions which produce undef values.
+      if (I.getType()->isVoidTy()) continue;
+
+      if (auto *STy = dyn_cast<StructType>(I.getType())) {
+        // Only a few things that can be structs matter for undef.
+
+        // Tracked calls must never be marked overdefined in ResolvedUndefsIn.
+        if (CallSite CS = CallSite(&I))
+          if (Function *F = CS.getCalledFunction())
+            if (MRVFunctionsTracked.count(F))
+              continue;
+
+        // extractvalue and insertvalue don't need to be marked; they are
+        // tracked as precisely as their operands.
+        if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I))
+          continue;
+
+        // Send the results of everything else to overdefined.  We could be
+        // more precise than this but it isn't worth bothering.
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          LatticeVal &LV = getStructValueState(&I, i);
+          if (LV.isUnknown())
+            markOverdefined(LV, &I);
+        }
+        continue;
+      }
+
+      LatticeVal &LV = getValueState(&I);
+      if (!LV.isUnknown()) continue;
+
+      // extractvalue is safe; check here because the argument is a struct.
+      if (isa<ExtractValueInst>(I))
+        continue;
+
+      // Compute the operand LatticeVals, for convenience below.
+      // Anything taking a struct is conservatively assumed to require
+      // overdefined markings.
+      if (I.getOperand(0)->getType()->isStructTy()) {
+        markOverdefined(&I);
+        return true;
+      }
+      LatticeVal Op0LV = getValueState(I.getOperand(0));
+      LatticeVal Op1LV;
+      if (I.getNumOperands() == 2) {
+        if (I.getOperand(1)->getType()->isStructTy()) {
+          markOverdefined(&I);
+          return true;
+        }
+
+        Op1LV = getValueState(I.getOperand(1));
+      }
+      // If this is an instructions whose result is defined even if the input is
+      // not fully defined, propagate the information.
+      Type *ITy = I.getType();
+      switch (I.getOpcode()) {
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::Trunc:
+      case Instruction::FPTrunc:
+      case Instruction::BitCast:
+        break; // Any undef -> undef
+      case Instruction::FSub:
+      case Instruction::FAdd:
+      case Instruction::FMul:
+      case Instruction::FDiv:
+      case Instruction::FRem:
+        // Floating-point binary operation: be conservative.
+        if (Op0LV.isUnknown() && Op1LV.isUnknown())
+          markForcedConstant(&I, Constant::getNullValue(ITy));
+        else
+          markOverdefined(&I);
+        return true;
+      case Instruction::ZExt:
+      case Instruction::SExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::FPExt:
+      case Instruction::PtrToInt:
+      case Instruction::IntToPtr:
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+        // undef -> 0; some outputs are impossible
+        markForcedConstant(&I, Constant::getNullValue(ITy));
+        return true;
+      case Instruction::Mul:
+      case Instruction::And:
+        // Both operands undef -> undef
+        if (Op0LV.isUnknown() && Op1LV.isUnknown())
+          break;
+        // undef * X -> 0.   X could be zero.
+        // undef & X -> 0.   X could be zero.
+        markForcedConstant(&I, Constant::getNullValue(ITy));
+        return true;
+
+      case Instruction::Or:
+        // Both operands undef -> undef
+        if (Op0LV.isUnknown() && Op1LV.isUnknown())
+          break;
+        // undef | X -> -1.   X could be -1.
+        markForcedConstant(&I, Constant::getAllOnesValue(ITy));
+        return true;
+
+      case Instruction::Xor:
+        // undef ^ undef -> 0; strictly speaking, this is not strictly
+        // necessary, but we try to be nice to people who expect this
+        // behavior in simple cases
+        if (Op0LV.isUnknown() && Op1LV.isUnknown()) {
+          markForcedConstant(&I, Constant::getNullValue(ITy));
+          return true;
+        }
+        // undef ^ X -> undef
+        break;
+
+      case Instruction::SDiv:
+      case Instruction::UDiv:
+      case Instruction::SRem:
+      case Instruction::URem:
+        // X / undef -> undef.  No change.
+        // X % undef -> undef.  No change.
+        if (Op1LV.isUnknown()) break;
+
+        // X / 0 -> undef.  No change.
+        // X % 0 -> undef.  No change.
+        if (Op1LV.isConstant() && Op1LV.getConstant()->isZeroValue())
+          break;
+
+        // undef / X -> 0.   X could be maxint.
+        // undef % X -> 0.   X could be 1.
+        markForcedConstant(&I, Constant::getNullValue(ITy));
+        return true;
+
+      case Instruction::AShr:
+        // X >>a undef -> undef.
+        if (Op1LV.isUnknown()) break;
+
+        // Shifting by the bitwidth or more is undefined.
+        if (Op1LV.isConstant()) {
+          if (auto *ShiftAmt = Op1LV.getConstantInt())
+            if (ShiftAmt->getLimitedValue() >=
+                ShiftAmt->getType()->getScalarSizeInBits())
+              break;
+        }
+
+        // undef >>a X -> 0
+        markForcedConstant(&I, Constant::getNullValue(ITy));
+        return true;
+      case Instruction::LShr:
+      case Instruction::Shl:
+        // X << undef -> undef.
+        // X >> undef -> undef.
+        if (Op1LV.isUnknown()) break;
+
+        // Shifting by the bitwidth or more is undefined.
+        if (Op1LV.isConstant()) {
+          if (auto *ShiftAmt = Op1LV.getConstantInt())
+            if (ShiftAmt->getLimitedValue() >=
+                ShiftAmt->getType()->getScalarSizeInBits())
+              break;
+        }
+
+        // undef << X -> 0
+        // undef >> X -> 0
+        markForcedConstant(&I, Constant::getNullValue(ITy));
+        return true;
+      case Instruction::Select:
+        Op1LV = getValueState(I.getOperand(1));
+        // undef ? X : Y  -> X or Y.  There could be commonality between X/Y.
+        if (Op0LV.isUnknown()) {
+          if (!Op1LV.isConstant())  // Pick the constant one if there is any.
+            Op1LV = getValueState(I.getOperand(2));
+        } else if (Op1LV.isUnknown()) {
+          // c ? undef : undef -> undef.  No change.
+          Op1LV = getValueState(I.getOperand(2));
+          if (Op1LV.isUnknown())
+            break;
+          // Otherwise, c ? undef : x -> x.
+        } else {
+          // Leave Op1LV as Operand(1)'s LatticeValue.
+        }
+
+        if (Op1LV.isConstant())
+          markForcedConstant(&I, Op1LV.getConstant());
+        else
+          markOverdefined(&I);
+        return true;
+      case Instruction::Load:
+        // A load here means one of two things: a load of undef from a global,
+        // a load from an unknown pointer.  Either way, having it return undef
+        // is okay.
+        break;
+      case Instruction::ICmp:
+        // X == undef -> undef.  Other comparisons get more complicated.
+        if (cast<ICmpInst>(&I)->isEquality())
+          break;
+        markOverdefined(&I);
+        return true;
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        // There are two reasons a call can have an undef result
+        // 1. It could be tracked.
+        // 2. It could be constant-foldable.
+        // Because of the way we solve return values, tracked calls must
+        // never be marked overdefined in ResolvedUndefsIn.
+        if (Function *F = CallSite(&I).getCalledFunction())
+          if (TrackedRetVals.count(F))
+            break;
+
+        // If the call is constant-foldable, we mark it overdefined because
+        // we do not know what return values are valid.
+        markOverdefined(&I);
+        return true;
+      }
+      default:
+        // If we don't know what should happen here, conservatively mark it
+        // overdefined.
+        markOverdefined(&I);
+        return true;
+      }
+    }
+
+    // Check to see if we have a branch or switch on an undefined value.  If so
+    // we force the branch to go one way or the other to make the successor
+    // values live.  It doesn't really matter which way we force it.
+    TerminatorInst *TI = BB.getTerminator();
+    if (auto *BI = dyn_cast<BranchInst>(TI)) {
+      if (!BI->isConditional()) continue;
+      if (!getValueState(BI->getCondition()).isUnknown())
+        continue;
+
+      // If the input to SCCP is actually branch on undef, fix the undef to
+      // false.
+      if (isa<UndefValue>(BI->getCondition())) {
+        BI->setCondition(ConstantInt::getFalse(BI->getContext()));
+        markEdgeExecutable(&BB, TI->getSuccessor(1));
+        return true;
+      }
+
+      // Otherwise, it is a branch on a symbolic value which is currently
+      // considered to be undef.  Handle this by forcing the input value to the
+      // branch to false.
+      markForcedConstant(BI->getCondition(),
+                         ConstantInt::getFalse(TI->getContext()));
+      return true;
+    }
+
+   if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+      // Indirect branch with no successor ?. Its ok to assume it branches
+      // to no target.
+      if (IBR->getNumSuccessors() < 1)
+        continue;
+
+      if (!getValueState(IBR->getAddress()).isUnknown())
+        continue;
+
+      // If the input to SCCP is actually branch on undef, fix the undef to
+      // the first successor of the indirect branch.
+      if (isa<UndefValue>(IBR->getAddress())) {
+        IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0)));
+        markEdgeExecutable(&BB, IBR->getSuccessor(0));
+        return true;
+      }
+
+      // Otherwise, it is a branch on a symbolic value which is currently
+      // considered to be undef.  Handle this by forcing the input value to the
+      // branch to the first successor.
+      markForcedConstant(IBR->getAddress(),
+                         BlockAddress::get(IBR->getSuccessor(0)));
+      return true;
+    }
+
+    if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+      if (!SI->getNumCases() || !getValueState(SI->getCondition()).isUnknown())
+        continue;
+
+      // If the input to SCCP is actually switch on undef, fix the undef to
+      // the first constant.
+      if (isa<UndefValue>(SI->getCondition())) {
+        SI->setCondition(SI->case_begin()->getCaseValue());
+        markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor());
+        return true;
+      }
+
+      markForcedConstant(SI->getCondition(), SI->case_begin()->getCaseValue());
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
+  Constant *Const = nullptr;
+  if (V->getType()->isStructTy()) {
+    std::vector<LatticeVal> IVs = Solver.getStructLatticeValueFor(V);
+    if (any_of(IVs, [](const LatticeVal &LV) { return LV.isOverdefined(); }))
+      return false;
+    std::vector<Constant *> ConstVals;
+    auto *ST = dyn_cast<StructType>(V->getType());
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+      LatticeVal V = IVs[i];
+      ConstVals.push_back(V.isConstant()
+                              ? V.getConstant()
+                              : UndefValue::get(ST->getElementType(i)));
+    }
+    Const = ConstantStruct::get(ST, ConstVals);
+  } else {
+    LatticeVal IV = Solver.getLatticeValueFor(V);
+    if (IV.isOverdefined())
+      return false;
+    Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
+  }
+  assert(Const && "Constant is nullptr here!");
+  DEBUG(dbgs() << "  Constant: " << *Const << " = " << *V << '\n');
+
+  // Replaces all of the uses of a variable with uses of the constant.
+  V->replaceAllUsesWith(Const);
+  return true;
+}
+
+// runSCCP() - Run the Sparse Conditional Constant Propagation algorithm,
+// and return true if the function was modified.
+//
+static bool runSCCP(Function &F, const DataLayout &DL,
+                    const TargetLibraryInfo *TLI) {
+  DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
+  SCCPSolver Solver(DL, TLI);
+
+  // Mark the first block of the function as being executable.
+  Solver.MarkBlockExecutable(&F.front());
+
+  // Mark all arguments to the function as being overdefined.
+  for (Argument &AI : F.args())
+    Solver.markOverdefined(&AI);
+
+  // Solve for constants.
+  bool ResolvedUndefs = true;
+  while (ResolvedUndefs) {
+    Solver.Solve();
+    DEBUG(dbgs() << "RESOLVING UNDEFs\n");
+    ResolvedUndefs = Solver.ResolvedUndefsIn(F);
+  }
+
+  bool MadeChanges = false;
+
+  // If we decided that there are basic blocks that are dead in this function,
+  // delete their contents now.  Note that we cannot actually delete the blocks,
+  // as we cannot modify the CFG of the function.
+
+  for (BasicBlock &BB : F) {
+    if (!Solver.isBlockExecutable(&BB)) {
+      DEBUG(dbgs() << "  BasicBlock Dead:" << BB);
+
+      ++NumDeadBlocks;
+      NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB);
+
+      MadeChanges = true;
+      continue;
+    }
+
+    // Iterate over all of the instructions in a function, replacing them with
+    // constants if we have found them to be of constant values.
+    //
+    for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
+      Instruction *Inst = &*BI++;
+      if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst))
+        continue;
+
+      if (tryToReplaceWithConstant(Solver, Inst)) {
+        if (isInstructionTriviallyDead(Inst))
+          Inst->eraseFromParent();
+        // Hey, we just changed something!
+        MadeChanges = true;
+        ++NumInstRemoved;
+      }
+    }
+  }
+
+  return MadeChanges;
+}
+
+PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  if (!runSCCP(F, DL, &TLI))
+    return PreservedAnalyses::all();
+
+  auto PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+namespace {
+//===--------------------------------------------------------------------===//
+//
+/// SCCP Class - This class uses the SCCPSolver to implement a per-function
+/// Sparse Conditional Constant Propagator.
+///
+class SCCPLegacyPass : public FunctionPass {
+public:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+  static char ID; // Pass identification, replacement for typeid
+  SCCPLegacyPass() : FunctionPass(ID) {
+    initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // runOnFunction - Run the Sparse Conditional Constant Propagation
+  // algorithm, and return true if the function was modified.
+  //
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    return runSCCP(F, DL, TLI);
+  }
+};
+} // end anonymous namespace
+
+char SCCPLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SCCPLegacyPass, "sccp",
+                      "Sparse Conditional Constant Propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(SCCPLegacyPass, "sccp",
+                    "Sparse Conditional Constant Propagation", false, false)
+
+// createSCCPPass - This is the public interface to this file.
+FunctionPass *llvm::createSCCPPass() { return new SCCPLegacyPass(); }
+
+static bool AddressIsTaken(const GlobalValue *GV) {
+  // Delete any dead constantexpr klingons.
+  GV->removeDeadConstantUsers();
+
+  for (const Use &U : GV->uses()) {
+    const User *UR = U.getUser();
+    if (const auto *SI = dyn_cast<StoreInst>(UR)) {
+      if (SI->getOperand(0) == GV || SI->isVolatile())
+        return true;  // Storing addr of GV.
+    } else if (isa<InvokeInst>(UR) || isa<CallInst>(UR)) {
+      // Make sure we are calling the function, not passing the address.
+      ImmutableCallSite CS(cast<Instruction>(UR));
+      if (!CS.isCallee(&U))
+        return true;
+    } else if (const auto *LI = dyn_cast<LoadInst>(UR)) {
+      if (LI->isVolatile())
+        return true;
+    } else if (isa<BlockAddress>(UR)) {
+      // blockaddress doesn't take the address of the function, it takes addr
+      // of label.
+    } else {
+      return true;
+    }
+  }
+  return false;
+}
+
+static void findReturnsToZap(Function &F,
+                             SmallPtrSet<Function *, 32> &AddressTakenFunctions,
+                             SmallVector<ReturnInst *, 8> &ReturnsToZap) {
+  // We can only do this if we know that nothing else can call the function.
+  if (!F.hasLocalLinkage() || AddressTakenFunctions.count(&F))
+    return;
+
+  for (BasicBlock &BB : F)
+    if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+      if (!isa<UndefValue>(RI->getOperand(0)))
+        ReturnsToZap.push_back(RI);
+}
+
+static bool runIPSCCP(Module &M, const DataLayout &DL,
+                      const TargetLibraryInfo *TLI) {
+  SCCPSolver Solver(DL, TLI);
+
+  // AddressTakenFunctions - This set keeps track of the address-taken functions
+  // that are in the input.  As IPSCCP runs through and simplifies code,
+  // functions that were address taken can end up losing their
+  // address-taken-ness.  Because of this, we keep track of their addresses from
+  // the first pass so we can use them for the later simplification pass.
+  SmallPtrSet<Function*, 32> AddressTakenFunctions;
+
+  // Loop over all functions, marking arguments to those with their addresses
+  // taken or that are external as overdefined.
+  //
+  for (Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+
+    // If this is an exact definition of this function, then we can propagate
+    // information about its result into callsites of it.
+    // Don't touch naked functions. They may contain asm returning a
+    // value we don't see, so we may end up interprocedurally propagating
+    // the return value incorrectly.
+    if (F.hasExactDefinition() && !F.hasFnAttribute(Attribute::Naked))
+      Solver.AddTrackedFunction(&F);
+
+    // If this function only has direct calls that we can see, we can track its
+    // arguments and return value aggressively, and can assume it is not called
+    // unless we see evidence to the contrary.
+    if (F.hasLocalLinkage()) {
+      if (F.hasAddressTaken()) {
+        AddressTakenFunctions.insert(&F);
+      }
+      else {
+        Solver.AddArgumentTrackedFunction(&F);
+        continue;
+      }
+    }
+
+    // Assume the function is called.
+    Solver.MarkBlockExecutable(&F.front());
+
+    // Assume nothing about the incoming arguments.
+    for (Argument &AI : F.args())
+      Solver.markOverdefined(&AI);
+  }
+
+  // Loop over global variables.  We inform the solver about any internal global
+  // variables that do not have their 'addresses taken'.  If they don't have
+  // their addresses taken, we can propagate constants through them.
+  for (GlobalVariable &G : M.globals())
+    if (!G.isConstant() && G.hasLocalLinkage() && !AddressIsTaken(&G))
+      Solver.TrackValueOfGlobalVariable(&G);
+
+  // Solve for constants.
+  bool ResolvedUndefs = true;
+  while (ResolvedUndefs) {
+    Solver.Solve();
+
+    DEBUG(dbgs() << "RESOLVING UNDEFS\n");
+    ResolvedUndefs = false;
+    for (Function &F : M)
+      ResolvedUndefs |= Solver.ResolvedUndefsIn(F);
+  }
+
+  bool MadeChanges = false;
+
+  // Iterate over all of the instructions in the module, replacing them with
+  // constants if we have found them to be of constant values.
+  //
+  SmallVector<BasicBlock*, 512> BlocksToErase;
+
+  for (Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+
+    if (Solver.isBlockExecutable(&F.front()))
+      for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;
+           ++AI)
+        if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI))
+          ++IPNumArgsElimed;
+
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      if (!Solver.isBlockExecutable(&*BB)) {
+        DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
+
+        ++NumDeadBlocks;
+        NumInstRemoved +=
+            changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false);
+
+        MadeChanges = true;
+
+        if (&*BB != &F.front())
+          BlocksToErase.push_back(&*BB);
+        continue;
+      }
+
+      for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+        Instruction *Inst = &*BI++;
+        if (Inst->getType()->isVoidTy())
+          continue;
+        if (tryToReplaceWithConstant(Solver, Inst)) {
+          if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst))
+            Inst->eraseFromParent();
+          // Hey, we just changed something!
+          MadeChanges = true;
+          ++IPNumInstRemoved;
+        }
+      }
+    }
+
+    // Now that all instructions in the function are constant folded, erase dead
+    // blocks, because we can now use ConstantFoldTerminator to get rid of
+    // in-edges.
+    for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) {
+      // If there are any PHI nodes in this successor, drop entries for BB now.
+      BasicBlock *DeadBB = BlocksToErase[i];
+      for (Value::user_iterator UI = DeadBB->user_begin(),
+                                UE = DeadBB->user_end();
+           UI != UE;) {
+        // Grab the user and then increment the iterator early, as the user
+        // will be deleted. Step past all adjacent uses from the same user.
+        auto *I = dyn_cast<Instruction>(*UI);
+        do { ++UI; } while (UI != UE && *UI == I);
+
+        // Ignore blockaddress users; BasicBlock's dtor will handle them.
+        if (!I) continue;
+
+        bool Folded = ConstantFoldTerminator(I->getParent());
+        assert(Folded &&
+              "Expect TermInst on constantint or blockaddress to be folded");
+        (void) Folded;
+      }
+
+      // Finally, delete the basic block.
+      F.getBasicBlockList().erase(DeadBB);
+    }
+    BlocksToErase.clear();
+  }
+
+  // If we inferred constant or undef return values for a function, we replaced
+  // all call uses with the inferred value.  This means we don't need to bother
+  // actually returning anything from the function.  Replace all return
+  // instructions with return undef.
+  //
+  // Do this in two stages: first identify the functions we should process, then
+  // actually zap their returns.  This is important because we can only do this
+  // if the address of the function isn't taken.  In cases where a return is the
+  // last use of a function, the order of processing functions would affect
+  // whether other functions are optimizable.
+  SmallVector<ReturnInst*, 8> ReturnsToZap;
+
+  const DenseMap<Function*, LatticeVal> &RV = Solver.getTrackedRetVals();
+  for (const auto &I : RV) {
+    Function *F = I.first;
+    if (I.second.isOverdefined() || F->getReturnType()->isVoidTy())
+      continue;
+    findReturnsToZap(*F, AddressTakenFunctions, ReturnsToZap);
+  }
+
+  for (const auto &F : Solver.getMRVFunctionsTracked()) {
+    assert(F->getReturnType()->isStructTy() &&
+           "The return type should be a struct");
+    StructType *STy = cast<StructType>(F->getReturnType());
+    if (Solver.isStructLatticeConstant(F, STy))
+      findReturnsToZap(*F, AddressTakenFunctions, ReturnsToZap);
+  }
+
+  // Zap all returns which we've identified as zap to change.
+  for (unsigned i = 0, e = ReturnsToZap.size(); i != e; ++i) {
+    Function *F = ReturnsToZap[i]->getParent()->getParent();
+    ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType()));
+  }
+
+  // If we inferred constant or undef values for globals variables, we can
+  // delete the global and any stores that remain to it.
+  const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals();
+  for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(),
+         E = TG.end(); I != E; ++I) {
+    GlobalVariable *GV = I->first;
+    assert(!I->second.isOverdefined() &&
+           "Overdefined values should have been taken out of the map!");
+    DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n");
+    while (!GV->use_empty()) {
+      StoreInst *SI = cast<StoreInst>(GV->user_back());
+      SI->eraseFromParent();
+    }
+    M.getGlobalList().erase(GV);
+    ++IPNumGlobalConst;
+  }
+
+  return MadeChanges;
+}
+
+PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
+  const DataLayout &DL = M.getDataLayout();
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+  if (!runIPSCCP(M, DL, &TLI))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+namespace {
+//===--------------------------------------------------------------------===//
+//
+/// IPSCCP Class - This class implements interprocedural Sparse Conditional
+/// Constant Propagation.
+///
+class IPSCCPLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  IPSCCPLegacyPass() : ModulePass(ID) {
+    initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    const DataLayout &DL = M.getDataLayout();
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    return runIPSCCP(M, DL, TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+} // end anonymous namespace
+
+char IPSCCPLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp",
+                      "Interprocedural Sparse Conditional Constant Propagation",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp",
+                    "Interprocedural Sparse Conditional Constant Propagation",
+                    false, false)
+
+// createIPSCCPPass - This is the public interface to this file.
+ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
new file mode 100644
index 000000000000..b9cee5b2ba95
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -0,0 +1,4319 @@
+//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This transformation implements the well known scalar replacement of
+/// aggregates transformation. It tries to identify promotable elements of an
+/// aggregate alloca, and promote them to registers. It will also try to
+/// convert uses of an element (or set of elements) of an alloca into a vector
+/// or bitfield-style integer scalar if appropriate.
+///
+/// It works to do this with minimal slicing of the alloca so that regions
+/// which are merely transferred in and out of external memory remain unchanged
+/// and are not decomposed to scalar code.
+///
+/// Because this also performs alloca promotion, it can be thought of as also
+/// serving the purpose of SSA formation. The algorithm iterates on the
+/// function until all opportunities for promotion have been realized.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Chrono.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+
+#ifndef NDEBUG
+// We only use this for a debug check.
+#include <random>
+#endif
+
+using namespace llvm;
+using namespace llvm::sroa;
+
+#define DEBUG_TYPE "sroa"
+
+STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
+STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
+STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
+STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
+STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
+STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
+STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
+STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
+STATISTIC(NumDeleted, "Number of instructions deleted");
+STATISTIC(NumVectorized, "Number of vectorized aggregates");
+
+/// Hidden option to enable randomly shuffling the slices to help uncover
+/// instability in their order.
+static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
+                                             cl::init(false), cl::Hidden);
+
+/// Hidden option to experiment with completely strict handling of inbounds
+/// GEPs.
+static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
+                                        cl::Hidden);
+
+namespace {
+/// \brief A custom IRBuilder inserter which prefixes all names, but only in
+/// Assert builds.
+class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter {
+  std::string Prefix;
+  const Twine getNameWithPrefix(const Twine &Name) const {
+    return Name.isTriviallyEmpty() ? Name : Prefix + Name;
+  }
+
+public:
+  void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
+
+protected:
+  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
+                    BasicBlock::iterator InsertPt) const {
+    IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB,
+                                           InsertPt);
+  }
+};
+
+/// \brief Provide a typedef for IRBuilder that drops names in release builds.
+using IRBuilderTy = llvm::IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
+}
+
+namespace {
+/// \brief A used slice of an alloca.
+///
+/// This structure represents a slice of an alloca used by some instruction. It
+/// stores both the begin and end offsets of this use, a pointer to the use
+/// itself, and a flag indicating whether we can classify the use as splittable
+/// or not when forming partitions of the alloca.
+class Slice {
+  /// \brief The beginning offset of the range.
+  uint64_t BeginOffset;
+
+  /// \brief The ending offset, not included in the range.
+  uint64_t EndOffset;
+
+  /// \brief Storage for both the use of this slice and whether it can be
+  /// split.
+  PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
+
+public:
+  Slice() : BeginOffset(), EndOffset() {}
+  Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
+      : BeginOffset(BeginOffset), EndOffset(EndOffset),
+        UseAndIsSplittable(U, IsSplittable) {}
+
+  uint64_t beginOffset() const { return BeginOffset; }
+  uint64_t endOffset() const { return EndOffset; }
+
+  bool isSplittable() const { return UseAndIsSplittable.getInt(); }
+  void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
+
+  Use *getUse() const { return UseAndIsSplittable.getPointer(); }
+
+  bool isDead() const { return getUse() == nullptr; }
+  void kill() { UseAndIsSplittable.setPointer(nullptr); }
+
+  /// \brief Support for ordering ranges.
+  ///
+  /// This provides an ordering over ranges such that start offsets are
+  /// always increasing, and within equal start offsets, the end offsets are
+  /// decreasing. Thus the spanning range comes first in a cluster with the
+  /// same start position.
+  bool operator<(const Slice &RHS) const {
+    if (beginOffset() < RHS.beginOffset())
+      return true;
+    if (beginOffset() > RHS.beginOffset())
+      return false;
+    if (isSplittable() != RHS.isSplittable())
+      return !isSplittable();
+    if (endOffset() > RHS.endOffset())
+      return true;
+    return false;
+  }
+
+  /// \brief Support comparison with a single offset to allow binary searches.
+  friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
+                                              uint64_t RHSOffset) {
+    return LHS.beginOffset() < RHSOffset;
+  }
+  friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
+                                              const Slice &RHS) {
+    return LHSOffset < RHS.beginOffset();
+  }
+
+  bool operator==(const Slice &RHS) const {
+    return isSplittable() == RHS.isSplittable() &&
+           beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
+  }
+  bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
+};
+} // end anonymous namespace
+
+namespace llvm {
+template <typename T> struct isPodLike;
+template <> struct isPodLike<Slice> { static const bool value = true; };
+}
+
+/// \brief Representation of the alloca slices.
+///
+/// This class represents the slices of an alloca which are formed by its
+/// various uses. If a pointer escapes, we can't fully build a representation
+/// for the slices used and we reflect that in this structure. The uses are
+/// stored, sorted by increasing beginning offset and with unsplittable slices
+/// starting at a particular offset before splittable slices.
+class llvm::sroa::AllocaSlices {
+public:
+  /// \brief Construct the slices of a particular alloca.
+  AllocaSlices(const DataLayout &DL, AllocaInst &AI);
+
+  /// \brief Test whether a pointer to the allocation escapes our analysis.
+  ///
+  /// If this is true, the slices are never fully built and should be
+  /// ignored.
+  bool isEscaped() const { return PointerEscapingInstr; }
+
+  /// \brief Support for iterating over the slices.
+  /// @{
+  typedef SmallVectorImpl<Slice>::iterator iterator;
+  typedef iterator_range<iterator> range;
+  iterator begin() { return Slices.begin(); }
+  iterator end() { return Slices.end(); }
+
+  typedef SmallVectorImpl<Slice>::const_iterator const_iterator;
+  typedef iterator_range<const_iterator> const_range;
+  const_iterator begin() const { return Slices.begin(); }
+  const_iterator end() const { return Slices.end(); }
+  /// @}
+
+  /// \brief Erase a range of slices.
+  void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
+
+  /// \brief Insert new slices for this alloca.
+  ///
+  /// This moves the slices into the alloca's slices collection, and re-sorts
+  /// everything so that the usual ordering properties of the alloca's slices
+  /// hold.
+  void insert(ArrayRef<Slice> NewSlices) {
+    int OldSize = Slices.size();
+    Slices.append(NewSlices.begin(), NewSlices.end());
+    auto SliceI = Slices.begin() + OldSize;
+    std::sort(SliceI, Slices.end());
+    std::inplace_merge(Slices.begin(), SliceI, Slices.end());
+  }
+
+  // Forward declare the iterator and range accessor for walking the
+  // partitions.
+  class partition_iterator;
+  iterator_range<partition_iterator> partitions();
+
+  /// \brief Access the dead users for this alloca.
+  ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
+
+  /// \brief Access the dead operands referring to this alloca.
+  ///
+  /// These are operands which have cannot actually be used to refer to the
+  /// alloca as they are outside its range and the user doesn't correct for
+  /// that. These mostly consist of PHI node inputs and the like which we just
+  /// need to replace with undef.
+  ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
+  void printSlice(raw_ostream &OS, const_iterator I,
+                  StringRef Indent = "  ") const;
+  void printUse(raw_ostream &OS, const_iterator I,
+                StringRef Indent = "  ") const;
+  void print(raw_ostream &OS) const;
+  void dump(const_iterator I) const;
+  void dump() const;
+#endif
+
+private:
+  template <typename DerivedT, typename RetT = void> class BuilderBase;
+  class SliceBuilder;
+  friend class AllocaSlices::SliceBuilder;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// \brief Handle to alloca instruction to simplify method interfaces.
+  AllocaInst &AI;
+#endif
+
+  /// \brief The instruction responsible for this alloca not having a known set
+  /// of slices.
+  ///
+  /// When an instruction (potentially) escapes the pointer to the alloca, we
+  /// store a pointer to that here and abort trying to form slices of the
+  /// alloca. This will be null if the alloca slices are analyzed successfully.
+  Instruction *PointerEscapingInstr;
+
+  /// \brief The slices of the alloca.
+  ///
+  /// We store a vector of the slices formed by uses of the alloca here. This
+  /// vector is sorted by increasing begin offset, and then the unsplittable
+  /// slices before the splittable ones. See the Slice inner class for more
+  /// details.
+  SmallVector<Slice, 8> Slices;
+
+  /// \brief Instructions which will become dead if we rewrite the alloca.
+  ///
+  /// Note that these are not separated by slice. This is because we expect an
+  /// alloca to be completely rewritten or not rewritten at all. If rewritten,
+  /// all these instructions can simply be removed and replaced with undef as
+  /// they come from outside of the allocated space.
+  SmallVector<Instruction *, 8> DeadUsers;
+
+  /// \brief Operands which will become dead if we rewrite the alloca.
+  ///
+  /// These are operands that in their particular use can be replaced with
+  /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
+  /// to PHI nodes and the like. They aren't entirely dead (there might be
+  /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
+  /// want to swap this particular input for undef to simplify the use lists of
+  /// the alloca.
+  SmallVector<Use *, 8> DeadOperands;
+};
+
+/// \brief A partition of the slices.
+///
+/// An ephemeral representation for a range of slices which can be viewed as
+/// a partition of the alloca. This range represents a span of the alloca's
+/// memory which cannot be split, and provides access to all of the slices
+/// overlapping some part of the partition.
+///
+/// Objects of this type are produced by traversing the alloca's slices, but
+/// are only ephemeral and not persistent.
+class llvm::sroa::Partition {
+private:
+  friend class AllocaSlices;
+  friend class AllocaSlices::partition_iterator;
+
+  typedef AllocaSlices::iterator iterator;
+
+  /// \brief The beginning and ending offsets of the alloca for this
+  /// partition.
+  uint64_t BeginOffset, EndOffset;
+
+  /// \brief The start and end iterators of this partition.
+  iterator SI, SJ;
+
+  /// \brief A collection of split slice tails overlapping the partition.
+  SmallVector<Slice *, 4> SplitTails;
+
+  /// \brief Raw constructor builds an empty partition starting and ending at
+  /// the given iterator.
+  Partition(iterator SI) : SI(SI), SJ(SI) {}
+
+public:
+  /// \brief The start offset of this partition.
+  ///
+  /// All of the contained slices start at or after this offset.
+  uint64_t beginOffset() const { return BeginOffset; }
+
+  /// \brief The end offset of this partition.
+  ///
+  /// All of the contained slices end at or before this offset.
+  uint64_t endOffset() const { return EndOffset; }
+
+  /// \brief The size of the partition.
+  ///
+  /// Note that this can never be zero.
+  uint64_t size() const {
+    assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
+    return EndOffset - BeginOffset;
+  }
+
+  /// \brief Test whether this partition contains no slices, and merely spans
+  /// a region occupied by split slices.
+  bool empty() const { return SI == SJ; }
+
+  /// \name Iterate slices that start within the partition.
+  /// These may be splittable or unsplittable. They have a begin offset >= the
+  /// partition begin offset.
+  /// @{
+  // FIXME: We should probably define a "concat_iterator" helper and use that
+  // to stitch together pointee_iterators over the split tails and the
+  // contiguous iterators of the partition. That would give a much nicer
+  // interface here. We could then additionally expose filtered iterators for
+  // split, unsplit, and unsplittable splices based on the usage patterns.
+  iterator begin() const { return SI; }
+  iterator end() const { return SJ; }
+  /// @}
+
+  /// \brief Get the sequence of split slice tails.
+  ///
+  /// These tails are of slices which start before this partition but are
+  /// split and overlap into the partition. We accumulate these while forming
+  /// partitions.
+  ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
+};
+
+/// \brief An iterator over partitions of the alloca's slices.
+///
+/// This iterator implements the core algorithm for partitioning the alloca's
+/// slices. It is a forward iterator as we don't support backtracking for
+/// efficiency reasons, and re-use a single storage area to maintain the
+/// current set of split slices.
+///
+/// It is templated on the slice iterator type to use so that it can operate
+/// with either const or non-const slice iterators.
+class AllocaSlices::partition_iterator
+    : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
+                                  Partition> {
+  friend class AllocaSlices;
+
+  /// \brief Most of the state for walking the partitions is held in a class
+  /// with a nice interface for examining them.
+  Partition P;
+
+  /// \brief We need to keep the end of the slices to know when to stop.
+  AllocaSlices::iterator SE;
+
+  /// \brief We also need to keep track of the maximum split end offset seen.
+  /// FIXME: Do we really?
+  uint64_t MaxSplitSliceEndOffset;
+
+  /// \brief Sets the partition to be empty at given iterator, and sets the
+  /// end iterator.
+  partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
+      : P(SI), SE(SE), MaxSplitSliceEndOffset(0) {
+    // If not already at the end, advance our state to form the initial
+    // partition.
+    if (SI != SE)
+      advance();
+  }
+
+  /// \brief Advance the iterator to the next partition.
+  ///
+  /// Requires that the iterator not be at the end of the slices.
+  void advance() {
+    assert((P.SI != SE || !P.SplitTails.empty()) &&
+           "Cannot advance past the end of the slices!");
+
+    // Clear out any split uses which have ended.
+    if (!P.SplitTails.empty()) {
+      if (P.EndOffset >= MaxSplitSliceEndOffset) {
+        // If we've finished all splits, this is easy.
+        P.SplitTails.clear();
+        MaxSplitSliceEndOffset = 0;
+      } else {
+        // Remove the uses which have ended in the prior partition. This
+        // cannot change the max split slice end because we just checked that
+        // the prior partition ended prior to that max.
+        P.SplitTails.erase(
+            remove_if(P.SplitTails,
+                      [&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
+            P.SplitTails.end());
+        assert(any_of(P.SplitTails,
+                      [&](Slice *S) {
+                        return S->endOffset() == MaxSplitSliceEndOffset;
+                      }) &&
+               "Could not find the current max split slice offset!");
+        assert(all_of(P.SplitTails,
+                      [&](Slice *S) {
+                        return S->endOffset() <= MaxSplitSliceEndOffset;
+                      }) &&
+               "Max split slice end offset is not actually the max!");
+      }
+    }
+
+    // If P.SI is already at the end, then we've cleared the split tail and
+    // now have an end iterator.
+    if (P.SI == SE) {
+      assert(P.SplitTails.empty() && "Failed to clear the split slices!");
+      return;
+    }
+
+    // If we had a non-empty partition previously, set up the state for
+    // subsequent partitions.
+    if (P.SI != P.SJ) {
+      // Accumulate all the splittable slices which started in the old
+      // partition into the split list.
+      for (Slice &S : P)
+        if (S.isSplittable() && S.endOffset() > P.EndOffset) {
+          P.SplitTails.push_back(&S);
+          MaxSplitSliceEndOffset =
+              std::max(S.endOffset(), MaxSplitSliceEndOffset);
+        }
+
+      // Start from the end of the previous partition.
+      P.SI = P.SJ;
+
+      // If P.SI is now at the end, we at most have a tail of split slices.
+      if (P.SI == SE) {
+        P.BeginOffset = P.EndOffset;
+        P.EndOffset = MaxSplitSliceEndOffset;
+        return;
+      }
+
+      // If the we have split slices and the next slice is after a gap and is
+      // not splittable immediately form an empty partition for the split
+      // slices up until the next slice begins.
+      if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
+          !P.SI->isSplittable()) {
+        P.BeginOffset = P.EndOffset;
+        P.EndOffset = P.SI->beginOffset();
+        return;
+      }
+    }
+
+    // OK, we need to consume new slices. Set the end offset based on the
+    // current slice, and step SJ past it. The beginning offset of the
+    // partition is the beginning offset of the next slice unless we have
+    // pre-existing split slices that are continuing, in which case we begin
+    // at the prior end offset.
+    P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
+    P.EndOffset = P.SI->endOffset();
+    ++P.SJ;
+
+    // There are two strategies to form a partition based on whether the
+    // partition starts with an unsplittable slice or a splittable slice.
+    if (!P.SI->isSplittable()) {
+      // When we're forming an unsplittable region, it must always start at
+      // the first slice and will extend through its end.
+      assert(P.BeginOffset == P.SI->beginOffset());
+
+      // Form a partition including all of the overlapping slices with this
+      // unsplittable slice.
+      while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+        if (!P.SJ->isSplittable())
+          P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+        ++P.SJ;
+      }
+
+      // We have a partition across a set of overlapping unsplittable
+      // partitions.
+      return;
+    }
+
+    // If we're starting with a splittable slice, then we need to form
+    // a synthetic partition spanning it and any other overlapping splittable
+    // splices.
+    assert(P.SI->isSplittable() && "Forming a splittable partition!");
+
+    // Collect all of the overlapping splittable slices.
+    while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
+           P.SJ->isSplittable()) {
+      P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+      ++P.SJ;
+    }
+
+    // Back upiP.EndOffset if we ended the span early when encountering an
+    // unsplittable slice. This synthesizes the early end offset of
+    // a partition spanning only splittable slices.
+    if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+      assert(!P.SJ->isSplittable());
+      P.EndOffset = P.SJ->beginOffset();
+    }
+  }
+
+public:
+  bool operator==(const partition_iterator &RHS) const {
+    assert(SE == RHS.SE &&
+           "End iterators don't match between compared partition iterators!");
+
+    // The observed positions of partitions is marked by the P.SI iterator and
+    // the emptiness of the split slices. The latter is only relevant when
+    // P.SI == SE, as the end iterator will additionally have an empty split
+    // slices list, but the prior may have the same P.SI and a tail of split
+    // slices.
+    if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
+      assert(P.SJ == RHS.P.SJ &&
+             "Same set of slices formed two different sized partitions!");
+      assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
+             "Same slice position with differently sized non-empty split "
+             "slice tails!");
+      return true;
+    }
+    return false;
+  }
+
+  partition_iterator &operator++() {
+    advance();
+    return *this;
+  }
+
+  Partition &operator*() { return P; }
+};
+
+/// \brief A forward range over the partitions of the alloca's slices.
+///
+/// This accesses an iterator range over the partitions of the alloca's
+/// slices. It computes these partitions on the fly based on the overlapping
+/// offsets of the slices and the ability to split them. It will visit "empty"
+/// partitions to cover regions of the alloca only accessed via split
+/// slices.
+iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
+  return make_range(partition_iterator(begin(), end()),
+                    partition_iterator(end(), end()));
+}
+
+static Value *foldSelectInst(SelectInst &SI) {
+  // If the condition being selected on is a constant or the same value is
+  // being selected between, fold the select. Yes this does (rarely) happen
+  // early on.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
+    return SI.getOperand(1 + CI->isZero());
+  if (SI.getOperand(1) == SI.getOperand(2))
+    return SI.getOperand(1);
+
+  return nullptr;
+}
+
+/// \brief A helper that folds a PHI node or a select.
+static Value *foldPHINodeOrSelectInst(Instruction &I) {
+  if (PHINode *PN = dyn_cast<PHINode>(&I)) {
+    // If PN merges together the same value, return that value.
+    return PN->hasConstantValue();
+  }
+  return foldSelectInst(cast<SelectInst>(I));
+}
+
+/// \brief Builder for the alloca slices.
+///
+/// This class builds a set of alloca slices by recursively visiting the uses
+/// of an alloca and making a slice for each load and store at each offset.
+class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
+  friend class PtrUseVisitor<SliceBuilder>;
+  friend class InstVisitor<SliceBuilder>;
+  typedef PtrUseVisitor<SliceBuilder> Base;
+
+  const uint64_t AllocSize;
+  AllocaSlices &AS;
+
+  SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
+  SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
+
+  /// \brief Set to de-duplicate dead instructions found in the use walk.
+  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
+
+public:
+  SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
+      : PtrUseVisitor<SliceBuilder>(DL),
+        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), AS(AS) {}
+
+private:
+  void markAsDead(Instruction &I) {
+    if (VisitedDeadInsts.insert(&I).second)
+      AS.DeadUsers.push_back(&I);
+  }
+
+  void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
+                 bool IsSplittable = false) {
+    // Completely skip uses which have a zero size or start either before or
+    // past the end of the allocation.
+    if (Size == 0 || Offset.uge(AllocSize)) {
+      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
+                   << " which has zero size or starts outside of the "
+                   << AllocSize << " byte alloca:\n"
+                   << "    alloca: " << AS.AI << "\n"
+                   << "       use: " << I << "\n");
+      return markAsDead(I);
+    }
+
+    uint64_t BeginOffset = Offset.getZExtValue();
+    uint64_t EndOffset = BeginOffset + Size;
+
+    // Clamp the end offset to the end of the allocation. Note that this is
+    // formulated to handle even the case where "BeginOffset + Size" overflows.
+    // This may appear superficially to be something we could ignore entirely,
+    // but that is not so! There may be widened loads or PHI-node uses where
+    // some instructions are dead but not others. We can't completely ignore
+    // them, and so have to record at least the information here.
+    assert(AllocSize >= BeginOffset); // Established above.
+    if (Size > AllocSize - BeginOffset) {
+      DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
+                   << " to remain within the " << AllocSize << " byte alloca:\n"
+                   << "    alloca: " << AS.AI << "\n"
+                   << "       use: " << I << "\n");
+      EndOffset = AllocSize;
+    }
+
+    AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
+  }
+
+  void visitBitCastInst(BitCastInst &BC) {
+    if (BC.use_empty())
+      return markAsDead(BC);
+
+    return Base::visitBitCastInst(BC);
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    if (GEPI.use_empty())
+      return markAsDead(GEPI);
+
+    if (SROAStrictInbounds && GEPI.isInBounds()) {
+      // FIXME: This is a manually un-factored variant of the basic code inside
+      // of GEPs with checking of the inbounds invariant specified in the
+      // langref in a very strict sense. If we ever want to enable
+      // SROAStrictInbounds, this code should be factored cleanly into
+      // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
+      // by writing out the code here where we have the underlying allocation
+      // size readily available.
+      APInt GEPOffset = Offset;
+      const DataLayout &DL = GEPI.getModule()->getDataLayout();
+      for (gep_type_iterator GTI = gep_type_begin(GEPI),
+                             GTE = gep_type_end(GEPI);
+           GTI != GTE; ++GTI) {
+        ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+        if (!OpC)
+          break;
+
+        // Handle a struct index, which adds its field offset to the pointer.
+        if (StructType *STy = GTI.getStructTypeOrNull()) {
+          unsigned ElementIdx = OpC->getZExtValue();
+          const StructLayout *SL = DL.getStructLayout(STy);
+          GEPOffset +=
+              APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
+        } else {
+          // For array or vector indices, scale the index by the size of the
+          // type.
+          APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
+          GEPOffset += Index * APInt(Offset.getBitWidth(),
+                                     DL.getTypeAllocSize(GTI.getIndexedType()));
+        }
+
+        // If this index has computed an intermediate pointer which is not
+        // inbounds, then the result of the GEP is a poison value and we can
+        // delete it and all uses.
+        if (GEPOffset.ugt(AllocSize))
+          return markAsDead(GEPI);
+      }
+    }
+
+    return Base::visitGetElementPtrInst(GEPI);
+  }
+
+  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
+                         uint64_t Size, bool IsVolatile) {
+    // We allow splitting of non-volatile loads and stores where the type is an
+    // integer type. These may be used to implement 'memcpy' or other "transfer
+    // of bits" patterns.
+    bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
+
+    insertUse(I, Offset, Size, IsSplittable);
+  }
+
+  void visitLoadInst(LoadInst &LI) {
+    assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
+           "All simple FCA loads should have been pre-split");
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&LI);
+
+    const DataLayout &DL = LI.getModule()->getDataLayout();
+    uint64_t Size = DL.getTypeStoreSize(LI.getType());
+    return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
+  }
+
+  void visitStoreInst(StoreInst &SI) {
+    Value *ValOp = SI.getValueOperand();
+    if (ValOp == *U)
+      return PI.setEscapedAndAborted(&SI);
+    if (!IsOffsetKnown)
+      return PI.setAborted(&SI);
+
+    const DataLayout &DL = SI.getModule()->getDataLayout();
+    uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
+
+    // If this memory access can be shown to *statically* extend outside the
+    // bounds of of the allocation, it's behavior is undefined, so simply
+    // ignore it. Note that this is more strict than the generic clamping
+    // behavior of insertUse. We also try to handle cases which might run the
+    // risk of overflow.
+    // FIXME: We should instead consider the pointer to have escaped if this
+    // function is being instrumented for addressing bugs or race conditions.
+    if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
+      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
+                   << " which extends past the end of the " << AllocSize
+                   << " byte alloca:\n"
+                   << "    alloca: " << AS.AI << "\n"
+                   << "       use: " << SI << "\n");
+      return markAsDead(SI);
+    }
+
+    assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
+           "All simple FCA stores should have been pre-split");
+    handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
+  }
+
+  void visitMemSetInst(MemSetInst &II) {
+    assert(II.getRawDest() == *U && "Pointer use is not the destination?");
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && Offset.uge(AllocSize)))
+      // Zero-length mem transfer intrinsics can be ignored entirely.
+      return markAsDead(II);
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    insertUse(II, Offset, Length ? Length->getLimitedValue()
+                                 : AllocSize - Offset.getLimitedValue(),
+              (bool)Length);
+  }
+
+  void visitMemTransferInst(MemTransferInst &II) {
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    if (Length && Length->getValue() == 0)
+      // Zero-length mem transfer intrinsics can be ignored entirely.
+      return markAsDead(II);
+
+    // Because we can visit these intrinsics twice, also check to see if the
+    // first time marked this instruction as dead. If so, skip it.
+    if (VisitedDeadInsts.count(&II))
+      return;
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    // This side of the transfer is completely out-of-bounds, and so we can
+    // nuke the entire transfer. However, we also need to nuke the other side
+    // if already added to our partitions.
+    // FIXME: Yet another place we really should bypass this when
+    // instrumenting for ASan.
+    if (Offset.uge(AllocSize)) {
+      SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
+          MemTransferSliceMap.find(&II);
+      if (MTPI != MemTransferSliceMap.end())
+        AS.Slices[MTPI->second].kill();
+      return markAsDead(II);
+    }
+
+    uint64_t RawOffset = Offset.getLimitedValue();
+    uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
+
+    // Check for the special case where the same exact value is used for both
+    // source and dest.
+    if (*U == II.getRawDest() && *U == II.getRawSource()) {
+      // For non-volatile transfers this is a no-op.
+      if (!II.isVolatile())
+        return markAsDead(II);
+
+      return insertUse(II, Offset, Size, /*IsSplittable=*/false);
+    }
+
+    // If we have seen both source and destination for a mem transfer, then
+    // they both point to the same alloca.
+    bool Inserted;
+    SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
+    std::tie(MTPI, Inserted) =
+        MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
+    unsigned PrevIdx = MTPI->second;
+    if (!Inserted) {
+      Slice &PrevP = AS.Slices[PrevIdx];
+
+      // Check if the begin offsets match and this is a non-volatile transfer.
+      // In that case, we can completely elide the transfer.
+      if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
+        PrevP.kill();
+        return markAsDead(II);
+      }
+
+      // Otherwise we have an offset transfer within the same alloca. We can't
+      // split those.
+      PrevP.makeUnsplittable();
+    }
+
+    // Insert the use now that we've fixed up the splittable nature.
+    insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
+
+    // Check that we ended up with a valid index in the map.
+    assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
+           "Map index doesn't point back to a slice with this user.");
+  }
+
+  // Disable SRoA for any intrinsics except for lifetime invariants.
+  // FIXME: What about debug intrinsics? This matches old behavior, but
+  // doesn't make sense.
+  void visitIntrinsicInst(IntrinsicInst &II) {
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    if (II.getIntrinsicID() == Intrinsic::lifetime_start ||
+        II.getIntrinsicID() == Intrinsic::lifetime_end) {
+      ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
+      uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
+                               Length->getLimitedValue());
+      insertUse(II, Offset, Size, true);
+      return;
+    }
+
+    Base::visitIntrinsicInst(II);
+  }
+
+  Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
+    // We consider any PHI or select that results in a direct load or store of
+    // the same offset to be a viable use for slicing purposes. These uses
+    // are considered unsplittable and the size is the maximum loaded or stored
+    // size.
+    SmallPtrSet<Instruction *, 4> Visited;
+    SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
+    Visited.insert(Root);
+    Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
+    const DataLayout &DL = Root->getModule()->getDataLayout();
+    // If there are no loads or stores, the access is dead. We mark that as
+    // a size zero access.
+    Size = 0;
+    do {
+      Instruction *I, *UsedI;
+      std::tie(UsedI, I) = Uses.pop_back_val();
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        Size = std::max(Size, DL.getTypeStoreSize(LI->getType()));
+        continue;
+      }
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        Value *Op = SI->getOperand(0);
+        if (Op == UsedI)
+          return SI;
+        Size = std::max(Size, DL.getTypeStoreSize(Op->getType()));
+        continue;
+      }
+
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+        if (!GEP->hasAllZeroIndices())
+          return GEP;
+      } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
+                 !isa<SelectInst>(I)) {
+        return I;
+      }
+
+      for (User *U : I->users())
+        if (Visited.insert(cast<Instruction>(U)).second)
+          Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
+    } while (!Uses.empty());
+
+    return nullptr;
+  }
+
+  void visitPHINodeOrSelectInst(Instruction &I) {
+    assert(isa<PHINode>(I) || isa<SelectInst>(I));
+    if (I.use_empty())
+      return markAsDead(I);
+
+    // TODO: We could use SimplifyInstruction here to fold PHINodes and
+    // SelectInsts. However, doing so requires to change the current
+    // dead-operand-tracking mechanism. For instance, suppose neither loading
+    // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
+    // trap either.  However, if we simply replace %U with undef using the
+    // current dead-operand-tracking mechanism, "load (select undef, undef,
+    // %other)" may trap because the select may return the first operand
+    // "undef".
+    if (Value *Result = foldPHINodeOrSelectInst(I)) {
+      if (Result == *U)
+        // If the result of the constant fold will be the pointer, recurse
+        // through the PHI/select as if we had RAUW'ed it.
+        enqueueUsers(I);
+      else
+        // Otherwise the operand to the PHI/select is dead, and we can replace
+        // it with undef.
+        AS.DeadOperands.push_back(U);
+
+      return;
+    }
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&I);
+
+    // See if we already have computed info on this node.
+    uint64_t &Size = PHIOrSelectSizes[&I];
+    if (!Size) {
+      // This is a new PHI/Select, check for an unsafe use of it.
+      if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
+        return PI.setAborted(UnsafeI);
+    }
+
+    // For PHI and select operands outside the alloca, we can't nuke the entire
+    // phi or select -- the other side might still be relevant, so we special
+    // case them here and use a separate structure to track the operands
+    // themselves which should be replaced with undef.
+    // FIXME: This should instead be escaped in the event we're instrumenting
+    // for address sanitization.
+    if (Offset.uge(AllocSize)) {
+      AS.DeadOperands.push_back(U);
+      return;
+    }
+
+    insertUse(I, Offset, Size);
+  }
+
+  void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
+
+  void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
+
+  /// \brief Disable SROA entirely if there are unhandled users of the alloca.
+  void visitInstruction(Instruction &I) { PI.setAborted(&I); }
+};
+
+AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
+    :
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+      AI(AI),
+#endif
+      PointerEscapingInstr(nullptr) {
+  SliceBuilder PB(DL, AI, *this);
+  SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
+  if (PtrI.isEscaped() || PtrI.isAborted()) {
+    // FIXME: We should sink the escape vs. abort info into the caller nicely,
+    // possibly by just storing the PtrInfo in the AllocaSlices.
+    PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
+                                                  : PtrI.getAbortingInst();
+    assert(PointerEscapingInstr && "Did not track a bad instruction");
+    return;
+  }
+
+  Slices.erase(remove_if(Slices, [](const Slice &S) { return S.isDead(); }),
+               Slices.end());
+
+#ifndef NDEBUG
+  if (SROARandomShuffleSlices) {
+    std::mt19937 MT(static_cast<unsigned>(
+        std::chrono::system_clock::now().time_since_epoch().count()));
+    std::shuffle(Slices.begin(), Slices.end(), MT);
+  }
+#endif
+
+  // Sort the uses. This arranges for the offsets to be in ascending order,
+  // and the sizes to be in descending order.
+  std::sort(Slices.begin(), Slices.end());
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+void AllocaSlices::print(raw_ostream &OS, const_iterator I,
+                         StringRef Indent) const {
+  printSlice(OS, I, Indent);
+  OS << "\n";
+  printUse(OS, I, Indent);
+}
+
+void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
+                              StringRef Indent) const {
+  OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
+     << " slice #" << (I - begin())
+     << (I->isSplittable() ? " (splittable)" : "");
+}
+
+void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
+                            StringRef Indent) const {
+  OS << Indent << "  used by: " << *I->getUse()->getUser() << "\n";
+}
+
+void AllocaSlices::print(raw_ostream &OS) const {
+  if (PointerEscapingInstr) {
+    OS << "Can't analyze slices for alloca: " << AI << "\n"
+       << "  A pointer to this alloca escaped by:\n"
+       << "  " << *PointerEscapingInstr << "\n";
+    return;
+  }
+
+  OS << "Slices of alloca: " << AI << "\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    print(OS, I);
+}
+
+LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
+  print(dbgs(), I);
+}
+LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
+
+#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+/// Walk the range of a partitioning looking for a common type to cover this
+/// sequence of slices.
+static Type *findCommonType(AllocaSlices::const_iterator B,
+                            AllocaSlices::const_iterator E,
+                            uint64_t EndOffset) {
+  Type *Ty = nullptr;
+  bool TyIsCommon = true;
+  IntegerType *ITy = nullptr;
+
+  // Note that we need to look at *every* alloca slice's Use to ensure we
+  // always get consistent results regardless of the order of slices.
+  for (AllocaSlices::const_iterator I = B; I != E; ++I) {
+    Use *U = I->getUse();
+    if (isa<IntrinsicInst>(*U->getUser()))
+      continue;
+    if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
+      continue;
+
+    Type *UserTy = nullptr;
+    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+      UserTy = LI->getType();
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+      UserTy = SI->getValueOperand()->getType();
+    }
+
+    if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
+      // If the type is larger than the partition, skip it. We only encounter
+      // this for split integer operations where we want to use the type of the
+      // entity causing the split. Also skip if the type is not a byte width
+      // multiple.
+      if (UserITy->getBitWidth() % 8 != 0 ||
+          UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
+        continue;
+
+      // Track the largest bitwidth integer type used in this way in case there
+      // is no common type.
+      if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
+        ITy = UserITy;
+    }
+
+    // To avoid depending on the order of slices, Ty and TyIsCommon must not
+    // depend on types skipped above.
+    if (!UserTy || (Ty && Ty != UserTy))
+      TyIsCommon = false; // Give up on anything but an iN type.
+    else
+      Ty = UserTy;
+  }
+
+  return TyIsCommon ? Ty : ITy;
+}
+
+/// PHI instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers in the pred blocks and then PHI the
+/// results, allowing the load of the alloca to be promoted.
+/// From this:
+///   %P2 = phi [i32* %Alloca, i32* %Other]
+///   %V = load i32* %P2
+/// to:
+///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+///   ...
+///   %V2 = load i32* %Other
+///   ...
+///   %V = phi [i32 %V1, i32 %V2]
+///
+/// We can do this to a select if its only uses are loads and if the operands
+/// to the select can be loaded unconditionally.
+///
+/// FIXME: This should be hoisted into a generic utility, likely in
+/// Transforms/Util/Local.h
+static bool isSafePHIToSpeculate(PHINode &PN) {
+  // For now, we can only do this promotion if the load is in the same block
+  // as the PHI, and if there are no stores between the phi and load.
+  // TODO: Allow recursive phi users.
+  // TODO: Allow stores.
+  BasicBlock *BB = PN.getParent();
+  unsigned MaxAlign = 0;
+  bool HaveLoad = false;
+  for (User *U : PN.users()) {
+    LoadInst *LI = dyn_cast<LoadInst>(U);
+    if (!LI || !LI->isSimple())
+      return false;
+
+    // For now we only allow loads in the same block as the PHI.  This is
+    // a common case that happens when instcombine merges two loads through
+    // a PHI.
+    if (LI->getParent() != BB)
+      return false;
+
+    // Ensure that there are no instructions between the PHI and the load that
+    // could store.
+    for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
+      if (BBI->mayWriteToMemory())
+        return false;
+
+    MaxAlign = std::max(MaxAlign, LI->getAlignment());
+    HaveLoad = true;
+  }
+
+  if (!HaveLoad)
+    return false;
+
+  const DataLayout &DL = PN.getModule()->getDataLayout();
+
+  // We can only transform this if it is safe to push the loads into the
+  // predecessor blocks. The only thing to watch out for is that we can't put
+  // a possibly trapping load in the predecessor if it is a critical edge.
+  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+    TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
+    Value *InVal = PN.getIncomingValue(Idx);
+
+    // If the value is produced by the terminator of the predecessor (an
+    // invoke) or it has side-effects, there is no valid place to put a load
+    // in the predecessor.
+    if (TI == InVal || TI->mayHaveSideEffects())
+      return false;
+
+    // If the predecessor has a single successor, then the edge isn't
+    // critical.
+    if (TI->getNumSuccessors() == 1)
+      continue;
+
+    // If this pointer is always safe to load, or if we can prove that there
+    // is already a load in the block, then we can move the load to the pred
+    // block.
+    if (isSafeToLoadUnconditionally(InVal, MaxAlign, DL, TI))
+      continue;
+
+    return false;
+  }
+
+  return true;
+}
+
+static void speculatePHINodeLoads(PHINode &PN) {
+  DEBUG(dbgs() << "    original: " << PN << "\n");
+
+  Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
+  IRBuilderTy PHIBuilder(&PN);
+  PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
+                                        PN.getName() + ".sroa.speculated");
+
+  // Get the AA tags and alignment to use from one of the loads.  It doesn't
+  // matter which one we get and if any differ.
+  LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
+
+  AAMDNodes AATags;
+  SomeLoad->getAAMetadata(AATags);
+  unsigned Align = SomeLoad->getAlignment();
+
+  // Rewrite all loads of the PN to use the new PHI.
+  while (!PN.use_empty()) {
+    LoadInst *LI = cast<LoadInst>(PN.user_back());
+    LI->replaceAllUsesWith(NewPN);
+    LI->eraseFromParent();
+  }
+
+  // Inject loads into all of the pred blocks.
+  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+    BasicBlock *Pred = PN.getIncomingBlock(Idx);
+    TerminatorInst *TI = Pred->getTerminator();
+    Value *InVal = PN.getIncomingValue(Idx);
+    IRBuilderTy PredBuilder(TI);
+
+    LoadInst *Load = PredBuilder.CreateLoad(
+        InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
+    ++NumLoadsSpeculated;
+    Load->setAlignment(Align);
+    if (AATags)
+      Load->setAAMetadata(AATags);
+    NewPN->addIncoming(Load, Pred);
+  }
+
+  DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
+  PN.eraseFromParent();
+}
+
+/// Select instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers and then select between the result,
+/// allowing the load of the alloca to be promoted.
+/// From this:
+///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
+///   %V = load i32* %P2
+/// to:
+///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+///   %V2 = load i32* %Other
+///   %V = select i1 %cond, i32 %V1, i32 %V2
+///
+/// We can do this to a select if its only uses are loads and if the operand
+/// to the select can be loaded unconditionally.
+static bool isSafeSelectToSpeculate(SelectInst &SI) {
+  Value *TValue = SI.getTrueValue();
+  Value *FValue = SI.getFalseValue();
+  const DataLayout &DL = SI.getModule()->getDataLayout();
+
+  for (User *U : SI.users()) {
+    LoadInst *LI = dyn_cast<LoadInst>(U);
+    if (!LI || !LI->isSimple())
+      return false;
+
+    // Both operands to the select need to be dereferenceable, either
+    // absolutely (e.g. allocas) or at this point because we can see other
+    // accesses to it.
+    if (!isSafeToLoadUnconditionally(TValue, LI->getAlignment(), DL, LI))
+      return false;
+    if (!isSafeToLoadUnconditionally(FValue, LI->getAlignment(), DL, LI))
+      return false;
+  }
+
+  return true;
+}
+
+static void speculateSelectInstLoads(SelectInst &SI) {
+  DEBUG(dbgs() << "    original: " << SI << "\n");
+
+  IRBuilderTy IRB(&SI);
+  Value *TV = SI.getTrueValue();
+  Value *FV = SI.getFalseValue();
+  // Replace the loads of the select with a select of two loads.
+  while (!SI.use_empty()) {
+    LoadInst *LI = cast<LoadInst>(SI.user_back());
+    assert(LI->isSimple() && "We only speculate simple loads");
+
+    IRB.SetInsertPoint(LI);
+    LoadInst *TL =
+        IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true");
+    LoadInst *FL =
+        IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
+    NumLoadsSpeculated += 2;
+
+    // Transfer alignment and AA info if present.
+    TL->setAlignment(LI->getAlignment());
+    FL->setAlignment(LI->getAlignment());
+
+    AAMDNodes Tags;
+    LI->getAAMetadata(Tags);
+    if (Tags) {
+      TL->setAAMetadata(Tags);
+      FL->setAAMetadata(Tags);
+    }
+
+    Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
+                                LI->getName() + ".sroa.speculated");
+
+    DEBUG(dbgs() << "          speculated to: " << *V << "\n");
+    LI->replaceAllUsesWith(V);
+    LI->eraseFromParent();
+  }
+  SI.eraseFromParent();
+}
+
+/// \brief Build a GEP out of a base pointer and indices.
+///
+/// This will return the BasePtr if that is valid, or build a new GEP
+/// instruction using the IRBuilder if GEP-ing is needed.
+static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
+                       SmallVectorImpl<Value *> &Indices, Twine NamePrefix) {
+  if (Indices.empty())
+    return BasePtr;
+
+  // A single zero index is a no-op, so check for this and avoid building a GEP
+  // in that case.
+  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
+    return BasePtr;
+
+  return IRB.CreateInBoundsGEP(nullptr, BasePtr, Indices,
+                               NamePrefix + "sroa_idx");
+}
+
+/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
+/// TargetTy without changing the offset of the pointer.
+///
+/// This routine assumes we've already established a properly offset GEP with
+/// Indices, and arrived at the Ty type. The goal is to continue to GEP with
+/// zero-indices down through type layers until we find one the same as
+/// TargetTy. If we can't find one with the same type, we at least try to use
+/// one with the same size. If none of that works, we just produce the GEP as
+/// indicated by Indices to have the correct offset.
+static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
+                                    Value *BasePtr, Type *Ty, Type *TargetTy,
+                                    SmallVectorImpl<Value *> &Indices,
+                                    Twine NamePrefix) {
+  if (Ty == TargetTy)
+    return buildGEP(IRB, BasePtr, Indices, NamePrefix);
+
+  // Pointer size to use for the indices.
+  unsigned PtrSize = DL.getPointerTypeSizeInBits(BasePtr->getType());
+
+  // See if we can descend into a struct and locate a field with the correct
+  // type.
+  unsigned NumLayers = 0;
+  Type *ElementTy = Ty;
+  do {
+    if (ElementTy->isPointerTy())
+      break;
+
+    if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
+      ElementTy = ArrayTy->getElementType();
+      Indices.push_back(IRB.getIntN(PtrSize, 0));
+    } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
+      ElementTy = VectorTy->getElementType();
+      Indices.push_back(IRB.getInt32(0));
+    } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
+      if (STy->element_begin() == STy->element_end())
+        break; // Nothing left to descend into.
+      ElementTy = *STy->element_begin();
+      Indices.push_back(IRB.getInt32(0));
+    } else {
+      break;
+    }
+    ++NumLayers;
+  } while (ElementTy != TargetTy);
+  if (ElementTy != TargetTy)
+    Indices.erase(Indices.end() - NumLayers, Indices.end());
+
+  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
+}
+
+/// \brief Recursively compute indices for a natural GEP.
+///
+/// This is the recursive step for getNaturalGEPWithOffset that walks down the
+/// element types adding appropriate indices for the GEP.
+static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
+                                       Value *Ptr, Type *Ty, APInt &Offset,
+                                       Type *TargetTy,
+                                       SmallVectorImpl<Value *> &Indices,
+                                       Twine NamePrefix) {
+  if (Offset == 0)
+    return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
+                                 NamePrefix);
+
+  // We can't recurse through pointer types.
+  if (Ty->isPointerTy())
+    return nullptr;
+
+  // We try to analyze GEPs over vectors here, but note that these GEPs are
+  // extremely poorly defined currently. The long-term goal is to remove GEPing
+  // over a vector from the IR completely.
+  if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
+    unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
+    if (ElementSizeInBits % 8 != 0) {
+      // GEPs over non-multiple of 8 size vector elements are invalid.
+      return nullptr;
+    }
+    APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
+    APInt NumSkippedElements = Offset.sdiv(ElementSize);
+    if (NumSkippedElements.ugt(VecTy->getNumElements()))
+      return nullptr;
+    Offset -= NumSkippedElements * ElementSize;
+    Indices.push_back(IRB.getInt(NumSkippedElements));
+    return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
+                                    Offset, TargetTy, Indices, NamePrefix);
+  }
+
+  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+    Type *ElementTy = ArrTy->getElementType();
+    APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
+    APInt NumSkippedElements = Offset.sdiv(ElementSize);
+    if (NumSkippedElements.ugt(ArrTy->getNumElements()))
+      return nullptr;
+
+    Offset -= NumSkippedElements * ElementSize;
+    Indices.push_back(IRB.getInt(NumSkippedElements));
+    return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+                                    Indices, NamePrefix);
+  }
+
+  StructType *STy = dyn_cast<StructType>(Ty);
+  if (!STy)
+    return nullptr;
+
+  const StructLayout *SL = DL.getStructLayout(STy);
+  uint64_t StructOffset = Offset.getZExtValue();
+  if (StructOffset >= SL->getSizeInBytes())
+    return nullptr;
+  unsigned Index = SL->getElementContainingOffset(StructOffset);
+  Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
+  Type *ElementTy = STy->getElementType(Index);
+  if (Offset.uge(DL.getTypeAllocSize(ElementTy)))
+    return nullptr; // The offset points into alignment padding.
+
+  Indices.push_back(IRB.getInt32(Index));
+  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+                                  Indices, NamePrefix);
+}
+
+/// \brief Get a natural GEP from a base pointer to a particular offset and
+/// resulting in a particular type.
+///
+/// The goal is to produce a "natural" looking GEP that works with the existing
+/// composite types to arrive at the appropriate offset and element type for
+/// a pointer. TargetTy is the element type the returned GEP should point-to if
+/// possible. We recurse by decreasing Offset, adding the appropriate index to
+/// Indices, and setting Ty to the result subtype.
+///
+/// If no natural GEP can be constructed, this function returns null.
+static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
+                                      Value *Ptr, APInt Offset, Type *TargetTy,
+                                      SmallVectorImpl<Value *> &Indices,
+                                      Twine NamePrefix) {
+  PointerType *Ty = cast<PointerType>(Ptr->getType());
+
+  // Don't consider any GEPs through an i8* as natural unless the TargetTy is
+  // an i8.
+  if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
+    return nullptr;
+
+  Type *ElementTy = Ty->getElementType();
+  if (!ElementTy->isSized())
+    return nullptr; // We can't GEP through an unsized element.
+  APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
+  if (ElementSize == 0)
+    return nullptr; // Zero-length arrays can't help us build a natural GEP.
+  APInt NumSkippedElements = Offset.sdiv(ElementSize);
+
+  Offset -= NumSkippedElements * ElementSize;
+  Indices.push_back(IRB.getInt(NumSkippedElements));
+  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+                                  Indices, NamePrefix);
+}
+
+/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
+/// resulting pointer has PointerTy.
+///
+/// This tries very hard to compute a "natural" GEP which arrives at the offset
+/// and produces the pointer type desired. Where it cannot, it will try to use
+/// the natural GEP to arrive at the offset and bitcast to the type. Where that
+/// fails, it will try to use an existing i8* and GEP to the byte offset and
+/// bitcast to the type.
+///
+/// The strategy for finding the more natural GEPs is to peel off layers of the
+/// pointer, walking back through bit casts and GEPs, searching for a base
+/// pointer from which we can compute a natural GEP with the desired
+/// properties. The algorithm tries to fold as many constant indices into
+/// a single GEP as possible, thus making each GEP more independent of the
+/// surrounding code.
+static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
+                             APInt Offset, Type *PointerTy, Twine NamePrefix) {
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+  Visited.insert(Ptr);
+  SmallVector<Value *, 4> Indices;
+
+  // We may end up computing an offset pointer that has the wrong type. If we
+  // never are able to compute one directly that has the correct type, we'll
+  // fall back to it, so keep it and the base it was computed from around here.
+  Value *OffsetPtr = nullptr;
+  Value *OffsetBasePtr;
+
+  // Remember any i8 pointer we come across to re-use if we need to do a raw
+  // byte offset.
+  Value *Int8Ptr = nullptr;
+  APInt Int8PtrOffset(Offset.getBitWidth(), 0);
+
+  Type *TargetTy = PointerTy->getPointerElementType();
+
+  do {
+    // First fold any existing GEPs into the offset.
+    while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
+      APInt GEPOffset(Offset.getBitWidth(), 0);
+      if (!GEP->accumulateConstantOffset(DL, GEPOffset))
+        break;
+      Offset += GEPOffset;
+      Ptr = GEP->getPointerOperand();
+      if (!Visited.insert(Ptr).second)
+        break;
+    }
+
+    // See if we can perform a natural GEP here.
+    Indices.clear();
+    if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
+                                           Indices, NamePrefix)) {
+      // If we have a new natural pointer at the offset, clear out any old
+      // offset pointer we computed. Unless it is the base pointer or
+      // a non-instruction, we built a GEP we don't need. Zap it.
+      if (OffsetPtr && OffsetPtr != OffsetBasePtr)
+        if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
+          assert(I->use_empty() && "Built a GEP with uses some how!");
+          I->eraseFromParent();
+        }
+      OffsetPtr = P;
+      OffsetBasePtr = Ptr;
+      // If we also found a pointer of the right type, we're done.
+      if (P->getType() == PointerTy)
+        return P;
+    }
+
+    // Stash this pointer if we've found an i8*.
+    if (Ptr->getType()->isIntegerTy(8)) {
+      Int8Ptr = Ptr;
+      Int8PtrOffset = Offset;
+    }
+
+    // Peel off a layer of the pointer and update the offset appropriately.
+    if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
+      Ptr = cast<Operator>(Ptr)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
+      if (GA->isInterposable())
+        break;
+      Ptr = GA->getAliasee();
+    } else {
+      break;
+    }
+    assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
+  } while (Visited.insert(Ptr).second);
+
+  if (!OffsetPtr) {
+    if (!Int8Ptr) {
+      Int8Ptr = IRB.CreateBitCast(
+          Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
+          NamePrefix + "sroa_raw_cast");
+      Int8PtrOffset = Offset;
+    }
+
+    OffsetPtr = Int8PtrOffset == 0
+                    ? Int8Ptr
+                    : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
+                                            IRB.getInt(Int8PtrOffset),
+                                            NamePrefix + "sroa_raw_idx");
+  }
+  Ptr = OffsetPtr;
+
+  // On the off chance we were targeting i8*, guard the bitcast here.
+  if (Ptr->getType() != PointerTy)
+    Ptr = IRB.CreateBitCast(Ptr, PointerTy, NamePrefix + "sroa_cast");
+
+  return Ptr;
+}
+
+/// \brief Compute the adjusted alignment for a load or store from an offset.
+static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
+                                     const DataLayout &DL) {
+  unsigned Alignment;
+  Type *Ty;
+  if (auto *LI = dyn_cast<LoadInst>(I)) {
+    Alignment = LI->getAlignment();
+    Ty = LI->getType();
+  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+    Alignment = SI->getAlignment();
+    Ty = SI->getValueOperand()->getType();
+  } else {
+    llvm_unreachable("Only loads and stores are allowed!");
+  }
+
+  if (!Alignment)
+    Alignment = DL.getABITypeAlignment(Ty);
+
+  return MinAlign(Alignment, Offset);
+}
+
+/// \brief Test whether we can convert a value from the old to the new type.
+///
+/// This predicate should be used to guard calls to convertValue in order to
+/// ensure that we only try to convert viable values. The strategy is that we
+/// will peel off single element struct and array wrappings to get to an
+/// underlying value, and convert that value.
+static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
+  if (OldTy == NewTy)
+    return true;
+
+  // For integer types, we can't handle any bit-width differences. This would
+  // break both vector conversions with extension and introduce endianness
+  // issues when in conjunction with loads and stores.
+  if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
+    assert(cast<IntegerType>(OldTy)->getBitWidth() !=
+               cast<IntegerType>(NewTy)->getBitWidth() &&
+           "We can't have the same bitwidth for different int types");
+    return false;
+  }
+
+  if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
+    return false;
+  if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
+    return false;
+
+  // We can convert pointers to integers and vice-versa. Same for vectors
+  // of pointers and integers.
+  OldTy = OldTy->getScalarType();
+  NewTy = NewTy->getScalarType();
+  if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
+    if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
+      return cast<PointerType>(NewTy)->getPointerAddressSpace() ==
+        cast<PointerType>(OldTy)->getPointerAddressSpace();
+    }
+
+    // We can convert integers to integral pointers, but not to non-integral
+    // pointers.
+    if (OldTy->isIntegerTy())
+      return !DL.isNonIntegralPointerType(NewTy);
+
+    // We can convert integral pointers to integers, but non-integral pointers
+    // need to remain pointers.
+    if (!DL.isNonIntegralPointerType(OldTy))
+      return NewTy->isIntegerTy();
+
+    return false;
+  }
+
+  return true;
+}
+
+/// \brief Generic routine to convert an SSA value to a value of a different
+/// type.
+///
+/// This will try various different casting techniques, such as bitcasts,
+/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
+/// two types for viability with this routine.
+static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
+                           Type *NewTy) {
+  Type *OldTy = V->getType();
+  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+
+  if (OldTy == NewTy)
+    return V;
+
+  assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
+         "Integer types must be the exact same to convert.");
+
+  // See if we need inttoptr for this type pair. A cast involving both scalars
+  // and vectors requires and additional bitcast.
+  if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
+    // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
+    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                                NewTy);
+
+    // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
+    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                                NewTy);
+
+    return IRB.CreateIntToPtr(V, NewTy);
+  }
+
+  // See if we need ptrtoint for this type pair. A cast involving both scalars
+  // and vectors requires and additional bitcast.
+  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
+    // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
+    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                               NewTy);
+
+    // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
+    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                               NewTy);
+
+    return IRB.CreatePtrToInt(V, NewTy);
+  }
+
+  return IRB.CreateBitCast(V, NewTy);
+}
+
+/// \brief Test whether the given slice use can be promoted to a vector.
+///
+/// This function is called to test each entry in a partition which is slated
+/// for a single slice.
+static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
+                                            VectorType *Ty,
+                                            uint64_t ElementSize,
+                                            const DataLayout &DL) {
+  // First validate the slice offsets.
+  uint64_t BeginOffset =
+      std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
+  uint64_t BeginIndex = BeginOffset / ElementSize;
+  if (BeginIndex * ElementSize != BeginOffset ||
+      BeginIndex >= Ty->getNumElements())
+    return false;
+  uint64_t EndOffset =
+      std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
+  uint64_t EndIndex = EndOffset / ElementSize;
+  if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements())
+    return false;
+
+  assert(EndIndex > BeginIndex && "Empty vector!");
+  uint64_t NumElements = EndIndex - BeginIndex;
+  Type *SliceTy = (NumElements == 1)
+                      ? Ty->getElementType()
+                      : VectorType::get(Ty->getElementType(), NumElements);
+
+  Type *SplitIntTy =
+      Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
+
+  Use *U = S.getUse();
+
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+    if (MI->isVolatile())
+      return false;
+    if (!S.isSplittable())
+      return false; // Skip any unsplittable intrinsics.
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+    if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+        II->getIntrinsicID() != Intrinsic::lifetime_end)
+      return false;
+  } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
+    // Disable vector promotion when there are loads or stores of an FCA.
+    return false;
+  } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+    if (LI->isVolatile())
+      return false;
+    Type *LTy = LI->getType();
+    if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
+      assert(LTy->isIntegerTy());
+      LTy = SplitIntTy;
+    }
+    if (!canConvertValue(DL, SliceTy, LTy))
+      return false;
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+    if (SI->isVolatile())
+      return false;
+    Type *STy = SI->getValueOperand()->getType();
+    if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
+      assert(STy->isIntegerTy());
+      STy = SplitIntTy;
+    }
+    if (!canConvertValue(DL, STy, SliceTy))
+      return false;
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+/// \brief Test whether the given alloca partitioning and range of slices can be
+/// promoted to a vector.
+///
+/// This is a quick test to check whether we can rewrite a particular alloca
+/// partition (and its newly formed alloca) into a vector alloca with only
+/// whole-vector loads and stores such that it could be promoted to a vector
+/// SSA value. We only can ensure this for a limited set of operations, and we
+/// don't want to do the rewrites unless we are confident that the result will
+/// be promotable, so we have an early test here.
+static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
+  // Collect the candidate types for vector-based promotion. Also track whether
+  // we have different element types.
+  SmallVector<VectorType *, 4> CandidateTys;
+  Type *CommonEltTy = nullptr;
+  bool HaveCommonEltTy = true;
+  auto CheckCandidateType = [&](Type *Ty) {
+    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+      CandidateTys.push_back(VTy);
+      if (!CommonEltTy)
+        CommonEltTy = VTy->getElementType();
+      else if (CommonEltTy != VTy->getElementType())
+        HaveCommonEltTy = false;
+    }
+  };
+  // Consider any loads or stores that are the exact size of the slice.
+  for (const Slice &S : P)
+    if (S.beginOffset() == P.beginOffset() &&
+        S.endOffset() == P.endOffset()) {
+      if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
+        CheckCandidateType(LI->getType());
+      else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
+        CheckCandidateType(SI->getValueOperand()->getType());
+    }
+
+  // If we didn't find a vector type, nothing to do here.
+  if (CandidateTys.empty())
+    return nullptr;
+
+  // Remove non-integer vector types if we had multiple common element types.
+  // FIXME: It'd be nice to replace them with integer vector types, but we can't
+  // do that until all the backends are known to produce good code for all
+  // integer vector types.
+  if (!HaveCommonEltTy) {
+    CandidateTys.erase(remove_if(CandidateTys,
+                                 [](VectorType *VTy) {
+                                   return !VTy->getElementType()->isIntegerTy();
+                                 }),
+                       CandidateTys.end());
+
+    // If there were no integer vector types, give up.
+    if (CandidateTys.empty())
+      return nullptr;
+
+    // Rank the remaining candidate vector types. This is easy because we know
+    // they're all integer vectors. We sort by ascending number of elements.
+    auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+      (void)DL;
+      assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
+             "Cannot have vector types of different sizes!");
+      assert(RHSTy->getElementType()->isIntegerTy() &&
+             "All non-integer types eliminated!");
+      assert(LHSTy->getElementType()->isIntegerTy() &&
+             "All non-integer types eliminated!");
+      return RHSTy->getNumElements() < LHSTy->getNumElements();
+    };
+    std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
+    CandidateTys.erase(
+        std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
+        CandidateTys.end());
+  } else {
+// The only way to have the same element type in every vector type is to
+// have the same vector type. Check that and remove all but one.
+#ifndef NDEBUG
+    for (VectorType *VTy : CandidateTys) {
+      assert(VTy->getElementType() == CommonEltTy &&
+             "Unaccounted for element type!");
+      assert(VTy == CandidateTys[0] &&
+             "Different vector types with the same element type!");
+    }
+#endif
+    CandidateTys.resize(1);
+  }
+
+  // Try each vector type, and return the one which works.
+  auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
+    uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
+
+    // While the definition of LLVM vectors is bitpacked, we don't support sizes
+    // that aren't byte sized.
+    if (ElementSize % 8)
+      return false;
+    assert((DL.getTypeSizeInBits(VTy) % 8) == 0 &&
+           "vector size not a multiple of element size?");
+    ElementSize /= 8;
+
+    for (const Slice &S : P)
+      if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
+        return false;
+
+    for (const Slice *S : P.splitSliceTails())
+      if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
+        return false;
+
+    return true;
+  };
+  for (VectorType *VTy : CandidateTys)
+    if (CheckVectorTypeForPromotion(VTy))
+      return VTy;
+
+  return nullptr;
+}
+
+/// \brief Test whether a slice of an alloca is valid for integer widening.
+///
+/// This implements the necessary checking for the \c isIntegerWideningViable
+/// test below on a single slice of the alloca.
+static bool isIntegerWideningViableForSlice(const Slice &S,
+                                            uint64_t AllocBeginOffset,
+                                            Type *AllocaTy,
+                                            const DataLayout &DL,
+                                            bool &WholeAllocaOp) {
+  uint64_t Size = DL.getTypeStoreSize(AllocaTy);
+
+  uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
+  uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
+
+  // We can't reasonably handle cases where the load or store extends past
+  // the end of the alloca's type and into its padding.
+  if (RelEnd > Size)
+    return false;
+
+  Use *U = S.getUse();
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+    if (LI->isVolatile())
+      return false;
+    // We can't handle loads that extend past the allocated memory.
+    if (DL.getTypeStoreSize(LI->getType()) > Size)
+      return false;
+    // Note that we don't count vector loads or stores as whole-alloca
+    // operations which enable integer widening because we would prefer to use
+    // vector widening instead.
+    if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
+      WholeAllocaOp = true;
+    if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
+      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
+        return false;
+    } else if (RelBegin != 0 || RelEnd != Size ||
+               !canConvertValue(DL, AllocaTy, LI->getType())) {
+      // Non-integer loads need to be convertible from the alloca type so that
+      // they are promotable.
+      return false;
+    }
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+    Type *ValueTy = SI->getValueOperand()->getType();
+    if (SI->isVolatile())
+      return false;
+    // We can't handle stores that extend past the allocated memory.
+    if (DL.getTypeStoreSize(ValueTy) > Size)
+      return false;
+    // Note that we don't count vector loads or stores as whole-alloca
+    // operations which enable integer widening because we would prefer to use
+    // vector widening instead.
+    if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
+      WholeAllocaOp = true;
+    if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
+      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
+        return false;
+    } else if (RelBegin != 0 || RelEnd != Size ||
+               !canConvertValue(DL, ValueTy, AllocaTy)) {
+      // Non-integer stores need to be convertible to the alloca type so that
+      // they are promotable.
+      return false;
+    }
+  } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+    if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
+      return false;
+    if (!S.isSplittable())
+      return false; // Skip any unsplittable intrinsics.
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+    if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+        II->getIntrinsicID() != Intrinsic::lifetime_end)
+      return false;
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+/// \brief Test whether the given alloca partition's integer operations can be
+/// widened to promotable ones.
+///
+/// This is a quick test to check whether we can rewrite the integer loads and
+/// stores to a particular alloca into wider loads and stores and be able to
+/// promote the resulting alloca.
+static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
+                                    const DataLayout &DL) {
+  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
+  // Don't create integer types larger than the maximum bitwidth.
+  if (SizeInBits > IntegerType::MAX_INT_BITS)
+    return false;
+
+  // Don't try to handle allocas with bit-padding.
+  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy))
+    return false;
+
+  // We need to ensure that an integer type with the appropriate bitwidth can
+  // be converted to the alloca type, whatever that is. We don't want to force
+  // the alloca itself to have an integer type if there is a more suitable one.
+  Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
+  if (!canConvertValue(DL, AllocaTy, IntTy) ||
+      !canConvertValue(DL, IntTy, AllocaTy))
+    return false;
+
+  // While examining uses, we ensure that the alloca has a covering load or
+  // store. We don't want to widen the integer operations only to fail to
+  // promote due to some other unsplittable entry (which we may make splittable
+  // later). However, if there are only splittable uses, go ahead and assume
+  // that we cover the alloca.
+  // FIXME: We shouldn't consider split slices that happen to start in the
+  // partition here...
+  bool WholeAllocaOp =
+      P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits);
+
+  for (const Slice &S : P)
+    if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
+                                         WholeAllocaOp))
+      return false;
+
+  for (const Slice *S : P.splitSliceTails())
+    if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
+                                         WholeAllocaOp))
+      return false;
+
+  return WholeAllocaOp;
+}
+
+static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
+                             IntegerType *Ty, uint64_t Offset,
+                             const Twine &Name) {
+  DEBUG(dbgs() << "       start: " << *V << "\n");
+  IntegerType *IntTy = cast<IntegerType>(V->getType());
+  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+         "Element extends past full value");
+  uint64_t ShAmt = 8 * Offset;
+  if (DL.isBigEndian())
+    ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+  if (ShAmt) {
+    V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
+    DEBUG(dbgs() << "     shifted: " << *V << "\n");
+  }
+  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot extract to a larger integer!");
+  if (Ty != IntTy) {
+    V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
+    DEBUG(dbgs() << "     trunced: " << *V << "\n");
+  }
+  return V;
+}
+
+static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
+                            Value *V, uint64_t Offset, const Twine &Name) {
+  IntegerType *IntTy = cast<IntegerType>(Old->getType());
+  IntegerType *Ty = cast<IntegerType>(V->getType());
+  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot insert a larger integer!");
+  DEBUG(dbgs() << "       start: " << *V << "\n");
+  if (Ty != IntTy) {
+    V = IRB.CreateZExt(V, IntTy, Name + ".ext");
+    DEBUG(dbgs() << "    extended: " << *V << "\n");
+  }
+  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+         "Element store outside of alloca store");
+  uint64_t ShAmt = 8 * Offset;
+  if (DL.isBigEndian())
+    ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+  if (ShAmt) {
+    V = IRB.CreateShl(V, ShAmt, Name + ".shift");
+    DEBUG(dbgs() << "     shifted: " << *V << "\n");
+  }
+
+  if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
+    APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
+    Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
+    DEBUG(dbgs() << "      masked: " << *Old << "\n");
+    V = IRB.CreateOr(Old, V, Name + ".insert");
+    DEBUG(dbgs() << "    inserted: " << *V << "\n");
+  }
+  return V;
+}
+
+static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
+                            unsigned EndIndex, const Twine &Name) {
+  VectorType *VecTy = cast<VectorType>(V->getType());
+  unsigned NumElements = EndIndex - BeginIndex;
+  assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+  if (NumElements == VecTy->getNumElements())
+    return V;
+
+  if (NumElements == 1) {
+    V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
+                                 Name + ".extract");
+    DEBUG(dbgs() << "     extract: " << *V << "\n");
+    return V;
+  }
+
+  SmallVector<Constant *, 8> Mask;
+  Mask.reserve(NumElements);
+  for (unsigned i = BeginIndex; i != EndIndex; ++i)
+    Mask.push_back(IRB.getInt32(i));
+  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                              ConstantVector::get(Mask), Name + ".extract");
+  DEBUG(dbgs() << "     shuffle: " << *V << "\n");
+  return V;
+}
+
+static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
+                           unsigned BeginIndex, const Twine &Name) {
+  VectorType *VecTy = cast<VectorType>(Old->getType());
+  assert(VecTy && "Can only insert a vector into a vector");
+
+  VectorType *Ty = dyn_cast<VectorType>(V->getType());
+  if (!Ty) {
+    // Single element to insert.
+    V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
+                                Name + ".insert");
+    DEBUG(dbgs() << "     insert: " << *V << "\n");
+    return V;
+  }
+
+  assert(Ty->getNumElements() <= VecTy->getNumElements() &&
+         "Too many elements!");
+  if (Ty->getNumElements() == VecTy->getNumElements()) {
+    assert(V->getType() == VecTy && "Vector type mismatch");
+    return V;
+  }
+  unsigned EndIndex = BeginIndex + Ty->getNumElements();
+
+  // When inserting a smaller vector into the larger to store, we first
+  // use a shuffle vector to widen it with undef elements, and then
+  // a second shuffle vector to select between the loaded vector and the
+  // incoming vector.
+  SmallVector<Constant *, 8> Mask;
+  Mask.reserve(VecTy->getNumElements());
+  for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+    if (i >= BeginIndex && i < EndIndex)
+      Mask.push_back(IRB.getInt32(i - BeginIndex));
+    else
+      Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
+  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                              ConstantVector::get(Mask), Name + ".expand");
+  DEBUG(dbgs() << "    shuffle: " << *V << "\n");
+
+  Mask.clear();
+  for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+    Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
+
+  V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
+
+  DEBUG(dbgs() << "    blend: " << *V << "\n");
+  return V;
+}
+
+/// \brief Visitor to rewrite instructions using p particular slice of an alloca
+/// to use a new alloca.
+///
+/// Also implements the rewriting to vector-based accesses when the partition
+/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
+/// lives here.
+class llvm::sroa::AllocaSliceRewriter
+    : public InstVisitor<AllocaSliceRewriter, bool> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class llvm::InstVisitor<AllocaSliceRewriter, bool>;
+  typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base;
+
+  const DataLayout &DL;
+  AllocaSlices &AS;
+  SROA &Pass;
+  AllocaInst &OldAI, &NewAI;
+  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
+  Type *NewAllocaTy;
+
+  // This is a convenience and flag variable that will be null unless the new
+  // alloca's integer operations should be widened to this integer type due to
+  // passing isIntegerWideningViable above. If it is non-null, the desired
+  // integer type will be stored here for easy access during rewriting.
+  IntegerType *IntTy;
+
+  // If we are rewriting an alloca partition which can be written as pure
+  // vector operations, we stash extra information here. When VecTy is
+  // non-null, we have some strict guarantees about the rewritten alloca:
+  //   - The new alloca is exactly the size of the vector type here.
+  //   - The accesses all either map to the entire vector or to a single
+  //     element.
+  //   - The set of accessing instructions is only one of those handled above
+  //     in isVectorPromotionViable. Generally these are the same access kinds
+  //     which are promotable via mem2reg.
+  VectorType *VecTy;
+  Type *ElementTy;
+  uint64_t ElementSize;
+
+  // The original offset of the slice currently being rewritten relative to
+  // the original alloca.
+  uint64_t BeginOffset, EndOffset;
+  // The new offsets of the slice currently being rewritten relative to the
+  // original alloca.
+  uint64_t NewBeginOffset, NewEndOffset;
+
+  uint64_t SliceSize;
+  bool IsSplittable;
+  bool IsSplit;
+  Use *OldUse;
+  Instruction *OldPtr;
+
+  // Track post-rewrite users which are PHI nodes and Selects.
+  SmallSetVector<PHINode *, 8> &PHIUsers;
+  SmallSetVector<SelectInst *, 8> &SelectUsers;
+
+  // Utility IR builder, whose name prefix is setup for each visited use, and
+  // the insertion point is set to point to the user.
+  IRBuilderTy IRB;
+
+public:
+  AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
+                      AllocaInst &OldAI, AllocaInst &NewAI,
+                      uint64_t NewAllocaBeginOffset,
+                      uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
+                      VectorType *PromotableVecTy,
+                      SmallSetVector<PHINode *, 8> &PHIUsers,
+                      SmallSetVector<SelectInst *, 8> &SelectUsers)
+      : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
+        NewAllocaBeginOffset(NewAllocaBeginOffset),
+        NewAllocaEndOffset(NewAllocaEndOffset),
+        NewAllocaTy(NewAI.getAllocatedType()),
+        IntTy(IsIntegerPromotable
+                  ? Type::getIntNTy(
+                        NewAI.getContext(),
+                        DL.getTypeSizeInBits(NewAI.getAllocatedType()))
+                  : nullptr),
+        VecTy(PromotableVecTy),
+        ElementTy(VecTy ? VecTy->getElementType() : nullptr),
+        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
+        BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
+        OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
+        IRB(NewAI.getContext(), ConstantFolder()) {
+    if (VecTy) {
+      assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
+             "Only multiple-of-8 sized vector elements are viable");
+      ++NumVectorized;
+    }
+    assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
+  }
+
+  bool visit(AllocaSlices::const_iterator I) {
+    bool CanSROA = true;
+    BeginOffset = I->beginOffset();
+    EndOffset = I->endOffset();
+    IsSplittable = I->isSplittable();
+    IsSplit =
+        BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
+    DEBUG(dbgs() << "  rewriting " << (IsSplit ? "split " : ""));
+    DEBUG(AS.printSlice(dbgs(), I, ""));
+    DEBUG(dbgs() << "\n");
+
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
+    SliceSize = NewEndOffset - NewBeginOffset;
+
+    OldUse = I->getUse();
+    OldPtr = cast<Instruction>(OldUse->get());
+
+    Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
+    IRB.SetInsertPoint(OldUserI);
+    IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
+    IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
+
+    CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
+    if (VecTy || IntTy)
+      assert(CanSROA);
+    return CanSROA;
+  }
+
+private:
+  // Make sure the other visit overloads are visible.
+  using Base::visit;
+
+  // Every instruction which can end up as a user must have a rewrite rule.
+  bool visitInstruction(Instruction &I) {
+    DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
+    llvm_unreachable("No rewrite rule for this instruction!");
+  }
+
+  Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
+    // Note that the offset computation can use BeginOffset or NewBeginOffset
+    // interchangeably for unsplit slices.
+    assert(IsSplit || BeginOffset == NewBeginOffset);
+    uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+
+#ifndef NDEBUG
+    StringRef OldName = OldPtr->getName();
+    // Skip through the last '.sroa.' component of the name.
+    size_t LastSROAPrefix = OldName.rfind(".sroa.");
+    if (LastSROAPrefix != StringRef::npos) {
+      OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
+      // Look for an SROA slice index.
+      size_t IndexEnd = OldName.find_first_not_of("0123456789");
+      if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
+        // Strip the index and look for the offset.
+        OldName = OldName.substr(IndexEnd + 1);
+        size_t OffsetEnd = OldName.find_first_not_of("0123456789");
+        if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
+          // Strip the offset.
+          OldName = OldName.substr(OffsetEnd + 1);
+      }
+    }
+    // Strip any SROA suffixes as well.
+    OldName = OldName.substr(0, OldName.find(".sroa_"));
+#endif
+
+    return getAdjustedPtr(IRB, DL, &NewAI,
+                          APInt(DL.getPointerTypeSizeInBits(PointerTy), Offset),
+                          PointerTy,
+#ifndef NDEBUG
+                          Twine(OldName) + "."
+#else
+                          Twine()
+#endif
+                          );
+  }
+
+  /// \brief Compute suitable alignment to access this slice of the *new*
+  /// alloca.
+  ///
+  /// You can optionally pass a type to this routine and if that type's ABI
+  /// alignment is itself suitable, this will return zero.
+  unsigned getSliceAlign(Type *Ty = nullptr) {
+    unsigned NewAIAlign = NewAI.getAlignment();
+    if (!NewAIAlign)
+      NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
+    unsigned Align =
+        MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
+    return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align;
+  }
+
+  unsigned getIndex(uint64_t Offset) {
+    assert(VecTy && "Can only call getIndex when rewriting a vector");
+    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
+    assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
+    uint32_t Index = RelOffset / ElementSize;
+    assert(Index * ElementSize == RelOffset);
+    return Index;
+  }
+
+  void deleteIfTriviallyDead(Value *V) {
+    Instruction *I = cast<Instruction>(V);
+    if (isInstructionTriviallyDead(I))
+      Pass.DeadInsts.insert(I);
+  }
+
+  Value *rewriteVectorizedLoadInst() {
+    unsigned BeginIndex = getIndex(NewBeginOffset);
+    unsigned EndIndex = getIndex(NewEndOffset);
+    assert(EndIndex > BeginIndex && "Empty vector!");
+
+    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+    return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
+  }
+
+  Value *rewriteIntegerLoad(LoadInst &LI) {
+    assert(IntTy && "We cannot insert an integer to the alloca");
+    assert(!LI.isVolatile());
+    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+    V = convertValue(DL, IRB, V, IntTy);
+    assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+    uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+    if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
+      IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
+      V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
+    }
+    // It is possible that the extracted type is not the load type. This
+    // happens if there is a load past the end of the alloca, and as
+    // a consequence the slice is narrower but still a candidate for integer
+    // lowering. To handle this case, we just zero extend the extracted
+    // integer.
+    assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
+           "Can only handle an extract for an overly wide load");
+    if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
+      V = IRB.CreateZExt(V, LI.getType());
+    return V;
+  }
+
+  bool visitLoadInst(LoadInst &LI) {
+    DEBUG(dbgs() << "    original: " << LI << "\n");
+    Value *OldOp = LI.getOperand(0);
+    assert(OldOp == OldPtr);
+
+    unsigned AS = LI.getPointerAddressSpace();
+
+    Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
+                             : LI.getType();
+    const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
+    bool IsPtrAdjusted = false;
+    Value *V;
+    if (VecTy) {
+      V = rewriteVectorizedLoadInst();
+    } else if (IntTy && LI.getType()->isIntegerTy()) {
+      V = rewriteIntegerLoad(LI);
+    } else if (NewBeginOffset == NewAllocaBeginOffset &&
+               NewEndOffset == NewAllocaEndOffset &&
+               (canConvertValue(DL, NewAllocaTy, TargetTy) ||
+                (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
+                 TargetTy->isIntegerTy()))) {
+      LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                              LI.isVolatile(), LI.getName());
+      if (LI.isVolatile())
+        NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+
+      // Any !nonnull metadata or !range metadata on the old load is also valid
+      // on the new load. This is even true in some cases even when the loads
+      // are different types, for example by mapping !nonnull metadata to
+      // !range metadata by modeling the null pointer constant converted to the
+      // integer type.
+      // FIXME: Add support for range metadata here. Currently the utilities
+      // for this don't propagate range metadata in trivial cases from one
+      // integer load to another, don't handle non-addrspace-0 null pointers
+      // correctly, and don't have any support for mapping ranges as the
+      // integer type becomes winder or narrower.
+      if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
+        copyNonnullMetadata(LI, N, *NewLI);
+
+      // Try to preserve nonnull metadata
+      V = NewLI;
+
+      // If this is an integer load past the end of the slice (which means the
+      // bytes outside the slice are undef or this load is dead) just forcibly
+      // fix the integer size with correct handling of endianness.
+      if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+        if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
+          if (AITy->getBitWidth() < TITy->getBitWidth()) {
+            V = IRB.CreateZExt(V, TITy, "load.ext");
+            if (DL.isBigEndian())
+              V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
+                                "endian_shift");
+          }
+    } else {
+      Type *LTy = TargetTy->getPointerTo(AS);
+      LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
+                                              getSliceAlign(TargetTy),
+                                              LI.isVolatile(), LI.getName());
+      if (LI.isVolatile())
+        NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+
+      V = NewLI;
+      IsPtrAdjusted = true;
+    }
+    V = convertValue(DL, IRB, V, TargetTy);
+
+    if (IsSplit) {
+      assert(!LI.isVolatile());
+      assert(LI.getType()->isIntegerTy() &&
+             "Only integer type loads and stores are split");
+      assert(SliceSize < DL.getTypeStoreSize(LI.getType()) &&
+             "Split load isn't smaller than original load");
+      assert(LI.getType()->getIntegerBitWidth() ==
+                 DL.getTypeStoreSizeInBits(LI.getType()) &&
+             "Non-byte-multiple bit width");
+      // Move the insertion point just past the load so that we can refer to it.
+      IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
+      // Create a placeholder value with the same type as LI to use as the
+      // basis for the new value. This allows us to replace the uses of LI with
+      // the computed value, and then replace the placeholder with LI, leaving
+      // LI only used for this computation.
+      Value *Placeholder =
+          new LoadInst(UndefValue::get(LI.getType()->getPointerTo(AS)));
+      V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
+                        "insert");
+      LI.replaceAllUsesWith(V);
+      Placeholder->replaceAllUsesWith(&LI);
+      Placeholder->deleteValue();
+    } else {
+      LI.replaceAllUsesWith(V);
+    }
+
+    Pass.DeadInsts.insert(&LI);
+    deleteIfTriviallyDead(OldOp);
+    DEBUG(dbgs() << "          to: " << *V << "\n");
+    return !LI.isVolatile() && !IsPtrAdjusted;
+  }
+
+  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) {
+    if (V->getType() != VecTy) {
+      unsigned BeginIndex = getIndex(NewBeginOffset);
+      unsigned EndIndex = getIndex(NewEndOffset);
+      assert(EndIndex > BeginIndex && "Empty vector!");
+      unsigned NumElements = EndIndex - BeginIndex;
+      assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+      Type *SliceTy = (NumElements == 1)
+                          ? ElementTy
+                          : VectorType::get(ElementTy, NumElements);
+      if (V->getType() != SliceTy)
+        V = convertValue(DL, IRB, V, SliceTy);
+
+      // Mix in the existing elements.
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+      V = insertVector(IRB, Old, V, BeginIndex, "vec");
+    }
+    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+    Pass.DeadInsts.insert(&SI);
+
+    (void)Store;
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return true;
+  }
+
+  bool rewriteIntegerStore(Value *V, StoreInst &SI) {
+    assert(IntTy && "We cannot extract an integer from the alloca");
+    assert(!SI.isVolatile());
+    if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
+      Value *Old =
+          IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+      Old = convertValue(DL, IRB, Old, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
+    }
+    V = convertValue(DL, IRB, V, NewAllocaTy);
+    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+    Store->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
+    Pass.DeadInsts.insert(&SI);
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return true;
+  }
+
+  bool visitStoreInst(StoreInst &SI) {
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    Value *OldOp = SI.getOperand(1);
+    assert(OldOp == OldPtr);
+
+    Value *V = SI.getValueOperand();
+
+    // Strip all inbounds GEPs and pointer casts to try to dig out any root
+    // alloca that should be re-examined after promoting this alloca.
+    if (V->getType()->isPointerTy())
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
+        Pass.PostPromotionWorklist.insert(AI);
+
+    if (SliceSize < DL.getTypeStoreSize(V->getType())) {
+      assert(!SI.isVolatile());
+      assert(V->getType()->isIntegerTy() &&
+             "Only integer type loads and stores are split");
+      assert(V->getType()->getIntegerBitWidth() ==
+                 DL.getTypeStoreSizeInBits(V->getType()) &&
+             "Non-byte-multiple bit width");
+      IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
+      V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
+                         "extract");
+    }
+
+    if (VecTy)
+      return rewriteVectorizedStoreInst(V, SI, OldOp);
+    if (IntTy && V->getType()->isIntegerTy())
+      return rewriteIntegerStore(V, SI);
+
+    const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize;
+    StoreInst *NewSI;
+    if (NewBeginOffset == NewAllocaBeginOffset &&
+        NewEndOffset == NewAllocaEndOffset &&
+        (canConvertValue(DL, V->getType(), NewAllocaTy) ||
+         (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
+          V->getType()->isIntegerTy()))) {
+      // If this is an integer store past the end of slice (and thus the bytes
+      // past that point are irrelevant or this is unreachable), truncate the
+      // value prior to storing.
+      if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
+        if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+          if (VITy->getBitWidth() > AITy->getBitWidth()) {
+            if (DL.isBigEndian())
+              V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
+                                 "endian_shift");
+            V = IRB.CreateTrunc(V, AITy, "load.trunc");
+          }
+
+      V = convertValue(DL, IRB, V, NewAllocaTy);
+      NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+                                     SI.isVolatile());
+    } else {
+      unsigned AS = SI.getPointerAddressSpace();
+      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
+      NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
+                                     SI.isVolatile());
+    }
+    NewSI->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
+    if (SI.isVolatile())
+      NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
+    Pass.DeadInsts.insert(&SI);
+    deleteIfTriviallyDead(OldOp);
+
+    DEBUG(dbgs() << "          to: " << *NewSI << "\n");
+    return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
+  }
+
+  /// \brief Compute an integer value from splatting an i8 across the given
+  /// number of bytes.
+  ///
+  /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
+  /// call this routine.
+  /// FIXME: Heed the advice above.
+  ///
+  /// \param V The i8 value to splat.
+  /// \param Size The number of bytes in the output (assuming i8 is one byte)
+  Value *getIntegerSplat(Value *V, unsigned Size) {
+    assert(Size > 0 && "Expected a positive number of bytes.");
+    IntegerType *VTy = cast<IntegerType>(V->getType());
+    assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
+    if (Size == 1)
+      return V;
+
+    Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
+    V = IRB.CreateMul(
+        IRB.CreateZExt(V, SplatIntTy, "zext"),
+        ConstantExpr::getUDiv(
+            Constant::getAllOnesValue(SplatIntTy),
+            ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
+                                  SplatIntTy)),
+        "isplat");
+    return V;
+  }
+
+  /// \brief Compute a vector splat for a given element value.
+  Value *getVectorSplat(Value *V, unsigned NumElements) {
+    V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
+    DEBUG(dbgs() << "       splat: " << *V << "\n");
+    return V;
+  }
+
+  bool visitMemSetInst(MemSetInst &II) {
+    DEBUG(dbgs() << "    original: " << II << "\n");
+    assert(II.getRawDest() == OldPtr);
+
+    // If the memset has a variable size, it cannot be split, just adjust the
+    // pointer to the new alloca.
+    if (!isa<Constant>(II.getLength())) {
+      assert(!IsSplit);
+      assert(NewBeginOffset == BeginOffset);
+      II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
+      Type *CstTy = II.getAlignmentCst()->getType();
+      II.setAlignment(ConstantInt::get(CstTy, getSliceAlign()));
+
+      deleteIfTriviallyDead(OldPtr);
+      return false;
+    }
+
+    // Record this instruction for deletion.
+    Pass.DeadInsts.insert(&II);
+
+    Type *AllocaTy = NewAI.getAllocatedType();
+    Type *ScalarTy = AllocaTy->getScalarType();
+
+    // If this doesn't map cleanly onto the alloca type, and that type isn't
+    // a single value type, just emit a memset.
+    if (!VecTy && !IntTy &&
+        (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
+         SliceSize != DL.getTypeStoreSize(AllocaTy) ||
+         !AllocaTy->isSingleValueType() ||
+         !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) ||
+         DL.getTypeSizeInBits(ScalarTy) % 8 != 0)) {
+      Type *SizeTy = II.getLength()->getType();
+      Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
+      CallInst *New = IRB.CreateMemSet(
+          getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
+          getSliceAlign(), II.isVolatile());
+      (void)New;
+      DEBUG(dbgs() << "          to: " << *New << "\n");
+      return false;
+    }
+
+    // If we can represent this as a simple value, we have to build the actual
+    // value to store, which requires expanding the byte present in memset to
+    // a sensible representation for the alloca type. This is essentially
+    // splatting the byte to a sufficiently wide integer, splatting it across
+    // any desired vector width, and bitcasting to the final type.
+    Value *V;
+
+    if (VecTy) {
+      // If this is a memset of a vectorized alloca, insert it.
+      assert(ElementTy == ScalarTy);
+
+      unsigned BeginIndex = getIndex(NewBeginOffset);
+      unsigned EndIndex = getIndex(NewEndOffset);
+      assert(EndIndex > BeginIndex && "Empty vector!");
+      unsigned NumElements = EndIndex - BeginIndex;
+      assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+      Value *Splat =
+          getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8);
+      Splat = convertValue(DL, IRB, Splat, ElementTy);
+      if (NumElements > 1)
+        Splat = getVectorSplat(Splat, NumElements);
+
+      Value *Old =
+          IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+      V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
+    } else if (IntTy) {
+      // If this is a memset on an alloca where we can widen stores, insert the
+      // set integer.
+      assert(!II.isVolatile());
+
+      uint64_t Size = NewEndOffset - NewBeginOffset;
+      V = getIntegerSplat(II.getValue(), Size);
+
+      if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
+                    EndOffset != NewAllocaBeginOffset)) {
+        Value *Old =
+            IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+        Old = convertValue(DL, IRB, Old, IntTy);
+        uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+        V = insertInteger(DL, IRB, Old, V, Offset, "insert");
+      } else {
+        assert(V->getType() == IntTy &&
+               "Wrong type for an alloca wide integer!");
+      }
+      V = convertValue(DL, IRB, V, AllocaTy);
+    } else {
+      // Established these invariants above.
+      assert(NewBeginOffset == NewAllocaBeginOffset);
+      assert(NewEndOffset == NewAllocaEndOffset);
+
+      V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8);
+      if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
+        V = getVectorSplat(V, AllocaVecTy->getNumElements());
+
+      V = convertValue(DL, IRB, V, AllocaTy);
+    }
+
+    Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+                                        II.isVolatile());
+    (void)New;
+    DEBUG(dbgs() << "          to: " << *New << "\n");
+    return !II.isVolatile();
+  }
+
+  bool visitMemTransferInst(MemTransferInst &II) {
+    // Rewriting of memory transfer instructions can be a bit tricky. We break
+    // them into two categories: split intrinsics and unsplit intrinsics.
+
+    DEBUG(dbgs() << "    original: " << II << "\n");
+
+    bool IsDest = &II.getRawDestUse() == OldUse;
+    assert((IsDest && II.getRawDest() == OldPtr) ||
+           (!IsDest && II.getRawSource() == OldPtr));
+
+    unsigned SliceAlign = getSliceAlign();
+
+    // For unsplit intrinsics, we simply modify the source and destination
+    // pointers in place. This isn't just an optimization, it is a matter of
+    // correctness. With unsplit intrinsics we may be dealing with transfers
+    // within a single alloca before SROA ran, or with transfers that have
+    // a variable length. We may also be dealing with memmove instead of
+    // memcpy, and so simply updating the pointers is the necessary for us to
+    // update both source and dest of a single call.
+    if (!IsSplittable) {
+      Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+      if (IsDest)
+        II.setDest(AdjustedPtr);
+      else
+        II.setSource(AdjustedPtr);
+
+      if (II.getAlignment() > SliceAlign) {
+        Type *CstTy = II.getAlignmentCst()->getType();
+        II.setAlignment(
+            ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign)));
+      }
+
+      DEBUG(dbgs() << "          to: " << II << "\n");
+      deleteIfTriviallyDead(OldPtr);
+      return false;
+    }
+    // For split transfer intrinsics we have an incredibly useful assurance:
+    // the source and destination do not reside within the same alloca, and at
+    // least one of them does not escape. This means that we can replace
+    // memmove with memcpy, and we don't need to worry about all manner of
+    // downsides to splitting and transforming the operations.
+
+    // If this doesn't map cleanly onto the alloca type, and that type isn't
+    // a single value type, just emit a memcpy.
+    bool EmitMemCpy =
+        !VecTy && !IntTy &&
+        (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
+         SliceSize != DL.getTypeStoreSize(NewAI.getAllocatedType()) ||
+         !NewAI.getAllocatedType()->isSingleValueType());
+
+    // If we're just going to emit a memcpy, the alloca hasn't changed, and the
+    // size hasn't been shrunk based on analysis of the viable range, this is
+    // a no-op.
+    if (EmitMemCpy && &OldAI == &NewAI) {
+      // Ensure the start lines up.
+      assert(NewBeginOffset == BeginOffset);
+
+      // Rewrite the size as needed.
+      if (NewEndOffset != EndOffset)
+        II.setLength(ConstantInt::get(II.getLength()->getType(),
+                                      NewEndOffset - NewBeginOffset));
+      return false;
+    }
+    // Record this instruction for deletion.
+    Pass.DeadInsts.insert(&II);
+
+    // Strip all inbounds GEPs and pointer casts to try to dig out any root
+    // alloca that should be re-examined after rewriting this instruction.
+    Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
+    if (AllocaInst *AI =
+            dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
+      assert(AI != &OldAI && AI != &NewAI &&
+             "Splittable transfers cannot reach the same alloca on both ends.");
+      Pass.Worklist.insert(AI);
+    }
+
+    Type *OtherPtrTy = OtherPtr->getType();
+    unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
+
+    // Compute the relative offset for the other pointer within the transfer.
+    unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS);
+    APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
+    unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1,
+                                   OtherOffset.zextOrTrunc(64).getZExtValue());
+
+    if (EmitMemCpy) {
+      // Compute the other pointer, folding as much as possible to produce
+      // a single, simple GEP in most cases.
+      OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+                                OtherPtr->getName() + ".");
+
+      Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+      Type *SizeTy = II.getLength()->getType();
+      Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
+
+      CallInst *New = IRB.CreateMemCpy(
+          IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size,
+          MinAlign(SliceAlign, OtherAlign), II.isVolatile());
+      (void)New;
+      DEBUG(dbgs() << "          to: " << *New << "\n");
+      return false;
+    }
+
+    bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
+                         NewEndOffset == NewAllocaEndOffset;
+    uint64_t Size = NewEndOffset - NewBeginOffset;
+    unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
+    unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
+    unsigned NumElements = EndIndex - BeginIndex;
+    IntegerType *SubIntTy =
+        IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
+
+    // Reset the other pointer type to match the register type we're going to
+    // use, but using the address space of the original other pointer.
+    if (VecTy && !IsWholeAlloca) {
+      if (NumElements == 1)
+        OtherPtrTy = VecTy->getElementType();
+      else
+        OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements);
+
+      OtherPtrTy = OtherPtrTy->getPointerTo(OtherAS);
+    } else if (IntTy && !IsWholeAlloca) {
+      OtherPtrTy = SubIntTy->getPointerTo(OtherAS);
+    } else {
+      OtherPtrTy = NewAllocaTy->getPointerTo(OtherAS);
+    }
+
+    Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+                                   OtherPtr->getName() + ".");
+    unsigned SrcAlign = OtherAlign;
+    Value *DstPtr = &NewAI;
+    unsigned DstAlign = SliceAlign;
+    if (!IsDest) {
+      std::swap(SrcPtr, DstPtr);
+      std::swap(SrcAlign, DstAlign);
+    }
+
+    Value *Src;
+    if (VecTy && !IsWholeAlloca && !IsDest) {
+      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+      Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
+    } else if (IntTy && !IsWholeAlloca && !IsDest) {
+      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+      Src = convertValue(DL, IRB, Src, IntTy);
+      uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+      Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
+    } else {
+      Src =
+          IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");
+    }
+
+    if (VecTy && !IsWholeAlloca && IsDest) {
+      Value *Old =
+          IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+      Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
+    } else if (IntTy && !IsWholeAlloca && IsDest) {
+      Value *Old =
+          IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+      Old = convertValue(DL, IRB, Old, IntTy);
+      uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+      Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
+      Src = convertValue(DL, IRB, Src, NewAllocaTy);
+    }
+
+    StoreInst *Store = cast<StoreInst>(
+        IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
+    (void)Store;
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return !II.isVolatile();
+  }
+
+  bool visitIntrinsicInst(IntrinsicInst &II) {
+    assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
+           II.getIntrinsicID() == Intrinsic::lifetime_end);
+    DEBUG(dbgs() << "    original: " << II << "\n");
+    assert(II.getArgOperand(1) == OldPtr);
+
+    // Record this instruction for deletion.
+    Pass.DeadInsts.insert(&II);
+
+    // Lifetime intrinsics are only promotable if they cover the whole alloca.
+    // Therefore, we drop lifetime intrinsics which don't cover the whole
+    // alloca.
+    // (In theory, intrinsics which partially cover an alloca could be
+    // promoted, but PromoteMemToReg doesn't handle that case.)
+    // FIXME: Check whether the alloca is promotable before dropping the
+    // lifetime intrinsics?
+    if (NewBeginOffset != NewAllocaBeginOffset ||
+        NewEndOffset != NewAllocaEndOffset)
+      return true;
+
+    ConstantInt *Size =
+        ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
+                         NewEndOffset - NewBeginOffset);
+    Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+    Value *New;
+    if (II.getIntrinsicID() == Intrinsic::lifetime_start)
+      New = IRB.CreateLifetimeStart(Ptr, Size);
+    else
+      New = IRB.CreateLifetimeEnd(Ptr, Size);
+
+    (void)New;
+    DEBUG(dbgs() << "          to: " << *New << "\n");
+
+    return true;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    DEBUG(dbgs() << "    original: " << PN << "\n");
+    assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
+    assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
+
+    // We would like to compute a new pointer in only one place, but have it be
+    // as local as possible to the PHI. To do that, we re-use the location of
+    // the old pointer, which necessarily must be in the right position to
+    // dominate the PHI.
+    IRBuilderTy PtrBuilder(IRB);
+    if (isa<PHINode>(OldPtr))
+      PtrBuilder.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
+    else
+      PtrBuilder.SetInsertPoint(OldPtr);
+    PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc());
+
+    Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType());
+    // Replace the operands which were using the old pointer.
+    std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
+
+    DEBUG(dbgs() << "          to: " << PN << "\n");
+    deleteIfTriviallyDead(OldPtr);
+
+    // PHIs can't be promoted on their own, but often can be speculated. We
+    // check the speculation outside of the rewriter so that we see the
+    // fully-rewritten alloca.
+    PHIUsers.insert(&PN);
+    return true;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
+           "Pointer isn't an operand!");
+    assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
+    assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
+
+    Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+    // Replace the operands which were using the old pointer.
+    if (SI.getOperand(1) == OldPtr)
+      SI.setOperand(1, NewPtr);
+    if (SI.getOperand(2) == OldPtr)
+      SI.setOperand(2, NewPtr);
+
+    DEBUG(dbgs() << "          to: " << SI << "\n");
+    deleteIfTriviallyDead(OldPtr);
+
+    // Selects can't be promoted on their own, but often can be speculated. We
+    // check the speculation outside of the rewriter so that we see the
+    // fully-rewritten alloca.
+    SelectUsers.insert(&SI);
+    return true;
+  }
+};
+
+namespace {
+/// \brief Visitor to rewrite aggregate loads and stores as scalar.
+///
+/// This pass aggressively rewrites all aggregate loads and stores on
+/// a particular pointer (or any pointer derived from it which we can identify)
+/// with scalar loads and stores.
+class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
+
+  /// Queue of pointer uses to analyze and potentially rewrite.
+  SmallVector<Use *, 8> Queue;
+
+  /// Set to prevent us from cycling with phi nodes and loops.
+  SmallPtrSet<User *, 8> Visited;
+
+  /// The current pointer use being rewritten. This is used to dig up the used
+  /// value (as opposed to the user).
+  Use *U;
+
+public:
+  /// Rewrite loads and stores through a pointer and all pointers derived from
+  /// it.
+  bool rewrite(Instruction &I) {
+    DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
+    enqueueUsers(I);
+    bool Changed = false;
+    while (!Queue.empty()) {
+      U = Queue.pop_back_val();
+      Changed |= visit(cast<Instruction>(U->getUser()));
+    }
+    return Changed;
+  }
+
+private:
+  /// Enqueue all the users of the given instruction for further processing.
+  /// This uses a set to de-duplicate users.
+  void enqueueUsers(Instruction &I) {
+    for (Use &U : I.uses())
+      if (Visited.insert(U.getUser()).second)
+        Queue.push_back(&U);
+  }
+
+  // Conservative default is to not rewrite anything.
+  bool visitInstruction(Instruction &I) { return false; }
+
+  /// \brief Generic recursive split emission class.
+  template <typename Derived> class OpSplitter {
+  protected:
+    /// The builder used to form new instructions.
+    IRBuilderTy IRB;
+    /// The indices which to be used with insert- or extractvalue to select the
+    /// appropriate value within the aggregate.
+    SmallVector<unsigned, 4> Indices;
+    /// The indices to a GEP instruction which will move Ptr to the correct slot
+    /// within the aggregate.
+    SmallVector<Value *, 4> GEPIndices;
+    /// The base pointer of the original op, used as a base for GEPing the
+    /// split operations.
+    Value *Ptr;
+
+    /// Initialize the splitter with an insertion point, Ptr and start with a
+    /// single zero GEP index.
+    OpSplitter(Instruction *InsertionPoint, Value *Ptr)
+        : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
+
+  public:
+    /// \brief Generic recursive split emission routine.
+    ///
+    /// This method recursively splits an aggregate op (load or store) into
+    /// scalar or vector ops. It splits recursively until it hits a single value
+    /// and emits that single value operation via the template argument.
+    ///
+    /// The logic of this routine relies on GEPs and insertvalue and
+    /// extractvalue all operating with the same fundamental index list, merely
+    /// formatted differently (GEPs need actual values).
+    ///
+    /// \param Ty  The type being split recursively into smaller ops.
+    /// \param Agg The aggregate value being built up or stored, depending on
+    /// whether this is splitting a load or a store respectively.
+    void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
+      if (Ty->isSingleValueType())
+        return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name);
+
+      if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+        unsigned OldSize = Indices.size();
+        (void)OldSize;
+        for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
+             ++Idx) {
+          assert(Indices.size() == OldSize && "Did not return to the old size");
+          Indices.push_back(Idx);
+          GEPIndices.push_back(IRB.getInt32(Idx));
+          emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
+          GEPIndices.pop_back();
+          Indices.pop_back();
+        }
+        return;
+      }
+
+      if (StructType *STy = dyn_cast<StructType>(Ty)) {
+        unsigned OldSize = Indices.size();
+        (void)OldSize;
+        for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
+             ++Idx) {
+          assert(Indices.size() == OldSize && "Did not return to the old size");
+          Indices.push_back(Idx);
+          GEPIndices.push_back(IRB.getInt32(Idx));
+          emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
+          GEPIndices.pop_back();
+          Indices.pop_back();
+        }
+        return;
+      }
+
+      llvm_unreachable("Only arrays and structs are aggregate loadable types");
+    }
+  };
+
+  struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
+    LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
+        : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
+
+    /// Emit a leaf load of a single value. This is called at the leaves of the
+    /// recursive emission to actually load values.
+    void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+      assert(Ty->isSingleValueType());
+      // Load the single value and insert it using the indices.
+      Value *GEP =
+          IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
+      Value *Load = IRB.CreateLoad(GEP, Name + ".load");
+      Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
+      DEBUG(dbgs() << "          to: " << *Load << "\n");
+    }
+  };
+
+  bool visitLoadInst(LoadInst &LI) {
+    assert(LI.getPointerOperand() == *U);
+    if (!LI.isSimple() || LI.getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being loaded, split it apart.
+    DEBUG(dbgs() << "    original: " << LI << "\n");
+    LoadOpSplitter Splitter(&LI, *U);
+    Value *V = UndefValue::get(LI.getType());
+    Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
+    LI.replaceAllUsesWith(V);
+    LI.eraseFromParent();
+    return true;
+  }
+
+  struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
+    StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
+        : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
+
+    /// Emit a leaf store of a single value. This is called at the leaves of the
+    /// recursive emission to actually produce stores.
+    void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+      assert(Ty->isSingleValueType());
+      // Extract the single value and store it using the indices.
+      //
+      // The gep and extractvalue values are factored out of the CreateStore
+      // call to make the output independent of the argument evaluation order.
+      Value *ExtractValue =
+          IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
+      Value *InBoundsGEP =
+          IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
+      Value *Store = IRB.CreateStore(ExtractValue, InBoundsGEP);
+      (void)Store;
+      DEBUG(dbgs() << "          to: " << *Store << "\n");
+    }
+  };
+
+  bool visitStoreInst(StoreInst &SI) {
+    if (!SI.isSimple() || SI.getPointerOperand() != *U)
+      return false;
+    Value *V = SI.getValueOperand();
+    if (V->getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being stored, split it apart.
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    StoreOpSplitter Splitter(&SI, *U);
+    Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
+    SI.eraseFromParent();
+    return true;
+  }
+
+  bool visitBitCastInst(BitCastInst &BC) {
+    enqueueUsers(BC);
+    return false;
+  }
+
+  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    enqueueUsers(GEPI);
+    return false;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    enqueueUsers(PN);
+    return false;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    enqueueUsers(SI);
+    return false;
+  }
+};
+}
+
+/// \brief Strip aggregate type wrapping.
+///
+/// This removes no-op aggregate types wrapping an underlying type. It will
+/// strip as many layers of types as it can without changing either the type
+/// size or the allocated size.
+static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
+  if (Ty->isSingleValueType())
+    return Ty;
+
+  uint64_t AllocSize = DL.getTypeAllocSize(Ty);
+  uint64_t TypeSize = DL.getTypeSizeInBits(Ty);
+
+  Type *InnerTy;
+  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+    InnerTy = ArrTy->getElementType();
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    const StructLayout *SL = DL.getStructLayout(STy);
+    unsigned Index = SL->getElementContainingOffset(0);
+    InnerTy = STy->getElementType(Index);
+  } else {
+    return Ty;
+  }
+
+  if (AllocSize > DL.getTypeAllocSize(InnerTy) ||
+      TypeSize > DL.getTypeSizeInBits(InnerTy))
+    return Ty;
+
+  return stripAggregateTypeWrapping(DL, InnerTy);
+}
+
+/// \brief Try to find a partition of the aggregate type passed in for a given
+/// offset and size.
+///
+/// This recurses through the aggregate type and tries to compute a subtype
+/// based on the offset and size. When the offset and size span a sub-section
+/// of an array, it will even compute a new array type for that sub-section,
+/// and the same for structs.
+///
+/// Note that this routine is very strict and tries to find a partition of the
+/// type which produces the *exact* right offset and size. It is not forgiving
+/// when the size or offset cause either end of type-based partition to be off.
+/// Also, this is a best-effort routine. It is reasonable to give up and not
+/// return a type if necessary.
+static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
+                              uint64_t Size) {
+  if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size)
+    return stripAggregateTypeWrapping(DL, Ty);
+  if (Offset > DL.getTypeAllocSize(Ty) ||
+      (DL.getTypeAllocSize(Ty) - Offset) < Size)
+    return nullptr;
+
+  if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
+    Type *ElementTy = SeqTy->getElementType();
+    uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
+    uint64_t NumSkippedElements = Offset / ElementSize;
+    if (NumSkippedElements >= SeqTy->getNumElements())
+      return nullptr;
+    Offset -= NumSkippedElements * ElementSize;
+
+    // First check if we need to recurse.
+    if (Offset > 0 || Size < ElementSize) {
+      // Bail if the partition ends in a different array element.
+      if ((Offset + Size) > ElementSize)
+        return nullptr;
+      // Recurse through the element type trying to peel off offset bytes.
+      return getTypePartition(DL, ElementTy, Offset, Size);
+    }
+    assert(Offset == 0);
+
+    if (Size == ElementSize)
+      return stripAggregateTypeWrapping(DL, ElementTy);
+    assert(Size > ElementSize);
+    uint64_t NumElements = Size / ElementSize;
+    if (NumElements * ElementSize != Size)
+      return nullptr;
+    return ArrayType::get(ElementTy, NumElements);
+  }
+
+  StructType *STy = dyn_cast<StructType>(Ty);
+  if (!STy)
+    return nullptr;
+
+  const StructLayout *SL = DL.getStructLayout(STy);
+  if (Offset >= SL->getSizeInBytes())
+    return nullptr;
+  uint64_t EndOffset = Offset + Size;
+  if (EndOffset > SL->getSizeInBytes())
+    return nullptr;
+
+  unsigned Index = SL->getElementContainingOffset(Offset);
+  Offset -= SL->getElementOffset(Index);
+
+  Type *ElementTy = STy->getElementType(Index);
+  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
+  if (Offset >= ElementSize)
+    return nullptr; // The offset points into alignment padding.
+
+  // See if any partition must be contained by the element.
+  if (Offset > 0 || Size < ElementSize) {
+    if ((Offset + Size) > ElementSize)
+      return nullptr;
+    return getTypePartition(DL, ElementTy, Offset, Size);
+  }
+  assert(Offset == 0);
+
+  if (Size == ElementSize)
+    return stripAggregateTypeWrapping(DL, ElementTy);
+
+  StructType::element_iterator EI = STy->element_begin() + Index,
+                               EE = STy->element_end();
+  if (EndOffset < SL->getSizeInBytes()) {
+    unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
+    if (Index == EndIndex)
+      return nullptr; // Within a single element and its padding.
+
+    // Don't try to form "natural" types if the elements don't line up with the
+    // expected size.
+    // FIXME: We could potentially recurse down through the last element in the
+    // sub-struct to find a natural end point.
+    if (SL->getElementOffset(EndIndex) != EndOffset)
+      return nullptr;
+
+    assert(Index < EndIndex);
+    EE = STy->element_begin() + EndIndex;
+  }
+
+  // Try to build up a sub-structure.
+  StructType *SubTy =
+      StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
+  const StructLayout *SubSL = DL.getStructLayout(SubTy);
+  if (Size != SubSL->getSizeInBytes())
+    return nullptr; // The sub-struct doesn't have quite the size needed.
+
+  return SubTy;
+}
+
+/// \brief Pre-split loads and stores to simplify rewriting.
+///
+/// We want to break up the splittable load+store pairs as much as
+/// possible. This is important to do as a preprocessing step, as once we
+/// start rewriting the accesses to partitions of the alloca we lose the
+/// necessary information to correctly split apart paired loads and stores
+/// which both point into this alloca. The case to consider is something like
+/// the following:
+///
+///   %a = alloca [12 x i8]
+///   %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
+///   %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
+///   %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
+///   %iptr1 = bitcast i8* %gep1 to i64*
+///   %iptr2 = bitcast i8* %gep2 to i64*
+///   %fptr1 = bitcast i8* %gep1 to float*
+///   %fptr2 = bitcast i8* %gep2 to float*
+///   %fptr3 = bitcast i8* %gep3 to float*
+///   store float 0.0, float* %fptr1
+///   store float 1.0, float* %fptr2
+///   %v = load i64* %iptr1
+///   store i64 %v, i64* %iptr2
+///   %f1 = load float* %fptr2
+///   %f2 = load float* %fptr3
+///
+/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
+/// promote everything so we recover the 2 SSA values that should have been
+/// there all along.
+///
+/// \returns true if any changes are made.
+bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
+  DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+
+  // Track the loads and stores which are candidates for pre-splitting here, in
+  // the order they first appear during the partition scan. These give stable
+  // iteration order and a basis for tracking which loads and stores we
+  // actually split.
+  SmallVector<LoadInst *, 4> Loads;
+  SmallVector<StoreInst *, 4> Stores;
+
+  // We need to accumulate the splits required of each load or store where we
+  // can find them via a direct lookup. This is important to cross-check loads
+  // and stores against each other. We also track the slice so that we can kill
+  // all the slices that end up split.
+  struct SplitOffsets {
+    Slice *S;
+    std::vector<uint64_t> Splits;
+  };
+  SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
+
+  // Track loads out of this alloca which cannot, for any reason, be pre-split.
+  // This is important as we also cannot pre-split stores of those loads!
+  // FIXME: This is all pretty gross. It means that we can be more aggressive
+  // in pre-splitting when the load feeding the store happens to come from
+  // a separate alloca. Put another way, the effectiveness of SROA would be
+  // decreased by a frontend which just concatenated all of its local allocas
+  // into one big flat alloca. But defeating such patterns is exactly the job
+  // SROA is tasked with! Sadly, to not have this discrepancy we would have
+  // change store pre-splitting to actually force pre-splitting of the load
+  // that feeds it *and all stores*. That makes pre-splitting much harder, but
+  // maybe it would make it more principled?
+  SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
+
+  DEBUG(dbgs() << "  Searching for candidate loads and stores\n");
+  for (auto &P : AS.partitions()) {
+    for (Slice &S : P) {
+      Instruction *I = cast<Instruction>(S.getUse()->getUser());
+      if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
+        // If this is a load we have to track that it can't participate in any
+        // pre-splitting. If this is a store of a load we have to track that
+        // that load also can't participate in any pre-splitting.
+        if (auto *LI = dyn_cast<LoadInst>(I))
+          UnsplittableLoads.insert(LI);
+        else if (auto *SI = dyn_cast<StoreInst>(I))
+          if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
+            UnsplittableLoads.insert(LI);
+        continue;
+      }
+      assert(P.endOffset() > S.beginOffset() &&
+             "Empty or backwards partition!");
+
+      // Determine if this is a pre-splittable slice.
+      if (auto *LI = dyn_cast<LoadInst>(I)) {
+        assert(!LI->isVolatile() && "Cannot split volatile loads!");
+
+        // The load must be used exclusively to store into other pointers for
+        // us to be able to arbitrarily pre-split it. The stores must also be
+        // simple to avoid changing semantics.
+        auto IsLoadSimplyStored = [](LoadInst *LI) {
+          for (User *LU : LI->users()) {
+            auto *SI = dyn_cast<StoreInst>(LU);
+            if (!SI || !SI->isSimple())
+              return false;
+          }
+          return true;
+        };
+        if (!IsLoadSimplyStored(LI)) {
+          UnsplittableLoads.insert(LI);
+          continue;
+        }
+
+        Loads.push_back(LI);
+      } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+        if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
+          // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
+          continue;
+        auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
+        if (!StoredLoad || !StoredLoad->isSimple())
+          continue;
+        assert(!SI->isVolatile() && "Cannot split volatile stores!");
+
+        Stores.push_back(SI);
+      } else {
+        // Other uses cannot be pre-split.
+        continue;
+      }
+
+      // Record the initial split.
+      DEBUG(dbgs() << "    Candidate: " << *I << "\n");
+      auto &Offsets = SplitOffsetsMap[I];
+      assert(Offsets.Splits.empty() &&
+             "Should not have splits the first time we see an instruction!");
+      Offsets.S = &S;
+      Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
+    }
+
+    // Now scan the already split slices, and add a split for any of them which
+    // we're going to pre-split.
+    for (Slice *S : P.splitSliceTails()) {
+      auto SplitOffsetsMapI =
+          SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
+      if (SplitOffsetsMapI == SplitOffsetsMap.end())
+        continue;
+      auto &Offsets = SplitOffsetsMapI->second;
+
+      assert(Offsets.S == S && "Found a mismatched slice!");
+      assert(!Offsets.Splits.empty() &&
+             "Cannot have an empty set of splits on the second partition!");
+      assert(Offsets.Splits.back() ==
+                 P.beginOffset() - Offsets.S->beginOffset() &&
+             "Previous split does not end where this one begins!");
+
+      // Record each split. The last partition's end isn't needed as the size
+      // of the slice dictates that.
+      if (S->endOffset() > P.endOffset())
+        Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
+    }
+  }
+
+  // We may have split loads where some of their stores are split stores. For
+  // such loads and stores, we can only pre-split them if their splits exactly
+  // match relative to their starting offset. We have to verify this prior to
+  // any rewriting.
+  Stores.erase(
+      remove_if(Stores,
+                [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
+                  // Lookup the load we are storing in our map of split
+                  // offsets.
+                  auto *LI = cast<LoadInst>(SI->getValueOperand());
+                  // If it was completely unsplittable, then we're done,
+                  // and this store can't be pre-split.
+                  if (UnsplittableLoads.count(LI))
+                    return true;
+
+                  auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+                  if (LoadOffsetsI == SplitOffsetsMap.end())
+                    return false; // Unrelated loads are definitely safe.
+                  auto &LoadOffsets = LoadOffsetsI->second;
+
+                  // Now lookup the store's offsets.
+                  auto &StoreOffsets = SplitOffsetsMap[SI];
+
+                  // If the relative offsets of each split in the load and
+                  // store match exactly, then we can split them and we
+                  // don't need to remove them here.
+                  if (LoadOffsets.Splits == StoreOffsets.Splits)
+                    return false;
+
+                  DEBUG(dbgs() << "    Mismatched splits for load and store:\n"
+                               << "      " << *LI << "\n"
+                               << "      " << *SI << "\n");
+
+                  // We've found a store and load that we need to split
+                  // with mismatched relative splits. Just give up on them
+                  // and remove both instructions from our list of
+                  // candidates.
+                  UnsplittableLoads.insert(LI);
+                  return true;
+                }),
+      Stores.end());
+  // Now we have to go *back* through all the stores, because a later store may
+  // have caused an earlier store's load to become unsplittable and if it is
+  // unsplittable for the later store, then we can't rely on it being split in
+  // the earlier store either.
+  Stores.erase(remove_if(Stores,
+                         [&UnsplittableLoads](StoreInst *SI) {
+                           auto *LI = cast<LoadInst>(SI->getValueOperand());
+                           return UnsplittableLoads.count(LI);
+                         }),
+               Stores.end());
+  // Once we've established all the loads that can't be split for some reason,
+  // filter any that made it into our list out.
+  Loads.erase(remove_if(Loads,
+                        [&UnsplittableLoads](LoadInst *LI) {
+                          return UnsplittableLoads.count(LI);
+                        }),
+              Loads.end());
+
+  // If no loads or stores are left, there is no pre-splitting to be done for
+  // this alloca.
+  if (Loads.empty() && Stores.empty())
+    return false;
+
+  // From here on, we can't fail and will be building new accesses, so rig up
+  // an IR builder.
+  IRBuilderTy IRB(&AI);
+
+  // Collect the new slices which we will merge into the alloca slices.
+  SmallVector<Slice, 4> NewSlices;
+
+  // Track any allocas we end up splitting loads and stores for so we iterate
+  // on them.
+  SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
+
+  // At this point, we have collected all of the loads and stores we can
+  // pre-split, and the specific splits needed for them. We actually do the
+  // splitting in a specific order in order to handle when one of the loads in
+  // the value operand to one of the stores.
+  //
+  // First, we rewrite all of the split loads, and just accumulate each split
+  // load in a parallel structure. We also build the slices for them and append
+  // them to the alloca slices.
+  SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
+  std::vector<LoadInst *> SplitLoads;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
+  for (LoadInst *LI : Loads) {
+    SplitLoads.clear();
+
+    IntegerType *Ty = cast<IntegerType>(LI->getType());
+    uint64_t LoadSize = Ty->getBitWidth() / 8;
+    assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
+
+    auto &Offsets = SplitOffsetsMap[LI];
+    assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+           "Slice size should always match load size exactly!");
+    uint64_t BaseOffset = Offsets.S->beginOffset();
+    assert(BaseOffset + LoadSize > BaseOffset &&
+           "Cannot represent alloca access size using 64-bit integers!");
+
+    Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
+    IRB.SetInsertPoint(LI);
+
+    DEBUG(dbgs() << "  Splitting load: " << *LI << "\n");
+
+    uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+    int Idx = 0, Size = Offsets.Splits.size();
+    for (;;) {
+      auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+      auto AS = LI->getPointerAddressSpace();
+      auto *PartPtrTy = PartTy->getPointerTo(AS);
+      LoadInst *PLoad = IRB.CreateAlignedLoad(
+          getAdjustedPtr(IRB, DL, BasePtr,
+                         APInt(DL.getPointerSizeInBits(AS), PartOffset),
+                         PartPtrTy, BasePtr->getName() + "."),
+          getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
+          LI->getName());
+      PLoad->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access); 
+
+      // Append this load onto the list of split loads so we can find it later
+      // to rewrite the stores.
+      SplitLoads.push_back(PLoad);
+
+      // Now build a new slice for the alloca.
+      NewSlices.push_back(
+          Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+                &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
+                /*IsSplittable*/ false));
+      DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
+                   << ", " << NewSlices.back().endOffset() << "): " << *PLoad
+                   << "\n");
+
+      // See if we've handled all the splits.
+      if (Idx >= Size)
+        break;
+
+      // Setup the next partition.
+      PartOffset = Offsets.Splits[Idx];
+      ++Idx;
+      PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
+    }
+
+    // Now that we have the split loads, do the slow walk over all uses of the
+    // load and rewrite them as split stores, or save the split loads to use
+    // below if the store is going to be split there anyways.
+    bool DeferredStores = false;
+    for (User *LU : LI->users()) {
+      StoreInst *SI = cast<StoreInst>(LU);
+      if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
+        DeferredStores = true;
+        DEBUG(dbgs() << "    Deferred splitting of store: " << *SI << "\n");
+        continue;
+      }
+
+      Value *StoreBasePtr = SI->getPointerOperand();
+      IRB.SetInsertPoint(SI);
+
+      DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n");
+
+      for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
+        LoadInst *PLoad = SplitLoads[Idx];
+        uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
+        auto *PartPtrTy =
+            PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
+
+        auto AS = SI->getPointerAddressSpace();
+        StoreInst *PStore = IRB.CreateAlignedStore(
+            PLoad,
+            getAdjustedPtr(IRB, DL, StoreBasePtr,
+                           APInt(DL.getPointerSizeInBits(AS), PartOffset),
+                           PartPtrTy, StoreBasePtr->getName() + "."),
+            getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
+        PStore->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access);
+        DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
+      }
+
+      // We want to immediately iterate on any allocas impacted by splitting
+      // this store, and we have to track any promotable alloca (indicated by
+      // a direct store) as needing to be resplit because it is no longer
+      // promotable.
+      if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
+        ResplitPromotableAllocas.insert(OtherAI);
+        Worklist.insert(OtherAI);
+      } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+                     StoreBasePtr->stripInBoundsOffsets())) {
+        Worklist.insert(OtherAI);
+      }
+
+      // Mark the original store as dead.
+      DeadInsts.insert(SI);
+    }
+
+    // Save the split loads if there are deferred stores among the users.
+    if (DeferredStores)
+      SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
+
+    // Mark the original load as dead and kill the original slice.
+    DeadInsts.insert(LI);
+    Offsets.S->kill();
+  }
+
+  // Second, we rewrite all of the split stores. At this point, we know that
+  // all loads from this alloca have been split already. For stores of such
+  // loads, we can simply look up the pre-existing split loads. For stores of
+  // other loads, we split those loads first and then write split stores of
+  // them.
+  for (StoreInst *SI : Stores) {
+    auto *LI = cast<LoadInst>(SI->getValueOperand());
+    IntegerType *Ty = cast<IntegerType>(LI->getType());
+    uint64_t StoreSize = Ty->getBitWidth() / 8;
+    assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
+
+    auto &Offsets = SplitOffsetsMap[SI];
+    assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+           "Slice size should always match load size exactly!");
+    uint64_t BaseOffset = Offsets.S->beginOffset();
+    assert(BaseOffset + StoreSize > BaseOffset &&
+           "Cannot represent alloca access size using 64-bit integers!");
+
+    Value *LoadBasePtr = LI->getPointerOperand();
+    Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
+
+    DEBUG(dbgs() << "  Splitting store: " << *SI << "\n");
+
+    // Check whether we have an already split load.
+    auto SplitLoadsMapI = SplitLoadsMap.find(LI);
+    std::vector<LoadInst *> *SplitLoads = nullptr;
+    if (SplitLoadsMapI != SplitLoadsMap.end()) {
+      SplitLoads = &SplitLoadsMapI->second;
+      assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
+             "Too few split loads for the number of splits in the store!");
+    } else {
+      DEBUG(dbgs() << "          of load: " << *LI << "\n");
+    }
+
+    uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+    int Idx = 0, Size = Offsets.Splits.size();
+    for (;;) {
+      auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+      auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+      auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+
+      // Either lookup a split load or create one.
+      LoadInst *PLoad;
+      if (SplitLoads) {
+        PLoad = (*SplitLoads)[Idx];
+      } else {
+        IRB.SetInsertPoint(LI);
+        auto AS = LI->getPointerAddressSpace();
+        PLoad = IRB.CreateAlignedLoad(
+            getAdjustedPtr(IRB, DL, LoadBasePtr,
+                           APInt(DL.getPointerSizeInBits(AS), PartOffset),
+                           LoadPartPtrTy, LoadBasePtr->getName() + "."),
+            getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
+            LI->getName());
+      }
+
+      // And store this partition.
+      IRB.SetInsertPoint(SI);
+      auto AS = SI->getPointerAddressSpace();
+      StoreInst *PStore = IRB.CreateAlignedStore(
+          PLoad,
+          getAdjustedPtr(IRB, DL, StoreBasePtr,
+                         APInt(DL.getPointerSizeInBits(AS), PartOffset),
+                         StorePartPtrTy, StoreBasePtr->getName() + "."),
+          getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
+
+      // Now build a new slice for the alloca.
+      NewSlices.push_back(
+          Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+                &PStore->getOperandUse(PStore->getPointerOperandIndex()),
+                /*IsSplittable*/ false));
+      DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
+                   << ", " << NewSlices.back().endOffset() << "): " << *PStore
+                   << "\n");
+      if (!SplitLoads) {
+        DEBUG(dbgs() << "      of split load: " << *PLoad << "\n");
+      }
+
+      // See if we've finished all the splits.
+      if (Idx >= Size)
+        break;
+
+      // Setup the next partition.
+      PartOffset = Offsets.Splits[Idx];
+      ++Idx;
+      PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
+    }
+
+    // We want to immediately iterate on any allocas impacted by splitting
+    // this load, which is only relevant if it isn't a load of this alloca and
+    // thus we didn't already split the loads above. We also have to keep track
+    // of any promotable allocas we split loads on as they can no longer be
+    // promoted.
+    if (!SplitLoads) {
+      if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
+        assert(OtherAI != &AI && "We can't re-split our own alloca!");
+        ResplitPromotableAllocas.insert(OtherAI);
+        Worklist.insert(OtherAI);
+      } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+                     LoadBasePtr->stripInBoundsOffsets())) {
+        assert(OtherAI != &AI && "We can't re-split our own alloca!");
+        Worklist.insert(OtherAI);
+      }
+    }
+
+    // Mark the original store as dead now that we've split it up and kill its
+    // slice. Note that we leave the original load in place unless this store
+    // was its only use. It may in turn be split up if it is an alloca load
+    // for some other alloca, but it may be a normal load. This may introduce
+    // redundant loads, but where those can be merged the rest of the optimizer
+    // should handle the merging, and this uncovers SSA splits which is more
+    // important. In practice, the original loads will almost always be fully
+    // split and removed eventually, and the splits will be merged by any
+    // trivial CSE, including instcombine.
+    if (LI->hasOneUse()) {
+      assert(*LI->user_begin() == SI && "Single use isn't this store!");
+      DeadInsts.insert(LI);
+    }
+    DeadInsts.insert(SI);
+    Offsets.S->kill();
+  }
+
+  // Remove the killed slices that have ben pre-split.
+  AS.erase(remove_if(AS, [](const Slice &S) { return S.isDead(); }), AS.end());
+
+  // Insert our new slices. This will sort and merge them into the sorted
+  // sequence.
+  AS.insert(NewSlices);
+
+  DEBUG(dbgs() << "  Pre-split slices:\n");
+#ifndef NDEBUG
+  for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
+    DEBUG(AS.print(dbgs(), I, "    "));
+#endif
+
+  // Finally, don't try to promote any allocas that new require re-splitting.
+  // They have already been added to the worklist above.
+  PromotableAllocas.erase(
+      remove_if(
+          PromotableAllocas,
+          [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
+      PromotableAllocas.end());
+
+  return true;
+}
+
+/// \brief Rewrite an alloca partition's users.
+///
+/// This routine drives both of the rewriting goals of the SROA pass. It tries
+/// to rewrite uses of an alloca partition to be conducive for SSA value
+/// promotion. If the partition needs a new, more refined alloca, this will
+/// build that new alloca, preserving as much type information as possible, and
+/// rewrite the uses of the old alloca to point at the new one and have the
+/// appropriate new offsets. It also evaluates how successful the rewrite was
+/// at enabling promotion and if it was successful queues the alloca to be
+/// promoted.
+AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+                                   Partition &P) {
+  // Try to compute a friendly type for this partition of the alloca. This
+  // won't always succeed, in which case we fall back to a legal integer type
+  // or an i8 array of an appropriate size.
+  Type *SliceTy = nullptr;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
+  if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
+    if (DL.getTypeAllocSize(CommonUseTy) >= P.size())
+      SliceTy = CommonUseTy;
+  if (!SliceTy)
+    if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
+                                                 P.beginOffset(), P.size()))
+      SliceTy = TypePartitionTy;
+  if ((!SliceTy || (SliceTy->isArrayTy() &&
+                    SliceTy->getArrayElementType()->isIntegerTy())) &&
+      DL.isLegalInteger(P.size() * 8))
+    SliceTy = Type::getIntNTy(*C, P.size() * 8);
+  if (!SliceTy)
+    SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
+  assert(DL.getTypeAllocSize(SliceTy) >= P.size());
+
+  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
+
+  VectorType *VecTy =
+      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
+  if (VecTy)
+    SliceTy = VecTy;
+
+  // Check for the case where we're going to rewrite to a new alloca of the
+  // exact same type as the original, and with the same access offsets. In that
+  // case, re-use the existing alloca, but still run through the rewriter to
+  // perform phi and select speculation.
+  AllocaInst *NewAI;
+  if (SliceTy == AI.getAllocatedType()) {
+    assert(P.beginOffset() == 0 &&
+           "Non-zero begin offset but same alloca type");
+    NewAI = &AI;
+    // FIXME: We should be able to bail at this point with "nothing changed".
+    // FIXME: We might want to defer PHI speculation until after here.
+    // FIXME: return nullptr;
+  } else {
+    unsigned Alignment = AI.getAlignment();
+    if (!Alignment) {
+      // The minimum alignment which users can rely on when the explicit
+      // alignment is omitted or zero is that required by the ABI for this
+      // type.
+      Alignment = DL.getABITypeAlignment(AI.getAllocatedType());
+    }
+    Alignment = MinAlign(Alignment, P.beginOffset());
+    // If we will get at least this much alignment from the type alone, leave
+    // the alloca's alignment unconstrained.
+    if (Alignment <= DL.getABITypeAlignment(SliceTy))
+      Alignment = 0;
+    NewAI = new AllocaInst(
+      SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment,
+        AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
+    ++NumNewAllocas;
+  }
+
+  DEBUG(dbgs() << "Rewriting alloca partition "
+               << "[" << P.beginOffset() << "," << P.endOffset()
+               << ") to: " << *NewAI << "\n");
+
+  // Track the high watermark on the worklist as it is only relevant for
+  // promoted allocas. We will reset it to this point if the alloca is not in
+  // fact scheduled for promotion.
+  unsigned PPWOldSize = PostPromotionWorklist.size();
+  unsigned NumUses = 0;
+  SmallSetVector<PHINode *, 8> PHIUsers;
+  SmallSetVector<SelectInst *, 8> SelectUsers;
+
+  AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
+                               P.endOffset(), IsIntegerPromotable, VecTy,
+                               PHIUsers, SelectUsers);
+  bool Promotable = true;
+  for (Slice *S : P.splitSliceTails()) {
+    Promotable &= Rewriter.visit(S);
+    ++NumUses;
+  }
+  for (Slice &S : P) {
+    Promotable &= Rewriter.visit(&S);
+    ++NumUses;
+  }
+
+  NumAllocaPartitionUses += NumUses;
+  MaxUsesPerAllocaPartition.updateMax(NumUses);
+
+  // Now that we've processed all the slices in the new partition, check if any
+  // PHIs or Selects would block promotion.
+  for (PHINode *PHI : PHIUsers)
+    if (!isSafePHIToSpeculate(*PHI)) {
+      Promotable = false;
+      PHIUsers.clear();
+      SelectUsers.clear();
+      break;
+    }
+
+  for (SelectInst *Sel : SelectUsers)
+    if (!isSafeSelectToSpeculate(*Sel)) {
+      Promotable = false;
+      PHIUsers.clear();
+      SelectUsers.clear();
+      break;
+    }
+
+  if (Promotable) {
+    if (PHIUsers.empty() && SelectUsers.empty()) {
+      // Promote the alloca.
+      PromotableAllocas.push_back(NewAI);
+    } else {
+      // If we have either PHIs or Selects to speculate, add them to those
+      // worklists and re-queue the new alloca so that we promote in on the
+      // next iteration.
+      for (PHINode *PHIUser : PHIUsers)
+        SpeculatablePHIs.insert(PHIUser);
+      for (SelectInst *SelectUser : SelectUsers)
+        SpeculatableSelects.insert(SelectUser);
+      Worklist.insert(NewAI);
+    }
+  } else {
+    // Drop any post-promotion work items if promotion didn't happen.
+    while (PostPromotionWorklist.size() > PPWOldSize)
+      PostPromotionWorklist.pop_back();
+
+    // We couldn't promote and we didn't create a new partition, nothing
+    // happened.
+    if (NewAI == &AI)
+      return nullptr;
+
+    // If we can't promote the alloca, iterate on it to check for new
+    // refinements exposed by splitting the current alloca. Don't iterate on an
+    // alloca which didn't actually change and didn't get promoted.
+    Worklist.insert(NewAI);
+  }
+
+  return NewAI;
+}
+
+/// \brief Walks the slices of an alloca and form partitions based on them,
+/// rewriting each of their uses.
+bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
+  if (AS.begin() == AS.end())
+    return false;
+
+  unsigned NumPartitions = 0;
+  bool Changed = false;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
+
+  // First try to pre-split loads and stores.
+  Changed |= presplitLoadsAndStores(AI, AS);
+
+  // Now that we have identified any pre-splitting opportunities, mark any
+  // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
+  // to split these during pre-splitting, we want to force them to be
+  // rewritten into a partition.
+  bool IsSorted = true;
+  for (Slice &S : AS) {
+    if (!S.isSplittable())
+      continue;
+    // FIXME: We currently leave whole-alloca splittable loads and stores. This
+    // used to be the only splittable loads and stores and we need to be
+    // confident that the above handling of splittable loads and stores is
+    // completely sufficient before we forcibly disable the remaining handling.
+    if (S.beginOffset() == 0 &&
+        S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType()))
+      continue;
+    if (isa<LoadInst>(S.getUse()->getUser()) ||
+        isa<StoreInst>(S.getUse()->getUser())) {
+      S.makeUnsplittable();
+      IsSorted = false;
+    }
+  }
+  if (!IsSorted)
+    std::sort(AS.begin(), AS.end());
+
+  /// Describes the allocas introduced by rewritePartition in order to migrate
+  /// the debug info.
+  struct Fragment {
+    AllocaInst *Alloca;
+    uint64_t Offset;
+    uint64_t Size;
+    Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
+      : Alloca(AI), Offset(O), Size(S) {}
+  };
+  SmallVector<Fragment, 4> Fragments;
+
+  // Rewrite each partition.
+  for (auto &P : AS.partitions()) {
+    if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
+      Changed = true;
+      if (NewAI != &AI) {
+        uint64_t SizeOfByte = 8;
+        uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType());
+        // Don't include any padding.
+        uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
+        Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
+      }
+    }
+    ++NumPartitions;
+  }
+
+  NumAllocaPartitions += NumPartitions;
+  MaxPartitionsPerAlloca.updateMax(NumPartitions);
+
+  // Migrate debug information from the old alloca to the new alloca(s)
+  // and the individual partitions.
+  if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) {
+    auto *Var = DbgDecl->getVariable();
+    auto *Expr = DbgDecl->getExpression();
+    DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
+    uint64_t AllocaSize = DL.getTypeSizeInBits(AI.getAllocatedType());
+    for (auto Fragment : Fragments) {
+      // Create a fragment expression describing the new partition or reuse AI's
+      // expression if there is only one partition.
+      auto *FragmentExpr = Expr;
+      if (Fragment.Size < AllocaSize || Expr->isFragment()) {
+        // If this alloca is already a scalar replacement of a larger aggregate,
+        // Fragment.Offset describes the offset inside the scalar.
+        auto ExprFragment = Expr->getFragmentInfo();
+        uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0;
+        uint64_t Start = Offset + Fragment.Offset;
+        uint64_t Size = Fragment.Size;
+        if (ExprFragment) {
+          uint64_t AbsEnd =
+	    ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
+          if (Start >= AbsEnd)
+            // No need to describe a SROAed padding.
+            continue;
+          Size = std::min(Size, AbsEnd - Start);
+        }
+        FragmentExpr = DIB.createFragmentExpression(Start, Size);
+      }
+
+      // Remove any existing dbg.declare intrinsic describing the same alloca.
+      if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Fragment.Alloca))
+        OldDDI->eraseFromParent();
+
+      DIB.insertDeclare(Fragment.Alloca, Var, FragmentExpr,
+                        DbgDecl->getDebugLoc(), &AI);
+    }
+  }
+  return Changed;
+}
+
+/// \brief Clobber a use with undef, deleting the used value if it becomes dead.
+void SROA::clobberUse(Use &U) {
+  Value *OldV = U;
+  // Replace the use with an undef value.
+  U = UndefValue::get(OldV->getType());
+
+  // Check for this making an instruction dead. We have to garbage collect
+  // all the dead instructions to ensure the uses of any alloca end up being
+  // minimal.
+  if (Instruction *OldI = dyn_cast<Instruction>(OldV))
+    if (isInstructionTriviallyDead(OldI)) {
+      DeadInsts.insert(OldI);
+    }
+}
+
+/// \brief Analyze an alloca for SROA.
+///
+/// This analyzes the alloca to ensure we can reason about it, builds
+/// the slices of the alloca, and then hands it off to be split and
+/// rewritten as needed.
+bool SROA::runOnAlloca(AllocaInst &AI) {
+  DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
+  ++NumAllocasAnalyzed;
+
+  // Special case dead allocas, as they're trivial.
+  if (AI.use_empty()) {
+    AI.eraseFromParent();
+    return true;
+  }
+  const DataLayout &DL = AI.getModule()->getDataLayout();
+
+  // Skip alloca forms that this analysis can't handle.
+  if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
+      DL.getTypeAllocSize(AI.getAllocatedType()) == 0)
+    return false;
+
+  bool Changed = false;
+
+  // First, split any FCA loads and stores touching this alloca to promote
+  // better splitting and promotion opportunities.
+  AggLoadStoreRewriter AggRewriter;
+  Changed |= AggRewriter.rewrite(AI);
+
+  // Build the slices using a recursive instruction-visiting builder.
+  AllocaSlices AS(DL, AI);
+  DEBUG(AS.print(dbgs()));
+  if (AS.isEscaped())
+    return Changed;
+
+  // Delete all the dead users of this alloca before splitting and rewriting it.
+  for (Instruction *DeadUser : AS.getDeadUsers()) {
+    // Free up everything used by this instruction.
+    for (Use &DeadOp : DeadUser->operands())
+      clobberUse(DeadOp);
+
+    // Now replace the uses of this instruction.
+    DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
+
+    // And mark it for deletion.
+    DeadInsts.insert(DeadUser);
+    Changed = true;
+  }
+  for (Use *DeadOp : AS.getDeadOperands()) {
+    clobberUse(*DeadOp);
+    Changed = true;
+  }
+
+  // No slices to split. Leave the dead alloca for a later pass to clean up.
+  if (AS.begin() == AS.end())
+    return Changed;
+
+  Changed |= splitAlloca(AI, AS);
+
+  DEBUG(dbgs() << "  Speculating PHIs\n");
+  while (!SpeculatablePHIs.empty())
+    speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
+
+  DEBUG(dbgs() << "  Speculating Selects\n");
+  while (!SpeculatableSelects.empty())
+    speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
+
+  return Changed;
+}
+
+/// \brief Delete the dead instructions accumulated in this run.
+///
+/// Recursively deletes the dead instructions we've accumulated. This is done
+/// at the very end to maximize locality of the recursive delete and to
+/// minimize the problems of invalidated instruction pointers as such pointers
+/// are used heavily in the intermediate stages of the algorithm.
+///
+/// We also record the alloca instructions deleted here so that they aren't
+/// subsequently handed to mem2reg to promote.
+void SROA::deleteDeadInstructions(
+    SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
+  while (!DeadInsts.empty()) {
+    Instruction *I = DeadInsts.pop_back_val();
+    DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+
+    I->replaceAllUsesWith(UndefValue::get(I->getType()));
+
+    for (Use &Operand : I->operands())
+      if (Instruction *U = dyn_cast<Instruction>(Operand)) {
+        // Zero out the operand and see if it becomes trivially dead.
+        Operand = nullptr;
+        if (isInstructionTriviallyDead(U))
+          DeadInsts.insert(U);
+      }
+
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+      DeletedAllocas.insert(AI);
+      if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI))
+        DbgDecl->eraseFromParent();
+    }
+
+    ++NumDeleted;
+    I->eraseFromParent();
+  }
+}
+
+/// \brief Promote the allocas, using the best available technique.
+///
+/// This attempts to promote whatever allocas have been identified as viable in
+/// the PromotableAllocas list. If that list is empty, there is nothing to do.
+/// This function returns whether any promotion occurred.
+bool SROA::promoteAllocas(Function &F) {
+  if (PromotableAllocas.empty())
+    return false;
+
+  NumPromoted += PromotableAllocas.size();
+
+  DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
+  PromoteMemToReg(PromotableAllocas, *DT, AC);
+  PromotableAllocas.clear();
+  return true;
+}
+
+PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
+                                AssumptionCache &RunAC) {
+  DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
+  C = &F.getContext();
+  DT = &RunDT;
+  AC = &RunAC;
+
+  BasicBlock &EntryBB = F.getEntryBlock();
+  for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
+       I != E; ++I) {
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+      Worklist.insert(AI);
+  }
+
+  bool Changed = false;
+  // A set of deleted alloca instruction pointers which should be removed from
+  // the list of promotable allocas.
+  SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
+
+  do {
+    while (!Worklist.empty()) {
+      Changed |= runOnAlloca(*Worklist.pop_back_val());
+      deleteDeadInstructions(DeletedAllocas);
+
+      // Remove the deleted allocas from various lists so that we don't try to
+      // continue processing them.
+      if (!DeletedAllocas.empty()) {
+        auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
+        Worklist.remove_if(IsInSet);
+        PostPromotionWorklist.remove_if(IsInSet);
+        PromotableAllocas.erase(remove_if(PromotableAllocas, IsInSet),
+                                PromotableAllocas.end());
+        DeletedAllocas.clear();
+      }
+    }
+
+    Changed |= promoteAllocas(F);
+
+    Worklist = PostPromotionWorklist;
+    PostPromotionWorklist.clear();
+  } while (!Worklist.empty());
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) {
+  return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
+                 AM.getResult<AssumptionAnalysis>(F));
+}
+
+/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
+///
+/// This is in the llvm namespace purely to allow it to be a friend of the \c
+/// SROA pass.
+class llvm::sroa::SROALegacyPass : public FunctionPass {
+  /// The SROA implementation.
+  SROA Impl;
+
+public:
+  SROALegacyPass() : FunctionPass(ID) {
+    initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto PA = Impl.runImpl(
+        F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
+    return !PA.areAllPreserved();
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  StringRef getPassName() const override { return "SROA"; }
+  static char ID;
+};
+
+char SROALegacyPass::ID = 0;
+
+FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
+                      "Scalar Replacement Of Aggregates", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
+                    false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
new file mode 100644
index 000000000000..ce6f93eb0c15
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -0,0 +1,282 @@
+//===-- Scalar.cpp --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMScalarOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Scalar.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+
+using namespace llvm;
+
+/// initializeScalarOptsPasses - Initialize all passes linked into the
+/// ScalarOpts library.
+void llvm::initializeScalarOpts(PassRegistry &Registry) {
+  initializeADCELegacyPassPass(Registry);
+  initializeBDCELegacyPassPass(Registry);
+  initializeAlignmentFromAssumptionsPass(Registry);
+  initializeConstantHoistingLegacyPassPass(Registry);
+  initializeConstantPropagationPass(Registry);
+  initializeCorrelatedValuePropagationPass(Registry);
+  initializeDCELegacyPassPass(Registry);
+  initializeDeadInstEliminationPass(Registry);
+  initializeScalarizerPass(Registry);
+  initializeDSELegacyPassPass(Registry);
+  initializeGuardWideningLegacyPassPass(Registry);
+  initializeGVNLegacyPassPass(Registry);
+  initializeNewGVNLegacyPassPass(Registry);
+  initializeEarlyCSELegacyPassPass(Registry);
+  initializeEarlyCSEMemSSALegacyPassPass(Registry);
+  initializeGVNHoistLegacyPassPass(Registry);
+  initializeGVNSinkLegacyPassPass(Registry);
+  initializeFlattenCFGPassPass(Registry);
+  initializeInductiveRangeCheckEliminationPass(Registry);
+  initializeIndVarSimplifyLegacyPassPass(Registry);
+  initializeInferAddressSpacesPass(Registry);
+  initializeJumpThreadingPass(Registry);
+  initializeLegacyLICMPassPass(Registry);
+  initializeLegacyLoopSinkPassPass(Registry);
+  initializeLoopDataPrefetchLegacyPassPass(Registry);
+  initializeLoopDeletionLegacyPassPass(Registry);
+  initializeLoopAccessLegacyAnalysisPass(Registry);
+  initializeLoopInstSimplifyLegacyPassPass(Registry);
+  initializeLoopInterchangePass(Registry);
+  initializeLoopPredicationLegacyPassPass(Registry);
+  initializeLoopRotateLegacyPassPass(Registry);
+  initializeLoopStrengthReducePass(Registry);
+  initializeLoopRerollPass(Registry);
+  initializeLoopUnrollPass(Registry);
+  initializeLoopUnswitchPass(Registry);
+  initializeLoopVersioningLICMPass(Registry);
+  initializeLoopIdiomRecognizeLegacyPassPass(Registry);
+  initializeLowerAtomicLegacyPassPass(Registry);
+  initializeLowerExpectIntrinsicPass(Registry);
+  initializeLowerGuardIntrinsicLegacyPassPass(Registry);
+  initializeMemCpyOptLegacyPassPass(Registry);
+  initializeMergedLoadStoreMotionLegacyPassPass(Registry);
+  initializeNaryReassociateLegacyPassPass(Registry);
+  initializePartiallyInlineLibCallsLegacyPassPass(Registry);
+  initializeReassociateLegacyPassPass(Registry);
+  initializeRegToMemPass(Registry);
+  initializeRewriteStatepointsForGCPass(Registry);
+  initializeSCCPLegacyPassPass(Registry);
+  initializeIPSCCPLegacyPassPass(Registry);
+  initializeSROALegacyPassPass(Registry);
+  initializeCFGSimplifyPassPass(Registry);
+  initializeLateCFGSimplifyPassPass(Registry);
+  initializeStructurizeCFGPass(Registry);
+  initializeSimpleLoopUnswitchLegacyPassPass(Registry);
+  initializeSinkingLegacyPassPass(Registry);
+  initializeTailCallElimPass(Registry);
+  initializeSeparateConstOffsetFromGEPPass(Registry);
+  initializeSpeculativeExecutionLegacyPassPass(Registry);
+  initializeStraightLineStrengthReducePass(Registry);
+  initializePlaceBackedgeSafepointsImplPass(Registry);
+  initializePlaceSafepointsPass(Registry);
+  initializeFloat2IntLegacyPassPass(Registry);
+  initializeLoopDistributeLegacyPass(Registry);
+  initializeLoopLoadEliminationPass(Registry);
+  initializeLoopSimplifyCFGLegacyPassPass(Registry);
+  initializeLoopVersioningPassPass(Registry);
+}
+
+void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
+  initializeScalarOpts(*unwrap(R));
+}
+
+void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAggressiveDCEPass());
+}
+
+void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createBitTrackingDCEPass());
+}
+
+void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAlignmentFromAssumptionsPass());
+}
+
+void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createCFGSimplificationPass());
+}
+
+void LLVMAddLateCFGSimplificationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLateCFGSimplificationPass());
+}
+
+void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadStoreEliminationPass());
+}
+
+void LLVMAddScalarizerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createScalarizerPass());
+}
+
+void LLVMAddGVNPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGVNPass());
+}
+
+void LLVMAddNewGVNPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createNewGVNPass());
+}
+
+void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMergedLoadStoreMotionPass());
+}
+
+void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createIndVarSimplifyPass());
+}
+
+void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createInstructionCombiningPass());
+}
+
+void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createJumpThreadingPass());
+}
+
+void LLVMAddLoopSinkPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopSinkPass());
+}
+
+void LLVMAddLICMPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLICMPass());
+}
+
+void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopDeletionPass());
+}
+
+void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopIdiomPass());
+}
+
+void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopRotatePass());
+}
+
+void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopRerollPass());
+}
+
+void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopSimplifyCFGPass());
+}
+
+void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnrollPass());
+}
+
+void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnswitchPass());
+}
+
+void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMemCpyOptPass());
+}
+
+void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPartiallyInlineLibCallsPass());
+}
+
+void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerSwitchPass());
+}
+
+void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPromoteMemoryToRegisterPass());
+}
+
+void LLVMAddReassociatePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createReassociatePass());
+}
+
+void LLVMAddSCCPPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSCCPPass());
+}
+
+void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSROAPass());
+}
+
+void LLVMAddScalarReplAggregatesPassSSA(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSROAPass());
+}
+
+void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM,
+                                                  int Threshold) {
+  unwrap(PM)->add(createSROAPass());
+}
+
+void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) {
+  // NOTE: The simplify-libcalls pass has been removed.
+}
+
+void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createTailCallEliminationPass());
+}
+
+void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createConstantPropagationPass());
+}
+
+void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDemoteRegisterToMemoryPass());
+}
+
+void LLVMAddVerifierPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createVerifierPass());
+}
+
+void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createCorrelatedValuePropagationPass());
+}
+
+void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createEarlyCSEPass(false/*=UseMemorySSA*/));
+}
+
+void LLVMAddEarlyCSEMemSSAPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createEarlyCSEPass(true/*=UseMemorySSA*/));
+}
+
+void LLVMAddGVNHoistLegacyPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGVNHoistPass());
+}
+
+void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createTypeBasedAAWrapperPass());
+}
+
+void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createScopedNoAliasAAWrapperPass());
+}
+
+void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createBasicAAWrapperPass());
+}
+
+void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerExpectIntrinsicPass());
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
new file mode 100644
index 000000000000..d11855f2f3a9
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -0,0 +1,772 @@
+//===--- Scalarizer.cpp - Scalarize vector operations ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts vector operations into scalar operations, in order
+// to expose optimization opportunities on the individual scalar operations.
+// It is mainly intended for targets that do not have vector units, but it
+// may also be useful for revectorizing code to different vector widths.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarizer"
+
+namespace {
+// Used to store the scattered form of a vector.
+typedef SmallVector<Value *, 8> ValueVector;
+
+// Used to map a vector Value to its scattered form.  We use std::map
+// because we want iterators to persist across insertion and because the
+// values are relatively large.
+typedef std::map<Value *, ValueVector> ScatterMap;
+
+// Lists Instructions that have been replaced with scalar implementations,
+// along with a pointer to their scattered forms.
+typedef SmallVector<std::pair<Instruction *, ValueVector *>, 16> GatherList;
+
+// Provides a very limited vector-like interface for lazily accessing one
+// component of a scattered vector or vector pointer.
+class Scatterer {
+public:
+  Scatterer() {}
+
+  // Scatter V into Size components.  If new instructions are needed,
+  // insert them before BBI in BB.  If Cache is nonnull, use it to cache
+  // the results.
+  Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+            ValueVector *cachePtr = nullptr);
+
+  // Return component I, creating a new Value for it if necessary.
+  Value *operator[](unsigned I);
+
+  // Return the number of components.
+  unsigned size() const { return Size; }
+
+private:
+  BasicBlock *BB;
+  BasicBlock::iterator BBI;
+  Value *V;
+  ValueVector *CachePtr;
+  PointerType *PtrTy;
+  ValueVector Tmp;
+  unsigned Size;
+};
+
+// FCmpSpliiter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp
+// called Name that compares X and Y in the same way as FCI.
+struct FCmpSplitter {
+  FCmpSplitter(FCmpInst &fci) : FCI(fci) {}
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name);
+  }
+  FCmpInst &FCI;
+};
+
+// ICmpSpliiter(ICI)(Builder, X, Y, Name) uses Builder to create an ICmp
+// called Name that compares X and Y in the same way as ICI.
+struct ICmpSplitter {
+  ICmpSplitter(ICmpInst &ici) : ICI(ici) {}
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name);
+  }
+  ICmpInst &ICI;
+};
+
+// BinarySpliiter(BO)(Builder, X, Y, Name) uses Builder to create
+// a binary operator like BO called Name with operands X and Y.
+struct BinarySplitter {
+  BinarySplitter(BinaryOperator &bo) : BO(bo) {}
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name);
+  }
+  BinaryOperator &BO;
+};
+
+// Information about a load or store that we're scalarizing.
+struct VectorLayout {
+  VectorLayout() : VecTy(nullptr), ElemTy(nullptr), VecAlign(0), ElemSize(0) {}
+
+  // Return the alignment of element I.
+  uint64_t getElemAlign(unsigned I) {
+    return MinAlign(VecAlign, I * ElemSize);
+  }
+
+  // The type of the vector.
+  VectorType *VecTy;
+
+  // The type of each element.
+  Type *ElemTy;
+
+  // The alignment of the vector.
+  uint64_t VecAlign;
+
+  // The size of each element.
+  uint64_t ElemSize;
+};
+
+class Scalarizer : public FunctionPass,
+                   public InstVisitor<Scalarizer, bool> {
+public:
+  static char ID;
+
+  Scalarizer() :
+    FunctionPass(ID) {
+    initializeScalarizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+  // InstVisitor methods.  They return true if the instruction was scalarized,
+  // false if nothing changed.
+  bool visitInstruction(Instruction &) { return false; }
+  bool visitSelectInst(SelectInst &SI);
+  bool visitICmpInst(ICmpInst &);
+  bool visitFCmpInst(FCmpInst &);
+  bool visitBinaryOperator(BinaryOperator &);
+  bool visitGetElementPtrInst(GetElementPtrInst &);
+  bool visitCastInst(CastInst &);
+  bool visitBitCastInst(BitCastInst &);
+  bool visitShuffleVectorInst(ShuffleVectorInst &);
+  bool visitPHINode(PHINode &);
+  bool visitLoadInst(LoadInst &);
+  bool visitStoreInst(StoreInst &);
+  bool visitCallInst(CallInst &I);
+
+  static void registerOptions() {
+    // This is disabled by default because having separate loads and stores
+    // makes it more likely that the -combiner-alias-analysis limits will be
+    // reached.
+    OptionRegistry::registerOption<bool, Scalarizer,
+                                 &Scalarizer::ScalarizeLoadStore>(
+        "scalarize-load-store",
+        "Allow the scalarizer pass to scalarize loads and store", false);
+  }
+
+private:
+  Scatterer scatter(Instruction *, Value *);
+  void gather(Instruction *, const ValueVector &);
+  bool canTransferMetadata(unsigned Kind);
+  void transferMetadata(Instruction *, const ValueVector &);
+  bool getVectorLayout(Type *, unsigned, VectorLayout &, const DataLayout &);
+  bool finish();
+
+  template<typename T> bool splitBinary(Instruction &, const T &);
+
+  bool splitCall(CallInst &CI);
+
+  ScatterMap Scattered;
+  GatherList Gathered;
+  unsigned ParallelLoopAccessMDKind;
+  bool ScalarizeLoadStore;
+};
+
+char Scalarizer::ID = 0;
+} // end anonymous namespace
+
+INITIALIZE_PASS_WITH_OPTIONS(Scalarizer, "scalarizer",
+                             "Scalarize vector operations", false, false)
+
+Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+                     ValueVector *cachePtr)
+  : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) {
+  Type *Ty = V->getType();
+  PtrTy = dyn_cast<PointerType>(Ty);
+  if (PtrTy)
+    Ty = PtrTy->getElementType();
+  Size = Ty->getVectorNumElements();
+  if (!CachePtr)
+    Tmp.resize(Size, nullptr);
+  else if (CachePtr->empty())
+    CachePtr->resize(Size, nullptr);
+  else
+    assert(Size == CachePtr->size() && "Inconsistent vector sizes");
+}
+
+// Return component I, creating a new Value for it if necessary.
+Value *Scatterer::operator[](unsigned I) {
+  ValueVector &CV = (CachePtr ? *CachePtr : Tmp);
+  // Try to reuse a previous value.
+  if (CV[I])
+    return CV[I];
+  IRBuilder<> Builder(BB, BBI);
+  if (PtrTy) {
+    if (!CV[0]) {
+      Type *Ty =
+        PointerType::get(PtrTy->getElementType()->getVectorElementType(),
+                         PtrTy->getAddressSpace());
+      CV[0] = Builder.CreateBitCast(V, Ty, V->getName() + ".i0");
+    }
+    if (I != 0)
+      CV[I] = Builder.CreateConstGEP1_32(nullptr, CV[0], I,
+                                         V->getName() + ".i" + Twine(I));
+  } else {
+    // Search through a chain of InsertElementInsts looking for element I.
+    // Record other elements in the cache.  The new V is still suitable
+    // for all uncached indices.
+    for (;;) {
+      InsertElementInst *Insert = dyn_cast<InsertElementInst>(V);
+      if (!Insert)
+        break;
+      ConstantInt *Idx = dyn_cast<ConstantInt>(Insert->getOperand(2));
+      if (!Idx)
+        break;
+      unsigned J = Idx->getZExtValue();
+      V = Insert->getOperand(0);
+      if (I == J) {
+        CV[J] = Insert->getOperand(1);
+        return CV[J];
+      } else if (!CV[J]) {
+        // Only cache the first entry we find for each index we're not actively
+        // searching for. This prevents us from going too far up the chain and
+        // caching incorrect entries.
+        CV[J] = Insert->getOperand(1);
+      }
+    }
+    CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I),
+                                         V->getName() + ".i" + Twine(I));
+  }
+  return CV[I];
+}
+
+bool Scalarizer::doInitialization(Module &M) {
+  ParallelLoopAccessMDKind =
+      M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
+  ScalarizeLoadStore =
+      M.getContext().getOption<bool, Scalarizer, &Scalarizer::ScalarizeLoadStore>();
+  return false;
+}
+
+bool Scalarizer::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+  assert(Gathered.empty() && Scattered.empty());
+  for (BasicBlock &BB : F) {
+    for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
+      Instruction *I = &*II;
+      bool Done = visit(I);
+      ++II;
+      if (Done && I->getType()->isVoidTy())
+        I->eraseFromParent();
+    }
+  }
+  return finish();
+}
+
+// Return a scattered form of V that can be accessed by Point.  V must be a
+// vector or a pointer to a vector.
+Scatterer Scalarizer::scatter(Instruction *Point, Value *V) {
+  if (Argument *VArg = dyn_cast<Argument>(V)) {
+    // Put the scattered form of arguments in the entry block,
+    // so that it can be used everywhere.
+    Function *F = VArg->getParent();
+    BasicBlock *BB = &F->getEntryBlock();
+    return Scatterer(BB, BB->begin(), V, &Scattered[V]);
+  }
+  if (Instruction *VOp = dyn_cast<Instruction>(V)) {
+    // Put the scattered form of an instruction directly after the
+    // instruction.
+    BasicBlock *BB = VOp->getParent();
+    return Scatterer(BB, std::next(BasicBlock::iterator(VOp)),
+                     V, &Scattered[V]);
+  }
+  // In the fallback case, just put the scattered before Point and
+  // keep the result local to Point.
+  return Scatterer(Point->getParent(), Point->getIterator(), V);
+}
+
+// Replace Op with the gathered form of the components in CV.  Defer the
+// deletion of Op and creation of the gathered form to the end of the pass,
+// so that we can avoid creating the gathered form if all uses of Op are
+// replaced with uses of CV.
+void Scalarizer::gather(Instruction *Op, const ValueVector &CV) {
+  // Since we're not deleting Op yet, stub out its operands, so that it
+  // doesn't make anything live unnecessarily.
+  for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I)
+    Op->setOperand(I, UndefValue::get(Op->getOperand(I)->getType()));
+
+  transferMetadata(Op, CV);
+
+  // If we already have a scattered form of Op (created from ExtractElements
+  // of Op itself), replace them with the new form.
+  ValueVector &SV = Scattered[Op];
+  if (!SV.empty()) {
+    for (unsigned I = 0, E = SV.size(); I != E; ++I) {
+      Value *V = SV[I];
+      if (V == nullptr)
+        continue;
+
+      Instruction *Old = cast<Instruction>(V);
+      CV[I]->takeName(Old);
+      Old->replaceAllUsesWith(CV[I]);
+      Old->eraseFromParent();
+    }
+  }
+  SV = CV;
+  Gathered.push_back(GatherList::value_type(Op, &SV));
+}
+
+// Return true if it is safe to transfer the given metadata tag from
+// vector to scalar instructions.
+bool Scalarizer::canTransferMetadata(unsigned Tag) {
+  return (Tag == LLVMContext::MD_tbaa
+          || Tag == LLVMContext::MD_fpmath
+          || Tag == LLVMContext::MD_tbaa_struct
+          || Tag == LLVMContext::MD_invariant_load
+          || Tag == LLVMContext::MD_alias_scope
+          || Tag == LLVMContext::MD_noalias
+          || Tag == ParallelLoopAccessMDKind);
+}
+
+// Transfer metadata from Op to the instructions in CV if it is known
+// to be safe to do so.
+void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  Op->getAllMetadataOtherThanDebugLoc(MDs);
+  for (unsigned I = 0, E = CV.size(); I != E; ++I) {
+    if (Instruction *New = dyn_cast<Instruction>(CV[I])) {
+      for (const auto &MD : MDs)
+        if (canTransferMetadata(MD.first))
+          New->setMetadata(MD.first, MD.second);
+      if (Op->getDebugLoc() && !New->getDebugLoc())
+        New->setDebugLoc(Op->getDebugLoc());
+    }
+  }
+}
+
+// Try to fill in Layout from Ty, returning true on success.  Alignment is
+// the alignment of the vector, or 0 if the ABI default should be used.
+bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment,
+                                 VectorLayout &Layout, const DataLayout &DL) {
+  // Make sure we're dealing with a vector.
+  Layout.VecTy = dyn_cast<VectorType>(Ty);
+  if (!Layout.VecTy)
+    return false;
+
+  // Check that we're dealing with full-byte elements.
+  Layout.ElemTy = Layout.VecTy->getElementType();
+  if (DL.getTypeSizeInBits(Layout.ElemTy) !=
+      DL.getTypeStoreSizeInBits(Layout.ElemTy))
+    return false;
+
+  if (Alignment)
+    Layout.VecAlign = Alignment;
+  else
+    Layout.VecAlign = DL.getABITypeAlignment(Layout.VecTy);
+  Layout.ElemSize = DL.getTypeStoreSize(Layout.ElemTy);
+  return true;
+}
+
+// Scalarize two-operand instruction I, using Split(Builder, X, Y, Name)
+// to create an instruction like I with operands X and Y and name Name.
+template<typename Splitter>
+bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) {
+  VectorType *VT = dyn_cast<VectorType>(I.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(&I);
+  Scatterer Op0 = scatter(&I, I.getOperand(0));
+  Scatterer Op1 = scatter(&I, I.getOperand(1));
+  assert(Op0.size() == NumElems && "Mismatched binary operation");
+  assert(Op1.size() == NumElems && "Mismatched binary operation");
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned Elem = 0; Elem < NumElems; ++Elem)
+    Res[Elem] = Split(Builder, Op0[Elem], Op1[Elem],
+                      I.getName() + ".i" + Twine(Elem));
+  gather(&I, Res);
+  return true;
+}
+
+static bool isTriviallyScalariable(Intrinsic::ID ID) {
+  return isTriviallyVectorizable(ID);
+}
+
+// All of the current scalarizable intrinsics only have one mangled type.
+static Function *getScalarIntrinsicDeclaration(Module *M,
+                                               Intrinsic::ID ID,
+                                               VectorType *Ty) {
+  return Intrinsic::getDeclaration(M, ID, { Ty->getScalarType() });
+}
+
+/// If a call to a vector typed intrinsic function, split into a scalar call per
+/// element if possible for the intrinsic.
+bool Scalarizer::splitCall(CallInst &CI) {
+  VectorType *VT = dyn_cast<VectorType>(CI.getType());
+  if (!VT)
+    return false;
+
+  Function *F = CI.getCalledFunction();
+  if (!F)
+    return false;
+
+  Intrinsic::ID ID = F->getIntrinsicID();
+  if (ID == Intrinsic::not_intrinsic || !isTriviallyScalariable(ID))
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  unsigned NumArgs = CI.getNumArgOperands();
+
+  ValueVector ScalarOperands(NumArgs);
+  SmallVector<Scatterer, 8> Scattered(NumArgs);
+
+  Scattered.resize(NumArgs);
+
+  // Assumes that any vector type has the same number of elements as the return
+  // vector type, which is true for all current intrinsics.
+  for (unsigned I = 0; I != NumArgs; ++I) {
+    Value *OpI = CI.getOperand(I);
+    if (OpI->getType()->isVectorTy()) {
+      Scattered[I] = scatter(&CI, OpI);
+      assert(Scattered[I].size() == NumElems && "mismatched call operands");
+    } else {
+      ScalarOperands[I] = OpI;
+    }
+  }
+
+  ValueVector Res(NumElems);
+  ValueVector ScalarCallOps(NumArgs);
+
+  Function *NewIntrin = getScalarIntrinsicDeclaration(F->getParent(), ID, VT);
+  IRBuilder<> Builder(&CI);
+
+  // Perform actual scalarization, taking care to preserve any scalar operands.
+  for (unsigned Elem = 0; Elem < NumElems; ++Elem) {
+    ScalarCallOps.clear();
+
+    for (unsigned J = 0; J != NumArgs; ++J) {
+      if (hasVectorInstrinsicScalarOpd(ID, J))
+        ScalarCallOps.push_back(ScalarOperands[J]);
+      else
+        ScalarCallOps.push_back(Scattered[J][Elem]);
+    }
+
+    Res[Elem] = Builder.CreateCall(NewIntrin, ScalarCallOps,
+                                   CI.getName() + ".i" + Twine(Elem));
+  }
+
+  gather(&CI, Res);
+  return true;
+}
+
+bool Scalarizer::visitSelectInst(SelectInst &SI) {
+  VectorType *VT = dyn_cast<VectorType>(SI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(&SI);
+  Scatterer Op1 = scatter(&SI, SI.getOperand(1));
+  Scatterer Op2 = scatter(&SI, SI.getOperand(2));
+  assert(Op1.size() == NumElems && "Mismatched select");
+  assert(Op2.size() == NumElems && "Mismatched select");
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  if (SI.getOperand(0)->getType()->isVectorTy()) {
+    Scatterer Op0 = scatter(&SI, SI.getOperand(0));
+    assert(Op0.size() == NumElems && "Mismatched select");
+    for (unsigned I = 0; I < NumElems; ++I)
+      Res[I] = Builder.CreateSelect(Op0[I], Op1[I], Op2[I],
+                                    SI.getName() + ".i" + Twine(I));
+  } else {
+    Value *Op0 = SI.getOperand(0);
+    for (unsigned I = 0; I < NumElems; ++I)
+      Res[I] = Builder.CreateSelect(Op0, Op1[I], Op2[I],
+                                    SI.getName() + ".i" + Twine(I));
+  }
+  gather(&SI, Res);
+  return true;
+}
+
+bool Scalarizer::visitICmpInst(ICmpInst &ICI) {
+  return splitBinary(ICI, ICmpSplitter(ICI));
+}
+
+bool Scalarizer::visitFCmpInst(FCmpInst &FCI) {
+  return splitBinary(FCI, FCmpSplitter(FCI));
+}
+
+bool Scalarizer::visitBinaryOperator(BinaryOperator &BO) {
+  return splitBinary(BO, BinarySplitter(BO));
+}
+
+bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+  VectorType *VT = dyn_cast<VectorType>(GEPI.getType());
+  if (!VT)
+    return false;
+
+  IRBuilder<> Builder(&GEPI);
+  unsigned NumElems = VT->getNumElements();
+  unsigned NumIndices = GEPI.getNumIndices();
+
+  // The base pointer might be scalar even if it's a vector GEP. In those cases,
+  // splat the pointer into a vector value, and scatter that vector.
+  Value *Op0 = GEPI.getOperand(0);
+  if (!Op0->getType()->isVectorTy())
+    Op0 = Builder.CreateVectorSplat(NumElems, Op0);
+  Scatterer Base = scatter(&GEPI, Op0);
+
+  SmallVector<Scatterer, 8> Ops;
+  Ops.resize(NumIndices);
+  for (unsigned I = 0; I < NumIndices; ++I) {
+    Value *Op = GEPI.getOperand(I + 1);
+
+    // The indices might be scalars even if it's a vector GEP. In those cases,
+    // splat the scalar into a vector value, and scatter that vector.
+    if (!Op->getType()->isVectorTy())
+      Op = Builder.CreateVectorSplat(NumElems, Op);
+
+    Ops[I] = scatter(&GEPI, Op);
+  }
+
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned I = 0; I < NumElems; ++I) {
+    SmallVector<Value *, 8> Indices;
+    Indices.resize(NumIndices);
+    for (unsigned J = 0; J < NumIndices; ++J)
+      Indices[J] = Ops[J][I];
+    Res[I] = Builder.CreateGEP(GEPI.getSourceElementType(), Base[I], Indices,
+                               GEPI.getName() + ".i" + Twine(I));
+    if (GEPI.isInBounds())
+      if (GetElementPtrInst *NewGEPI = dyn_cast<GetElementPtrInst>(Res[I]))
+        NewGEPI->setIsInBounds();
+  }
+  gather(&GEPI, Res);
+  return true;
+}
+
+bool Scalarizer::visitCastInst(CastInst &CI) {
+  VectorType *VT = dyn_cast<VectorType>(CI.getDestTy());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(&CI);
+  Scatterer Op0 = scatter(&CI, CI.getOperand(0));
+  assert(Op0.size() == NumElems && "Mismatched cast");
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], VT->getElementType(),
+                                CI.getName() + ".i" + Twine(I));
+  gather(&CI, Res);
+  return true;
+}
+
+bool Scalarizer::visitBitCastInst(BitCastInst &BCI) {
+  VectorType *DstVT = dyn_cast<VectorType>(BCI.getDestTy());
+  VectorType *SrcVT = dyn_cast<VectorType>(BCI.getSrcTy());
+  if (!DstVT || !SrcVT)
+    return false;
+
+  unsigned DstNumElems = DstVT->getNumElements();
+  unsigned SrcNumElems = SrcVT->getNumElements();
+  IRBuilder<> Builder(&BCI);
+  Scatterer Op0 = scatter(&BCI, BCI.getOperand(0));
+  ValueVector Res;
+  Res.resize(DstNumElems);
+
+  if (DstNumElems == SrcNumElems) {
+    for (unsigned I = 0; I < DstNumElems; ++I)
+      Res[I] = Builder.CreateBitCast(Op0[I], DstVT->getElementType(),
+                                     BCI.getName() + ".i" + Twine(I));
+  } else if (DstNumElems > SrcNumElems) {
+    // <M x t1> -> <N*M x t2>.  Convert each t1 to <N x t2> and copy the
+    // individual elements to the destination.
+    unsigned FanOut = DstNumElems / SrcNumElems;
+    Type *MidTy = VectorType::get(DstVT->getElementType(), FanOut);
+    unsigned ResI = 0;
+    for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) {
+      Value *V = Op0[Op0I];
+      Instruction *VI;
+      // Look through any existing bitcasts before converting to <N x t2>.
+      // In the best case, the resulting conversion might be a no-op.
+      while ((VI = dyn_cast<Instruction>(V)) &&
+             VI->getOpcode() == Instruction::BitCast)
+        V = VI->getOperand(0);
+      V = Builder.CreateBitCast(V, MidTy, V->getName() + ".cast");
+      Scatterer Mid = scatter(&BCI, V);
+      for (unsigned MidI = 0; MidI < FanOut; ++MidI)
+        Res[ResI++] = Mid[MidI];
+    }
+  } else {
+    // <N*M x t1> -> <M x t2>.  Convert each group of <N x t1> into a t2.
+    unsigned FanIn = SrcNumElems / DstNumElems;
+    Type *MidTy = VectorType::get(SrcVT->getElementType(), FanIn);
+    unsigned Op0I = 0;
+    for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) {
+      Value *V = UndefValue::get(MidTy);
+      for (unsigned MidI = 0; MidI < FanIn; ++MidI)
+        V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI),
+                                        BCI.getName() + ".i" + Twine(ResI)
+                                        + ".upto" + Twine(MidI));
+      Res[ResI] = Builder.CreateBitCast(V, DstVT->getElementType(),
+                                        BCI.getName() + ".i" + Twine(ResI));
+    }
+  }
+  gather(&BCI, Res);
+  return true;
+}
+
+bool Scalarizer::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  VectorType *VT = dyn_cast<VectorType>(SVI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  Scatterer Op0 = scatter(&SVI, SVI.getOperand(0));
+  Scatterer Op1 = scatter(&SVI, SVI.getOperand(1));
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  for (unsigned I = 0; I < NumElems; ++I) {
+    int Selector = SVI.getMaskValue(I);
+    if (Selector < 0)
+      Res[I] = UndefValue::get(VT->getElementType());
+    else if (unsigned(Selector) < Op0.size())
+      Res[I] = Op0[Selector];
+    else
+      Res[I] = Op1[Selector - Op0.size()];
+  }
+  gather(&SVI, Res);
+  return true;
+}
+
+bool Scalarizer::visitPHINode(PHINode &PHI) {
+  VectorType *VT = dyn_cast<VectorType>(PHI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(&PHI);
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  unsigned NumOps = PHI.getNumOperands();
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreatePHI(VT->getElementType(), NumOps,
+                               PHI.getName() + ".i" + Twine(I));
+
+  for (unsigned I = 0; I < NumOps; ++I) {
+    Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I));
+    BasicBlock *IncomingBlock = PHI.getIncomingBlock(I);
+    for (unsigned J = 0; J < NumElems; ++J)
+      cast<PHINode>(Res[J])->addIncoming(Op[J], IncomingBlock);
+  }
+  gather(&PHI, Res);
+  return true;
+}
+
+bool Scalarizer::visitLoadInst(LoadInst &LI) {
+  if (!ScalarizeLoadStore)
+    return false;
+  if (!LI.isSimple())
+    return false;
+
+  VectorLayout Layout;
+  if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout,
+                       LI.getModule()->getDataLayout()))
+    return false;
+
+  unsigned NumElems = Layout.VecTy->getNumElements();
+  IRBuilder<> Builder(&LI);
+  Scatterer Ptr = scatter(&LI, LI.getPointerOperand());
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreateAlignedLoad(Ptr[I], Layout.getElemAlign(I),
+                                       LI.getName() + ".i" + Twine(I));
+  gather(&LI, Res);
+  return true;
+}
+
+bool Scalarizer::visitStoreInst(StoreInst &SI) {
+  if (!ScalarizeLoadStore)
+    return false;
+  if (!SI.isSimple())
+    return false;
+
+  VectorLayout Layout;
+  Value *FullValue = SI.getValueOperand();
+  if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout,
+                       SI.getModule()->getDataLayout()))
+    return false;
+
+  unsigned NumElems = Layout.VecTy->getNumElements();
+  IRBuilder<> Builder(&SI);
+  Scatterer Ptr = scatter(&SI, SI.getPointerOperand());
+  Scatterer Val = scatter(&SI, FullValue);
+
+  ValueVector Stores;
+  Stores.resize(NumElems);
+  for (unsigned I = 0; I < NumElems; ++I) {
+    unsigned Align = Layout.getElemAlign(I);
+    Stores[I] = Builder.CreateAlignedStore(Val[I], Ptr[I], Align);
+  }
+  transferMetadata(&SI, Stores);
+  return true;
+}
+
+bool Scalarizer::visitCallInst(CallInst &CI) {
+  return splitCall(CI);
+}
+
+// Delete the instructions that we scalarized.  If a full vector result
+// is still needed, recreate it using InsertElements.
+bool Scalarizer::finish() {
+  // The presence of data in Gathered or Scattered indicates changes
+  // made to the Function.
+  if (Gathered.empty() && Scattered.empty())
+    return false;
+  for (const auto &GMI : Gathered) {
+    Instruction *Op = GMI.first;
+    ValueVector &CV = *GMI.second;
+    if (!Op->use_empty()) {
+      // The value is still needed, so recreate it using a series of
+      // InsertElements.
+      Type *Ty = Op->getType();
+      Value *Res = UndefValue::get(Ty);
+      BasicBlock *BB = Op->getParent();
+      unsigned Count = Ty->getVectorNumElements();
+      IRBuilder<> Builder(Op);
+      if (isa<PHINode>(Op))
+        Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
+      for (unsigned I = 0; I < Count; ++I)
+        Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I),
+                                          Op->getName() + ".upto" + Twine(I));
+      Res->takeName(Op);
+      Op->replaceAllUsesWith(Res);
+    }
+    Op->eraseFromParent();
+  }
+  Gathered.clear();
+  Scattered.clear();
+  return true;
+}
+
+FunctionPass *llvm::createScalarizerPass() {
+  return new Scalarizer();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
new file mode 100644
index 000000000000..84675f41cdd5
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -0,0 +1,1264 @@
+//===-- SeparateConstOffsetFromGEP.cpp - ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Loop unrolling may create many similar GEPs for array accesses.
+// e.g., a 2-level loop
+//
+// float a[32][32]; // global variable
+//
+// for (int i = 0; i < 2; ++i) {
+//   for (int j = 0; j < 2; ++j) {
+//     ...
+//     ... = a[x + i][y + j];
+//     ...
+//   }
+// }
+//
+// will probably be unrolled to:
+//
+// gep %a, 0, %x, %y; load
+// gep %a, 0, %x, %y + 1; load
+// gep %a, 0, %x + 1, %y; load
+// gep %a, 0, %x + 1, %y + 1; load
+//
+// LLVM's GVN does not use partial redundancy elimination yet, and is thus
+// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
+// significant slowdown in targets with limited addressing modes. For instance,
+// because the PTX target does not support the reg+reg addressing mode, the
+// NVPTX backend emits PTX code that literally computes the pointer address of
+// each GEP, wasting tons of registers. It emits the following PTX for the
+// first load and similar PTX for other loads.
+//
+// mov.u32         %r1, %x;
+// mov.u32         %r2, %y;
+// mul.wide.u32    %rl2, %r1, 128;
+// mov.u64         %rl3, a;
+// add.s64         %rl4, %rl3, %rl2;
+// mul.wide.u32    %rl5, %r2, 4;
+// add.s64         %rl6, %rl4, %rl5;
+// ld.global.f32   %f1, [%rl6];
+//
+// To reduce the register pressure, the optimization implemented in this file
+// merges the common part of a group of GEPs, so we can compute each pointer
+// address by adding a simple offset to the common part, saving many registers.
+//
+// It works by splitting each GEP into a variadic base and a constant offset.
+// The variadic base can be computed once and reused by multiple GEPs, and the
+// constant offsets can be nicely folded into the reg+immediate addressing mode
+// (supported by most targets) without using any extra register.
+//
+// For instance, we transform the four GEPs and four loads in the above example
+// into:
+//
+// base = gep a, 0, x, y
+// load base
+// laod base + 1  * sizeof(float)
+// load base + 32 * sizeof(float)
+// load base + 33 * sizeof(float)
+//
+// Given the transformed IR, a backend that supports the reg+immediate
+// addressing mode can easily fold the pointer arithmetics into the loads. For
+// example, the NVPTX backend can easily fold the pointer arithmetics into the
+// ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
+//
+// mov.u32         %r1, %tid.x;
+// mov.u32         %r2, %tid.y;
+// mul.wide.u32    %rl2, %r1, 128;
+// mov.u64         %rl3, a;
+// add.s64         %rl4, %rl3, %rl2;
+// mul.wide.u32    %rl5, %r2, 4;
+// add.s64         %rl6, %rl4, %rl5;
+// ld.global.f32   %f1, [%rl6]; // so far the same as unoptimized PTX
+// ld.global.f32   %f2, [%rl6+4]; // much better
+// ld.global.f32   %f3, [%rl6+128]; // much better
+// ld.global.f32   %f4, [%rl6+132]; // much better
+//
+// Another improvement enabled by the LowerGEP flag is to lower a GEP with
+// multiple indices to either multiple GEPs with a single index or arithmetic
+// operations (depending on whether the target uses alias analysis in codegen).
+// Such transformation can have following benefits:
+// (1) It can always extract constants in the indices of structure type.
+// (2) After such Lowering, there are more optimization opportunities such as
+//     CSE, LICM and CGP.
+//
+// E.g. The following GEPs have multiple indices:
+//  BB1:
+//    %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
+//    load %p
+//    ...
+//  BB2:
+//    %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
+//    load %p2
+//    ...
+//
+// We can not do CSE for to the common part related to index "i64 %i". Lowering
+// GEPs can achieve such goals.
+// If the target does not use alias analysis in codegen, this pass will
+// lower a GEP with multiple indices into arithmetic operations:
+//  BB1:
+//    %1 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity
+//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %3 = add i64 %1, %2                          ; CSE opportunity
+//    %4 = mul i64 %j1, length_of_struct
+//    %5 = add i64 %3, %4
+//    %6 = add i64 %3, struct_field_3              ; Constant offset
+//    %p = inttoptr i64 %6 to i32*
+//    load %p
+//    ...
+//  BB2:
+//    %7 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity
+//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %9 = add i64 %7, %8                          ; CSE opportunity
+//    %10 = mul i64 %j2, length_of_struct
+//    %11 = add i64 %9, %10
+//    %12 = add i64 %11, struct_field_2            ; Constant offset
+//    %p = inttoptr i64 %12 to i32*
+//    load %p2
+//    ...
+//
+// If the target uses alias analysis in codegen, this pass will lower a GEP
+// with multiple indices into multiple GEPs with a single index:
+//  BB1:
+//    %1 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity
+//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %3 = getelementptr i8* %1, i64 %2            ; CSE opportunity
+//    %4 = mul i64 %j1, length_of_struct
+//    %5 = getelementptr i8* %3, i64 %4
+//    %6 = getelementptr i8* %5, struct_field_3    ; Constant offset
+//    %p = bitcast i8* %6 to i32*
+//    load %p
+//    ...
+//  BB2:
+//    %7 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity
+//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %9 = getelementptr i8* %7, i64 %8            ; CSE opportunity
+//    %10 = mul i64 %j2, length_of_struct
+//    %11 = getelementptr i8* %9, i64 %10
+//    %12 = getelementptr i8* %11, struct_field_2  ; Constant offset
+//    %p2 = bitcast i8* %12 to i32*
+//    load %p2
+//    ...
+//
+// Lowering GEPs can also benefit other passes such as LICM and CGP.
+// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
+// indices if one of the index is variant. If we lower such GEP into invariant
+// parts and variant parts, LICM can hoist/sink those invariant parts.
+// CGP (CodeGen Prepare) tries to sink address calculations that match the
+// target's addressing modes. A GEP with multiple indices may not match and will
+// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
+// them. So we end up with a better addressing mode.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
+    "disable-separate-const-offset-from-gep", cl::init(false),
+    cl::desc("Do not separate the constant offset from a GEP instruction"),
+    cl::Hidden);
+// Setting this flag may emit false positives when the input module already
+// contains dead instructions. Therefore, we set it only in unit tests that are
+// free of dead code.
+static cl::opt<bool>
+    VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false),
+                     cl::desc("Verify this pass produces no dead code"),
+                     cl::Hidden);
+
+namespace {
+
+/// \brief A helper class for separating a constant offset from a GEP index.
+///
+/// In real programs, a GEP index may be more complicated than a simple addition
+/// of something and a constant integer which can be trivially splitted. For
+/// example, to split ((a << 3) | 5) + b, we need to search deeper for the
+/// constant offset, so that we can separate the index to (a << 3) + b and 5.
+///
+/// Therefore, this class looks into the expression that computes a given GEP
+/// index, and tries to find a constant integer that can be hoisted to the
+/// outermost level of the expression as an addition. Not every constant in an
+/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
+/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
+/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
+class ConstantOffsetExtractor {
+public:
+  /// Extracts a constant offset from the given GEP index. It returns the
+  /// new index representing the remainder (equal to the original index minus
+  /// the constant offset), or nullptr if we cannot extract a constant offset.
+  /// \p Idx The given GEP index
+  /// \p GEP The given GEP
+  /// \p UserChainTail Outputs the tail of UserChain so that we can
+  ///                  garbage-collect unused instructions in UserChain.
+  static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
+                        User *&UserChainTail, const DominatorTree *DT);
+  /// Looks for a constant offset from the given GEP index without extracting
+  /// it. It returns the numeric value of the extracted constant offset (0 if
+  /// failed). The meaning of the arguments are the same as Extract.
+  static int64_t Find(Value *Idx, GetElementPtrInst *GEP,
+                      const DominatorTree *DT);
+
+private:
+  ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
+      : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
+  }
+  /// Searches the expression that computes V for a non-zero constant C s.t.
+  /// V can be reassociated into the form V' + C. If the searching is
+  /// successful, returns C and update UserChain as a def-use chain from C to V;
+  /// otherwise, UserChain is empty.
+  ///
+  /// \p V            The given expression
+  /// \p SignExtended Whether V will be sign-extended in the computation of the
+  ///                 GEP index
+  /// \p ZeroExtended Whether V will be zero-extended in the computation of the
+  ///                 GEP index
+  /// \p NonNegative  Whether V is guaranteed to be non-negative. For example,
+  ///                 an index of an inbounds GEP is guaranteed to be
+  ///                 non-negative. Levaraging this, we can better split
+  ///                 inbounds GEPs.
+  APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
+  /// A helper function to look into both operands of a binary operator.
+  APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
+                            bool ZeroExtended);
+  /// After finding the constant offset C from the GEP index I, we build a new
+  /// index I' s.t. I' + C = I. This function builds and returns the new
+  /// index I' according to UserChain produced by function "find".
+  ///
+  /// The building conceptually takes two steps:
+  /// 1) iteratively distribute s/zext towards the leaves of the expression tree
+  /// that computes I
+  /// 2) reassociate the expression tree to the form I' + C.
+  ///
+  /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
+  /// sext to a, b and 5 so that we have
+  ///   sext(a) + (sext(b) + 5).
+  /// Then, we reassociate it to
+  ///   (sext(a) + sext(b)) + 5.
+  /// Given this form, we know I' is sext(a) + sext(b).
+  Value *rebuildWithoutConstOffset();
+  /// After the first step of rebuilding the GEP index without the constant
+  /// offset, distribute s/zext to the operands of all operators in UserChain.
+  /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
+  /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
+  ///
+  /// The function also updates UserChain to point to new subexpressions after
+  /// distributing s/zext. e.g., the old UserChain of the above example is
+  /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
+  /// and the new UserChain is
+  /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
+  ///   zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
+  ///
+  /// \p ChainIndex The index to UserChain. ChainIndex is initially
+  ///               UserChain.size() - 1, and is decremented during
+  ///               the recursion.
+  Value *distributeExtsAndCloneChain(unsigned ChainIndex);
+  /// Reassociates the GEP index to the form I' + C and returns I'.
+  Value *removeConstOffset(unsigned ChainIndex);
+  /// A helper function to apply ExtInsts, a list of s/zext, to value V.
+  /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
+  /// returns "sext i32 (zext i16 V to i32) to i64".
+  Value *applyExts(Value *V);
+
+  /// A helper function that returns whether we can trace into the operands
+  /// of binary operator BO for a constant offset.
+  ///
+  /// \p SignExtended Whether BO is surrounded by sext
+  /// \p ZeroExtended Whether BO is surrounded by zext
+  /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
+  ///                array index.
+  bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
+                    bool NonNegative);
+
+  /// The path from the constant offset to the old GEP index. e.g., if the GEP
+  /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
+  /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
+  /// UserChain[2] will be the entire expression "a * b + (c + 5)".
+  ///
+  /// This path helps to rebuild the new GEP index.
+  SmallVector<User *, 8> UserChain;
+  /// A data structure used in rebuildWithoutConstOffset. Contains all
+  /// sext/zext instructions along UserChain.
+  SmallVector<CastInst *, 16> ExtInsts;
+  Instruction *IP;  /// Insertion position of cloned instructions.
+  const DataLayout &DL;
+  const DominatorTree *DT;
+};
+
+/// \brief A pass that tries to split every GEP in the function into a variadic
+/// base and a constant offset. It is a FunctionPass because searching for the
+/// constant offset may inspect other basic blocks.
+class SeparateConstOffsetFromGEP : public FunctionPass {
+public:
+  static char ID;
+  SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
+                             bool LowerGEP = false)
+      : FunctionPass(ID), DL(nullptr), DT(nullptr), TM(TM), LowerGEP(LowerGEP) {
+    initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+  bool doInitialization(Module &M) override {
+    DL = &M.getDataLayout();
+    return false;
+  }
+  bool runOnFunction(Function &F) override;
+
+private:
+  /// Tries to split the given GEP into a variadic base and a constant offset,
+  /// and returns true if the splitting succeeds.
+  bool splitGEP(GetElementPtrInst *GEP);
+  /// Lower a GEP with multiple indices into multiple GEPs with a single index.
+  /// Function splitGEP already split the original GEP into a variadic part and
+  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+  /// variadic part into a set of GEPs with a single index and applies
+  /// AccumulativeByteOffset to it.
+  /// \p Variadic                  The variadic part of the original GEP.
+  /// \p AccumulativeByteOffset    The constant offset.
+  void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
+                              int64_t AccumulativeByteOffset);
+  /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
+  /// Function splitGEP already split the original GEP into a variadic part and
+  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+  /// variadic part into a set of arithmetic operations and applies
+  /// AccumulativeByteOffset to it.
+  /// \p Variadic                  The variadic part of the original GEP.
+  /// \p AccumulativeByteOffset    The constant offset.
+  void lowerToArithmetics(GetElementPtrInst *Variadic,
+                          int64_t AccumulativeByteOffset);
+  /// Finds the constant offset within each index and accumulates them. If
+  /// LowerGEP is true, it finds in indices of both sequential and structure
+  /// types, otherwise it only finds in sequential indices. The output
+  /// NeedsExtraction indicates whether we successfully find a non-zero constant
+  /// offset.
+  int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
+  /// Canonicalize array indices to pointer-size integers. This helps to
+  /// simplify the logic of splitting a GEP. For example, if a + b is a
+  /// pointer-size integer, we have
+  ///   gep base, a + b = gep (gep base, a), b
+  /// However, this equality may not hold if the size of a + b is smaller than
+  /// the pointer size, because LLVM conceptually sign-extends GEP indices to
+  /// pointer size before computing the address
+  /// (http://llvm.org/docs/LangRef.html#id181).
+  ///
+  /// This canonicalization is very likely already done in clang and
+  /// instcombine. Therefore, the program will probably remain the same.
+  ///
+  /// Returns true if the module changes.
+  ///
+  /// Verified in @i32_add in split-gep.ll
+  bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
+  /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
+  /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
+  /// the constant offset. After extraction, it becomes desirable to reunion the
+  /// distributed sexts. For example,
+  ///
+  ///                              &a[sext(i +nsw (j +nsw 5)]
+  ///   => distribute              &a[sext(i) +nsw (sext(j) +nsw 5)]
+  ///   => constant extraction     &a[sext(i) + sext(j)] + 5
+  ///   => reunion                 &a[sext(i +nsw j)] + 5
+  bool reuniteExts(Function &F);
+  /// A helper that reunites sexts in an instruction.
+  bool reuniteExts(Instruction *I);
+  /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
+  Instruction *findClosestMatchingDominator(const SCEV *Key,
+                                            Instruction *Dominatee);
+  /// Verify F is free of dead code.
+  void verifyNoDeadCode(Function &F);
+
+  bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
+  // Swap the index operand of two GEP.
+  void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
+  // Check if it is safe to swap operand of two GEP.
+  bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
+                            Loop *CurLoop);
+
+  const DataLayout *DL;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  const TargetMachine *TM;
+
+  LoopInfo *LI;
+  TargetLibraryInfo *TLI;
+  /// Whether to lower a GEP with multiple indices into arithmetic operations or
+  /// multiple GEPs with a single index.
+  bool LowerGEP;
+  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs;
+};
+}  // anonymous namespace
+
+char SeparateConstOffsetFromGEP::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
+    "Split GEPs to a variadic base and a constant offset for better CSE", false,
+    false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
+    "Split GEPs to a variadic base and a constant offset for better CSE", false,
+    false)
+
+FunctionPass *
+llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM,
+                                           bool LowerGEP) {
+  return new SeparateConstOffsetFromGEP(TM, LowerGEP);
+}
+
+bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
+                                            bool ZeroExtended,
+                                            BinaryOperator *BO,
+                                            bool NonNegative) {
+  // We only consider ADD, SUB and OR, because a non-zero constant found in
+  // expressions composed of these operations can be easily hoisted as a
+  // constant offset by reassociation.
+  if (BO->getOpcode() != Instruction::Add &&
+      BO->getOpcode() != Instruction::Sub &&
+      BO->getOpcode() != Instruction::Or) {
+    return false;
+  }
+
+  Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
+  // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
+  // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
+  if (BO->getOpcode() == Instruction::Or &&
+      !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT))
+    return false;
+
+  // In addition, tracing into BO requires that its surrounding s/zext (if
+  // any) is distributable to both operands.
+  //
+  // Suppose BO = A op B.
+  //  SignExtended | ZeroExtended | Distributable?
+  // --------------+--------------+----------------------------------
+  //       0       |      0       | true because no s/zext exists
+  //       0       |      1       | zext(BO) == zext(A) op zext(B)
+  //       1       |      0       | sext(BO) == sext(A) op sext(B)
+  //       1       |      1       | zext(sext(BO)) ==
+  //               |              |     zext(sext(A)) op zext(sext(B))
+  if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) {
+    // If a + b >= 0 and (a >= 0 or b >= 0), then
+    //   sext(a + b) = sext(a) + sext(b)
+    // even if the addition is not marked nsw.
+    //
+    // Leveraging this invarient, we can trace into an sext'ed inbound GEP
+    // index if the constant offset is non-negative.
+    //
+    // Verified in @sext_add in split-gep.ll.
+    if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
+      if (!ConstLHS->isNegative())
+        return true;
+    }
+    if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
+      if (!ConstRHS->isNegative())
+        return true;
+    }
+  }
+
+  // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
+  // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
+  if (BO->getOpcode() == Instruction::Add ||
+      BO->getOpcode() == Instruction::Sub) {
+    if (SignExtended && !BO->hasNoSignedWrap())
+      return false;
+    if (ZeroExtended && !BO->hasNoUnsignedWrap())
+      return false;
+  }
+
+  return true;
+}
+
+APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
+                                                   bool SignExtended,
+                                                   bool ZeroExtended) {
+  // BO being non-negative does not shed light on whether its operands are
+  // non-negative. Clear the NonNegative flag here.
+  APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
+                              /* NonNegative */ false);
+  // If we found a constant offset in the left operand, stop and return that.
+  // This shortcut might cause us to miss opportunities of combining the
+  // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
+  // However, such cases are probably already handled by -instcombine,
+  // given this pass runs after the standard optimizations.
+  if (ConstantOffset != 0) return ConstantOffset;
+  ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
+                        /* NonNegative */ false);
+  // If U is a sub operator, negate the constant offset found in the right
+  // operand.
+  if (BO->getOpcode() == Instruction::Sub)
+    ConstantOffset = -ConstantOffset;
+  return ConstantOffset;
+}
+
+APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
+                                    bool ZeroExtended, bool NonNegative) {
+  // TODO(jingyue): We could trace into integer/pointer casts, such as
+  // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
+  // integers because it gives good enough results for our benchmarks.
+  unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+
+  // We cannot do much with Values that are not a User, such as an Argument.
+  User *U = dyn_cast<User>(V);
+  if (U == nullptr) return APInt(BitWidth, 0);
+
+  APInt ConstantOffset(BitWidth, 0);
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    // Hooray, we found it!
+    ConstantOffset = CI->getValue();
+  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
+    // Trace into subexpressions for more hoisting opportunities.
+    if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
+      ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
+  } else if (isa<SExtInst>(V)) {
+    ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
+                          ZeroExtended, NonNegative).sext(BitWidth);
+  } else if (isa<ZExtInst>(V)) {
+    // As an optimization, we can clear the SignExtended flag because
+    // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
+    //
+    // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
+    ConstantOffset =
+        find(U->getOperand(0), /* SignExtended */ false,
+             /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
+  }
+
+  // If we found a non-zero constant offset, add it to the path for
+  // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
+  // help this optimization.
+  if (ConstantOffset != 0)
+    UserChain.push_back(U);
+  return ConstantOffset;
+}
+
+Value *ConstantOffsetExtractor::applyExts(Value *V) {
+  Value *Current = V;
+  // ExtInsts is built in the use-def order. Therefore, we apply them to V
+  // in the reversed order.
+  for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) {
+    if (Constant *C = dyn_cast<Constant>(Current)) {
+      // If Current is a constant, apply s/zext using ConstantExpr::getCast.
+      // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
+      Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType());
+    } else {
+      Instruction *Ext = (*I)->clone();
+      Ext->setOperand(0, Current);
+      Ext->insertBefore(IP);
+      Current = Ext;
+    }
+  }
+  return Current;
+}
+
+Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
+  distributeExtsAndCloneChain(UserChain.size() - 1);
+  // Remove all nullptrs (used to be s/zext) from UserChain.
+  unsigned NewSize = 0;
+  for (User *I : UserChain) {
+    if (I != nullptr) {
+      UserChain[NewSize] = I;
+      NewSize++;
+    }
+  }
+  UserChain.resize(NewSize);
+  return removeConstOffset(UserChain.size() - 1);
+}
+
+Value *
+ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
+  User *U = UserChain[ChainIndex];
+  if (ChainIndex == 0) {
+    assert(isa<ConstantInt>(U));
+    // If U is a ConstantInt, applyExts will return a ConstantInt as well.
+    return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
+  }
+
+  if (CastInst *Cast = dyn_cast<CastInst>(U)) {
+    assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) &&
+           "We only traced into two types of CastInst: sext and zext");
+    ExtInsts.push_back(Cast);
+    UserChain[ChainIndex] = nullptr;
+    return distributeExtsAndCloneChain(ChainIndex - 1);
+  }
+
+  // Function find only trace into BinaryOperator and CastInst.
+  BinaryOperator *BO = cast<BinaryOperator>(U);
+  // OpNo = which operand of BO is UserChain[ChainIndex - 1]
+  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+  Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
+  Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
+
+  BinaryOperator *NewBO = nullptr;
+  if (OpNo == 0) {
+    NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
+                                   BO->getName(), IP);
+  } else {
+    NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
+                                   BO->getName(), IP);
+  }
+  return UserChain[ChainIndex] = NewBO;
+}
+
+Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
+  if (ChainIndex == 0) {
+    assert(isa<ConstantInt>(UserChain[ChainIndex]));
+    return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
+  }
+
+  BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
+  assert(BO->getNumUses() <= 1 &&
+         "distributeExtsAndCloneChain clones each BinaryOperator in "
+         "UserChain, so no one should be used more than "
+         "once");
+
+  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+  assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
+  Value *NextInChain = removeConstOffset(ChainIndex - 1);
+  Value *TheOther = BO->getOperand(1 - OpNo);
+
+  // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+  // sub-expression to be just TheOther.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
+    if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+      return TheOther;
+  }
+
+  BinaryOperator::BinaryOps NewOp = BO->getOpcode();
+  if (BO->getOpcode() == Instruction::Or) {
+    // Rebuild "or" as "add", because "or" may be invalid for the new
+    // epxression.
+    //
+    // For instance, given
+    //   a | (b + 5) where a and b + 5 have no common bits,
+    // we can extract 5 as the constant offset.
+    //
+    // However, reusing the "or" in the new index would give us
+    //   (a | b) + 5
+    // which does not equal a | (b + 5).
+    //
+    // Replacing the "or" with "add" is fine, because
+    //   a | (b + 5) = a + (b + 5) = (a + b) + 5
+    NewOp = Instruction::Add;
+  }
+
+  BinaryOperator *NewBO;
+  if (OpNo == 0) {
+    NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP);
+  } else {
+    NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP);
+  }
+  NewBO->takeName(BO);
+  return NewBO;
+}
+
+Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
+                                        User *&UserChainTail,
+                                        const DominatorTree *DT) {
+  ConstantOffsetExtractor Extractor(GEP, DT);
+  // Find a non-zero constant offset first.
+  APInt ConstantOffset =
+      Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+                     GEP->isInBounds());
+  if (ConstantOffset == 0) {
+    UserChainTail = nullptr;
+    return nullptr;
+  }
+  // Separates the constant offset from the GEP index.
+  Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();
+  UserChainTail = Extractor.UserChain.back();
+  return IdxWithoutConstOffset;
+}
+
+int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
+                                      const DominatorTree *DT) {
+  // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
+  return ConstantOffsetExtractor(GEP, DT)
+      .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+            GEP->isInBounds())
+      .getSExtValue();
+}
+
+bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
+    GetElementPtrInst *GEP) {
+  bool Changed = false;
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
+       I != E; ++I, ++GTI) {
+    // Skip struct member indices which must be i32.
+    if (GTI.isSequential()) {
+      if ((*I)->getType() != IntPtrTy) {
+        *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+int64_t
+SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
+                                                 bool &NeedsExtraction) {
+  NeedsExtraction = false;
+  int64_t AccumulativeByteOffset = 0;
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (GTI.isSequential()) {
+      // Tries to extract a constant offset from this GEP index.
+      int64_t ConstantOffset =
+          ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
+      if (ConstantOffset != 0) {
+        NeedsExtraction = true;
+        // A GEP may have multiple indices.  We accumulate the extracted
+        // constant offset to a byte offset, and later offset the remainder of
+        // the original GEP with this byte offset.
+        AccumulativeByteOffset +=
+            ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
+      }
+    } else if (LowerGEP) {
+      StructType *StTy = GTI.getStructType();
+      uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
+      // Skip field 0 as the offset is always 0.
+      if (Field != 0) {
+        NeedsExtraction = true;
+        AccumulativeByteOffset +=
+            DL->getStructLayout(StTy)->getElementOffset(Field);
+      }
+    }
+  }
+  return AccumulativeByteOffset;
+}
+
+void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
+    GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
+  IRBuilder<> Builder(Variadic);
+  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+  Type *I8PtrTy =
+      Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
+  Value *ResultPtr = Variadic->getOperand(0);
+  Loop *L = LI->getLoopFor(Variadic->getParent());
+  // Check if the base is not loop invariant or used more than once.
+  bool isSwapCandidate =
+      L && L->isLoopInvariant(ResultPtr) &&
+      !hasMoreThanOneUseInLoop(ResultPtr, L);
+  Value *FirstResult = nullptr;
+
+  if (ResultPtr->getType() != I8PtrTy)
+    ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+
+  gep_type_iterator GTI = gep_type_begin(*Variadic);
+  // Create an ugly GEP for each sequential index. We don't create GEPs for
+  // structure indices, as they are accumulated in the constant offset index.
+  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+    if (GTI.isSequential()) {
+      Value *Idx = Variadic->getOperand(I);
+      // Skip zero indices.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+        if (CI->isZero())
+          continue;
+
+      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+                                DL->getTypeAllocSize(GTI.getIndexedType()));
+      // Scale the index by element size.
+      if (ElementSize != 1) {
+        if (ElementSize.isPowerOf2()) {
+          Idx = Builder.CreateShl(
+              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+        } else {
+          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+        }
+      }
+      // Create an ugly GEP with a single index for each index.
+      ResultPtr =
+          Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
+      if (FirstResult == nullptr)
+        FirstResult = ResultPtr;
+    }
+  }
+
+  // Create a GEP with the constant offset index.
+  if (AccumulativeByteOffset != 0) {
+    Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
+    ResultPtr =
+        Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
+  } else
+    isSwapCandidate = false;
+
+  // If we created a GEP with constant index, and the base is loop invariant,
+  // then we swap the first one with it, so LICM can move constant GEP out
+  // later.
+  GetElementPtrInst *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult);
+  GetElementPtrInst *SecondGEP = dyn_cast_or_null<GetElementPtrInst>(ResultPtr);
+  if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
+    swapGEPOperand(FirstGEP, SecondGEP);
+
+  if (ResultPtr->getType() != Variadic->getType())
+    ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
+
+  Variadic->replaceAllUsesWith(ResultPtr);
+  Variadic->eraseFromParent();
+}
+
+void
+SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
+                                               int64_t AccumulativeByteOffset) {
+  IRBuilder<> Builder(Variadic);
+  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+  Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
+  gep_type_iterator GTI = gep_type_begin(*Variadic);
+  // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
+  // don't create arithmetics for structure indices, as they are accumulated
+  // in the constant offset index.
+  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+    if (GTI.isSequential()) {
+      Value *Idx = Variadic->getOperand(I);
+      // Skip zero indices.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+        if (CI->isZero())
+          continue;
+
+      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+                                DL->getTypeAllocSize(GTI.getIndexedType()));
+      // Scale the index by element size.
+      if (ElementSize != 1) {
+        if (ElementSize.isPowerOf2()) {
+          Idx = Builder.CreateShl(
+              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+        } else {
+          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+        }
+      }
+      // Create an ADD for each index.
+      ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
+    }
+  }
+
+  // Create an ADD for the constant offset index.
+  if (AccumulativeByteOffset != 0) {
+    ResultPtr = Builder.CreateAdd(
+        ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
+  }
+
+  ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
+  Variadic->replaceAllUsesWith(ResultPtr);
+  Variadic->eraseFromParent();
+}
+
+bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
+  // Skip vector GEPs.
+  if (GEP->getType()->isVectorTy())
+    return false;
+
+  // The backend can already nicely handle the case where all indices are
+  // constant.
+  if (GEP->hasAllConstantIndices())
+    return false;
+
+  bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
+
+  bool NeedsExtraction;
+  int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
+
+  if (!NeedsExtraction)
+    return Changed;
+  // If LowerGEP is disabled, before really splitting the GEP, check whether the
+  // backend supports the addressing mode we are about to produce. If no, this
+  // splitting probably won't be beneficial.
+  // If LowerGEP is enabled, even the extracted constant offset can not match
+  // the addressing mode, we can still do optimizations to other lowered parts
+  // of variable indices. Therefore, we don't check for addressing modes in that
+  // case.
+  if (!LowerGEP) {
+    TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+            *GEP->getParent()->getParent());
+    unsigned AddrSpace = GEP->getPointerAddressSpace();
+    if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
+                                   /*BaseGV=*/nullptr, AccumulativeByteOffset,
+                                   /*HasBaseReg=*/true, /*Scale=*/0,
+                                   AddrSpace)) {
+      return Changed;
+    }
+  }
+
+  // Remove the constant offset in each sequential index. The resultant GEP
+  // computes the variadic base.
+  // Notice that we don't remove struct field indices here. If LowerGEP is
+  // disabled, a structure index is not accumulated and we still use the old
+  // one. If LowerGEP is enabled, a structure index is accumulated in the
+  // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
+  // handle the constant offset and won't need a new structure index.
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (GTI.isSequential()) {
+      // Splits this GEP index into a variadic part and a constant offset, and
+      // uses the variadic part as the new index.
+      Value *OldIdx = GEP->getOperand(I);
+      User *UserChainTail;
+      Value *NewIdx =
+          ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT);
+      if (NewIdx != nullptr) {
+        // Switches to the index with the constant offset removed.
+        GEP->setOperand(I, NewIdx);
+        // After switching to the new index, we can garbage-collect UserChain
+        // and the old index if they are not used.
+        RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
+        RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
+      }
+    }
+  }
+
+  // Clear the inbounds attribute because the new index may be off-bound.
+  // e.g.,
+  //
+  //   b     = add i64 a, 5
+  //   addr  = gep inbounds float, float* p, i64 b
+  //
+  // is transformed to:
+  //
+  //   addr2 = gep float, float* p, i64 a ; inbounds removed
+  //   addr  = gep inbounds float, float* addr2, i64 5
+  //
+  // If a is -4, although the old index b is in bounds, the new index a is
+  // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
+  // inbounds keyword is not present, the offsets are added to the base
+  // address with silently-wrapping two's complement arithmetic".
+  // Therefore, the final code will be a semantically equivalent.
+  //
+  // TODO(jingyue): do some range analysis to keep as many inbounds as
+  // possible. GEPs with inbounds are more friendly to alias analysis.
+  bool GEPWasInBounds = GEP->isInBounds();
+  GEP->setIsInBounds(false);
+
+  // Lowers a GEP to either GEPs with a single index or arithmetic operations.
+  if (LowerGEP) {
+    // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
+    // arithmetic operations if the target uses alias analysis in codegen.
+    if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA())
+      lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
+    else
+      lowerToArithmetics(GEP, AccumulativeByteOffset);
+    return true;
+  }
+
+  // No need to create another GEP if the accumulative byte offset is 0.
+  if (AccumulativeByteOffset == 0)
+    return true;
+
+  // Offsets the base with the accumulative byte offset.
+  //
+  //   %gep                        ; the base
+  //   ... %gep ...
+  //
+  // => add the offset
+  //
+  //   %gep2                       ; clone of %gep
+  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
+  //   %gep                        ; will be removed
+  //   ... %gep ...
+  //
+  // => replace all uses of %gep with %new.gep and remove %gep
+  //
+  //   %gep2                       ; clone of %gep
+  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
+  //   ... %new.gep ...
+  //
+  // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
+  // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
+  // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
+  // type of %gep.
+  //
+  //   %gep2                       ; clone of %gep
+  //   %0       = bitcast %gep2 to i8*
+  //   %uglygep = gep %0, <offset>
+  //   %new.gep = bitcast %uglygep to <type of %gep>
+  //   ... %new.gep ...
+  Instruction *NewGEP = GEP->clone();
+  NewGEP->insertBefore(GEP);
+
+  // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
+  // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
+  // used with unsigned integers later.
+  int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
+      DL->getTypeAllocSize(GEP->getResultElementType()));
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+  if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
+    // Very likely. As long as %gep is natually aligned, the byte offset we
+    // extracted should be a multiple of sizeof(*%gep).
+    int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
+    NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
+                                       ConstantInt::get(IntPtrTy, Index, true),
+                                       GEP->getName(), GEP);
+    // Inherit the inbounds attribute of the original GEP.
+    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
+  } else {
+    // Unlikely but possible. For example,
+    // #pragma pack(1)
+    // struct S {
+    //   int a[3];
+    //   int64 b[8];
+    // };
+    // #pragma pack()
+    //
+    // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
+    // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
+    // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
+    // sizeof(int64).
+    //
+    // Emit an uglygep in this case.
+    Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
+                                       GEP->getPointerAddressSpace());
+    NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
+    NewGEP = GetElementPtrInst::Create(
+        Type::getInt8Ty(GEP->getContext()), NewGEP,
+        ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
+        GEP);
+    // Inherit the inbounds attribute of the original GEP.
+    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
+    if (GEP->getType() != I8PtrTy)
+      NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
+  }
+
+  GEP->replaceAllUsesWith(NewGEP);
+  GEP->eraseFromParent();
+
+  return true;
+}
+
+bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  if (DisableSeparateConstOffsetFromGEP)
+    return false;
+
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  bool Changed = false;
+  for (BasicBlock &B : F) {
+    for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
+        Changed |= splitGEP(GEP);
+    // No need to split GEP ConstantExprs because all its indices are constant
+    // already.
+  }
+
+  Changed |= reuniteExts(F);
+
+  if (VerifyNoDeadCode)
+    verifyNoDeadCode(F);
+
+  return Changed;
+}
+
+Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
+    const SCEV *Key, Instruction *Dominatee) {
+  auto Pos = DominatingExprs.find(Key);
+  if (Pos == DominatingExprs.end())
+    return nullptr;
+
+  auto &Candidates = Pos->second;
+  // Because we process the basic blocks in pre-order of the dominator tree, a
+  // candidate that doesn't dominate the current instruction won't dominate any
+  // future instruction either. Therefore, we pop it out of the stack. This
+  // optimization makes the algorithm O(n).
+  while (!Candidates.empty()) {
+    Instruction *Candidate = Candidates.back();
+    if (DT->dominates(Candidate, Dominatee))
+      return Candidate;
+    Candidates.pop_back();
+  }
+  return nullptr;
+}
+
+bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
+  if (!SE->isSCEVable(I->getType()))
+    return false;
+
+  //   Dom: LHS+RHS
+  //   I: sext(LHS)+sext(RHS)
+  // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
+  // TODO: handle zext
+  Value *LHS = nullptr, *RHS = nullptr;
+  if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) ||
+      match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
+    if (LHS->getType() == RHS->getType()) {
+      const SCEV *Key =
+          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+      if (auto *Dom = findClosestMatchingDominator(Key, I)) {
+        Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
+        NewSExt->takeName(I);
+        I->replaceAllUsesWith(NewSExt);
+        RecursivelyDeleteTriviallyDeadInstructions(I);
+        return true;
+      }
+    }
+  }
+
+  // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
+  if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) ||
+      match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
+    if (programUndefinedIfFullPoison(I)) {
+      const SCEV *Key =
+          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+      DominatingExprs[Key].push_back(I);
+    }
+  }
+  return false;
+}
+
+bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
+  bool Changed = false;
+  DominatingExprs.clear();
+  for (const auto Node : depth_first(DT)) {
+    BasicBlock *BB = Node->getBlock();
+    for (auto I = BB->begin(); I != BB->end(); ) {
+      Instruction *Cur = &*I++;
+      Changed |= reuniteExts(Cur);
+    }
+  }
+  return Changed;
+}
+
+void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
+  for (BasicBlock &B : F) {
+    for (Instruction &I : B) {
+      if (isInstructionTriviallyDead(&I)) {
+        std::string ErrMessage;
+        raw_string_ostream RSO(ErrMessage);
+        RSO << "Dead instruction detected!\n" << I << "\n";
+        llvm_unreachable(RSO.str().c_str());
+      }
+    }
+  }
+}
+
+bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
+    GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
+  if (!FirstGEP || !FirstGEP->hasOneUse())
+    return false;
+
+  if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
+    return false;
+
+  if (FirstGEP == SecondGEP)
+    return false;
+
+  unsigned FirstNum = FirstGEP->getNumOperands();
+  unsigned SecondNum = SecondGEP->getNumOperands();
+  // Give up if the number of operands are not 2.
+  if (FirstNum != SecondNum || FirstNum != 2)
+    return false;
+
+  Value *FirstBase = FirstGEP->getOperand(0);
+  Value *SecondBase = SecondGEP->getOperand(0);
+  Value *FirstOffset = FirstGEP->getOperand(1);
+  // Give up if the index of the first GEP is loop invariant.
+  if (CurLoop->isLoopInvariant(FirstOffset))
+    return false;
+
+  // Give up if base doesn't have same type.
+  if (FirstBase->getType() != SecondBase->getType())
+    return false;
+
+  Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);
+
+  // Check if the second operand of first GEP has constant coefficient.
+  // For an example, for the following code,  we won't gain anything by
+  // hoisting the second GEP out because the second GEP can be folded away.
+  //   %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
+  //   %67 = shl i64 %scevgep.sum.ur159, 2
+  //   %uglygep160 = getelementptr i8* %65, i64 %67
+  //   %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
+
+  // Skip constant shift instruction which may be generated by Splitting GEPs.
+  if (FirstOffsetDef && FirstOffsetDef->isShift() &&
+      isa<ConstantInt>(FirstOffsetDef->getOperand(1)))
+    FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));
+
+  // Give up if FirstOffsetDef is an Add or Sub with constant.
+  // Because it may not profitable at all due to constant folding.
+  if (FirstOffsetDef)
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
+      unsigned opc = BO->getOpcode();
+      if ((opc == Instruction::Add || opc == Instruction::Sub) &&
+          (isa<ConstantInt>(BO->getOperand(0)) ||
+           isa<ConstantInt>(BO->getOperand(1))))
+        return false;
+    }
+  return true;
+}
+
+bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
+  int UsesInLoop = 0;
+  for (User *U : V->users()) {
+    if (Instruction *User = dyn_cast<Instruction>(U))
+      if (L->contains(User))
+        if (++UsesInLoop > 1)
+          return true;
+  }
+  return false;
+}
+
+void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
+                                                GetElementPtrInst *Second) {
+  Value *Offset1 = First->getOperand(1);
+  Value *Offset2 = Second->getOperand(1);
+  First->setOperand(1, Offset2);
+  Second->setOperand(1, Offset1);
+
+  // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
+  const DataLayout &DAL = First->getModule()->getDataLayout();
+  APInt Offset(DAL.getPointerSizeInBits(
+                   cast<PointerType>(First->getType())->getAddressSpace()),
+               0);
+  Value *NewBase =
+      First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
+  uint64_t ObjectSize;
+  if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) ||
+     Offset.ugt(ObjectSize)) {
+    First->setIsInBounds(false);
+    Second->setIsInBounds(false);
+  } else
+    First->setIsInBounds(true);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
new file mode 100644
index 000000000000..aaab5857e0f1
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -0,0 +1,808 @@
+//===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+#define DEBUG_TYPE "simple-loop-unswitch"
+
+using namespace llvm;
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumTrivial, "Number of unswitches that are trivial");
+
+static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
+                                        Constant &Replacement) {
+  assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
+
+  // Replace uses of LIC in the loop with the given constant.
+  for (auto UI = LIC.use_begin(), UE = LIC.use_end(); UI != UE;) {
+    // Grab the use and walk past it so we can clobber it in the use list.
+    Use *U = &*UI++;
+    Instruction *UserI = dyn_cast<Instruction>(U->getUser());
+    if (!UserI || !L.contains(UserI))
+      continue;
+
+    // Replace this use within the loop body.
+    *U = &Replacement;
+  }
+}
+
+/// Update the dominator tree after removing one exiting predecessor of a loop
+/// exit block.
+static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L,
+                               DominatorTree &DT) {
+  assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) &&
+         "Cannot have empty predecessors of the loop exit block if we split "
+         "off a block to unswitch!");
+
+  BasicBlock *IDom = *pred_begin(LoopExitBB);
+  // Walk all of the other predecessors finding the nearest common dominator
+  // until all predecessors are covered or we reach the loop header. The loop
+  // header necessarily dominates all loop exit blocks in loop simplified form
+  // so we can early-exit the moment we hit that block.
+  for (auto PI = std::next(pred_begin(LoopExitBB)), PE = pred_end(LoopExitBB);
+       PI != PE && IDom != L.getHeader(); ++PI)
+    IDom = DT.findNearestCommonDominator(IDom, *PI);
+
+  DT.changeImmediateDominator(LoopExitBB, IDom);
+}
+
+/// Update the dominator tree after unswitching a particular former exit block.
+///
+/// This handles the full update of the dominator tree after hoisting a block
+/// that previously was an exit block (or split off of an exit block) up to be
+/// reached from the new immediate dominator of the preheader.
+///
+/// The common case is simple -- we just move the unswitched block to have an
+/// immediate dominator of the old preheader. But in complex cases, there may
+/// be other blocks reachable from the unswitched block that are immediately
+/// dominated by some node between the unswitched one and the old preheader.
+/// All of these also need to be hoisted in the dominator tree. We also want to
+/// minimize queries to the dominator tree because each step of this
+/// invalidates any DFS numbers that would make queries fast.
+static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
+                                  DominatorTree &DT) {
+  DomTreeNode *OldPHNode = DT[OldPH];
+  DomTreeNode *UnswitchedNode = DT[UnswitchedBB];
+  // If the dominator tree has already been updated for this unswitched node,
+  // we're done. This makes it easier to use this routine if there are multiple
+  // paths to the same unswitched destination.
+  if (UnswitchedNode->getIDom() == OldPHNode)
+    return;
+
+  // First collect the domtree nodes that we are hoisting over. These are the
+  // set of nodes which may have children that need to be hoisted as well.
+  SmallPtrSet<DomTreeNode *, 4> DomChain;
+  for (auto *IDom = UnswitchedNode->getIDom(); IDom != OldPHNode;
+       IDom = IDom->getIDom())
+    DomChain.insert(IDom);
+
+  // The unswitched block ends up immediately dominated by the old preheader --
+  // regardless of whether it is the loop exit block or split off of the loop
+  // exit block.
+  DT.changeImmediateDominator(UnswitchedNode, OldPHNode);
+
+  // For everything that moves up the dominator tree, we need to examine the
+  // dominator frontier to see if it additionally should move up the dominator
+  // tree. This lambda appends the dominator frontier for a node on the
+  // worklist.
+  //
+  // Note that we don't currently use the IDFCalculator here for two reasons:
+  // 1) It computes dominator tree levels for the entire function on each run
+  //    of 'compute'. While this isn't terrible, given that we expect to update
+  //    relatively small subtrees of the domtree, it isn't necessarily the right
+  //    tradeoff.
+  // 2) The interface doesn't fit this usage well. It doesn't operate in
+  //    append-only, and builds several sets that we don't need.
+  //
+  // FIXME: Neither of these issues are a big deal and could be addressed with
+  // some amount of refactoring of IDFCalculator. That would allow us to share
+  // the core logic here (which is solving the same core problem).
+  SmallSetVector<BasicBlock *, 4> Worklist;
+  SmallVector<DomTreeNode *, 4> DomNodes;
+  SmallPtrSet<BasicBlock *, 4> DomSet;
+  auto AppendDomFrontier = [&](DomTreeNode *Node) {
+    assert(DomNodes.empty() && "Must start with no dominator nodes.");
+    assert(DomSet.empty() && "Must start with an empty dominator set.");
+
+    // First flatten this subtree into sequence of nodes by doing a pre-order
+    // walk.
+    DomNodes.push_back(Node);
+    // We intentionally re-evaluate the size as each node can add new children.
+    // Because this is a tree walk, this cannot add any duplicates.
+    for (int i = 0; i < (int)DomNodes.size(); ++i)
+      DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end());
+
+    // Now create a set of the basic blocks so we can quickly test for
+    // dominated successors. We could in theory use the DFS numbers of the
+    // dominator tree for this, but we want this to remain predictably fast
+    // even while we mutate the dominator tree in ways that would invalidate
+    // the DFS numbering.
+    for (DomTreeNode *InnerN : DomNodes)
+      DomSet.insert(InnerN->getBlock());
+
+    // Now re-walk the nodes, appending every successor of every node that isn't
+    // in the set. Note that we don't append the node itself, even though if it
+    // is a successor it does not strictly dominate itself and thus it would be
+    // part of the dominance frontier. The reason we don't append it is that
+    // the node passed in came *from* the worklist and so it has already been
+    // processed.
+    for (DomTreeNode *InnerN : DomNodes)
+      for (BasicBlock *SuccBB : successors(InnerN->getBlock()))
+        if (!DomSet.count(SuccBB))
+          Worklist.insert(SuccBB);
+
+    DomNodes.clear();
+    DomSet.clear();
+  };
+
+  // Append the initial dom frontier nodes.
+  AppendDomFrontier(UnswitchedNode);
+
+  // Walk the worklist. We grow the list in the loop and so must recompute size.
+  for (int i = 0; i < (int)Worklist.size(); ++i) {
+    auto *BB = Worklist[i];
+
+    DomTreeNode *Node = DT[BB];
+    assert(!DomChain.count(Node) &&
+           "Cannot be dominated by a block you can reach!");
+
+    // If this block had an immediate dominator somewhere in the chain
+    // we hoisted over, then its position in the domtree needs to move as it is
+    // reachable from a node hoisted over this chain.
+    if (!DomChain.count(Node->getIDom()))
+      continue;
+
+    DT.changeImmediateDominator(Node, OldPHNode);
+
+    // Now add this node's dominator frontier to the worklist as well.
+    AppendDomFrontier(Node);
+  }
+}
+
+/// Check that all the LCSSA PHI nodes in the loop exit block have trivial
+/// incoming values along this edge.
+static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
+                                         BasicBlock &ExitBB) {
+  for (Instruction &I : ExitBB) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      // No more PHIs to check.
+      return true;
+
+    // If the incoming value for this edge isn't loop invariant the unswitch
+    // won't be trivial.
+    if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
+      return false;
+  }
+  llvm_unreachable("Basic blocks should never be empty!");
+}
+
+/// Rewrite the PHI nodes in an unswitched loop exit basic block.
+///
+/// Requires that the loop exit and unswitched basic block are the same, and
+/// that the exiting block was a unique predecessor of that block. Rewrites the
+/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
+/// PHI nodes from the old preheader that now contains the unswitched
+/// terminator.
+static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
+                                                  BasicBlock &OldExitingBB,
+                                                  BasicBlock &OldPH) {
+  for (Instruction &I : UnswitchedBB) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      // No more PHIs to check.
+      break;
+
+    // When the loop exit is directly unswitched we just need to update the
+    // incoming basic block. We loop to handle weird cases with repeated
+    // incoming blocks, but expect to typically only have one operand here.
+    for (auto i : seq<int>(0, PN->getNumOperands())) {
+      assert(PN->getIncomingBlock(i) == &OldExitingBB &&
+             "Found incoming block different from unique predecessor!");
+      PN->setIncomingBlock(i, &OldPH);
+    }
+  }
+}
+
+/// Rewrite the PHI nodes in the loop exit basic block and the split off
+/// unswitched block.
+///
+/// Because the exit block remains an exit from the loop, this rewrites the
+/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
+/// nodes into the unswitched basic block to select between the value in the
+/// old preheader and the loop exit.
+static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
+                                                      BasicBlock &UnswitchedBB,
+                                                      BasicBlock &OldExitingBB,
+                                                      BasicBlock &OldPH) {
+  assert(&ExitBB != &UnswitchedBB &&
+         "Must have different loop exit and unswitched blocks!");
+  Instruction *InsertPt = &*UnswitchedBB.begin();
+  for (Instruction &I : ExitBB) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      // No more PHIs to check.
+      break;
+
+    auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2,
+                                  PN->getName() + ".split", InsertPt);
+
+    // Walk backwards over the old PHI node's inputs to minimize the cost of
+    // removing each one. We have to do this weird loop manually so that we
+    // create the same number of new incoming edges in the new PHI as we expect
+    // each case-based edge to be included in the unswitched switch in some
+    // cases.
+    // FIXME: This is really, really gross. It would be much cleaner if LLVM
+    // allowed us to create a single entry for a predecessor block without
+    // having separate entries for each "edge" even though these edges are
+    // required to produce identical results.
+    for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) {
+      if (PN->getIncomingBlock(i) != &OldExitingBB)
+        continue;
+
+      Value *Incoming = PN->removeIncomingValue(i);
+      NewPN->addIncoming(Incoming, &OldPH);
+    }
+
+    // Now replace the old PHI with the new one and wire the old one in as an
+    // input to the new one.
+    PN->replaceAllUsesWith(NewPN);
+    NewPN->addIncoming(PN, &ExitBB);
+  }
+}
+
+/// Unswitch a trivial branch if the condition is loop invariant.
+///
+/// This routine should only be called when loop code leading to the branch has
+/// been validated as trivial (no side effects). This routine checks if the
+/// condition is invariant and one of the successors is a loop exit. This
+/// allows us to unswitch without duplicating the loop, making it trivial.
+///
+/// If this routine fails to unswitch the branch it returns false.
+///
+/// If the branch can be unswitched, this routine splits the preheader and
+/// hoists the branch above that split. Preserves loop simplified form
+/// (splitting the exit block as necessary). It simplifies the branch within
+/// the loop to an unconditional branch but doesn't remove it entirely. Further
+/// cleanup can be done with some simplify-cfg like pass.
+static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
+                                  LoopInfo &LI) {
+  assert(BI.isConditional() && "Can only unswitch a conditional branch!");
+  DEBUG(dbgs() << "  Trying to unswitch branch: " << BI << "\n");
+
+  Value *LoopCond = BI.getCondition();
+
+  // Need a trivial loop condition to unswitch.
+  if (!L.isLoopInvariant(LoopCond))
+    return false;
+
+  // FIXME: We should compute this once at the start and update it!
+  SmallVector<BasicBlock *, 16> ExitBlocks;
+  L.getExitBlocks(ExitBlocks);
+  SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
+                                             ExitBlocks.end());
+
+  // Check to see if a successor of the branch is guaranteed to
+  // exit through a unique exit block without having any
+  // side-effects.  If so, determine the value of Cond that causes
+  // it to do this.
+  ConstantInt *CondVal = ConstantInt::getTrue(BI.getContext());
+  ConstantInt *Replacement = ConstantInt::getFalse(BI.getContext());
+  int LoopExitSuccIdx = 0;
+  auto *LoopExitBB = BI.getSuccessor(0);
+  if (!ExitBlockSet.count(LoopExitBB)) {
+    std::swap(CondVal, Replacement);
+    LoopExitSuccIdx = 1;
+    LoopExitBB = BI.getSuccessor(1);
+    if (!ExitBlockSet.count(LoopExitBB))
+      return false;
+  }
+  auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx);
+  assert(L.contains(ContinueBB) &&
+         "Cannot have both successors exit and still be in the loop!");
+
+  auto *ParentBB = BI.getParent();
+  if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
+    return false;
+
+  DEBUG(dbgs() << "    unswitching trivial branch when: " << CondVal
+               << " == " << LoopCond << "\n");
+
+  // Split the preheader, so that we know that there is a safe place to insert
+  // the conditional branch. We will change the preheader to have a conditional
+  // branch on LoopCond.
+  BasicBlock *OldPH = L.getLoopPreheader();
+  BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI);
+
+  // Now that we have a place to insert the conditional branch, create a place
+  // to branch to: this is the exit block out of the loop that we are
+  // unswitching. We need to split this if there are other loop predecessors.
+  // Because the loop is in simplified form, *any* other predecessor is enough.
+  BasicBlock *UnswitchedBB;
+  if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {
+    (void)PredBB;
+    assert(PredBB == BI.getParent() &&
+           "A branch's parent isn't a predecessor!");
+    UnswitchedBB = LoopExitBB;
+  } else {
+    UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
+  }
+
+  // Now splice the branch to gate reaching the new preheader and re-point its
+  // successors.
+  OldPH->getInstList().splice(std::prev(OldPH->end()),
+                              BI.getParent()->getInstList(), BI);
+  OldPH->getTerminator()->eraseFromParent();
+  BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
+  BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
+
+  // Create a new unconditional branch that will continue the loop as a new
+  // terminator.
+  BranchInst::Create(ContinueBB, ParentBB);
+
+  // Rewrite the relevant PHI nodes.
+  if (UnswitchedBB == LoopExitBB)
+    rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
+  else
+    rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
+                                              *ParentBB, *OldPH);
+
+  // Now we need to update the dominator tree.
+  updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
+  // But if we split something off of the loop exit block then we also removed
+  // one of the predecessors for the loop exit block and may need to update its
+  // idom.
+  if (UnswitchedBB != LoopExitBB)
+    updateLoopExitIDom(LoopExitBB, L, DT);
+
+  // Since this is an i1 condition we can also trivially replace uses of it
+  // within the loop with a constant.
+  replaceLoopUsesWithConstant(L, *LoopCond, *Replacement);
+
+  ++NumTrivial;
+  ++NumBranches;
+  return true;
+}
+
+/// Unswitch a trivial switch if the condition is loop invariant.
+///
+/// This routine should only be called when loop code leading to the switch has
+/// been validated as trivial (no side effects). This routine checks if the
+/// condition is invariant and that at least one of the successors is a loop
+/// exit. This allows us to unswitch without duplicating the loop, making it
+/// trivial.
+///
+/// If this routine fails to unswitch the switch it returns false.
+///
+/// If the switch can be unswitched, this routine splits the preheader and
+/// copies the switch above that split. If the default case is one of the
+/// exiting cases, it copies the non-exiting cases and points them at the new
+/// preheader. If the default case is not exiting, it copies the exiting cases
+/// and points the default at the preheader. It preserves loop simplified form
+/// (splitting the exit blocks as necessary). It simplifies the switch within
+/// the loop by removing now-dead cases. If the default case is one of those
+/// unswitched, it replaces its destination with a new basic block containing
+/// only unreachable. Such basic blocks, while technically loop exits, are not
+/// considered for unswitching so this is a stable transform and the same
+/// switch will not be revisited. If after unswitching there is only a single
+/// in-loop successor, the switch is further simplified to an unconditional
+/// branch. Still more cleanup can be done with some simplify-cfg like pass.
+static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
+                                  LoopInfo &LI) {
+  DEBUG(dbgs() << "  Trying to unswitch switch: " << SI << "\n");
+  Value *LoopCond = SI.getCondition();
+
+  // If this isn't switching on an invariant condition, we can't unswitch it.
+  if (!L.isLoopInvariant(LoopCond))
+    return false;
+
+  auto *ParentBB = SI.getParent();
+
+  // FIXME: We should compute this once at the start and update it!
+  SmallVector<BasicBlock *, 16> ExitBlocks;
+  L.getExitBlocks(ExitBlocks);
+  SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
+                                             ExitBlocks.end());
+
+  SmallVector<int, 4> ExitCaseIndices;
+  for (auto Case : SI.cases()) {
+    auto *SuccBB = Case.getCaseSuccessor();
+    if (ExitBlockSet.count(SuccBB) &&
+        areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
+      ExitCaseIndices.push_back(Case.getCaseIndex());
+  }
+  BasicBlock *DefaultExitBB = nullptr;
+  if (ExitBlockSet.count(SI.getDefaultDest()) &&
+      areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
+      !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))
+    DefaultExitBB = SI.getDefaultDest();
+  else if (ExitCaseIndices.empty())
+    return false;
+
+  DEBUG(dbgs() << "    unswitching trivial cases...\n");
+
+  SmallVector<std::pair<ConstantInt *, BasicBlock *>, 4> ExitCases;
+  ExitCases.reserve(ExitCaseIndices.size());
+  // We walk the case indices backwards so that we remove the last case first
+  // and don't disrupt the earlier indices.
+  for (unsigned Index : reverse(ExitCaseIndices)) {
+    auto CaseI = SI.case_begin() + Index;
+    // Save the value of this case.
+    ExitCases.push_back({CaseI->getCaseValue(), CaseI->getCaseSuccessor()});
+    // Delete the unswitched cases.
+    SI.removeCase(CaseI);
+  }
+
+  // Check if after this all of the remaining cases point at the same
+  // successor.
+  BasicBlock *CommonSuccBB = nullptr;
+  if (SI.getNumCases() > 0 &&
+      std::all_of(std::next(SI.case_begin()), SI.case_end(),
+                  [&SI](const SwitchInst::CaseHandle &Case) {
+                    return Case.getCaseSuccessor() ==
+                           SI.case_begin()->getCaseSuccessor();
+                  }))
+    CommonSuccBB = SI.case_begin()->getCaseSuccessor();
+
+  if (DefaultExitBB) {
+    // We can't remove the default edge so replace it with an edge to either
+    // the single common remaining successor (if we have one) or an unreachable
+    // block.
+    if (CommonSuccBB) {
+      SI.setDefaultDest(CommonSuccBB);
+    } else {
+      BasicBlock *UnreachableBB = BasicBlock::Create(
+          ParentBB->getContext(),
+          Twine(ParentBB->getName()) + ".unreachable_default",
+          ParentBB->getParent());
+      new UnreachableInst(ParentBB->getContext(), UnreachableBB);
+      SI.setDefaultDest(UnreachableBB);
+      DT.addNewBlock(UnreachableBB, ParentBB);
+    }
+  } else {
+    // If we're not unswitching the default, we need it to match any cases to
+    // have a common successor or if we have no cases it is the common
+    // successor.
+    if (SI.getNumCases() == 0)
+      CommonSuccBB = SI.getDefaultDest();
+    else if (SI.getDefaultDest() != CommonSuccBB)
+      CommonSuccBB = nullptr;
+  }
+
+  // Split the preheader, so that we know that there is a safe place to insert
+  // the switch.
+  BasicBlock *OldPH = L.getLoopPreheader();
+  BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI);
+  OldPH->getTerminator()->eraseFromParent();
+
+  // Now add the unswitched switch.
+  auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
+
+  // Rewrite the IR for the unswitched basic blocks. This requires two steps.
+  // First, we split any exit blocks with remaining in-loop predecessors. Then
+  // we update the PHIs in one of two ways depending on if there was a split.
+  // We walk in reverse so that we split in the same order as the cases
+  // appeared. This is purely for convenience of reading the resulting IR, but
+  // it doesn't cost anything really.
+  SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
+  SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
+  // Handle the default exit if necessary.
+  // FIXME: It'd be great if we could merge this with the loop below but LLVM's
+  // ranges aren't quite powerful enough yet.
+  if (DefaultExitBB) {
+    if (pred_empty(DefaultExitBB)) {
+      UnswitchedExitBBs.insert(DefaultExitBB);
+      rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
+    } else {
+      auto *SplitBB =
+          SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
+      rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
+                                                *ParentBB, *OldPH);
+      updateLoopExitIDom(DefaultExitBB, L, DT);
+      DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+    }
+  }
+  // Note that we must use a reference in the for loop so that we update the
+  // container.
+  for (auto &CasePair : reverse(ExitCases)) {
+    // Grab a reference to the exit block in the pair so that we can update it.
+    BasicBlock *ExitBB = CasePair.second;
+
+    // If this case is the last edge into the exit block, we can simply reuse it
+    // as it will no longer be a loop exit. No mapping necessary.
+    if (pred_empty(ExitBB)) {
+      // Only rewrite once.
+      if (UnswitchedExitBBs.insert(ExitBB).second)
+        rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
+      continue;
+    }
+
+    // Otherwise we need to split the exit block so that we retain an exit
+    // block from the loop and a target for the unswitched condition.
+    BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB];
+    if (!SplitExitBB) {
+      // If this is the first time we see this, do the split and remember it.
+      SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
+      rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
+                                                *ParentBB, *OldPH);
+      updateLoopExitIDom(ExitBB, L, DT);
+    }
+    // Update the case pair to point to the split block.
+    CasePair.second = SplitExitBB;
+  }
+
+  // Now add the unswitched cases. We do this in reverse order as we built them
+  // in reverse order.
+  for (auto CasePair : reverse(ExitCases)) {
+    ConstantInt *CaseVal = CasePair.first;
+    BasicBlock *UnswitchedBB = CasePair.second;
+
+    NewSI->addCase(CaseVal, UnswitchedBB);
+    updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
+  }
+
+  // If the default was unswitched, re-point it and add explicit cases for
+  // entering the loop.
+  if (DefaultExitBB) {
+    NewSI->setDefaultDest(DefaultExitBB);
+    updateDTAfterUnswitch(DefaultExitBB, OldPH, DT);
+
+    // We removed all the exit cases, so we just copy the cases to the
+    // unswitched switch.
+    for (auto Case : SI.cases())
+      NewSI->addCase(Case.getCaseValue(), NewPH);
+  }
+
+  // If we ended up with a common successor for every path through the switch
+  // after unswitching, rewrite it to an unconditional branch to make it easy
+  // to recognize. Otherwise we potentially have to recognize the default case
+  // pointing at unreachable and other complexity.
+  if (CommonSuccBB) {
+    BasicBlock *BB = SI.getParent();
+    SI.eraseFromParent();
+    BranchInst::Create(CommonSuccBB, BB);
+  }
+
+  DT.verifyDomTree();
+  ++NumTrivial;
+  ++NumSwitches;
+  return true;
+}
+
+/// This routine scans the loop to find a branch or switch which occurs before
+/// any side effects occur. These can potentially be unswitched without
+/// duplicating the loop. If a branch or switch is successfully unswitched the
+/// scanning continues to see if subsequent branches or switches have become
+/// trivial. Once all trivial candidates have been unswitched, this routine
+/// returns.
+///
+/// The return value indicates whether anything was unswitched (and therefore
+/// changed).
+static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
+                                         LoopInfo &LI) {
+  bool Changed = false;
+
+  // If loop header has only one reachable successor we should keep looking for
+  // trivial condition candidates in the successor as well. An alternative is
+  // to constant fold conditions and merge successors into loop header (then we
+  // only need to check header's terminator). The reason for not doing this in
+  // LoopUnswitch pass is that it could potentially break LoopPassManager's
+  // invariants. Folding dead branches could either eliminate the current loop
+  // or make other loops unreachable. LCSSA form might also not be preserved
+  // after deleting branches. The following code keeps traversing loop header's
+  // successors until it finds the trivial condition candidate (condition that
+  // is not a constant). Since unswitching generates branches with constant
+  // conditions, this scenario could be very common in practice.
+  BasicBlock *CurrentBB = L.getHeader();
+  SmallPtrSet<BasicBlock *, 8> Visited;
+  Visited.insert(CurrentBB);
+  do {
+    // Check if there are any side-effecting instructions (e.g. stores, calls,
+    // volatile loads) in the part of the loop that the code *would* execute
+    // without unswitching.
+    if (llvm::any_of(*CurrentBB,
+                     [](Instruction &I) { return I.mayHaveSideEffects(); }))
+      return Changed;
+
+    TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
+
+    if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+      // Don't bother trying to unswitch past a switch with a constant
+      // condition. This should be removed prior to running this pass by
+      // simplify-cfg.
+      if (isa<Constant>(SI->getCondition()))
+        return Changed;
+
+      if (!unswitchTrivialSwitch(L, *SI, DT, LI))
+        // Coludn't unswitch this one so we're done.
+        return Changed;
+
+      // Mark that we managed to unswitch something.
+      Changed = true;
+
+      // If unswitching turned the terminator into an unconditional branch then
+      // we can continue. The unswitching logic specifically works to fold any
+      // cases it can into an unconditional branch to make it easier to
+      // recognize here.
+      auto *BI = dyn_cast<BranchInst>(CurrentBB->getTerminator());
+      if (!BI || BI->isConditional())
+        return Changed;
+
+      CurrentBB = BI->getSuccessor(0);
+      continue;
+    }
+
+    auto *BI = dyn_cast<BranchInst>(CurrentTerm);
+    if (!BI)
+      // We do not understand other terminator instructions.
+      return Changed;
+
+    // Don't bother trying to unswitch past an unconditional branch or a branch
+    // with a constant value. These should be removed by simplify-cfg prior to
+    // running this pass.
+    if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
+      return Changed;
+
+    // Found a trivial condition candidate: non-foldable conditional branch. If
+    // we fail to unswitch this, we can't do anything else that is trivial.
+    if (!unswitchTrivialBranch(L, *BI, DT, LI))
+      return Changed;
+
+    // Mark that we managed to unswitch something.
+    Changed = true;
+
+    // We unswitched the branch. This should always leave us with an
+    // unconditional branch that we can follow now.
+    BI = cast<BranchInst>(CurrentBB->getTerminator());
+    assert(!BI->isConditional() &&
+           "Cannot form a conditional branch by unswitching1");
+    CurrentBB = BI->getSuccessor(0);
+
+    // When continuing, if we exit the loop or reach a previous visited block,
+    // then we can not reach any trivial condition candidates (unfoldable
+    // branch instructions or switch instructions) and no unswitch can happen.
+  } while (L.contains(CurrentBB) && Visited.insert(CurrentBB).second);
+
+  return Changed;
+}
+
+/// Unswitch control flow predicated on loop invariant conditions.
+///
+/// This first hoists all branches or switches which are trivial (IE, do not
+/// require duplicating any part of the loop) out of the loop body. It then
+/// looks at other loop invariant control flows and tries to unswitch those as
+/// well by cloning the loop if the result is small enough.
+static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                         AssumptionCache &AC) {
+  assert(L.isLCSSAForm(DT) &&
+         "Loops must be in LCSSA form before unswitching.");
+  bool Changed = false;
+
+  // Must be in loop simplified form: we need a preheader and dedicated exits.
+  if (!L.isLoopSimplifyForm())
+    return false;
+
+  // Try trivial unswitch first before loop over other basic blocks in the loop.
+  Changed |= unswitchAllTrivialConditions(L, DT, LI);
+
+  // FIXME: Add support for non-trivial unswitching by cloning the loop.
+
+  return Changed;
+}
+
+PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
+                                              LoopStandardAnalysisResults &AR,
+                                              LPMUpdater &U) {
+  Function &F = *L.getHeader()->getParent();
+  (void)F;
+
+  DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n");
+
+  if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC))
+    return PreservedAnalyses::all();
+
+#ifndef NDEBUG
+  // Historically this pass has had issues with the dominator tree so verify it
+  // in asserts builds.
+  AR.DT.verifyDomTree();
+#endif
+  return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+
+class SimpleLoopUnswitchLegacyPass : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+
+  explicit SimpleLoopUnswitchLegacyPass() : LoopPass(ID) {
+    initializeSimpleLoopUnswitchLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipLoop(L))
+    return false;
+
+  Function &F = *L->getHeader()->getParent();
+
+  DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L << "\n");
+
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+  bool Changed = unswitchLoop(*L, DT, LI, AC);
+
+#ifndef NDEBUG
+  // Historically this pass has had issues with the dominator tree so verify it
+  // in asserts builds.
+  DT.verifyDomTree();
+#endif
+  return Changed;
+}
+
+char SimpleLoopUnswitchLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
+                      "Simple unswitch loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
+                    "Simple unswitch loops", false, false)
+
+Pass *llvm::createSimpleLoopUnswitchLegacyPass() {
+  return new SimpleLoopUnswitchLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
new file mode 100644
index 000000000000..8754c714c5b2
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -0,0 +1,287 @@
+//===- SimplifyCFGPass.cpp - CFG Simplification Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead code elimination and basic block merging, along
+// with a collection of other peephole control flow optimizations.  For example:
+//
+//   * Removes basic blocks with no predecessors.
+//   * Merges a basic block into its predecessor if there is only one and the
+//     predecessor only has one successor.
+//   * Eliminates PHI nodes for basic blocks with a single predecessor.
+//   * Eliminates a basic block that only contains an unconditional branch.
+//   * Changes invoke instructions to nounwind functions to be calls.
+//   * Change things like "if (x) if (y)" into "if (x&y)".
+//   * etc..
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "simplifycfg"
+
+static cl::opt<unsigned>
+UserBonusInstThreshold("bonus-inst-threshold", cl::Hidden, cl::init(1),
+   cl::desc("Control the number of bonus instructions (default = 1)"));
+
+STATISTIC(NumSimpl, "Number of blocks simplified");
+
+/// If we have more than one empty (other than phi node) return blocks,
+/// merge them together to promote recursive block merging.
+static bool mergeEmptyReturnBlocks(Function &F) {
+  bool Changed = false;
+
+  BasicBlock *RetBlock = nullptr;
+
+  // Scan all the blocks in the function, looking for empty return blocks.
+  for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) {
+    BasicBlock &BB = *BBI++;
+
+    // Only look at return blocks.
+    ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
+    if (!Ret) continue;
+
+    // Only look at the block if it is empty or the only other thing in it is a
+    // single PHI node that is the operand to the return.
+    if (Ret != &BB.front()) {
+      // Check for something else in the block.
+      BasicBlock::iterator I(Ret);
+      --I;
+      // Skip over debug info.
+      while (isa<DbgInfoIntrinsic>(I) && I != BB.begin())
+        --I;
+      if (!isa<DbgInfoIntrinsic>(I) &&
+          (!isa<PHINode>(I) || I != BB.begin() || Ret->getNumOperands() == 0 ||
+           Ret->getOperand(0) != &*I))
+        continue;
+    }
+
+    // If this is the first returning block, remember it and keep going.
+    if (!RetBlock) {
+      RetBlock = &BB;
+      continue;
+    }
+
+    // Otherwise, we found a duplicate return block.  Merge the two.
+    Changed = true;
+
+    // Case when there is no input to the return or when the returned values
+    // agree is trivial.  Note that they can't agree if there are phis in the
+    // blocks.
+    if (Ret->getNumOperands() == 0 ||
+        Ret->getOperand(0) ==
+          cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) {
+      BB.replaceAllUsesWith(RetBlock);
+      BB.eraseFromParent();
+      continue;
+    }
+
+    // If the canonical return block has no PHI node, create one now.
+    PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin());
+    if (!RetBlockPHI) {
+      Value *InVal = cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0);
+      pred_iterator PB = pred_begin(RetBlock), PE = pred_end(RetBlock);
+      RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(),
+                                    std::distance(PB, PE), "merge",
+                                    &RetBlock->front());
+
+      for (pred_iterator PI = PB; PI != PE; ++PI)
+        RetBlockPHI->addIncoming(InVal, *PI);
+      RetBlock->getTerminator()->setOperand(0, RetBlockPHI);
+    }
+
+    // Turn BB into a block that just unconditionally branches to the return
+    // block.  This handles the case when the two return blocks have a common
+    // predecessor but that return different things.
+    RetBlockPHI->addIncoming(Ret->getOperand(0), &BB);
+    BB.getTerminator()->eraseFromParent();
+    BranchInst::Create(RetBlock, &BB);
+  }
+
+  return Changed;
+}
+
+/// Call SimplifyCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
+                                   AssumptionCache *AC,
+                                   unsigned BonusInstThreshold,
+                                   bool LateSimplifyCFG) {
+  bool Changed = false;
+  bool LocalChange = true;
+
+  SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 32> Edges;
+  FindFunctionBackedges(F, Edges);
+  SmallPtrSet<BasicBlock *, 16> LoopHeaders;
+  for (unsigned i = 0, e = Edges.size(); i != e; ++i)
+    LoopHeaders.insert(const_cast<BasicBlock *>(Edges[i].second));
+
+  while (LocalChange) {
+    LocalChange = false;
+
+    // Loop over all of the basic blocks and remove them if they are unneeded.
+    for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
+      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders, LateSimplifyCFG)) {
+        LocalChange = true;
+        ++NumSimpl;
+      }
+    }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
+static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
+                                AssumptionCache *AC, int BonusInstThreshold,
+                                bool LateSimplifyCFG) {
+  bool EverChanged = removeUnreachableBlocks(F);
+  EverChanged |= mergeEmptyReturnBlocks(F);
+  EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
+                                        LateSimplifyCFG);
+
+  // If neither pass changed anything, we're done.
+  if (!EverChanged) return false;
+
+  // iterativelySimplifyCFG can (rarely) make some loops dead.  If this happens,
+  // removeUnreachableBlocks is needed to nuke them, which means we should
+  // iterate between the two optimizations.  We structure the code like this to
+  // avoid rerunning iterativelySimplifyCFG if the second pass of
+  // removeUnreachableBlocks doesn't do anything.
+  if (!removeUnreachableBlocks(F))
+    return true;
+
+  do {
+    EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
+                                         LateSimplifyCFG);
+    EverChanged |= removeUnreachableBlocks(F);
+  } while (EverChanged);
+
+  return true;
+}
+
+SimplifyCFGPass::SimplifyCFGPass()
+    : BonusInstThreshold(UserBonusInstThreshold),
+      LateSimplifyCFG(true) {}
+
+SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold, bool LateSimplifyCFG)
+    : BonusInstThreshold(BonusInstThreshold),
+      LateSimplifyCFG(LateSimplifyCFG) {}
+
+PreservedAnalyses SimplifyCFGPass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+
+  if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold, LateSimplifyCFG))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+namespace {
+struct BaseCFGSimplifyPass : public FunctionPass {
+  unsigned BonusInstThreshold;
+  std::function<bool(const Function &)> PredicateFtor;
+  bool LateSimplifyCFG;
+
+  BaseCFGSimplifyPass(int T, bool LateSimplifyCFG,
+                      std::function<bool(const Function &)> Ftor,
+                      char &ID)
+      : FunctionPass(ID), PredicateFtor(std::move(Ftor)),
+        LateSimplifyCFG(LateSimplifyCFG) {
+    BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
+  }
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F)))
+      return false;
+
+    AssumptionCache *AC =
+        &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    const TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold, LateSimplifyCFG);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+
+struct CFGSimplifyPass : public BaseCFGSimplifyPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  CFGSimplifyPass(int T = -1,
+                  std::function<bool(const Function &)> Ftor = nullptr)
+                  : BaseCFGSimplifyPass(T, false, Ftor, ID) {
+    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+  }
+};
+
+struct LateCFGSimplifyPass : public BaseCFGSimplifyPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  LateCFGSimplifyPass(int T = -1,
+                      std::function<bool(const Function &)> Ftor = nullptr)
+                      : BaseCFGSimplifyPass(T, true, Ftor, ID) {
+    initializeLateCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+  }
+};
+}
+
+char CFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+                    false)
+
+char LateCFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LateCFGSimplifyPass, "latesimplifycfg",
+                      "Simplify the CFG more aggressively", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(LateCFGSimplifyPass, "latesimplifycfg",
+                    "Simplify the CFG more aggressively", false, false)
+
+// Public interface to the CFGSimplification pass
+FunctionPass *
+llvm::createCFGSimplificationPass(int Threshold,
+    std::function<bool(const Function &)> Ftor) {
+  return new CFGSimplifyPass(Threshold, std::move(Ftor));
+}
+
+// Public interface to the LateCFGSimplification pass
+FunctionPass *
+llvm::createLateCFGSimplificationPass(int Threshold, 
+                                  std::function<bool(const Function &)> Ftor) {
+  return new LateCFGSimplifyPass(Threshold, std::move(Ftor));
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
new file mode 100644
index 000000000000..5210f165b874
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -0,0 +1,306 @@
+//===-- Sink.cpp - Code Sinking -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass moves instructions into successor blocks, when possible, so that
+// they aren't executed on paths where their results aren't needed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/Sink.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "sink"
+
+STATISTIC(NumSunk, "Number of instructions sunk");
+STATISTIC(NumSinkIter, "Number of sinking iterations");
+
+/// AllUsesDominatedByBlock - Return true if all uses of the specified value
+/// occur in blocks dominated by the specified block.
+static bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB,
+                                    DominatorTree &DT) {
+  // Ignoring debug uses is necessary so debug info doesn't affect the code.
+  // This may leave a referencing dbg_value in the original block, before
+  // the definition of the vreg.  Dwarf generator handles this although the
+  // user might not get the right info at runtime.
+  for (Use &U : Inst->uses()) {
+    // Determine the block of the use.
+    Instruction *UseInst = cast<Instruction>(U.getUser());
+    BasicBlock *UseBlock = UseInst->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(UseInst)) {
+      // PHI nodes use the operand in the predecessor block, not the block with
+      // the PHI.
+      unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+      UseBlock = PN->getIncomingBlock(Num);
+    }
+    // Check that it dominates.
+    if (!DT.dominates(BB, UseBlock))
+      return false;
+  }
+  return true;
+}
+
+static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
+                         SmallPtrSetImpl<Instruction *> &Stores) {
+
+  if (Inst->mayWriteToMemory()) {
+    Stores.insert(Inst);
+    return false;
+  }
+
+  if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
+    MemoryLocation Loc = MemoryLocation::get(L);
+    for (Instruction *S : Stores)
+      if (AA.getModRefInfo(S, Loc) & MRI_Mod)
+        return false;
+  }
+
+  if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst) || Inst->isEHPad() ||
+      Inst->mayThrow())
+    return false;
+
+  if (auto CS = CallSite(Inst)) {
+    // Convergent operations cannot be made control-dependent on additional
+    // values.
+    if (CS.hasFnAttr(Attribute::Convergent))
+      return false;
+
+    for (Instruction *S : Stores)
+      if (AA.getModRefInfo(S, CS) & MRI_Mod)
+        return false;
+  }
+
+  return true;
+}
+
+/// IsAcceptableTarget - Return true if it is possible to sink the instruction
+/// in the specified basic block.
+static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
+                               DominatorTree &DT, LoopInfo &LI) {
+  assert(Inst && "Instruction to be sunk is null");
+  assert(SuccToSinkTo && "Candidate sink target is null");
+
+  // It is not possible to sink an instruction into its own block.  This can
+  // happen with loops.
+  if (Inst->getParent() == SuccToSinkTo)
+    return false;
+
+  // It's never legal to sink an instruction into a block which terminates in an
+  // EH-pad.
+  if (SuccToSinkTo->getTerminator()->isExceptional())
+    return false;
+
+  // If the block has multiple predecessors, this would introduce computation
+  // on different code paths.  We could split the critical edge, but for now we
+  // just punt.
+  // FIXME: Split critical edges if not backedges.
+  if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
+    // We cannot sink a load across a critical edge - there may be stores in
+    // other code paths.
+    if (isa<LoadInst>(Inst))
+      return false;
+
+    // We don't want to sink across a critical edge if we don't dominate the
+    // successor. We could be introducing calculations to new code paths.
+    if (!DT.dominates(Inst->getParent(), SuccToSinkTo))
+      return false;
+
+    // Don't sink instructions into a loop.
+    Loop *succ = LI.getLoopFor(SuccToSinkTo);
+    Loop *cur = LI.getLoopFor(Inst->getParent());
+    if (succ != nullptr && succ != cur)
+      return false;
+  }
+
+  // Finally, check that all the uses of the instruction are actually
+  // dominated by the candidate
+  return AllUsesDominatedByBlock(Inst, SuccToSinkTo, DT);
+}
+
+/// SinkInstruction - Determine whether it is safe to sink the specified machine
+/// instruction out of its current block into a successor.
+static bool SinkInstruction(Instruction *Inst,
+                            SmallPtrSetImpl<Instruction *> &Stores,
+                            DominatorTree &DT, LoopInfo &LI, AAResults &AA) {
+
+  // Don't sink static alloca instructions.  CodeGen assumes allocas outside the
+  // entry block are dynamically sized stack objects.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst))
+    if (AI->isStaticAlloca())
+      return false;
+
+  // Check if it's safe to move the instruction.
+  if (!isSafeToMove(Inst, AA, Stores))
+    return false;
+
+  // FIXME: This should include support for sinking instructions within the
+  // block they are currently in to shorten the live ranges.  We often get
+  // instructions sunk into the top of a large block, but it would be better to
+  // also sink them down before their first use in the block.  This xform has to
+  // be careful not to *increase* register pressure though, e.g. sinking
+  // "x = y + z" down if it kills y and z would increase the live ranges of y
+  // and z and only shrink the live range of x.
+
+  // SuccToSinkTo - This is the successor to sink this instruction to, once we
+  // decide.
+  BasicBlock *SuccToSinkTo = nullptr;
+
+  // Instructions can only be sunk if all their uses are in blocks
+  // dominated by one of the successors.
+  // Look at all the dominated blocks and see if we can sink it in one.
+  DomTreeNode *DTN = DT.getNode(Inst->getParent());
+  for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
+      I != E && SuccToSinkTo == nullptr; ++I) {
+    BasicBlock *Candidate = (*I)->getBlock();
+    // A node always immediate-dominates its children on the dominator
+    // tree.
+    if (IsAcceptableTarget(Inst, Candidate, DT, LI))
+      SuccToSinkTo = Candidate;
+  }
+
+  // If no suitable postdominator was found, look at all the successors and
+  // decide which one we should sink to, if any.
+  for (succ_iterator I = succ_begin(Inst->getParent()),
+      E = succ_end(Inst->getParent()); I != E && !SuccToSinkTo; ++I) {
+    if (IsAcceptableTarget(Inst, *I, DT, LI))
+      SuccToSinkTo = *I;
+  }
+
+  // If we couldn't find a block to sink to, ignore this instruction.
+  if (!SuccToSinkTo)
+    return false;
+
+  DEBUG(dbgs() << "Sink" << *Inst << " (";
+        Inst->getParent()->printAsOperand(dbgs(), false);
+        dbgs() << " -> ";
+        SuccToSinkTo->printAsOperand(dbgs(), false);
+        dbgs() << ")\n");
+
+  // Move the instruction.
+  Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt());
+  return true;
+}
+
+static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI,
+                         AAResults &AA) {
+  // Can't sink anything out of a block that has less than two successors.
+  if (BB.getTerminator()->getNumSuccessors() <= 1) return false;
+
+  // Don't bother sinking code out of unreachable blocks. In addition to being
+  // unprofitable, it can also lead to infinite looping, because in an
+  // unreachable loop there may be nowhere to stop.
+  if (!DT.isReachableFromEntry(&BB)) return false;
+
+  bool MadeChange = false;
+
+  // Walk the basic block bottom-up.  Remember if we saw a store.
+  BasicBlock::iterator I = BB.end();
+  --I;
+  bool ProcessedBegin = false;
+  SmallPtrSet<Instruction *, 8> Stores;
+  do {
+    Instruction *Inst = &*I; // The instruction to sink.
+
+    // Predecrement I (if it's not begin) so that it isn't invalidated by
+    // sinking.
+    ProcessedBegin = I == BB.begin();
+    if (!ProcessedBegin)
+      --I;
+
+    if (isa<DbgInfoIntrinsic>(Inst))
+      continue;
+
+    if (SinkInstruction(Inst, Stores, DT, LI, AA)) {
+      ++NumSunk;
+      MadeChange = true;
+    }
+
+    // If we just processed the first instruction in the block, we're done.
+  } while (!ProcessedBegin);
+
+  return MadeChange;
+}
+
+static bool iterativelySinkInstructions(Function &F, DominatorTree &DT,
+                                        LoopInfo &LI, AAResults &AA) {
+  bool MadeChange, EverMadeChange = false;
+
+  do {
+    MadeChange = false;
+    DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
+    // Process all basic blocks.
+    for (BasicBlock &I : F)
+      MadeChange |= ProcessBlock(I, DT, LI, AA);
+    EverMadeChange |= MadeChange;
+    NumSinkIter++;
+  } while (MadeChange);
+
+  return EverMadeChange;
+}
+
+PreservedAnalyses SinkingPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+
+  if (!iterativelySinkInstructions(F, DT, LI, AA))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+namespace {
+  class SinkingLegacyPass : public FunctionPass {
+  public:
+    static char ID; // Pass identification
+    SinkingLegacyPass() : FunctionPass(ID) {
+      initializeSinkingLegacyPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override {
+      auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+      auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+      return iterativelySinkInstructions(F, DT, LI, AA);
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      FunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+    }
+  };
+} // end anonymous namespace
+
+char SinkingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SinkingLegacyPass, "sink", "Code sinking", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(SinkingLegacyPass, "sink", "Code sinking", false, false)
+
+FunctionPass *llvm::createSinkingPass() { return new SinkingLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
new file mode 100644
index 000000000000..a7c308b59877
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -0,0 +1,319 @@
+//===- SpeculativeExecution.cpp ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists instructions to enable speculative execution on
+// targets where branches are expensive. This is aimed at GPUs. It
+// currently works on simple if-then and if-then-else
+// patterns.
+//
+// Removing branches is not the only motivation for this
+// pass. E.g. consider this code and assume that there is no
+// addressing mode for multiplying by sizeof(*a):
+//
+//   if (b > 0)
+//     c = a[i + 1]
+//   if (d > 0)
+//     e = a[i + 2]
+//
+// turns into
+//
+//   p = &a[i + 1];
+//   if (b > 0)
+//     c = *p;
+//   q = &a[i + 2];
+//   if (d > 0)
+//     e = *q;
+//
+// which could later be optimized to
+//
+//   r = &a[i];
+//   if (b > 0)
+//     c = r[1];
+//   if (d > 0)
+//     e = r[2];
+//
+// Later passes sink back much of the speculated code that did not enable
+// further optimization.
+//
+// This pass is more aggressive than the function SpeculativeyExecuteBB in
+// SimplifyCFG. SimplifyCFG will not speculate if no selects are introduced and
+// it will speculate at most one instruction. It also will not speculate if
+// there is a value defined in the if-block that is only used in the then-block.
+// These restrictions make sense since the speculation in SimplifyCFG seems
+// aimed at introducing cheap selects, while this pass is intended to do more
+// aggressive speculation while counting on later passes to either capitalize on
+// that or clean it up.
+//
+// If the pass was created by calling
+// createSpeculativeExecutionIfHasBranchDivergencePass or the
+// -spec-exec-only-if-divergent-target option is present, this pass only has an
+// effect on targets where TargetTransformInfo::hasBranchDivergence() is true;
+// on other targets, it is a nop.
+//
+// This lets you include this pass unconditionally in the IR pass pipeline, but
+// only enable it for relevant targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SpeculativeExecution.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "speculative-execution"
+
+// The risk that speculation will not pay off increases with the
+// number of instructions speculated, so we put a limit on that.
+static cl::opt<unsigned> SpecExecMaxSpeculationCost(
+    "spec-exec-max-speculation-cost", cl::init(7), cl::Hidden,
+    cl::desc("Speculative execution is not applied to basic blocks where "
+             "the cost of the instructions to speculatively execute "
+             "exceeds this limit."));
+
+// Speculating just a few instructions from a larger block tends not
+// to be profitable and this limit prevents that. A reason for that is
+// that small basic blocks are more likely to be candidates for
+// further optimization.
+static cl::opt<unsigned> SpecExecMaxNotHoisted(
+    "spec-exec-max-not-hoisted", cl::init(5), cl::Hidden,
+    cl::desc("Speculative execution is not applied to basic blocks where the "
+             "number of instructions that would not be speculatively executed "
+             "exceeds this limit."));
+
+static cl::opt<bool> SpecExecOnlyIfDivergentTarget(
+    "spec-exec-only-if-divergent-target", cl::init(false), cl::Hidden,
+    cl::desc("Speculative execution is applied only to targets with divergent "
+             "branches, even if the pass was configured to apply only to all "
+             "targets."));
+
+namespace {
+
+class SpeculativeExecutionLegacyPass : public FunctionPass {
+public:
+  static char ID;
+  explicit SpeculativeExecutionLegacyPass(bool OnlyIfDivergentTarget = false)
+      : FunctionPass(ID), OnlyIfDivergentTarget(OnlyIfDivergentTarget ||
+                                                SpecExecOnlyIfDivergentTarget),
+        Impl(OnlyIfDivergentTarget) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override {
+    if (OnlyIfDivergentTarget)
+      return "Speculatively execute instructions if target has divergent "
+             "branches";
+    return "Speculatively execute instructions";
+  }
+
+private:
+  // Variable preserved purely for correct name printing.
+  const bool OnlyIfDivergentTarget;
+
+  SpeculativeExecutionPass Impl;
+};
+} // namespace
+
+char SpeculativeExecutionLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SpeculativeExecutionLegacyPass, "speculative-execution",
+                      "Speculatively execute instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(SpeculativeExecutionLegacyPass, "speculative-execution",
+                    "Speculatively execute instructions", false, false)
+
+void SpeculativeExecutionLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+}
+
+bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  return Impl.runImpl(F, TTI);
+}
+
+namespace llvm {
+
+bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) {
+  if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence()) {
+    DEBUG(dbgs() << "Not running SpeculativeExecution because "
+                    "TTI->hasBranchDivergence() is false.\n");
+    return false;
+  }
+
+  this->TTI = TTI;
+  bool Changed = false;
+  for (auto& B : F) {
+    Changed |= runOnBasicBlock(B);
+  }
+  return Changed;
+}
+
+bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) {
+  BranchInst *BI = dyn_cast<BranchInst>(B.getTerminator());
+  if (BI == nullptr)
+    return false;
+
+  if (BI->getNumSuccessors() != 2)
+    return false;
+  BasicBlock &Succ0 = *BI->getSuccessor(0);
+  BasicBlock &Succ1 = *BI->getSuccessor(1);
+
+  if (&B == &Succ0 || &B == &Succ1 || &Succ0 == &Succ1) {
+    return false;
+  }
+
+  // Hoist from if-then (triangle).
+  if (Succ0.getSinglePredecessor() != nullptr &&
+      Succ0.getSingleSuccessor() == &Succ1) {
+    return considerHoistingFromTo(Succ0, B);
+  }
+
+  // Hoist from if-else (triangle).
+  if (Succ1.getSinglePredecessor() != nullptr &&
+      Succ1.getSingleSuccessor() == &Succ0) {
+    return considerHoistingFromTo(Succ1, B);
+  }
+
+  // Hoist from if-then-else (diamond), but only if it is equivalent to
+  // an if-else or if-then due to one of the branches doing nothing.
+  if (Succ0.getSinglePredecessor() != nullptr &&
+      Succ1.getSinglePredecessor() != nullptr &&
+      Succ1.getSingleSuccessor() != nullptr &&
+      Succ1.getSingleSuccessor() != &B &&
+      Succ1.getSingleSuccessor() == Succ0.getSingleSuccessor()) {
+    // If a block has only one instruction, then that is a terminator
+    // instruction so that the block does nothing. This does happen.
+    if (Succ1.size() == 1) // equivalent to if-then
+      return considerHoistingFromTo(Succ0, B);
+    if (Succ0.size() == 1) // equivalent to if-else
+      return considerHoistingFromTo(Succ1, B);
+  }
+
+  return false;
+}
+
+static unsigned ComputeSpeculationCost(const Instruction *I,
+                                       const TargetTransformInfo &TTI) {
+  switch (Operator::getOpcode(I)) {
+    case Instruction::GetElementPtr:
+    case Instruction::Add:
+    case Instruction::Mul:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Select:
+    case Instruction::Shl:
+    case Instruction::Sub:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::Xor:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::Call:
+    case Instruction::BitCast:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::AddrSpaceCast:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPExt:
+    case Instruction::FPTrunc:
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+    case Instruction::FDiv:
+    case Instruction::FRem:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+      return TTI.getUserCost(I);
+
+    default:
+      return UINT_MAX; // Disallow anything not whitelisted.
+  }
+}
+
+bool SpeculativeExecutionPass::considerHoistingFromTo(
+    BasicBlock &FromBlock, BasicBlock &ToBlock) {
+  SmallSet<const Instruction *, 8> NotHoisted;
+  const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](User *U) {
+    for (Value* V : U->operand_values()) {
+      if (Instruction *I = dyn_cast<Instruction>(V)) {
+        if (NotHoisted.count(I) > 0)
+          return false;
+      }
+    }
+    return true;
+  };
+
+  unsigned TotalSpeculationCost = 0;
+  for (auto& I : FromBlock) {
+    const unsigned Cost = ComputeSpeculationCost(&I, *TTI);
+    if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) &&
+        AllPrecedingUsesFromBlockHoisted(&I)) {
+      TotalSpeculationCost += Cost;
+      if (TotalSpeculationCost > SpecExecMaxSpeculationCost)
+        return false;  // too much to hoist
+    } else {
+      NotHoisted.insert(&I);
+      if (NotHoisted.size() > SpecExecMaxNotHoisted)
+        return false; // too much left behind
+    }
+  }
+
+  if (TotalSpeculationCost == 0)
+    return false; // nothing to hoist
+
+  for (auto I = FromBlock.begin(); I != FromBlock.end();) {
+    // We have to increment I before moving Current as moving Current
+    // changes the list that I is iterating through.
+    auto Current = I;
+    ++I;
+    if (!NotHoisted.count(&*Current)) {
+      Current->moveBefore(ToBlock.getTerminator());
+    }
+  }
+  return true;
+}
+
+FunctionPass *createSpeculativeExecutionPass() {
+  return new SpeculativeExecutionLegacyPass();
+}
+
+FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() {
+  return new SpeculativeExecutionLegacyPass(/* OnlyIfDivergentTarget = */ true);
+}
+
+SpeculativeExecutionPass::SpeculativeExecutionPass(bool OnlyIfDivergentTarget)
+    : OnlyIfDivergentTarget(OnlyIfDivergentTarget ||
+                            SpecExecOnlyIfDivergentTarget) {}
+
+PreservedAnalyses SpeculativeExecutionPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+  bool Changed = runImpl(F, TTI);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+}  // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
new file mode 100644
index 000000000000..8b8d6590aa6a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -0,0 +1,701 @@
+//===-- StraightLineStrengthReduce.cpp - ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements straight-line strength reduction (SLSR). Unlike loop
+// strength reduction, this algorithm is designed to reduce arithmetic
+// redundancy in straight-line code instead of loops. It has proven to be
+// effective in simplifying arithmetic statements derived from an unrolled loop.
+// It can also simplify the logic of SeparateConstOffsetFromGEP.
+//
+// There are many optimizations we can perform in the domain of SLSR. This file
+// for now contains only an initial step. Specifically, we look for strength
+// reduction candidates in the following forms:
+//
+// Form 1: B + i * S
+// Form 2: (B + i) * S
+// Form 3: &B[i * S]
+//
+// where S is an integer variable, and i is a constant integer. If we found two
+// candidates S1 and S2 in the same form and S1 dominates S2, we may rewrite S2
+// in a simpler way with respect to S1. For example,
+//
+// S1: X = B + i * S
+// S2: Y = B + i' * S   => X + (i' - i) * S
+//
+// S1: X = (B + i) * S
+// S2: Y = (B + i') * S => X + (i' - i) * S
+//
+// S1: X = &B[i * S]
+// S2: Y = &B[i' * S]   => &X[(i' - i) * S]
+//
+// Note: (i' - i) * S is folded to the extent possible.
+//
+// This rewriting is in general a good idea. The code patterns we focus on
+// usually come from loop unrolling, so (i' - i) * S is likely the same
+// across iterations and can be reused. When that happens, the optimized form
+// takes only one add starting from the second iteration.
+//
+// When such rewriting is possible, we call S1 a "basis" of S2. When S2 has
+// multiple bases, we choose to rewrite S2 with respect to its "immediate"
+// basis, the basis that is the closest ancestor in the dominator tree.
+//
+// TODO:
+//
+// - Floating point arithmetics when fast math is enabled.
+//
+// - SLSR may decrease ILP at the architecture level. Targets that are very
+//   sensitive to ILP may want to disable it. Having SLSR to consider ILP is
+//   left as future work.
+//
+// - When (i' - i) is constant but i and i' are not, we could still perform
+//   SLSR.
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <list>
+#include <vector>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+namespace {
+
+static const unsigned UnknownAddressSpace = ~0u;
+
+class StraightLineStrengthReduce : public FunctionPass {
+public:
+  // SLSR candidate. Such a candidate must be in one of the forms described in
+  // the header comments.
+  struct Candidate {
+    enum Kind {
+      Invalid, // reserved for the default constructor
+      Add,     // B + i * S
+      Mul,     // (B + i) * S
+      GEP,     // &B[..][i * S][..]
+    };
+
+    Candidate()
+        : CandidateKind(Invalid), Base(nullptr), Index(nullptr),
+          Stride(nullptr), Ins(nullptr), Basis(nullptr) {}
+    Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
+              Instruction *I)
+        : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I),
+          Basis(nullptr) {}
+    Kind CandidateKind;
+    const SCEV *Base;
+    // Note that Index and Stride of a GEP candidate do not necessarily have the
+    // same integer type. In that case, during rewriting, Stride will be
+    // sign-extended or truncated to Index's type.
+    ConstantInt *Index;
+    Value *Stride;
+    // The instruction this candidate corresponds to. It helps us to rewrite a
+    // candidate with respect to its immediate basis. Note that one instruction
+    // can correspond to multiple candidates depending on how you associate the
+    // expression. For instance,
+    //
+    // (a + 1) * (b + 2)
+    //
+    // can be treated as
+    //
+    // <Base: a, Index: 1, Stride: b + 2>
+    //
+    // or
+    //
+    // <Base: b, Index: 2, Stride: a + 1>
+    Instruction *Ins;
+    // Points to the immediate basis of this candidate, or nullptr if we cannot
+    // find any basis for this candidate.
+    Candidate *Basis;
+  };
+
+  static char ID;
+
+  StraightLineStrengthReduce()
+      : FunctionPass(ID), DL(nullptr), DT(nullptr), TTI(nullptr) {
+    initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    // We do not modify the shape of the CFG.
+    AU.setPreservesCFG();
+  }
+
+  bool doInitialization(Module &M) override {
+    DL = &M.getDataLayout();
+    return false;
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
+  // share the same base and stride.
+  bool isBasisFor(const Candidate &Basis, const Candidate &C);
+  // Returns whether the candidate can be folded into an addressing mode.
+  bool isFoldable(const Candidate &C, TargetTransformInfo *TTI,
+                  const DataLayout *DL);
+  // Returns true if C is already in a simplest form and not worth being
+  // rewritten.
+  bool isSimplestForm(const Candidate &C);
+  // Checks whether I is in a candidate form. If so, adds all the matching forms
+  // to Candidates, and tries to find the immediate basis for each of them.
+  void allocateCandidatesAndFindBasis(Instruction *I);
+  // Allocate candidates and find bases for Add instructions.
+  void allocateCandidatesAndFindBasisForAdd(Instruction *I);
+  // Given I = LHS + RHS, factors RHS into i * S and makes (LHS + i * S) a
+  // candidate.
+  void allocateCandidatesAndFindBasisForAdd(Value *LHS, Value *RHS,
+                                            Instruction *I);
+  // Allocate candidates and find bases for Mul instructions.
+  void allocateCandidatesAndFindBasisForMul(Instruction *I);
+  // Splits LHS into Base + Index and, if succeeds, calls
+  // allocateCandidatesAndFindBasis.
+  void allocateCandidatesAndFindBasisForMul(Value *LHS, Value *RHS,
+                                            Instruction *I);
+  // Allocate candidates and find bases for GetElementPtr instructions.
+  void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP);
+  // A helper function that scales Idx with ElementSize before invoking
+  // allocateCandidatesAndFindBasis.
+  void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx,
+                                            Value *S, uint64_t ElementSize,
+                                            Instruction *I);
+  // Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate
+  // basis.
+  void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B,
+                                      ConstantInt *Idx, Value *S,
+                                      Instruction *I);
+  // Rewrites candidate C with respect to Basis.
+  void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
+  // A helper function that factors ArrayIdx to a product of a stride and a
+  // constant index, and invokes allocateCandidatesAndFindBasis with the
+  // factorings.
+  void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize,
+                        GetElementPtrInst *GEP);
+  // Emit code that computes the "bump" from Basis to C. If the candidate is a
+  // GEP and the bump is not divisible by the element size of the GEP, this
+  // function sets the BumpWithUglyGEP flag to notify its caller to bump the
+  // basis using an ugly GEP.
+  static Value *emitBump(const Candidate &Basis, const Candidate &C,
+                         IRBuilder<> &Builder, const DataLayout *DL,
+                         bool &BumpWithUglyGEP);
+
+  const DataLayout *DL;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  TargetTransformInfo *TTI;
+  std::list<Candidate> Candidates;
+  // Temporarily holds all instructions that are unlinked (but not deleted) by
+  // rewriteCandidateWithBasis. These instructions will be actually removed
+  // after all rewriting finishes.
+  std::vector<Instruction *> UnlinkedInstructions;
+};
+}  // anonymous namespace
+
+char StraightLineStrengthReduce::ID = 0;
+INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr",
+                      "Straight line strength reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr",
+                    "Straight line strength reduction", false, false)
+
+FunctionPass *llvm::createStraightLineStrengthReducePass() {
+  return new StraightLineStrengthReduce();
+}
+
+bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
+                                            const Candidate &C) {
+  return (Basis.Ins != C.Ins && // skip the same instruction
+          // They must have the same type too. Basis.Base == C.Base doesn't
+          // guarantee their types are the same (PR23975).
+          Basis.Ins->getType() == C.Ins->getType() &&
+          // Basis must dominate C in order to rewrite C with respect to Basis.
+          DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) &&
+          // They share the same base, stride, and candidate kind.
+          Basis.Base == C.Base && Basis.Stride == C.Stride &&
+          Basis.CandidateKind == C.CandidateKind);
+}
+
+static bool isGEPFoldable(GetElementPtrInst *GEP,
+                          const TargetTransformInfo *TTI) {
+  SmallVector<const Value*, 4> Indices;
+  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+    Indices.push_back(*I);
+  return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
+                         Indices) == TargetTransformInfo::TCC_Free;
+}
+
+// Returns whether (Base + Index * Stride) can be folded to an addressing mode.
+static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride,
+                          TargetTransformInfo *TTI) {
+  // Index->getSExtValue() may crash if Index is wider than 64-bit.
+  return Index->getBitWidth() <= 64 &&
+         TTI->isLegalAddressingMode(Base->getType(), nullptr, 0, true,
+                                    Index->getSExtValue(), UnknownAddressSpace);
+}
+
+bool StraightLineStrengthReduce::isFoldable(const Candidate &C,
+                                            TargetTransformInfo *TTI,
+                                            const DataLayout *DL) {
+  if (C.CandidateKind == Candidate::Add)
+    return isAddFoldable(C.Base, C.Index, C.Stride, TTI);
+  if (C.CandidateKind == Candidate::GEP)
+    return isGEPFoldable(cast<GetElementPtrInst>(C.Ins), TTI);
+  return false;
+}
+
+// Returns true if GEP has zero or one non-zero index.
+static bool hasOnlyOneNonZeroIndex(GetElementPtrInst *GEP) {
+  unsigned NumNonZeroIndices = 0;
+  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) {
+    ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I);
+    if (ConstIdx == nullptr || !ConstIdx->isZero())
+      ++NumNonZeroIndices;
+  }
+  return NumNonZeroIndices <= 1;
+}
+
+bool StraightLineStrengthReduce::isSimplestForm(const Candidate &C) {
+  if (C.CandidateKind == Candidate::Add) {
+    // B + 1 * S or B + (-1) * S
+    return C.Index->isOne() || C.Index->isMinusOne();
+  }
+  if (C.CandidateKind == Candidate::Mul) {
+    // (B + 0) * S
+    return C.Index->isZero();
+  }
+  if (C.CandidateKind == Candidate::GEP) {
+    // (char*)B + S or (char*)B - S
+    return ((C.Index->isOne() || C.Index->isMinusOne()) &&
+            hasOnlyOneNonZeroIndex(cast<GetElementPtrInst>(C.Ins)));
+  }
+  return false;
+}
+
+// TODO: We currently implement an algorithm whose time complexity is linear in
+// the number of existing candidates. However, we could do better by using
+// ScopedHashTable. Specifically, while traversing the dominator tree, we could
+// maintain all the candidates that dominate the basic block being traversed in
+// a ScopedHashTable. This hash table is indexed by the base and the stride of
+// a candidate. Therefore, finding the immediate basis of a candidate boils down
+// to one hash-table look up.
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
+    Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
+    Instruction *I) {
+  Candidate C(CT, B, Idx, S, I);
+  // SLSR can complicate an instruction in two cases:
+  //
+  // 1. If we can fold I into an addressing mode, computing I is likely free or
+  // takes only one instruction.
+  //
+  // 2. I is already in a simplest form. For example, when
+  //      X = B + 8 * S
+  //      Y = B + S,
+  //    rewriting Y to X - 7 * S is probably a bad idea.
+  //
+  // In the above cases, we still add I to the candidate list so that I can be
+  // the basis of other candidates, but we leave I's basis blank so that I
+  // won't be rewritten.
+  if (!isFoldable(C, TTI, DL) && !isSimplestForm(C)) {
+    // Try to compute the immediate basis of C.
+    unsigned NumIterations = 0;
+    // Limit the scan radius to avoid running in quadratice time.
+    static const unsigned MaxNumIterations = 50;
+    for (auto Basis = Candidates.rbegin();
+         Basis != Candidates.rend() && NumIterations < MaxNumIterations;
+         ++Basis, ++NumIterations) {
+      if (isBasisFor(*Basis, C)) {
+        C.Basis = &(*Basis);
+        break;
+      }
+    }
+  }
+  // Regardless of whether we find a basis for C, we need to push C to the
+  // candidate list so that it can be the basis of other candidates.
+  Candidates.push_back(C);
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
+    Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    allocateCandidatesAndFindBasisForAdd(I);
+    break;
+  case Instruction::Mul:
+    allocateCandidatesAndFindBasisForMul(I);
+    break;
+  case Instruction::GetElementPtr:
+    allocateCandidatesAndFindBasisForGEP(cast<GetElementPtrInst>(I));
+    break;
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
+    Instruction *I) {
+  // Try matching B + i * S.
+  if (!isa<IntegerType>(I->getType()))
+    return;
+
+  assert(I->getNumOperands() == 2 && "isn't I an add?");
+  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+  allocateCandidatesAndFindBasisForAdd(LHS, RHS, I);
+  if (LHS != RHS)
+    allocateCandidatesAndFindBasisForAdd(RHS, LHS, I);
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
+    Value *LHS, Value *RHS, Instruction *I) {
+  Value *S = nullptr;
+  ConstantInt *Idx = nullptr;
+  if (match(RHS, m_Mul(m_Value(S), m_ConstantInt(Idx)))) {
+    // I = LHS + RHS = LHS + Idx * S
+    allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I);
+  } else if (match(RHS, m_Shl(m_Value(S), m_ConstantInt(Idx)))) {
+    // I = LHS + RHS = LHS + (S << Idx) = LHS + S * (1 << Idx)
+    APInt One(Idx->getBitWidth(), 1);
+    Idx = ConstantInt::get(Idx->getContext(), One << Idx->getValue());
+    allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I);
+  } else {
+    // At least, I = LHS + 1 * RHS
+    ConstantInt *One = ConstantInt::get(cast<IntegerType>(I->getType()), 1);
+    allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), One, RHS,
+                                   I);
+  }
+}
+
+// Returns true if A matches B + C where C is constant.
+static bool matchesAdd(Value *A, Value *&B, ConstantInt *&C) {
+  return (match(A, m_Add(m_Value(B), m_ConstantInt(C))) ||
+          match(A, m_Add(m_ConstantInt(C), m_Value(B))));
+}
+
+// Returns true if A matches B | C where C is constant.
+static bool matchesOr(Value *A, Value *&B, ConstantInt *&C) {
+  return (match(A, m_Or(m_Value(B), m_ConstantInt(C))) ||
+          match(A, m_Or(m_ConstantInt(C), m_Value(B))));
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
+    Value *LHS, Value *RHS, Instruction *I) {
+  Value *B = nullptr;
+  ConstantInt *Idx = nullptr;
+  if (matchesAdd(LHS, B, Idx)) {
+    // If LHS is in the form of "Base + Index", then I is in the form of
+    // "(Base + Index) * RHS".
+    allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
+  } else if (matchesOr(LHS, B, Idx) && haveNoCommonBitsSet(B, Idx, *DL)) {
+    // If LHS is in the form of "Base | Index" and Base and Index have no common
+    // bits set, then
+    //   Base | Index = Base + Index
+    // and I is thus in the form of "(Base + Index) * RHS".
+    allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
+  } else {
+    // Otherwise, at least try the form (LHS + 0) * RHS.
+    ConstantInt *Zero = ConstantInt::get(cast<IntegerType>(I->getType()), 0);
+    allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(LHS), Zero, RHS,
+                                   I);
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
+    Instruction *I) {
+  // Try matching (B + i) * S.
+  // TODO: we could extend SLSR to float and vector types.
+  if (!isa<IntegerType>(I->getType()))
+    return;
+
+  assert(I->getNumOperands() == 2 && "isn't I a mul?");
+  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+  allocateCandidatesAndFindBasisForMul(LHS, RHS, I);
+  if (LHS != RHS) {
+    // Symmetrically, try to split RHS to Base + Index.
+    allocateCandidatesAndFindBasisForMul(RHS, LHS, I);
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
+    const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize,
+    Instruction *I) {
+  // I = B + sext(Idx *nsw S) * ElementSize
+  //   = B + (sext(Idx) * sext(S)) * ElementSize
+  //   = B + (sext(Idx) * ElementSize) * sext(S)
+  // Casting to IntegerType is safe because we skipped vector GEPs.
+  IntegerType *IntPtrTy = cast<IntegerType>(DL->getIntPtrType(I->getType()));
+  ConstantInt *ScaledIdx = ConstantInt::get(
+      IntPtrTy, Idx->getSExtValue() * (int64_t)ElementSize, true);
+  allocateCandidatesAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I);
+}
+
+void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx,
+                                                  const SCEV *Base,
+                                                  uint64_t ElementSize,
+                                                  GetElementPtrInst *GEP) {
+  // At least, ArrayIdx = ArrayIdx *nsw 1.
+  allocateCandidatesAndFindBasisForGEP(
+      Base, ConstantInt::get(cast<IntegerType>(ArrayIdx->getType()), 1),
+      ArrayIdx, ElementSize, GEP);
+  Value *LHS = nullptr;
+  ConstantInt *RHS = nullptr;
+  // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx
+  // itself. This would allow us to handle the shl case for free. However,
+  // matching SCEVs has two issues:
+  //
+  // 1. this would complicate rewriting because the rewriting procedure
+  // would have to translate SCEVs back to IR instructions. This translation
+  // is difficult when LHS is further evaluated to a composite SCEV.
+  //
+  // 2. ScalarEvolution is designed to be control-flow oblivious. It tends
+  // to strip nsw/nuw flags which are critical for SLSR to trace into
+  // sext'ed multiplication.
+  if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) {
+    // SLSR is currently unsafe if i * S may overflow.
+    // GEP = Base + sext(LHS *nsw RHS) * ElementSize
+    allocateCandidatesAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP);
+  } else if (match(ArrayIdx, m_NSWShl(m_Value(LHS), m_ConstantInt(RHS)))) {
+    // GEP = Base + sext(LHS <<nsw RHS) * ElementSize
+    //     = Base + sext(LHS *nsw (1 << RHS)) * ElementSize
+    APInt One(RHS->getBitWidth(), 1);
+    ConstantInt *PowerOf2 =
+        ConstantInt::get(RHS->getContext(), One << RHS->getValue());
+    allocateCandidatesAndFindBasisForGEP(Base, PowerOf2, LHS, ElementSize, GEP);
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
+    GetElementPtrInst *GEP) {
+  // TODO: handle vector GEPs
+  if (GEP->getType()->isVectorTy())
+    return;
+
+  SmallVector<const SCEV *, 4> IndexExprs;
+  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+    IndexExprs.push_back(SE->getSCEV(*I));
+
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (GTI.isStruct())
+      continue;
+
+    const SCEV *OrigIndexExpr = IndexExprs[I - 1];
+    IndexExprs[I - 1] = SE->getZero(OrigIndexExpr->getType());
+
+    // The base of this candidate is GEP's base plus the offsets of all
+    // indices except this current one.
+    const SCEV *BaseExpr = SE->getGEPExpr(cast<GEPOperator>(GEP), IndexExprs);
+    Value *ArrayIdx = GEP->getOperand(I);
+    uint64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+    if (ArrayIdx->getType()->getIntegerBitWidth() <=
+        DL->getPointerSizeInBits(GEP->getAddressSpace())) {
+      // Skip factoring if ArrayIdx is wider than the pointer size, because
+      // ArrayIdx is implicitly truncated to the pointer size.
+      factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP);
+    }
+    // When ArrayIdx is the sext of a value, we try to factor that value as
+    // well.  Handling this case is important because array indices are
+    // typically sign-extended to the pointer size.
+    Value *TruncatedArrayIdx = nullptr;
+    if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx))) &&
+        TruncatedArrayIdx->getType()->getIntegerBitWidth() <=
+            DL->getPointerSizeInBits(GEP->getAddressSpace())) {
+      // Skip factoring if TruncatedArrayIdx is wider than the pointer size,
+      // because TruncatedArrayIdx is implicitly truncated to the pointer size.
+      factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP);
+    }
+
+    IndexExprs[I - 1] = OrigIndexExpr;
+  }
+}
+
+// A helper function that unifies the bitwidth of A and B.
+static void unifyBitWidth(APInt &A, APInt &B) {
+  if (A.getBitWidth() < B.getBitWidth())
+    A = A.sext(B.getBitWidth());
+  else if (A.getBitWidth() > B.getBitWidth())
+    B = B.sext(A.getBitWidth());
+}
+
+Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
+                                            const Candidate &C,
+                                            IRBuilder<> &Builder,
+                                            const DataLayout *DL,
+                                            bool &BumpWithUglyGEP) {
+  APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue();
+  unifyBitWidth(Idx, BasisIdx);
+  APInt IndexOffset = Idx - BasisIdx;
+
+  BumpWithUglyGEP = false;
+  if (Basis.CandidateKind == Candidate::GEP) {
+    APInt ElementSize(
+        IndexOffset.getBitWidth(),
+        DL->getTypeAllocSize(
+            cast<GetElementPtrInst>(Basis.Ins)->getResultElementType()));
+    APInt Q, R;
+    APInt::sdivrem(IndexOffset, ElementSize, Q, R);
+    if (R == 0)
+      IndexOffset = Q;
+    else
+      BumpWithUglyGEP = true;
+  }
+
+  // Compute Bump = C - Basis = (i' - i) * S.
+  // Common case 1: if (i' - i) is 1, Bump = S.
+  if (IndexOffset == 1)
+    return C.Stride;
+  // Common case 2: if (i' - i) is -1, Bump = -S.
+  if (IndexOffset.isAllOnesValue())
+    return Builder.CreateNeg(C.Stride);
+
+  // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may
+  // have different bit widths.
+  IntegerType *DeltaType =
+      IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth());
+  Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType);
+  if (IndexOffset.isPowerOf2()) {
+    // If (i' - i) is a power of 2, Bump = sext/trunc(S) << log(i' - i).
+    ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2());
+    return Builder.CreateShl(ExtendedStride, Exponent);
+  }
+  if ((-IndexOffset).isPowerOf2()) {
+    // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i).
+    ConstantInt *Exponent =
+        ConstantInt::get(DeltaType, (-IndexOffset).logBase2());
+    return Builder.CreateNeg(Builder.CreateShl(ExtendedStride, Exponent));
+  }
+  Constant *Delta = ConstantInt::get(DeltaType, IndexOffset);
+  return Builder.CreateMul(ExtendedStride, Delta);
+}
+
+void StraightLineStrengthReduce::rewriteCandidateWithBasis(
+    const Candidate &C, const Candidate &Basis) {
+  assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base &&
+         C.Stride == Basis.Stride);
+  // We run rewriteCandidateWithBasis on all candidates in a post-order, so the
+  // basis of a candidate cannot be unlinked before the candidate.
+  assert(Basis.Ins->getParent() != nullptr && "the basis is unlinked");
+
+  // An instruction can correspond to multiple candidates. Therefore, instead of
+  // simply deleting an instruction when we rewrite it, we mark its parent as
+  // nullptr (i.e. unlink it) so that we can skip the candidates whose
+  // instruction is already rewritten.
+  if (!C.Ins->getParent())
+    return;
+
+  IRBuilder<> Builder(C.Ins);
+  bool BumpWithUglyGEP;
+  Value *Bump = emitBump(Basis, C, Builder, DL, BumpWithUglyGEP);
+  Value *Reduced = nullptr; // equivalent to but weaker than C.Ins
+  switch (C.CandidateKind) {
+  case Candidate::Add:
+  case Candidate::Mul:
+    // C = Basis + Bump
+    if (BinaryOperator::isNeg(Bump)) {
+      // If Bump is a neg instruction, emit C = Basis - (-Bump).
+      Reduced =
+          Builder.CreateSub(Basis.Ins, BinaryOperator::getNegArgument(Bump));
+      // We only use the negative argument of Bump, and Bump itself may be
+      // trivially dead.
+      RecursivelyDeleteTriviallyDeadInstructions(Bump);
+    } else {
+      // It's tempting to preserve nsw on Bump and/or Reduced. However, it's
+      // usually unsound, e.g.,
+      //
+      // X = (-2 +nsw 1) *nsw INT_MAX
+      // Y = (-2 +nsw 3) *nsw INT_MAX
+      //   =>
+      // Y = X + 2 * INT_MAX
+      //
+      // Neither + and * in the resultant expression are nsw.
+      Reduced = Builder.CreateAdd(Basis.Ins, Bump);
+    }
+    break;
+  case Candidate::GEP:
+    {
+      Type *IntPtrTy = DL->getIntPtrType(C.Ins->getType());
+      bool InBounds = cast<GetElementPtrInst>(C.Ins)->isInBounds();
+      if (BumpWithUglyGEP) {
+        // C = (char *)Basis + Bump
+        unsigned AS = Basis.Ins->getType()->getPointerAddressSpace();
+        Type *CharTy = Type::getInt8PtrTy(Basis.Ins->getContext(), AS);
+        Reduced = Builder.CreateBitCast(Basis.Ins, CharTy);
+        if (InBounds)
+          Reduced =
+              Builder.CreateInBoundsGEP(Builder.getInt8Ty(), Reduced, Bump);
+        else
+          Reduced = Builder.CreateGEP(Builder.getInt8Ty(), Reduced, Bump);
+        Reduced = Builder.CreateBitCast(Reduced, C.Ins->getType());
+      } else {
+        // C = gep Basis, Bump
+        // Canonicalize bump to pointer size.
+        Bump = Builder.CreateSExtOrTrunc(Bump, IntPtrTy);
+        if (InBounds)
+          Reduced = Builder.CreateInBoundsGEP(nullptr, Basis.Ins, Bump);
+        else
+          Reduced = Builder.CreateGEP(nullptr, Basis.Ins, Bump);
+      }
+    }
+    break;
+  default:
+    llvm_unreachable("C.CandidateKind is invalid");
+  };
+  Reduced->takeName(C.Ins);
+  C.Ins->replaceAllUsesWith(Reduced);
+  // Unlink C.Ins so that we can skip other candidates also corresponding to
+  // C.Ins. The actual deletion is postponed to the end of runOnFunction.
+  C.Ins->removeFromParent();
+  UnlinkedInstructions.push_back(C.Ins);
+}
+
+bool StraightLineStrengthReduce::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  // Traverse the dominator tree in the depth-first order. This order makes sure
+  // all bases of a candidate are in Candidates when we process it.
+  for (const auto Node : depth_first(DT))
+    for (auto &I : *(Node->getBlock()))
+      allocateCandidatesAndFindBasis(&I);
+
+  // Rewrite candidates in the reverse depth-first order. This order makes sure
+  // a candidate being rewritten is not a basis for any other candidate.
+  while (!Candidates.empty()) {
+    const Candidate &C = Candidates.back();
+    if (C.Basis != nullptr) {
+      rewriteCandidateWithBasis(C, *C.Basis);
+    }
+    Candidates.pop_back();
+  }
+
+  // Delete all unlink instructions.
+  for (auto *UnlinkedInst : UnlinkedInstructions) {
+    for (unsigned I = 0, E = UnlinkedInst->getNumOperands(); I != E; ++I) {
+      Value *Op = UnlinkedInst->getOperand(I);
+      UnlinkedInst->setOperand(I, nullptr);
+      RecursivelyDeleteTriviallyDeadInstructions(Op);
+    }
+    UnlinkedInst->deleteValue();
+  }
+  bool Ret = !UnlinkedInstructions.empty();
+  UnlinkedInstructions.clear();
+  return Ret;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
new file mode 100644
index 000000000000..0cccb415efdb
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -0,0 +1,937 @@
+//===-- StructurizeCFG.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/RegionPass.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "structurizecfg"
+
+namespace {
+
+// Definition of the complex types used in this pass.
+
+typedef std::pair<BasicBlock *, Value *> BBValuePair;
+
+typedef SmallVector<RegionNode*, 8> RNVector;
+typedef SmallVector<BasicBlock*, 8> BBVector;
+typedef SmallVector<BranchInst*, 8> BranchVector;
+typedef SmallVector<BBValuePair, 2> BBValueVector;
+
+typedef SmallPtrSet<BasicBlock *, 8> BBSet;
+
+typedef MapVector<PHINode *, BBValueVector> PhiMap;
+typedef MapVector<BasicBlock *, BBVector> BB2BBVecMap;
+
+typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap;
+typedef DenseMap<BasicBlock *, Value *> BBPredicates;
+typedef DenseMap<BasicBlock *, BBPredicates> PredMap;
+typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap;
+
+// The name for newly created blocks.
+static const char *const FlowBlockName = "Flow";
+
+/// Finds the nearest common dominator of a set of BasicBlocks.
+///
+/// For every BB you add to the set, you can specify whether we "remember" the
+/// block.  When you get the common dominator, you can also ask whether it's one
+/// of the blocks we remembered.
+class NearestCommonDominator {
+  DominatorTree *DT;
+  BasicBlock *Result = nullptr;
+  bool ResultIsRemembered = false;
+
+  /// Add BB to the resulting dominator.
+  void addBlock(BasicBlock *BB, bool Remember) {
+    if (!Result) {
+      Result = BB;
+      ResultIsRemembered = Remember;
+      return;
+    }
+
+    BasicBlock *NewResult = DT->findNearestCommonDominator(Result, BB);
+    if (NewResult != Result)
+      ResultIsRemembered = false;
+    if (NewResult == BB)
+      ResultIsRemembered |= Remember;
+    Result = NewResult;
+  }
+
+public:
+  explicit NearestCommonDominator(DominatorTree *DomTree) : DT(DomTree) {}
+
+  void addBlock(BasicBlock *BB) {
+    addBlock(BB, /* Remember = */ false);
+  }
+
+  void addAndRememberBlock(BasicBlock *BB) {
+    addBlock(BB, /* Remember = */ true);
+  }
+
+  /// Get the nearest common dominator of all the BBs added via addBlock() and
+  /// addAndRememberBlock().
+  BasicBlock *result() { return Result; }
+
+  /// Is the BB returned by getResult() one of the blocks we added to the set
+  /// with addAndRememberBlock()?
+  bool resultIsRememberedBlock() { return ResultIsRemembered; }
+};
+
+/// @brief Transforms the control flow graph on one single entry/exit region
+/// at a time.
+///
+/// After the transform all "If"/"Then"/"Else" style control flow looks like
+/// this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 |
+/// | /
+/// |/
+/// 3
+/// ||   Where:
+/// | |  1 = "If" block, calculates the condition
+/// 4 |  2 = "Then" subregion, runs if the condition is true
+/// | /  3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow
+/// |/   4 = "Else" optional subregion, runs if the condition is false
+/// 5    5 = "End" block, also rejoins the control flow
+/// \endverbatim
+///
+/// Control flow is expressed as a branch where the true exit goes into the
+/// "Then"/"Else" region, while the false exit skips the region
+/// The condition for the optional "Else" region is expressed as a PHI node.
+/// The incoming values of the PHI node are true for the "If" edge and false
+/// for the "Then" edge.
+///
+/// Additionally to that even complicated loops look like this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 ^  Where:
+/// | /  1 = "Entry" block
+/// |/   2 = "Loop" optional subregion, with all exits at "Flow" block
+/// 3    3 = "Flow" block, with back edge to entry block
+/// |
+/// \endverbatim
+///
+/// The back edge of the "Flow" block is always on the false side of the branch
+/// while the true side continues the general flow. So the loop condition
+/// consist of a network of PHI nodes where the true incoming values expresses
+/// breaks and the false values expresses continue states.
+class StructurizeCFG : public RegionPass {
+  bool SkipUniformRegions;
+
+  Type *Boolean;
+  ConstantInt *BoolTrue;
+  ConstantInt *BoolFalse;
+  UndefValue *BoolUndef;
+
+  Function *Func;
+  Region *ParentRegion;
+
+  DominatorTree *DT;
+  LoopInfo *LI;
+
+  SmallVector<RegionNode *, 8> Order;
+  BBSet Visited;
+
+  BBPhiMap DeletedPhis;
+  BB2BBVecMap AddedPhis;
+
+  PredMap Predicates;
+  BranchVector Conditions;
+
+  BB2BBMap Loops;
+  PredMap LoopPreds;
+  BranchVector LoopConds;
+
+  RegionNode *PrevNode;
+
+  void orderNodes();
+
+  void analyzeLoops(RegionNode *N);
+
+  Value *invert(Value *Condition);
+
+  Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
+
+  void gatherPredicates(RegionNode *N);
+
+  void collectInfos();
+
+  void insertConditions(bool Loops);
+
+  void delPhiValues(BasicBlock *From, BasicBlock *To);
+
+  void addPhiValues(BasicBlock *From, BasicBlock *To);
+
+  void setPhiValues();
+
+  void killTerminator(BasicBlock *BB);
+
+  void changeExit(RegionNode *Node, BasicBlock *NewExit,
+                  bool IncludeDominator);
+
+  BasicBlock *getNextFlow(BasicBlock *Dominator);
+
+  BasicBlock *needPrefix(bool NeedEmpty);
+
+  BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed);
+
+  void setPrevNode(BasicBlock *BB);
+
+  bool dominatesPredicates(BasicBlock *BB, RegionNode *Node);
+
+  bool isPredictableTrue(RegionNode *Node);
+
+  void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+  void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+  void createFlow();
+
+  void rebuildSSA();
+
+public:
+  static char ID;
+
+  explicit StructurizeCFG(bool SkipUniformRegions = false)
+      : RegionPass(ID), SkipUniformRegions(SkipUniformRegions) {
+    initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool doInitialization(Region *R, RGPassManager &RGM) override;
+
+  bool runOnRegion(Region *R, RGPassManager &RGM) override;
+
+  StringRef getPassName() const override { return "Structurize control flow"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    if (SkipUniformRegions)
+      AU.addRequired<DivergenceAnalysis>();
+    AU.addRequiredID(LowerSwitchID);
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    RegionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char StructurizeCFG::ID = 0;
+
+INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
+INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
+                    false, false)
+
+/// \brief Initialize the types and constants used in the pass
+bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
+  LLVMContext &Context = R->getEntry()->getContext();
+
+  Boolean = Type::getInt1Ty(Context);
+  BoolTrue = ConstantInt::getTrue(Context);
+  BoolFalse = ConstantInt::getFalse(Context);
+  BoolUndef = UndefValue::get(Boolean);
+
+  return false;
+}
+
+/// \brief Build up the general order of nodes
+void StructurizeCFG::orderNodes() {
+  ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
+  SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
+
+  // The reverse post-order traversal of the list gives us an ordering close
+  // to what we want.  The only problem with it is that sometimes backedges
+  // for outer loops will be visited before backedges for inner loops.
+  for (RegionNode *RN : RPOT) {
+    BasicBlock *BB = RN->getEntry();
+    Loop *Loop = LI->getLoopFor(BB);
+    ++LoopBlocks[Loop];
+  }
+
+  unsigned CurrentLoopDepth = 0;
+  Loop *CurrentLoop = nullptr;
+  for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
+    BasicBlock *BB = (*I)->getEntry();
+    unsigned LoopDepth = LI->getLoopDepth(BB);
+
+    if (is_contained(Order, *I))
+      continue;
+
+    if (LoopDepth < CurrentLoopDepth) {
+      // Make sure we have visited all blocks in this loop before moving back to
+      // the outer loop.
+
+      auto LoopI = I;
+      while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
+        LoopI++;
+        BasicBlock *LoopBB = (*LoopI)->getEntry();
+        if (LI->getLoopFor(LoopBB) == CurrentLoop) {
+          --BlockCount;
+          Order.push_back(*LoopI);
+        }
+      }
+    }
+
+    CurrentLoop = LI->getLoopFor(BB);
+    if (CurrentLoop)
+      LoopBlocks[CurrentLoop]--;
+
+    CurrentLoopDepth = LoopDepth;
+    Order.push_back(*I);
+  }
+
+  // This pass originally used a post-order traversal and then operated on
+  // the list in reverse. Now that we are using a reverse post-order traversal
+  // rather than re-working the whole pass to operate on the list in order,
+  // we just reverse the list and continue to operate on it in reverse.
+  std::reverse(Order.begin(), Order.end());
+}
+
+/// \brief Determine the end of the loops
+void StructurizeCFG::analyzeLoops(RegionNode *N) {
+  if (N->isSubRegion()) {
+    // Test for exit as back edge
+    BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
+    if (Visited.count(Exit))
+      Loops[Exit] = N->getEntry();
+
+  } else {
+    // Test for successors as back edge
+    BasicBlock *BB = N->getNodeAs<BasicBlock>();
+    BranchInst *Term = cast<BranchInst>(BB->getTerminator());
+
+    for (BasicBlock *Succ : Term->successors())
+      if (Visited.count(Succ))
+        Loops[Succ] = BB;
+  }
+}
+
+/// \brief Invert the given condition
+Value *StructurizeCFG::invert(Value *Condition) {
+  // First: Check if it's a constant
+  if (Constant *C = dyn_cast<Constant>(Condition))
+    return ConstantExpr::getNot(C);
+
+  // Second: If the condition is already inverted, return the original value
+  if (match(Condition, m_Not(m_Value(Condition))))
+    return Condition;
+
+  if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
+    // Third: Check all the users for an invert
+    BasicBlock *Parent = Inst->getParent();
+    for (User *U : Condition->users())
+      if (Instruction *I = dyn_cast<Instruction>(U))
+        if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition))))
+          return I;
+
+    // Last option: Create a new instruction
+    return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
+  }
+
+  if (Argument *Arg = dyn_cast<Argument>(Condition)) {
+    BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock();
+    return BinaryOperator::CreateNot(Condition,
+                                     Arg->getName() + ".inv",
+                                     EntryBlock.getTerminator());
+  }
+
+  llvm_unreachable("Unhandled condition to invert");
+}
+
+/// \brief Build the condition for one edge
+Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
+                                      bool Invert) {
+  Value *Cond = Invert ? BoolFalse : BoolTrue;
+  if (Term->isConditional()) {
+    Cond = Term->getCondition();
+
+    if (Idx != (unsigned)Invert)
+      Cond = invert(Cond);
+  }
+  return Cond;
+}
+
+/// \brief Analyze the predecessors of each block and build up predicates
+void StructurizeCFG::gatherPredicates(RegionNode *N) {
+  RegionInfo *RI = ParentRegion->getRegionInfo();
+  BasicBlock *BB = N->getEntry();
+  BBPredicates &Pred = Predicates[BB];
+  BBPredicates &LPred = LoopPreds[BB];
+
+  for (BasicBlock *P : predecessors(BB)) {
+    // Ignore it if it's a branch from outside into our region entry
+    if (!ParentRegion->contains(P))
+      continue;
+
+    Region *R = RI->getRegionFor(P);
+    if (R == ParentRegion) {
+      // It's a top level block in our region
+      BranchInst *Term = cast<BranchInst>(P->getTerminator());
+      for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+        BasicBlock *Succ = Term->getSuccessor(i);
+        if (Succ != BB)
+          continue;
+
+        if (Visited.count(P)) {
+          // Normal forward edge
+          if (Term->isConditional()) {
+            // Try to treat it like an ELSE block
+            BasicBlock *Other = Term->getSuccessor(!i);
+            if (Visited.count(Other) && !Loops.count(Other) &&
+                !Pred.count(Other) && !Pred.count(P)) {
+
+              Pred[Other] = BoolFalse;
+              Pred[P] = BoolTrue;
+              continue;
+            }
+          }
+          Pred[P] = buildCondition(Term, i, false);
+        } else {
+          // Back edge
+          LPred[P] = buildCondition(Term, i, true);
+        }
+      }
+    } else {
+      // It's an exit from a sub region
+      while (R->getParent() != ParentRegion)
+        R = R->getParent();
+
+      // Edge from inside a subregion to its entry, ignore it
+      if (*R == *N)
+        continue;
+
+      BasicBlock *Entry = R->getEntry();
+      if (Visited.count(Entry))
+        Pred[Entry] = BoolTrue;
+      else
+        LPred[Entry] = BoolFalse;
+    }
+  }
+}
+
+/// \brief Collect various loop and predicate infos
+void StructurizeCFG::collectInfos() {
+  // Reset predicate
+  Predicates.clear();
+
+  // and loop infos
+  Loops.clear();
+  LoopPreds.clear();
+
+  // Reset the visited nodes
+  Visited.clear();
+
+  for (RegionNode *RN : reverse(Order)) {
+    DEBUG(dbgs() << "Visiting: "
+                 << (RN->isSubRegion() ? "SubRegion with entry: " : "")
+                 << RN->getEntry()->getName() << " Loop Depth: "
+                 << LI->getLoopDepth(RN->getEntry()) << "\n");
+
+    // Analyze all the conditions leading to a node
+    gatherPredicates(RN);
+
+    // Remember that we've seen this node
+    Visited.insert(RN->getEntry());
+
+    // Find the last back edges
+    analyzeLoops(RN);
+  }
+}
+
+/// \brief Insert the missing branch conditions
+void StructurizeCFG::insertConditions(bool Loops) {
+  BranchVector &Conds = Loops ? LoopConds : Conditions;
+  Value *Default = Loops ? BoolTrue : BoolFalse;
+  SSAUpdater PhiInserter;
+
+  for (BranchInst *Term : Conds) {
+    assert(Term->isConditional());
+
+    BasicBlock *Parent = Term->getParent();
+    BasicBlock *SuccTrue = Term->getSuccessor(0);
+    BasicBlock *SuccFalse = Term->getSuccessor(1);
+
+    PhiInserter.Initialize(Boolean, "");
+    PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
+    PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
+
+    BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
+
+    NearestCommonDominator Dominator(DT);
+    Dominator.addBlock(Parent);
+
+    Value *ParentValue = nullptr;
+    for (std::pair<BasicBlock *, Value *> BBAndPred : Preds) {
+      BasicBlock *BB = BBAndPred.first;
+      Value *Pred = BBAndPred.second;
+
+      if (BB == Parent) {
+        ParentValue = Pred;
+        break;
+      }
+      PhiInserter.AddAvailableValue(BB, Pred);
+      Dominator.addAndRememberBlock(BB);
+    }
+
+    if (ParentValue) {
+      Term->setCondition(ParentValue);
+    } else {
+      if (!Dominator.resultIsRememberedBlock())
+        PhiInserter.AddAvailableValue(Dominator.result(), Default);
+
+      Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
+    }
+  }
+}
+
+/// \brief Remove all PHI values coming from "From" into "To" and remember
+/// them in DeletedPhis
+void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
+  PhiMap &Map = DeletedPhis[To];
+  for (Instruction &I : *To) {
+    if (!isa<PHINode>(I))
+      break;
+    PHINode &Phi = cast<PHINode>(I);
+    while (Phi.getBasicBlockIndex(From) != -1) {
+      Value *Deleted = Phi.removeIncomingValue(From, false);
+      Map[&Phi].push_back(std::make_pair(From, Deleted));
+    }
+  }
+}
+
+/// \brief Add a dummy PHI value as soon as we knew the new predecessor
+void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
+  for (Instruction &I : *To) {
+    if (!isa<PHINode>(I))
+      break;
+    PHINode &Phi = cast<PHINode>(I);
+    Value *Undef = UndefValue::get(Phi.getType());
+    Phi.addIncoming(Undef, From);
+  }
+  AddedPhis[To].push_back(From);
+}
+
+/// \brief Add the real PHI value as soon as everything is set up
+void StructurizeCFG::setPhiValues() {
+  SSAUpdater Updater;
+  for (const auto &AddedPhi : AddedPhis) {
+    BasicBlock *To = AddedPhi.first;
+    const BBVector &From = AddedPhi.second;
+
+    if (!DeletedPhis.count(To))
+      continue;
+
+    PhiMap &Map = DeletedPhis[To];
+    for (const auto &PI : Map) {
+      PHINode *Phi = PI.first;
+      Value *Undef = UndefValue::get(Phi->getType());
+      Updater.Initialize(Phi->getType(), "");
+      Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+      Updater.AddAvailableValue(To, Undef);
+
+      NearestCommonDominator Dominator(DT);
+      Dominator.addBlock(To);
+      for (const auto &VI : PI.second) {
+        Updater.AddAvailableValue(VI.first, VI.second);
+        Dominator.addAndRememberBlock(VI.first);
+      }
+
+      if (!Dominator.resultIsRememberedBlock())
+        Updater.AddAvailableValue(Dominator.result(), Undef);
+
+      for (BasicBlock *FI : From) {
+        int Idx = Phi->getBasicBlockIndex(FI);
+        assert(Idx != -1);
+        Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(FI));
+      }
+    }
+
+    DeletedPhis.erase(To);
+  }
+  assert(DeletedPhis.empty());
+}
+
+/// \brief Remove phi values from all successors and then remove the terminator.
+void StructurizeCFG::killTerminator(BasicBlock *BB) {
+  TerminatorInst *Term = BB->getTerminator();
+  if (!Term)
+    return;
+
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+       SI != SE; ++SI)
+    delPhiValues(BB, *SI);
+
+  Term->eraseFromParent();
+}
+
+/// \brief Let node exit(s) point to NewExit
+void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
+                                bool IncludeDominator) {
+  if (Node->isSubRegion()) {
+    Region *SubRegion = Node->getNodeAs<Region>();
+    BasicBlock *OldExit = SubRegion->getExit();
+    BasicBlock *Dominator = nullptr;
+
+    // Find all the edges from the sub region to the exit
+    for (auto BBI = pred_begin(OldExit), E = pred_end(OldExit); BBI != E;) {
+      // Incrememt BBI before mucking with BB's terminator.
+      BasicBlock *BB = *BBI++;
+
+      if (!SubRegion->contains(BB))
+        continue;
+
+      // Modify the edges to point to the new exit
+      delPhiValues(BB, OldExit);
+      BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit);
+      addPhiValues(BB, NewExit);
+
+      // Find the new dominator (if requested)
+      if (IncludeDominator) {
+        if (!Dominator)
+          Dominator = BB;
+        else
+          Dominator = DT->findNearestCommonDominator(Dominator, BB);
+      }
+    }
+
+    // Change the dominator (if requested)
+    if (Dominator)
+      DT->changeImmediateDominator(NewExit, Dominator);
+
+    // Update the region info
+    SubRegion->replaceExit(NewExit);
+  } else {
+    BasicBlock *BB = Node->getNodeAs<BasicBlock>();
+    killTerminator(BB);
+    BranchInst::Create(NewExit, BB);
+    addPhiValues(BB, NewExit);
+    if (IncludeDominator)
+      DT->changeImmediateDominator(NewExit, BB);
+  }
+}
+
+/// \brief Create a new flow node and update dominator tree and region info
+BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
+  LLVMContext &Context = Func->getContext();
+  BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
+                       Order.back()->getEntry();
+  BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
+                                        Func, Insert);
+  DT->addNewBlock(Flow, Dominator);
+  ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
+  return Flow;
+}
+
+/// \brief Create a new or reuse the previous node as flow node
+BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
+  BasicBlock *Entry = PrevNode->getEntry();
+
+  if (!PrevNode->isSubRegion()) {
+    killTerminator(Entry);
+    if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
+      return Entry;
+  }
+
+  // create a new flow node
+  BasicBlock *Flow = getNextFlow(Entry);
+
+  // and wire it up
+  changeExit(PrevNode, Flow, true);
+  PrevNode = ParentRegion->getBBNode(Flow);
+  return Flow;
+}
+
+/// \brief Returns the region exit if possible, otherwise just a new flow node
+BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
+                                        bool ExitUseAllowed) {
+  if (!Order.empty() || !ExitUseAllowed)
+    return getNextFlow(Flow);
+
+  BasicBlock *Exit = ParentRegion->getExit();
+  DT->changeImmediateDominator(Exit, Flow);
+  addPhiValues(Flow, Exit);
+  return Exit;
+}
+
+/// \brief Set the previous node
+void StructurizeCFG::setPrevNode(BasicBlock *BB) {
+  PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB)
+                                        : nullptr;
+}
+
+/// \brief Does BB dominate all the predicates of Node?
+bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
+  BBPredicates &Preds = Predicates[Node->getEntry()];
+  return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) {
+    return DT->dominates(BB, Pred.first);
+  });
+}
+
+/// \brief Can we predict that this node will always be called?
+bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
+  BBPredicates &Preds = Predicates[Node->getEntry()];
+  bool Dominated = false;
+
+  // Regionentry is always true
+  if (!PrevNode)
+    return true;
+
+  for (std::pair<BasicBlock*, Value*> Pred : Preds) {
+    BasicBlock *BB = Pred.first;
+    Value *V = Pred.second;
+
+    if (V != BoolTrue)
+      return false;
+
+    if (!Dominated && DT->dominates(BB, PrevNode->getEntry()))
+      Dominated = true;
+  }
+
+  // TODO: The dominator check is too strict
+  return Dominated;
+}
+
+/// Take one node from the order vector and wire it up
+void StructurizeCFG::wireFlow(bool ExitUseAllowed,
+                              BasicBlock *LoopEnd) {
+  RegionNode *Node = Order.pop_back_val();
+  Visited.insert(Node->getEntry());
+
+  if (isPredictableTrue(Node)) {
+    // Just a linear flow
+    if (PrevNode) {
+      changeExit(PrevNode, Node->getEntry(), true);
+    }
+    PrevNode = Node;
+
+  } else {
+    // Insert extra prefix node (or reuse last one)
+    BasicBlock *Flow = needPrefix(false);
+
+    // Insert extra postfix node (or use exit instead)
+    BasicBlock *Entry = Node->getEntry();
+    BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);
+
+    // let it point to entry and next block
+    Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow));
+    addPhiValues(Flow, Entry);
+    DT->changeImmediateDominator(Entry, Flow);
+
+    PrevNode = Node;
+    while (!Order.empty() && !Visited.count(LoopEnd) &&
+           dominatesPredicates(Entry, Order.back())) {
+      handleLoops(false, LoopEnd);
+    }
+
+    changeExit(PrevNode, Next, false);
+    setPrevNode(Next);
+  }
+}
+
+void StructurizeCFG::handleLoops(bool ExitUseAllowed,
+                                 BasicBlock *LoopEnd) {
+  RegionNode *Node = Order.back();
+  BasicBlock *LoopStart = Node->getEntry();
+
+  if (!Loops.count(LoopStart)) {
+    wireFlow(ExitUseAllowed, LoopEnd);
+    return;
+  }
+
+  if (!isPredictableTrue(Node))
+    LoopStart = needPrefix(true);
+
+  LoopEnd = Loops[Node->getEntry()];
+  wireFlow(false, LoopEnd);
+  while (!Visited.count(LoopEnd)) {
+    handleLoops(false, LoopEnd);
+  }
+
+  // If the start of the loop is the entry block, we can't branch to it so
+  // insert a new dummy entry block.
+  Function *LoopFunc = LoopStart->getParent();
+  if (LoopStart == &LoopFunc->getEntryBlock()) {
+    LoopStart->setName("entry.orig");
+
+    BasicBlock *NewEntry =
+      BasicBlock::Create(LoopStart->getContext(),
+                         "entry",
+                         LoopFunc,
+                         LoopStart);
+    BranchInst::Create(LoopStart, NewEntry);
+    DT->setNewRoot(NewEntry);
+  }
+
+  // Create an extra loop end node
+  LoopEnd = needPrefix(false);
+  BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
+  LoopConds.push_back(BranchInst::Create(Next, LoopStart,
+                                         BoolUndef, LoopEnd));
+  addPhiValues(LoopEnd, LoopStart);
+  setPrevNode(Next);
+}
+
+/// After this function control flow looks like it should be, but
+/// branches and PHI nodes only have undefined conditions.
+void StructurizeCFG::createFlow() {
+  BasicBlock *Exit = ParentRegion->getExit();
+  bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
+
+  DeletedPhis.clear();
+  AddedPhis.clear();
+  Conditions.clear();
+  LoopConds.clear();
+
+  PrevNode = nullptr;
+  Visited.clear();
+
+  while (!Order.empty()) {
+    handleLoops(EntryDominatesExit, nullptr);
+  }
+
+  if (PrevNode)
+    changeExit(PrevNode, Exit, EntryDominatesExit);
+  else
+    assert(EntryDominatesExit);
+}
+
+/// Handle a rare case where the disintegrated nodes instructions
+/// no longer dominate all their uses. Not sure if this is really nessasary
+void StructurizeCFG::rebuildSSA() {
+  SSAUpdater Updater;
+  for (BasicBlock *BB : ParentRegion->blocks())
+    for (Instruction &I : *BB) {
+      bool Initialized = false;
+      // We may modify the use list as we iterate over it, so be careful to
+      // compute the next element in the use list at the top of the loop.
+      for (auto UI = I.use_begin(), E = I.use_end(); UI != E;) {
+        Use &U = *UI++;
+        Instruction *User = cast<Instruction>(U.getUser());
+        if (User->getParent() == BB) {
+          continue;
+        } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+          if (UserPN->getIncomingBlock(U) == BB)
+            continue;
+        }
+
+        if (DT->dominates(&I, User))
+          continue;
+
+        if (!Initialized) {
+          Value *Undef = UndefValue::get(I.getType());
+          Updater.Initialize(I.getType(), "");
+          Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+          Updater.AddAvailableValue(BB, &I);
+          Initialized = true;
+        }
+        Updater.RewriteUseAfterInsertions(U);
+      }
+    }
+}
+
+static bool hasOnlyUniformBranches(const Region *R,
+                                   const DivergenceAnalysis &DA) {
+  for (const BasicBlock *BB : R->blocks()) {
+    const BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!Br || !Br->isConditional())
+      continue;
+
+    if (!DA.isUniform(Br->getCondition()))
+      return false;
+    DEBUG(dbgs() << "BB: " << BB->getName() << " has uniform terminator\n");
+  }
+  return true;
+}
+
+/// \brief Run the transformation for each region found
+bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
+  if (R->isTopLevelRegion())
+    return false;
+
+  if (SkipUniformRegions) {
+    // TODO: We could probably be smarter here with how we handle sub-regions.
+    auto &DA = getAnalysis<DivergenceAnalysis>();
+    if (hasOnlyUniformBranches(R, DA)) {
+      DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R << '\n');
+
+      // Mark all direct child block terminators as having been treated as
+      // uniform. To account for a possible future in which non-uniform
+      // sub-regions are treated more cleverly, indirect children are not
+      // marked as uniform.
+      MDNode *MD = MDNode::get(R->getEntry()->getParent()->getContext(), {});
+      for (RegionNode *E : R->elements()) {
+        if (E->isSubRegion())
+          continue;
+
+        if (Instruction *Term = E->getEntry()->getTerminator())
+          Term->setMetadata("structurizecfg.uniform", MD);
+      }
+
+      return false;
+    }
+  }
+
+  Func = R->getEntry()->getParent();
+  ParentRegion = R;
+
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+  orderNodes();
+  collectInfos();
+  createFlow();
+  insertConditions(false);
+  insertConditions(true);
+  setPhiValues();
+  rebuildSSA();
+
+  // Cleanup
+  Order.clear();
+  Visited.clear();
+  DeletedPhis.clear();
+  AddedPhis.clear();
+  Predicates.clear();
+  Conditions.clear();
+  Loops.clear();
+  LoopPreds.clear();
+  LoopConds.clear();
+
+  return true;
+}
+
+Pass *llvm::createStructurizeCFGPass(bool SkipUniformRegions) {
+  return new StructurizeCFG(SkipUniformRegions);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
new file mode 100644
index 000000000000..9397b87cdf56
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -0,0 +1,849 @@
+//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file transforms calls of the current function (self recursion) followed
+// by a return instruction with a branch to the entry of the function, creating
+// a loop.  This pass also implements the following extensions to the basic
+// algorithm:
+//
+//  1. Trivial instructions between the call and return do not prevent the
+//     transformation from taking place, though currently the analysis cannot
+//     support moving any really useful instructions (only dead ones).
+//  2. This pass transforms functions that are prevented from being tail
+//     recursive by an associative and commutative expression to use an
+//     accumulator variable, thus compiling the typical naive factorial or
+//     'fib' implementation into efficient code.
+//  3. TRE is performed if the function returns void, if the return
+//     returns the result returned by the call, or if the function returns a
+//     run-time constant on all exits from the function.  It is possible, though
+//     unlikely, that the return returns something else (like constant 0), and
+//     can still be TRE'd.  It can be TRE'd if ALL OTHER return instructions in
+//     the function return the exact same value.
+//  4. If it can prove that callees do not access their caller stack frame,
+//     they are marked as eligible for tail call elimination (by the code
+//     generator).
+//
+// There are several improvements that could be made:
+//
+//  1. If the function has any alloca instructions, these instructions will be
+//     moved out of the entry block of the function, causing them to be
+//     evaluated each time through the tail recursion.  Safely keeping allocas
+//     in the entry block requires analysis to proves that the tail-called
+//     function does not read or write the stack object.
+//  2. Tail recursion is only performed if the call immediately precedes the
+//     return instruction.  It's possible that there could be a jump between
+//     the call and the return.
+//  3. There can be intervening operations between the call and the return that
+//     prevent the TRE from occurring.  For example, there could be GEP's and
+//     stores to memory that will not be read or written by the call.  This
+//     requires some substantial analysis (such as with DSA) to prove safe to
+//     move ahead of the call, but doing so could allow many more TREs to be
+//     performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark.
+//  4. The algorithm we use to detect if callees access their caller stack
+//     frames is very primitive.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/TailRecursionElimination.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "tailcallelim"
+
+STATISTIC(NumEliminated, "Number of tail calls removed");
+STATISTIC(NumRetDuped,   "Number of return duplicated");
+STATISTIC(NumAccumAdded, "Number of accumulators introduced");
+
+/// \brief Scan the specified function for alloca instructions.
+/// If it contains any dynamic allocas, returns false.
+static bool canTRE(Function &F) {
+  // Because of PR962, we don't TRE dynamic allocas.
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+        if (!AI->isStaticAlloca())
+          return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+namespace {
+struct AllocaDerivedValueTracker {
+  // Start at a root value and walk its use-def chain to mark calls that use the
+  // value or a derived value in AllocaUsers, and places where it may escape in
+  // EscapePoints.
+  void walk(Value *Root) {
+    SmallVector<Use *, 32> Worklist;
+    SmallPtrSet<Use *, 32> Visited;
+
+    auto AddUsesToWorklist = [&](Value *V) {
+      for (auto &U : V->uses()) {
+        if (!Visited.insert(&U).second)
+          continue;
+        Worklist.push_back(&U);
+      }
+    };
+
+    AddUsesToWorklist(Root);
+
+    while (!Worklist.empty()) {
+      Use *U = Worklist.pop_back_val();
+      Instruction *I = cast<Instruction>(U->getUser());
+
+      switch (I->getOpcode()) {
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        CallSite CS(I);
+        bool IsNocapture =
+            CS.isDataOperand(U) && CS.doesNotCapture(CS.getDataOperandNo(U));
+        callUsesLocalStack(CS, IsNocapture);
+        if (IsNocapture) {
+          // If the alloca-derived argument is passed in as nocapture, then it
+          // can't propagate to the call's return. That would be capturing.
+          continue;
+        }
+        break;
+      }
+      case Instruction::Load: {
+        // The result of a load is not alloca-derived (unless an alloca has
+        // otherwise escaped, but this is a local analysis).
+        continue;
+      }
+      case Instruction::Store: {
+        if (U->getOperandNo() == 0)
+          EscapePoints.insert(I);
+        continue;  // Stores have no users to analyze.
+      }
+      case Instruction::BitCast:
+      case Instruction::GetElementPtr:
+      case Instruction::PHI:
+      case Instruction::Select:
+      case Instruction::AddrSpaceCast:
+        break;
+      default:
+        EscapePoints.insert(I);
+        break;
+      }
+
+      AddUsesToWorklist(I);
+    }
+  }
+
+  void callUsesLocalStack(CallSite CS, bool IsNocapture) {
+    // Add it to the list of alloca users.
+    AllocaUsers.insert(CS.getInstruction());
+
+    // If it's nocapture then it can't capture this alloca.
+    if (IsNocapture)
+      return;
+
+    // If it can write to memory, it can leak the alloca value.
+    if (!CS.onlyReadsMemory())
+      EscapePoints.insert(CS.getInstruction());
+  }
+
+  SmallPtrSet<Instruction *, 32> AllocaUsers;
+  SmallPtrSet<Instruction *, 32> EscapePoints;
+};
+}
+
+static bool markTails(Function &F, bool &AllCallsAreTailCalls) {
+  if (F.callsFunctionThatReturnsTwice())
+    return false;
+  AllCallsAreTailCalls = true;
+
+  // The local stack holds all alloca instructions and all byval arguments.
+  AllocaDerivedValueTracker Tracker;
+  for (Argument &Arg : F.args()) {
+    if (Arg.hasByValAttr())
+      Tracker.walk(&Arg);
+  }
+  for (auto &BB : F) {
+    for (auto &I : BB)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+        Tracker.walk(AI);
+  }
+
+  bool Modified = false;
+
+  // Track whether a block is reachable after an alloca has escaped. Blocks that
+  // contain the escaping instruction will be marked as being visited without an
+  // escaped alloca, since that is how the block began.
+  enum VisitType {
+    UNVISITED,
+    UNESCAPED,
+    ESCAPED
+  };
+  DenseMap<BasicBlock *, VisitType> Visited;
+
+  // We propagate the fact that an alloca has escaped from block to successor.
+  // Visit the blocks that are propagating the escapedness first. To do this, we
+  // maintain two worklists.
+  SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped;
+
+  // We may enter a block and visit it thinking that no alloca has escaped yet,
+  // then see an escape point and go back around a loop edge and come back to
+  // the same block twice. Because of this, we defer setting tail on calls when
+  // we first encounter them in a block. Every entry in this list does not
+  // statically use an alloca via use-def chain analysis, but may find an alloca
+  // through other means if the block turns out to be reachable after an escape
+  // point.
+  SmallVector<CallInst *, 32> DeferredTails;
+
+  BasicBlock *BB = &F.getEntryBlock();
+  VisitType Escaped = UNESCAPED;
+  do {
+    for (auto &I : *BB) {
+      if (Tracker.EscapePoints.count(&I))
+        Escaped = ESCAPED;
+
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (!CI || CI->isTailCall())
+        continue;
+
+      bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundles();
+
+      if (!IsNoTail && CI->doesNotAccessMemory()) {
+        // A call to a readnone function whose arguments are all things computed
+        // outside this function can be marked tail. Even if you stored the
+        // alloca address into a global, a readnone function can't load the
+        // global anyhow.
+        //
+        // Note that this runs whether we know an alloca has escaped or not. If
+        // it has, then we can't trust Tracker.AllocaUsers to be accurate.
+        bool SafeToTail = true;
+        for (auto &Arg : CI->arg_operands()) {
+          if (isa<Constant>(Arg.getUser()))
+            continue;
+          if (Argument *A = dyn_cast<Argument>(Arg.getUser()))
+            if (!A->hasByValAttr())
+              continue;
+          SafeToTail = false;
+          break;
+        }
+        if (SafeToTail) {
+          emitOptimizationRemark(
+              F.getContext(), "tailcallelim", F, CI->getDebugLoc(),
+              "marked this readnone call a tail call candidate");
+          CI->setTailCall();
+          Modified = true;
+          continue;
+        }
+      }
+
+      if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
+        DeferredTails.push_back(CI);
+      } else {
+        AllCallsAreTailCalls = false;
+      }
+    }
+
+    for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) {
+      auto &State = Visited[SuccBB];
+      if (State < Escaped) {
+        State = Escaped;
+        if (State == ESCAPED)
+          WorklistEscaped.push_back(SuccBB);
+        else
+          WorklistUnescaped.push_back(SuccBB);
+      }
+    }
+
+    if (!WorklistEscaped.empty()) {
+      BB = WorklistEscaped.pop_back_val();
+      Escaped = ESCAPED;
+    } else {
+      BB = nullptr;
+      while (!WorklistUnescaped.empty()) {
+        auto *NextBB = WorklistUnescaped.pop_back_val();
+        if (Visited[NextBB] == UNESCAPED) {
+          BB = NextBB;
+          Escaped = UNESCAPED;
+          break;
+        }
+      }
+    }
+  } while (BB);
+
+  for (CallInst *CI : DeferredTails) {
+    if (Visited[CI->getParent()] != ESCAPED) {
+      // If the escape point was part way through the block, calls after the
+      // escape point wouldn't have been put into DeferredTails.
+      emitOptimizationRemark(F.getContext(), "tailcallelim", F,
+                             CI->getDebugLoc(),
+                             "marked this call a tail call candidate");
+      CI->setTailCall();
+      Modified = true;
+    } else {
+      AllCallsAreTailCalls = false;
+    }
+  }
+
+  return Modified;
+}
+
+/// Return true if it is safe to move the specified
+/// instruction from after the call to before the call, assuming that all
+/// instructions between the call and this instruction are movable.
+///
+static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
+  // FIXME: We can move load/store/call/free instructions above the call if the
+  // call does not mod/ref the memory location being processed.
+  if (I->mayHaveSideEffects())  // This also handles volatile loads.
+    return false;
+
+  if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+    // Loads may always be moved above calls without side effects.
+    if (CI->mayHaveSideEffects()) {
+      // Non-volatile loads may be moved above a call with side effects if it
+      // does not write to memory and the load provably won't trap.
+      // Writes to memory only matter if they may alias the pointer
+      // being loaded from.
+      const DataLayout &DL = L->getModule()->getDataLayout();
+      if ((AA->getModRefInfo(CI, MemoryLocation::get(L)) & MRI_Mod) ||
+          !isSafeToLoadUnconditionally(L->getPointerOperand(),
+                                       L->getAlignment(), DL, L))
+        return false;
+    }
+  }
+
+  // Otherwise, if this is a side-effect free instruction, check to make sure
+  // that it does not use the return value of the call.  If it doesn't use the
+  // return value of the call, it must only use things that are defined before
+  // the call, or movable instructions between the call and the instruction
+  // itself.
+  return !is_contained(I->operands(), CI);
+}
+
+/// Return true if the specified value is the same when the return would exit
+/// as it was when the initial iteration of the recursive function was executed.
+///
+/// We currently handle static constants and arguments that are not modified as
+/// part of the recursion.
+static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) {
+  if (isa<Constant>(V)) return true; // Static constants are always dyn consts
+
+  // Check to see if this is an immutable argument, if so, the value
+  // will be available to initialize the accumulator.
+  if (Argument *Arg = dyn_cast<Argument>(V)) {
+    // Figure out which argument number this is...
+    unsigned ArgNo = 0;
+    Function *F = CI->getParent()->getParent();
+    for (Function::arg_iterator AI = F->arg_begin(); &*AI != Arg; ++AI)
+      ++ArgNo;
+
+    // If we are passing this argument into call as the corresponding
+    // argument operand, then the argument is dynamically constant.
+    // Otherwise, we cannot transform this function safely.
+    if (CI->getArgOperand(ArgNo) == Arg)
+      return true;
+  }
+
+  // Switch cases are always constant integers. If the value is being switched
+  // on and the return is only reachable from one of its cases, it's
+  // effectively constant.
+  if (BasicBlock *UniquePred = RI->getParent()->getUniquePredecessor())
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(UniquePred->getTerminator()))
+      if (SI->getCondition() == V)
+        return SI->getDefaultDest() != RI->getParent();
+
+  // Not a constant or immutable argument, we can't safely transform.
+  return false;
+}
+
+/// Check to see if the function containing the specified tail call consistently
+/// returns the same runtime-constant value at all exit points except for
+/// IgnoreRI. If so, return the returned value.
+static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
+  Function *F = CI->getParent()->getParent();
+  Value *ReturnedValue = nullptr;
+
+  for (BasicBlock &BBI : *F) {
+    ReturnInst *RI = dyn_cast<ReturnInst>(BBI.getTerminator());
+    if (RI == nullptr || RI == IgnoreRI) continue;
+
+    // We can only perform this transformation if the value returned is
+    // evaluatable at the start of the initial invocation of the function,
+    // instead of at the end of the evaluation.
+    //
+    Value *RetOp = RI->getOperand(0);
+    if (!isDynamicConstant(RetOp, CI, RI))
+      return nullptr;
+
+    if (ReturnedValue && RetOp != ReturnedValue)
+      return nullptr;     // Cannot transform if differing values are returned.
+    ReturnedValue = RetOp;
+  }
+  return ReturnedValue;
+}
+
+/// If the specified instruction can be transformed using accumulator recursion
+/// elimination, return the constant which is the start of the accumulator
+/// value.  Otherwise return null.
+static Value *canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) {
+  if (!I->isAssociative() || !I->isCommutative()) return nullptr;
+  assert(I->getNumOperands() == 2 &&
+         "Associative/commutative operations should have 2 args!");
+
+  // Exactly one operand should be the result of the call instruction.
+  if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
+      (I->getOperand(0) != CI && I->getOperand(1) != CI))
+    return nullptr;
+
+  // The only user of this instruction we allow is a single return instruction.
+  if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back()))
+    return nullptr;
+
+  // Ok, now we have to check all of the other return instructions in this
+  // function.  If they return non-constants or differing values, then we cannot
+  // transform the function safely.
+  return getCommonReturnValue(cast<ReturnInst>(I->user_back()), CI);
+}
+
+static Instruction *firstNonDbg(BasicBlock::iterator I) {
+  while (isa<DbgInfoIntrinsic>(I))
+    ++I;
+  return &*I;
+}
+
+static CallInst *findTRECandidate(Instruction *TI,
+                                  bool CannotTailCallElimCallsMarkedTail,
+                                  const TargetTransformInfo *TTI) {
+  BasicBlock *BB = TI->getParent();
+  Function *F = BB->getParent();
+
+  if (&BB->front() == TI) // Make sure there is something before the terminator.
+    return nullptr;
+
+  // Scan backwards from the return, checking to see if there is a tail call in
+  // this block.  If so, set CI to it.
+  CallInst *CI = nullptr;
+  BasicBlock::iterator BBI(TI);
+  while (true) {
+    CI = dyn_cast<CallInst>(BBI);
+    if (CI && CI->getCalledFunction() == F)
+      break;
+
+    if (BBI == BB->begin())
+      return nullptr;          // Didn't find a potential tail call.
+    --BBI;
+  }
+
+  // If this call is marked as a tail call, and if there are dynamic allocas in
+  // the function, we cannot perform this optimization.
+  if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
+    return nullptr;
+
+  // As a special case, detect code like this:
+  //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
+  // and disable this xform in this case, because the code generator will
+  // lower the call to fabs into inline code.
+  if (BB == &F->getEntryBlock() &&
+      firstNonDbg(BB->front().getIterator()) == CI &&
+      firstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
+      !TTI->isLoweredToCall(CI->getCalledFunction())) {
+    // A single-block function with just a call and a return. Check that
+    // the arguments match.
+    CallSite::arg_iterator I = CallSite(CI).arg_begin(),
+                           E = CallSite(CI).arg_end();
+    Function::arg_iterator FI = F->arg_begin(),
+                           FE = F->arg_end();
+    for (; I != E && FI != FE; ++I, ++FI)
+      if (*I != &*FI) break;
+    if (I == E && FI == FE)
+      return nullptr;
+  }
+
+  return CI;
+}
+
+static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
+                                       BasicBlock *&OldEntry,
+                                       bool &TailCallsAreMarkedTail,
+                                       SmallVectorImpl<PHINode *> &ArgumentPHIs,
+                                       AliasAnalysis *AA) {
+  // If we are introducing accumulator recursion to eliminate operations after
+  // the call instruction that are both associative and commutative, the initial
+  // value for the accumulator is placed in this variable.  If this value is set
+  // then we actually perform accumulator recursion elimination instead of
+  // simple tail recursion elimination.  If the operation is an LLVM instruction
+  // (eg: "add") then it is recorded in AccumulatorRecursionInstr.  If not, then
+  // we are handling the case when the return instruction returns a constant C
+  // which is different to the constant returned by other return instructions
+  // (which is recorded in AccumulatorRecursionEliminationInitVal).  This is a
+  // special case of accumulator recursion, the operation being "return C".
+  Value *AccumulatorRecursionEliminationInitVal = nullptr;
+  Instruction *AccumulatorRecursionInstr = nullptr;
+
+  // Ok, we found a potential tail call.  We can currently only transform the
+  // tail call if all of the instructions between the call and the return are
+  // movable to above the call itself, leaving the call next to the return.
+  // Check that this is the case now.
+  BasicBlock::iterator BBI(CI);
+  for (++BBI; &*BBI != Ret; ++BBI) {
+    if (canMoveAboveCall(&*BBI, CI, AA))
+      continue;
+
+    // If we can't move the instruction above the call, it might be because it
+    // is an associative and commutative operation that could be transformed
+    // using accumulator recursion elimination.  Check to see if this is the
+    // case, and if so, remember the initial accumulator value for later.
+    if ((AccumulatorRecursionEliminationInitVal =
+             canTransformAccumulatorRecursion(&*BBI, CI))) {
+      // Yes, this is accumulator recursion.  Remember which instruction
+      // accumulates.
+      AccumulatorRecursionInstr = &*BBI;
+    } else {
+      return false;   // Otherwise, we cannot eliminate the tail recursion!
+    }
+  }
+
+  // We can only transform call/return pairs that either ignore the return value
+  // of the call and return void, ignore the value of the call and return a
+  // constant, return the value returned by the tail call, or that are being
+  // accumulator recursion variable eliminated.
+  if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI &&
+      !isa<UndefValue>(Ret->getReturnValue()) &&
+      AccumulatorRecursionEliminationInitVal == nullptr &&
+      !getCommonReturnValue(nullptr, CI)) {
+    // One case remains that we are able to handle: the current return
+    // instruction returns a constant, and all other return instructions
+    // return a different constant.
+    if (!isDynamicConstant(Ret->getReturnValue(), CI, Ret))
+      return false; // Current return instruction does not return a constant.
+    // Check that all other return instructions return a common constant.  If
+    // so, record it in AccumulatorRecursionEliminationInitVal.
+    AccumulatorRecursionEliminationInitVal = getCommonReturnValue(Ret, CI);
+    if (!AccumulatorRecursionEliminationInitVal)
+      return false;
+  }
+
+  BasicBlock *BB = Ret->getParent();
+  Function *F = BB->getParent();
+
+  emitOptimizationRemark(F->getContext(), "tailcallelim", *F, CI->getDebugLoc(),
+                         "transforming tail recursion to loop");
+
+  // OK! We can transform this tail call.  If this is the first one found,
+  // create the new entry block, allowing us to branch back to the old entry.
+  if (!OldEntry) {
+    OldEntry = &F->getEntryBlock();
+    BasicBlock *NewEntry = BasicBlock::Create(F->getContext(), "", F, OldEntry);
+    NewEntry->takeName(OldEntry);
+    OldEntry->setName("tailrecurse");
+    BranchInst::Create(OldEntry, NewEntry);
+
+    // If this tail call is marked 'tail' and if there are any allocas in the
+    // entry block, move them up to the new entry block.
+    TailCallsAreMarkedTail = CI->isTailCall();
+    if (TailCallsAreMarkedTail)
+      // Move all fixed sized allocas from OldEntry to NewEntry.
+      for (BasicBlock::iterator OEBI = OldEntry->begin(), E = OldEntry->end(),
+             NEBI = NewEntry->begin(); OEBI != E; )
+        if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
+          if (isa<ConstantInt>(AI->getArraySize()))
+            AI->moveBefore(&*NEBI);
+
+    // Now that we have created a new block, which jumps to the entry
+    // block, insert a PHI node for each argument of the function.
+    // For now, we initialize each PHI to only have the real arguments
+    // which are passed in.
+    Instruction *InsertPos = &OldEntry->front();
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I) {
+      PHINode *PN = PHINode::Create(I->getType(), 2,
+                                    I->getName() + ".tr", InsertPos);
+      I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
+      PN->addIncoming(&*I, NewEntry);
+      ArgumentPHIs.push_back(PN);
+    }
+  }
+
+  // If this function has self recursive calls in the tail position where some
+  // are marked tail and some are not, only transform one flavor or another.  We
+  // have to choose whether we move allocas in the entry block to the new entry
+  // block or not, so we can't make a good choice for both.  NOTE: We could do
+  // slightly better here in the case that the function has no entry block
+  // allocas.
+  if (TailCallsAreMarkedTail && !CI->isTailCall())
+    return false;
+
+  // Ok, now that we know we have a pseudo-entry block WITH all of the
+  // required PHI nodes, add entries into the PHI node for the actual
+  // parameters passed into the tail-recursive call.
+  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+    ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB);
+
+  // If we are introducing an accumulator variable to eliminate the recursion,
+  // do so now.  Note that we _know_ that no subsequent tail recursion
+  // eliminations will happen on this function because of the way the
+  // accumulator recursion predicate is set up.
+  //
+  if (AccumulatorRecursionEliminationInitVal) {
+    Instruction *AccRecInstr = AccumulatorRecursionInstr;
+    // Start by inserting a new PHI node for the accumulator.
+    pred_iterator PB = pred_begin(OldEntry), PE = pred_end(OldEntry);
+    PHINode *AccPN = PHINode::Create(
+        AccumulatorRecursionEliminationInitVal->getType(),
+        std::distance(PB, PE) + 1, "accumulator.tr", &OldEntry->front());
+
+    // Loop over all of the predecessors of the tail recursion block.  For the
+    // real entry into the function we seed the PHI with the initial value,
+    // computed earlier.  For any other existing branches to this block (due to
+    // other tail recursions eliminated) the accumulator is not modified.
+    // Because we haven't added the branch in the current block to OldEntry yet,
+    // it will not show up as a predecessor.
+    for (pred_iterator PI = PB; PI != PE; ++PI) {
+      BasicBlock *P = *PI;
+      if (P == &F->getEntryBlock())
+        AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, P);
+      else
+        AccPN->addIncoming(AccPN, P);
+    }
+
+    if (AccRecInstr) {
+      // Add an incoming argument for the current block, which is computed by
+      // our associative and commutative accumulator instruction.
+      AccPN->addIncoming(AccRecInstr, BB);
+
+      // Next, rewrite the accumulator recursion instruction so that it does not
+      // use the result of the call anymore, instead, use the PHI node we just
+      // inserted.
+      AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+    } else {
+      // Add an incoming argument for the current block, which is just the
+      // constant returned by the current return instruction.
+      AccPN->addIncoming(Ret->getReturnValue(), BB);
+    }
+
+    // Finally, rewrite any return instructions in the program to return the PHI
+    // node instead of the "initval" that they do currently.  This loop will
+    // actually rewrite the return value we are destroying, but that's ok.
+    for (BasicBlock &BBI : *F)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI.getTerminator()))
+        RI->setOperand(0, AccPN);
+    ++NumAccumAdded;
+  }
+
+  // Now that all of the PHI nodes are in place, remove the call and
+  // ret instructions, replacing them with an unconditional branch.
+  BranchInst *NewBI = BranchInst::Create(OldEntry, Ret);
+  NewBI->setDebugLoc(CI->getDebugLoc());
+
+  BB->getInstList().erase(Ret);  // Remove return.
+  BB->getInstList().erase(CI);   // Remove call.
+  ++NumEliminated;
+  return true;
+}
+
+static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret,
+                                     BasicBlock *&OldEntry,
+                                     bool &TailCallsAreMarkedTail,
+                                     SmallVectorImpl<PHINode *> &ArgumentPHIs,
+                                     bool CannotTailCallElimCallsMarkedTail,
+                                     const TargetTransformInfo *TTI,
+                                     AliasAnalysis *AA) {
+  bool Change = false;
+
+  // Make sure this block is a trivial return block.
+  assert(BB->getFirstNonPHIOrDbg() == Ret &&
+         "Trying to fold non-trivial return block");
+
+  // If the return block contains nothing but the return and PHI's,
+  // there might be an opportunity to duplicate the return in its
+  // predecessors and perform TRE there. Look for predecessors that end
+  // in unconditional branch and recursive call(s).
+  SmallVector<BranchInst*, 8> UncondBranchPreds;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *Pred = *PI;
+    TerminatorInst *PTI = Pred->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(PTI))
+      if (BI->isUnconditional())
+        UncondBranchPreds.push_back(BI);
+  }
+
+  while (!UncondBranchPreds.empty()) {
+    BranchInst *BI = UncondBranchPreds.pop_back_val();
+    BasicBlock *Pred = BI->getParent();
+    if (CallInst *CI = findTRECandidate(BI, CannotTailCallElimCallsMarkedTail, TTI)){
+      DEBUG(dbgs() << "FOLDING: " << *BB
+            << "INTO UNCOND BRANCH PRED: " << *Pred);
+      ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
+
+      // Cleanup: if all predecessors of BB have been eliminated by
+      // FoldReturnIntoUncondBranch, delete it.  It is important to empty it,
+      // because the ret instruction in there is still using a value which
+      // eliminateRecursiveTailCall will attempt to remove.
+      if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
+        BB->eraseFromParent();
+
+      eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
+                                 ArgumentPHIs, AA);
+      ++NumRetDuped;
+      Change = true;
+    }
+  }
+
+  return Change;
+}
+
+static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
+                                  bool &TailCallsAreMarkedTail,
+                                  SmallVectorImpl<PHINode *> &ArgumentPHIs,
+                                  bool CannotTailCallElimCallsMarkedTail,
+                                  const TargetTransformInfo *TTI,
+                                  AliasAnalysis *AA) {
+  CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI);
+  if (!CI)
+    return false;
+
+  return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
+                                    ArgumentPHIs, AA);
+}
+
+static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
+                                   AliasAnalysis *AA) {
+  if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
+    return false;
+
+  bool MadeChange = false;
+  bool AllCallsAreTailCalls = false;
+  MadeChange |= markTails(F, AllCallsAreTailCalls);
+  if (!AllCallsAreTailCalls)
+    return MadeChange;
+
+  // If this function is a varargs function, we won't be able to PHI the args
+  // right, so don't even try to convert it...
+  if (F.getFunctionType()->isVarArg())
+    return false;
+
+  BasicBlock *OldEntry = nullptr;
+  bool TailCallsAreMarkedTail = false;
+  SmallVector<PHINode*, 8> ArgumentPHIs;
+
+  // If false, we cannot perform TRE on tail calls marked with the 'tail'
+  // attribute, because doing so would cause the stack size to increase (real
+  // TRE would deallocate variable sized allocas, TRE doesn't).
+  bool CanTRETailMarkedCall = canTRE(F);
+
+  // Change any tail recursive calls to loops.
+  //
+  // FIXME: The code generator produces really bad code when an 'escaping
+  // alloca' is changed from being a static alloca to being a dynamic alloca.
+  // Until this is resolved, disable this transformation if that would ever
+  // happen.  This bug is PR962.
+  for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) {
+    BasicBlock *BB = &*BBI++; // foldReturnAndProcessPred may delete BB.
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
+      bool Change =
+          processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
+                                ArgumentPHIs, !CanTRETailMarkedCall, TTI, AA);
+      if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
+        Change = foldReturnAndProcessPred(BB, Ret, OldEntry,
+                                          TailCallsAreMarkedTail, ArgumentPHIs,
+                                          !CanTRETailMarkedCall, TTI, AA);
+      MadeChange |= Change;
+    }
+  }
+
+  // If we eliminated any tail recursions, it's possible that we inserted some
+  // silly PHI nodes which just merge an initial value (the incoming operand)
+  // with themselves.  Check to see if we did and clean up our mess if so.  This
+  // occurs when a function passes an argument straight through to its tail
+  // call.
+  for (PHINode *PN : ArgumentPHIs) {
+    // If the PHI Node is a dynamic constant, replace it with the value it is.
+    if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
+      PN->replaceAllUsesWith(PNV);
+      PN->eraseFromParent();
+    }
+  }
+
+  return MadeChange;
+}
+
+namespace {
+struct TailCallElim : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  TailCallElim() : FunctionPass(ID) {
+    initializeTailCallElimPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    return eliminateTailRecursion(
+        F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
+        &getAnalysis<AAResultsWrapperPass>().getAAResults());
+  }
+};
+}
+
+char TailCallElim::ID = 0;
+INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination",
+                    false, false)
+
+// Public interface to the TailCallElimination pass
+FunctionPass *llvm::createTailCallEliminationPass() {
+  return new TailCallElim();
+}
+
+PreservedAnalyses TailCallElimPass::run(Function &F,
+                                        FunctionAnalysisManager &AM) {
+
+  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+
+  bool Changed = eliminateTailRecursion(F, &TTI, &AA);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp
new file mode 100644
index 000000000000..df9d5da9e26e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp
@@ -0,0 +1,150 @@
+//===-- ASanStackFrameLayout.cpp - helper for AddressSanitizer ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of ComputeASanStackFrameLayout (see ASanStackFrameLayout.h).
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+namespace llvm {
+
+// We sort the stack variables by alignment (largest first) to minimize
+// unnecessary large gaps due to alignment.
+// It is tempting to also sort variables by size so that larger variables
+// have larger redzones at both ends. But reordering will make report analysis
+// harder, especially when temporary unnamed variables are present.
+// So, until we can provide more information (type, line number, etc)
+// for the stack variables we avoid reordering them too much.
+static inline bool CompareVars(const ASanStackVariableDescription &a,
+                               const ASanStackVariableDescription &b) {
+  return a.Alignment > b.Alignment;
+}
+
+// We also force minimal alignment for all vars to kMinAlignment so that vars
+// with e.g. alignment 1 and alignment 16 do not get reordered by CompareVars.
+static const size_t kMinAlignment = 16;
+
+// The larger the variable Size the larger is the redzone.
+// The resulting frame size is a multiple of Alignment.
+static size_t VarAndRedzoneSize(size_t Size, size_t Alignment) {
+  size_t Res = 0;
+  if (Size <= 4)  Res = 16;
+  else if (Size <= 16) Res = 32;
+  else if (Size <= 128) Res = Size + 32;
+  else if (Size <= 512) Res = Size + 64;
+  else if (Size <= 4096) Res = Size + 128;
+  else                   Res = Size + 256;
+  return alignTo(Res, Alignment);
+}
+
+ASanStackFrameLayout
+ComputeASanStackFrameLayout(SmallVectorImpl<ASanStackVariableDescription> &Vars,
+                            size_t Granularity, size_t MinHeaderSize) {
+  assert(Granularity >= 8 && Granularity <= 64 &&
+         (Granularity & (Granularity - 1)) == 0);
+  assert(MinHeaderSize >= 16 && (MinHeaderSize & (MinHeaderSize - 1)) == 0 &&
+         MinHeaderSize >= Granularity);
+  const size_t NumVars = Vars.size();
+  assert(NumVars > 0);
+  for (size_t i = 0; i < NumVars; i++)
+    Vars[i].Alignment = std::max(Vars[i].Alignment, kMinAlignment);
+
+  std::stable_sort(Vars.begin(), Vars.end(), CompareVars);
+
+  ASanStackFrameLayout Layout;
+  Layout.Granularity = Granularity;
+  Layout.FrameAlignment = std::max(Granularity, Vars[0].Alignment);
+  size_t Offset = std::max(std::max(MinHeaderSize, Granularity),
+     Vars[0].Alignment);
+  assert((Offset % Granularity) == 0);
+  for (size_t i = 0; i < NumVars; i++) {
+    bool IsLast = i == NumVars - 1;
+    size_t Alignment = std::max(Granularity, Vars[i].Alignment);
+    (void)Alignment;  // Used only in asserts.
+    size_t Size = Vars[i].Size;
+    assert((Alignment & (Alignment - 1)) == 0);
+    assert(Layout.FrameAlignment >= Alignment);
+    assert((Offset % Alignment) == 0);
+    assert(Size > 0);
+    size_t NextAlignment = IsLast ? Granularity
+                   : std::max(Granularity, Vars[i + 1].Alignment);
+    size_t SizeWithRedzone = VarAndRedzoneSize(Size, NextAlignment);
+    Vars[i].Offset = Offset;
+    Offset += SizeWithRedzone;
+  }
+  if (Offset % MinHeaderSize) {
+    Offset += MinHeaderSize - (Offset % MinHeaderSize);
+  }
+  Layout.FrameSize = Offset;
+  assert((Layout.FrameSize % MinHeaderSize) == 0);
+  return Layout;
+}
+
+SmallString<64> ComputeASanStackFrameDescription(
+    const SmallVectorImpl<ASanStackVariableDescription> &Vars) {
+  SmallString<2048> StackDescriptionStorage;
+  raw_svector_ostream StackDescription(StackDescriptionStorage);
+  StackDescription << Vars.size();
+
+  for (const auto &Var : Vars) {
+    std::string Name = Var.Name;
+    if (Var.Line) {
+      Name += ":";
+      Name += to_string(Var.Line);
+    }
+    StackDescription << " " << Var.Offset << " " << Var.Size << " "
+                     << Name.size() << " " << Name;
+  }
+  return StackDescription.str();
+}
+
+SmallVector<uint8_t, 64>
+GetShadowBytes(const SmallVectorImpl<ASanStackVariableDescription> &Vars,
+               const ASanStackFrameLayout &Layout) {
+  assert(Vars.size() > 0);
+  SmallVector<uint8_t, 64> SB;
+  SB.clear();
+  const size_t Granularity = Layout.Granularity;
+  SB.resize(Vars[0].Offset / Granularity, kAsanStackLeftRedzoneMagic);
+  for (const auto &Var : Vars) {
+    SB.resize(Var.Offset / Granularity, kAsanStackMidRedzoneMagic);
+
+    SB.resize(SB.size() + Var.Size / Granularity, 0);
+    if (Var.Size % Granularity)
+      SB.push_back(Var.Size % Granularity);
+  }
+  SB.resize(Layout.FrameSize / Granularity, kAsanStackRightRedzoneMagic);
+  return SB;
+}
+
+SmallVector<uint8_t, 64> GetShadowBytesAfterScope(
+    const SmallVectorImpl<ASanStackVariableDescription> &Vars,
+    const ASanStackFrameLayout &Layout) {
+  SmallVector<uint8_t, 64> SB = GetShadowBytes(Vars, Layout);
+  const size_t Granularity = Layout.Granularity;
+
+  for (const auto &Var : Vars) {
+    assert(Var.LifetimeSize <= Var.Size);
+    const size_t LifetimeShadowSize =
+        (Var.LifetimeSize + Granularity - 1) / Granularity;
+    const size_t Offset = Var.Offset / Granularity;
+    std::fill(SB.begin() + Offset, SB.begin() + Offset + LifetimeShadowSize,
+              kAsanStackUseAfterScopeMagic);
+  }
+
+  return SB;
+}
+
+} // llvm namespace
diff --git a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
new file mode 100644
index 000000000000..4c9746b8c691
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -0,0 +1,252 @@
+//===- AddDiscriminators.cpp - Insert DWARF path discriminators -----------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file adds DWARF discriminators to the IR. Path discriminators are
+// used to decide what CFG path was taken inside sub-graphs whose instructions
+// share the same line and column number information.
+//
+// The main user of this is the sample profiler. Instruction samples are
+// mapped to line number information. Since a single line may be spread
+// out over several basic blocks, discriminators add more precise location
+// for the samples.
+//
+// For example,
+//
+//   1  #define ASSERT(P)
+//   2      if (!(P))
+//   3        abort()
+//   ...
+//   100   while (true) {
+//   101     ASSERT (sum < 0);
+//   102     ...
+//   130   }
+//
+// when converted to IR, this snippet looks something like:
+//
+// while.body:                                       ; preds = %entry, %if.end
+//   %0 = load i32* %sum, align 4, !dbg !15
+//   %cmp = icmp slt i32 %0, 0, !dbg !15
+//   br i1 %cmp, label %if.end, label %if.then, !dbg !15
+//
+// if.then:                                          ; preds = %while.body
+//   call void @abort(), !dbg !15
+//   br label %if.end, !dbg !15
+//
+// Notice that all the instructions in blocks 'while.body' and 'if.then'
+// have exactly the same debug information. When this program is sampled
+// at runtime, the profiler will assume that all these instructions are
+// equally frequent. This, in turn, will consider the edge while.body->if.then
+// to be frequently taken (which is incorrect).
+//
+// By adding a discriminator value to the instructions in block 'if.then',
+// we can distinguish instructions at line 101 with discriminator 0 from
+// the instructions at line 101 with discriminator 1.
+//
+// For more details about DWARF discriminators, please visit
+// http://wiki.dwarfstd.org/index.php?title=Path_Discriminators
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/AddDiscriminators.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "add-discriminators"
+
+namespace {
+// The legacy pass of AddDiscriminators.
+struct AddDiscriminatorsLegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  AddDiscriminatorsLegacyPass() : FunctionPass(ID) {
+    initializeAddDiscriminatorsLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+
+} // end anonymous namespace
+
+char AddDiscriminatorsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(AddDiscriminatorsLegacyPass, "add-discriminators",
+                      "Add DWARF path discriminators", false, false)
+INITIALIZE_PASS_END(AddDiscriminatorsLegacyPass, "add-discriminators",
+                    "Add DWARF path discriminators", false, false)
+
+// Command line option to disable discriminator generation even in the
+// presence of debug information. This is only needed when debugging
+// debug info generation issues.
+static cl::opt<bool> NoDiscriminators(
+    "no-discriminators", cl::init(false),
+    cl::desc("Disable generation of discriminator information."));
+
+// Create the legacy AddDiscriminatorsPass.
+FunctionPass *llvm::createAddDiscriminatorsPass() {
+  return new AddDiscriminatorsLegacyPass();
+}
+
+static bool shouldHaveDiscriminator(const Instruction *I) {
+  return !isa<IntrinsicInst>(I) || isa<MemIntrinsic>(I);
+}
+
+/// \brief Assign DWARF discriminators.
+///
+/// To assign discriminators, we examine the boundaries of every
+/// basic block and its successors. Suppose there is a basic block B1
+/// with successor B2. The last instruction I1 in B1 and the first
+/// instruction I2 in B2 are located at the same file and line number.
+/// This situation is illustrated in the following code snippet:
+///
+///       if (i < 10) x = i;
+///
+///     entry:
+///       br i1 %cmp, label %if.then, label %if.end, !dbg !10
+///     if.then:
+///       %1 = load i32* %i.addr, align 4, !dbg !10
+///       store i32 %1, i32* %x, align 4, !dbg !10
+///       br label %if.end, !dbg !10
+///     if.end:
+///       ret void, !dbg !12
+///
+/// Notice how the branch instruction in block 'entry' and all the
+/// instructions in block 'if.then' have the exact same debug location
+/// information (!dbg !10).
+///
+/// To distinguish instructions in block 'entry' from instructions in
+/// block 'if.then', we generate a new lexical block for all the
+/// instruction in block 'if.then' that share the same file and line
+/// location with the last instruction of block 'entry'.
+///
+/// This new lexical block will have the same location information as
+/// the previous one, but with a new DWARF discriminator value.
+///
+/// One of the main uses of this discriminator value is in runtime
+/// sample profilers. It allows the profiler to distinguish instructions
+/// at location !dbg !10 that execute on different basic blocks. This is
+/// important because while the predicate 'if (x < 10)' may have been
+/// executed millions of times, the assignment 'x = i' may have only
+/// executed a handful of times (meaning that the entry->if.then edge is
+/// seldom taken).
+///
+/// If we did not have discriminator information, the profiler would
+/// assign the same weight to both blocks 'entry' and 'if.then', which
+/// in turn will make it conclude that the entry->if.then edge is very
+/// hot.
+///
+/// To decide where to create new discriminator values, this function
+/// traverses the CFG and examines instruction at basic block boundaries.
+/// If the last instruction I1 of a block B1 is at the same file and line
+/// location as instruction I2 of successor B2, then it creates a new
+/// lexical block for I2 and all the instruction in B2 that share the same
+/// file and line location as I2. This new lexical block will have a
+/// different discriminator number than I1.
+static bool addDiscriminators(Function &F) {
+  // If the function has debug information, but the user has disabled
+  // discriminators, do nothing.
+  // Simlarly, if the function has no debug info, do nothing.
+  if (NoDiscriminators || !F.getSubprogram())
+    return false;
+
+  bool Changed = false;
+
+  typedef std::pair<StringRef, unsigned> Location;
+  typedef DenseSet<const BasicBlock *> BBSet;
+  typedef DenseMap<Location, BBSet> LocationBBMap;
+  typedef DenseMap<Location, unsigned> LocationDiscriminatorMap;
+  typedef DenseSet<Location> LocationSet;
+
+  LocationBBMap LBM;
+  LocationDiscriminatorMap LDM;
+
+  // Traverse all instructions in the function. If the source line location
+  // of the instruction appears in other basic block, assign a new
+  // discriminator for this instruction.
+  for (BasicBlock &B : F) {
+    for (auto &I : B.getInstList()) {
+      // Not all intrinsic calls should have a discriminator.
+      // We want to avoid a non-deterministic assignment of discriminators at
+      // different debug levels. We still allow discriminators on memory
+      // intrinsic calls because those can be early expanded by SROA into
+      // pairs of loads and stores, and the expanded load/store instructions
+      // should have a valid discriminator.
+      if (!shouldHaveDiscriminator(&I))
+        continue;
+      const DILocation *DIL = I.getDebugLoc();
+      if (!DIL)
+        continue;
+      Location L = std::make_pair(DIL->getFilename(), DIL->getLine());
+      auto &BBMap = LBM[L];
+      auto R = BBMap.insert(&B);
+      if (BBMap.size() == 1)
+        continue;
+      // If we could insert more than one block with the same line+file, a
+      // discriminator is needed to distinguish both instructions.
+      // Only the lowest 7 bits are used to represent a discriminator to fit
+      // it in 1 byte ULEB128 representation.
+      unsigned Discriminator = R.second ? ++LDM[L] : LDM[L];
+      I.setDebugLoc(DIL->setBaseDiscriminator(Discriminator));
+      DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
+                   << DIL->getColumn() << ":" << Discriminator << " " << I
+                   << "\n");
+      Changed = true;
+    }
+  }
+
+  // Traverse all instructions and assign new discriminators to call
+  // instructions with the same lineno that are in the same basic block.
+  // Sample base profile needs to distinguish different function calls within
+  // a same source line for correct profile annotation.
+  for (BasicBlock &B : F) {
+    LocationSet CallLocations;
+    for (auto &I : B.getInstList()) {
+      CallInst *Current = dyn_cast<CallInst>(&I);
+      // We bypass intrinsic calls for the following two reasons:
+      //  1) We want to avoid a non-deterministic assigment of
+      //     discriminators.
+      //  2) We want to minimize the number of base discriminators used.
+      if (!Current || isa<IntrinsicInst>(&I))
+        continue;
+
+      DILocation *CurrentDIL = Current->getDebugLoc();
+      if (!CurrentDIL)
+        continue;
+      Location L =
+          std::make_pair(CurrentDIL->getFilename(), CurrentDIL->getLine());
+      if (!CallLocations.insert(L).second) {
+        unsigned Discriminator = ++LDM[L];
+        Current->setDebugLoc(CurrentDIL->setBaseDiscriminator(Discriminator));
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+bool AddDiscriminatorsLegacyPass::runOnFunction(Function &F) {
+  return addDiscriminators(F);
+}
+PreservedAnalyses AddDiscriminatorsPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  if (!addDiscriminators(F))
+    return PreservedAnalyses::all();
+
+  // FIXME: should be all()
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
new file mode 100644
index 000000000000..3d5cbfc93f2e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -0,0 +1,769 @@
+//===-- BasicBlockUtils.cpp - BasicBlock Utilities -------------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform manipulations on basic blocks, and
+// instructions contained within basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+void llvm::DeleteDeadBlock(BasicBlock *BB) {
+  assert((pred_begin(BB) == pred_end(BB) ||
+         // Can delete self loop.
+         BB->getSinglePredecessor() == BB) && "Block is not dead!");
+  TerminatorInst *BBTerm = BB->getTerminator();
+
+  // Loop through all of our successors and make sure they know that one
+  // of their predecessors is going away.
+  for (BasicBlock *Succ : BBTerm->successors())
+    Succ->removePredecessor(BB);
+
+  // Zap all the instructions in the block.
+  while (!BB->empty()) {
+    Instruction &I = BB->back();
+    // If this instruction is used, replace uses with an arbitrary value.
+    // Because control flow can't get here, we don't care what we replace the
+    // value with.  Note that since this block is unreachable, and all values
+    // contained within it must dominate their uses, that all uses will
+    // eventually be removed (they are themselves dead).
+    if (!I.use_empty())
+      I.replaceAllUsesWith(UndefValue::get(I.getType()));
+    BB->getInstList().pop_back();
+  }
+
+  // Zap the block!
+  BB->eraseFromParent();
+}
+
+void llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
+                                   MemoryDependenceResults *MemDep) {
+  if (!isa<PHINode>(BB->begin())) return;
+
+  while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+    if (PN->getIncomingValue(0) != PN)
+      PN->replaceAllUsesWith(PN->getIncomingValue(0));
+    else
+      PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+
+    if (MemDep)
+      MemDep->removeInstruction(PN);  // Memdep updates AA itself.
+
+    PN->eraseFromParent();
+  }
+}
+
+bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
+  // Recursively deleting a PHI may cause multiple PHIs to be deleted
+  // or RAUW'd undef, so use an array of WeakTrackingVH for the PHIs to delete.
+  SmallVector<WeakTrackingVH, 8> PHIs;
+  for (BasicBlock::iterator I = BB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PHIs.push_back(PN);
+
+  bool Changed = false;
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+    if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i].operator Value*()))
+      Changed |= RecursivelyDeleteDeadPHINode(PN, TLI);
+
+  return Changed;
+}
+
+bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
+                                     LoopInfo *LI,
+                                     MemoryDependenceResults *MemDep) {
+  // Don't merge away blocks who have their address taken.
+  if (BB->hasAddressTaken()) return false;
+
+  // Can't merge if there are multiple predecessors, or no predecessors.
+  BasicBlock *PredBB = BB->getUniquePredecessor();
+  if (!PredBB) return false;
+
+  // Don't break self-loops.
+  if (PredBB == BB) return false;
+  // Don't break unwinding instructions.
+  if (PredBB->getTerminator()->isExceptional())
+    return false;
+
+  succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB));
+  BasicBlock *OnlySucc = BB;
+  for (; SI != SE; ++SI)
+    if (*SI != OnlySucc) {
+      OnlySucc = nullptr;     // There are multiple distinct successors!
+      break;
+    }
+
+  // Can't merge if there are multiple successors.
+  if (!OnlySucc) return false;
+
+  // Can't merge if there is PHI loop.
+  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+    if (PHINode *PN = dyn_cast<PHINode>(BI)) {
+      for (Value *IncValue : PN->incoming_values())
+        if (IncValue == PN)
+          return false;
+    } else
+      break;
+  }
+
+  // Begin by getting rid of unneeded PHIs.
+  if (isa<PHINode>(BB->front()))
+    FoldSingleEntryPHINodes(BB, MemDep);
+
+  // Delete the unconditional branch from the predecessor...
+  PredBB->getInstList().pop_back();
+
+  // Make all PHI nodes that referred to BB now refer to Pred as their
+  // source...
+  BB->replaceAllUsesWith(PredBB);
+
+  // Move all definitions in the successor to the predecessor...
+  PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
+
+  // Inherit predecessors name if it exists.
+  if (!PredBB->hasName())
+    PredBB->takeName(BB);
+
+  // Finally, erase the old block and update dominator info.
+  if (DT)
+    if (DomTreeNode *DTN = DT->getNode(BB)) {
+      DomTreeNode *PredDTN = DT->getNode(PredBB);
+      SmallVector<DomTreeNode *, 8> Children(DTN->begin(), DTN->end());
+      for (DomTreeNode *DI : Children)
+        DT->changeImmediateDominator(DI, PredDTN);
+
+      DT->eraseNode(BB);
+    }
+
+  if (LI)
+    LI->removeBlock(BB);
+
+  if (MemDep)
+    MemDep->invalidateCachedPredecessors();
+
+  BB->eraseFromParent();
+  return true;
+}
+
+void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL,
+                                BasicBlock::iterator &BI, Value *V) {
+  Instruction &I = *BI;
+  // Replaces all of the uses of the instruction with uses of the value
+  I.replaceAllUsesWith(V);
+
+  // Make sure to propagate a name if there is one already.
+  if (I.hasName() && !V->hasName())
+    V->takeName(&I);
+
+  // Delete the unnecessary instruction now...
+  BI = BIL.erase(BI);
+}
+
+void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL,
+                               BasicBlock::iterator &BI, Instruction *I) {
+  assert(I->getParent() == nullptr &&
+         "ReplaceInstWithInst: Instruction already inserted into basic block!");
+
+  // Copy debug location to newly added instruction, if it wasn't already set
+  // by the caller.
+  if (!I->getDebugLoc())
+    I->setDebugLoc(BI->getDebugLoc());
+
+  // Insert the new instruction into the basic block...
+  BasicBlock::iterator New = BIL.insert(BI, I);
+
+  // Replace all uses of the old instruction, and delete it.
+  ReplaceInstWithValue(BIL, BI, I);
+
+  // Move BI back to point to the newly inserted instruction
+  BI = New;
+}
+
+void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) {
+  BasicBlock::iterator BI(From);
+  ReplaceInstWithInst(From->getParent()->getInstList(), BI, To);
+}
+
+BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
+                            LoopInfo *LI) {
+  unsigned SuccNum = GetSuccessorNumber(BB, Succ);
+
+  // If this is a critical edge, let SplitCriticalEdge do it.
+  TerminatorInst *LatchTerm = BB->getTerminator();
+  if (SplitCriticalEdge(LatchTerm, SuccNum, CriticalEdgeSplittingOptions(DT, LI)
+                                                .setPreserveLCSSA()))
+    return LatchTerm->getSuccessor(SuccNum);
+
+  // If the edge isn't critical, then BB has a single successor or Succ has a
+  // single pred.  Split the block.
+  if (BasicBlock *SP = Succ->getSinglePredecessor()) {
+    // If the successor only has a single pred, split the top of the successor
+    // block.
+    assert(SP == BB && "CFG broken");
+    SP = nullptr;
+    return SplitBlock(Succ, &Succ->front(), DT, LI);
+  }
+
+  // Otherwise, if BB has a single successor, split it at the bottom of the
+  // block.
+  assert(BB->getTerminator()->getNumSuccessors() == 1 &&
+         "Should have a single succ!");
+  return SplitBlock(BB, BB->getTerminator(), DT, LI);
+}
+
+unsigned
+llvm::SplitAllCriticalEdges(Function &F,
+                            const CriticalEdgeSplittingOptions &Options) {
+  unsigned NumBroken = 0;
+  for (BasicBlock &BB : F) {
+    TerminatorInst *TI = BB.getTerminator();
+    if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI))
+      for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+        if (SplitCriticalEdge(TI, i, Options))
+          ++NumBroken;
+  }
+  return NumBroken;
+}
+
+BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
+                             DominatorTree *DT, LoopInfo *LI) {
+  BasicBlock::iterator SplitIt = SplitPt->getIterator();
+  while (isa<PHINode>(SplitIt) || SplitIt->isEHPad())
+    ++SplitIt;
+  BasicBlock *New = Old->splitBasicBlock(SplitIt, Old->getName()+".split");
+
+  // The new block lives in whichever loop the old one did. This preserves
+  // LCSSA as well, because we force the split point to be after any PHI nodes.
+  if (LI)
+    if (Loop *L = LI->getLoopFor(Old))
+      L->addBasicBlockToLoop(New, *LI);
+
+  if (DT)
+    // Old dominates New. New node dominates all other nodes dominated by Old.
+    if (DomTreeNode *OldNode = DT->getNode(Old)) {
+      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
+
+      DomTreeNode *NewNode = DT->addNewBlock(New, Old);
+      for (DomTreeNode *I : Children)
+        DT->changeImmediateDominator(I, NewNode);
+    }
+
+  return New;
+}
+
+/// Update DominatorTree, LoopInfo, and LCCSA analysis information.
+static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
+                                      ArrayRef<BasicBlock *> Preds,
+                                      DominatorTree *DT, LoopInfo *LI,
+                                      bool PreserveLCSSA, bool &HasLoopExit) {
+  // Update dominator tree if available.
+  if (DT)
+    DT->splitBlock(NewBB);
+
+  // The rest of the logic is only relevant for updating the loop structures.
+  if (!LI)
+    return;
+
+  Loop *L = LI->getLoopFor(OldBB);
+
+  // If we need to preserve loop analyses, collect some information about how
+  // this split will affect loops.
+  bool IsLoopEntry = !!L;
+  bool SplitMakesNewLoopHeader = false;
+  for (BasicBlock *Pred : Preds) {
+    // If we need to preserve LCSSA, determine if any of the preds is a loop
+    // exit.
+    if (PreserveLCSSA)
+      if (Loop *PL = LI->getLoopFor(Pred))
+        if (!PL->contains(OldBB))
+          HasLoopExit = true;
+
+    // If we need to preserve LoopInfo, note whether any of the preds crosses
+    // an interesting loop boundary.
+    if (!L)
+      continue;
+    if (L->contains(Pred))
+      IsLoopEntry = false;
+    else
+      SplitMakesNewLoopHeader = true;
+  }
+
+  // Unless we have a loop for OldBB, nothing else to do here.
+  if (!L)
+    return;
+
+  if (IsLoopEntry) {
+    // Add the new block to the nearest enclosing loop (and not an adjacent
+    // loop). To find this, examine each of the predecessors and determine which
+    // loops enclose them, and select the most-nested loop which contains the
+    // loop containing the block being split.
+    Loop *InnermostPredLoop = nullptr;
+    for (BasicBlock *Pred : Preds) {
+      if (Loop *PredLoop = LI->getLoopFor(Pred)) {
+        // Seek a loop which actually contains the block being split (to avoid
+        // adjacent loops).
+        while (PredLoop && !PredLoop->contains(OldBB))
+          PredLoop = PredLoop->getParentLoop();
+
+        // Select the most-nested of these loops which contains the block.
+        if (PredLoop && PredLoop->contains(OldBB) &&
+            (!InnermostPredLoop ||
+             InnermostPredLoop->getLoopDepth() < PredLoop->getLoopDepth()))
+          InnermostPredLoop = PredLoop;
+      }
+    }
+
+    if (InnermostPredLoop)
+      InnermostPredLoop->addBasicBlockToLoop(NewBB, *LI);
+  } else {
+    L->addBasicBlockToLoop(NewBB, *LI);
+    if (SplitMakesNewLoopHeader)
+      L->moveToHeader(NewBB);
+  }
+}
+
+/// Update the PHI nodes in OrigBB to include the values coming from NewBB.
+/// This also updates AliasAnalysis, if available.
+static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
+                           ArrayRef<BasicBlock *> Preds, BranchInst *BI,
+                           bool HasLoopExit) {
+  // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB.
+  SmallPtrSet<BasicBlock *, 16> PredSet(Preds.begin(), Preds.end());
+  for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) {
+    PHINode *PN = cast<PHINode>(I++);
+
+    // Check to see if all of the values coming in are the same.  If so, we
+    // don't need to create a new PHI node, unless it's needed for LCSSA.
+    Value *InVal = nullptr;
+    if (!HasLoopExit) {
+      InVal = PN->getIncomingValueForBlock(Preds[0]);
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        if (!PredSet.count(PN->getIncomingBlock(i)))
+          continue;
+        if (!InVal)
+          InVal = PN->getIncomingValue(i);
+        else if (InVal != PN->getIncomingValue(i)) {
+          InVal = nullptr;
+          break;
+        }
+      }
+    }
+
+    if (InVal) {
+      // If all incoming values for the new PHI would be the same, just don't
+      // make a new PHI.  Instead, just remove the incoming values from the old
+      // PHI.
+
+      // NOTE! This loop walks backwards for a reason! First off, this minimizes
+      // the cost of removal if we end up removing a large number of values, and
+      // second off, this ensures that the indices for the incoming values
+      // aren't invalidated when we remove one.
+      for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i)
+        if (PredSet.count(PN->getIncomingBlock(i)))
+          PN->removeIncomingValue(i, false);
+
+      // Add an incoming value to the PHI node in the loop for the preheader
+      // edge.
+      PN->addIncoming(InVal, NewBB);
+      continue;
+    }
+
+    // If the values coming into the block are not the same, we need a new
+    // PHI.
+    // Create the new PHI node, insert it into NewBB at the end of the block
+    PHINode *NewPHI =
+        PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
+
+    // NOTE! This loop walks backwards for a reason! First off, this minimizes
+    // the cost of removal if we end up removing a large number of values, and
+    // second off, this ensures that the indices for the incoming values aren't
+    // invalidated when we remove one.
+    for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i) {
+      BasicBlock *IncomingBB = PN->getIncomingBlock(i);
+      if (PredSet.count(IncomingBB)) {
+        Value *V = PN->removeIncomingValue(i, false);
+        NewPHI->addIncoming(V, IncomingBB);
+      }
+    }
+
+    PN->addIncoming(NewPHI, NewBB);
+  }
+}
+
+BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
+                                         ArrayRef<BasicBlock *> Preds,
+                                         const char *Suffix, DominatorTree *DT,
+                                         LoopInfo *LI, bool PreserveLCSSA) {
+  // Do not attempt to split that which cannot be split.
+  if (!BB->canSplitPredecessors())
+    return nullptr;
+
+  // For the landingpads we need to act a bit differently.
+  // Delegate this work to the SplitLandingPadPredecessors.
+  if (BB->isLandingPad()) {
+    SmallVector<BasicBlock*, 2> NewBBs;
+    std::string NewName = std::string(Suffix) + ".split-lp";
+
+    SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs, DT,
+                                LI, PreserveLCSSA);
+    return NewBBs[0];
+  }
+
+  // Create new basic block, insert right before the original block.
+  BasicBlock *NewBB = BasicBlock::Create(
+      BB->getContext(), BB->getName() + Suffix, BB->getParent(), BB);
+
+  // The new block unconditionally branches to the old block.
+  BranchInst *BI = BranchInst::Create(BB, NewBB);
+  BI->setDebugLoc(BB->getFirstNonPHIOrDbg()->getDebugLoc());
+
+  // Move the edges from Preds to point to NewBB instead of BB.
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+    // This is slightly more strict than necessary; the minimum requirement
+    // is that there be no more than one indirectbr branching to BB. And
+    // all BlockAddress uses would need to be updated.
+    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB);
+  }
+
+  // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI
+  // node becomes an incoming value for BB's phi node.  However, if the Preds
+  // list is empty, we need to insert dummy entries into the PHI nodes in BB to
+  // account for the newly created predecessor.
+  if (Preds.size() == 0) {
+    // Insert dummy values as the incoming value.
+    for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
+      cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
+    return NewBB;
+  }
+
+  // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+  bool HasLoopExit = false;
+  UpdateAnalysisInformation(BB, NewBB, Preds, DT, LI, PreserveLCSSA,
+                            HasLoopExit);
+
+  // Update the PHI nodes in BB with the values coming from NewBB.
+  UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit);
+  return NewBB;
+}
+
+void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
+                                       ArrayRef<BasicBlock *> Preds,
+                                       const char *Suffix1, const char *Suffix2,
+                                       SmallVectorImpl<BasicBlock *> &NewBBs,
+                                       DominatorTree *DT, LoopInfo *LI,
+                                       bool PreserveLCSSA) {
+  assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!");
+
+  // Create a new basic block for OrigBB's predecessors listed in Preds. Insert
+  // it right before the original block.
+  BasicBlock *NewBB1 = BasicBlock::Create(OrigBB->getContext(),
+                                          OrigBB->getName() + Suffix1,
+                                          OrigBB->getParent(), OrigBB);
+  NewBBs.push_back(NewBB1);
+
+  // The new block unconditionally branches to the old block.
+  BranchInst *BI1 = BranchInst::Create(OrigBB, NewBB1);
+  BI1->setDebugLoc(OrigBB->getFirstNonPHI()->getDebugLoc());
+
+  // Move the edges from Preds to point to NewBB1 instead of OrigBB.
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+    // This is slightly more strict than necessary; the minimum requirement
+    // is that there be no more than one indirectbr branching to BB. And
+    // all BlockAddress uses would need to be updated.
+    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    Preds[i]->getTerminator()->replaceUsesOfWith(OrigBB, NewBB1);
+  }
+
+  bool HasLoopExit = false;
+  UpdateAnalysisInformation(OrigBB, NewBB1, Preds, DT, LI, PreserveLCSSA,
+                            HasLoopExit);
+
+  // Update the PHI nodes in OrigBB with the values coming from NewBB1.
+  UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, HasLoopExit);
+
+  // Move the remaining edges from OrigBB to point to NewBB2.
+  SmallVector<BasicBlock*, 8> NewBB2Preds;
+  for (pred_iterator i = pred_begin(OrigBB), e = pred_end(OrigBB);
+       i != e; ) {
+    BasicBlock *Pred = *i++;
+    if (Pred == NewBB1) continue;
+    assert(!isa<IndirectBrInst>(Pred->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    NewBB2Preds.push_back(Pred);
+    e = pred_end(OrigBB);
+  }
+
+  BasicBlock *NewBB2 = nullptr;
+  if (!NewBB2Preds.empty()) {
+    // Create another basic block for the rest of OrigBB's predecessors.
+    NewBB2 = BasicBlock::Create(OrigBB->getContext(),
+                                OrigBB->getName() + Suffix2,
+                                OrigBB->getParent(), OrigBB);
+    NewBBs.push_back(NewBB2);
+
+    // The new block unconditionally branches to the old block.
+    BranchInst *BI2 = BranchInst::Create(OrigBB, NewBB2);
+    BI2->setDebugLoc(OrigBB->getFirstNonPHI()->getDebugLoc());
+
+    // Move the remaining edges from OrigBB to point to NewBB2.
+    for (BasicBlock *NewBB2Pred : NewBB2Preds)
+      NewBB2Pred->getTerminator()->replaceUsesOfWith(OrigBB, NewBB2);
+
+    // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+    HasLoopExit = false;
+    UpdateAnalysisInformation(OrigBB, NewBB2, NewBB2Preds, DT, LI,
+                              PreserveLCSSA, HasLoopExit);
+
+    // Update the PHI nodes in OrigBB with the values coming from NewBB2.
+    UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, HasLoopExit);
+  }
+
+  LandingPadInst *LPad = OrigBB->getLandingPadInst();
+  Instruction *Clone1 = LPad->clone();
+  Clone1->setName(Twine("lpad") + Suffix1);
+  NewBB1->getInstList().insert(NewBB1->getFirstInsertionPt(), Clone1);
+
+  if (NewBB2) {
+    Instruction *Clone2 = LPad->clone();
+    Clone2->setName(Twine("lpad") + Suffix2);
+    NewBB2->getInstList().insert(NewBB2->getFirstInsertionPt(), Clone2);
+
+    // Create a PHI node for the two cloned landingpad instructions only
+    // if the original landingpad instruction has some uses.
+    if (!LPad->use_empty()) {
+      assert(!LPad->getType()->isTokenTy() &&
+             "Split cannot be applied if LPad is token type. Otherwise an "
+             "invalid PHINode of token type would be created.");
+      PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad);
+      PN->addIncoming(Clone1, NewBB1);
+      PN->addIncoming(Clone2, NewBB2);
+      LPad->replaceAllUsesWith(PN);
+    }
+    LPad->eraseFromParent();
+  } else {
+    // There is no second clone. Just replace the landing pad with the first
+    // clone.
+    LPad->replaceAllUsesWith(Clone1);
+    LPad->eraseFromParent();
+  }
+}
+
+ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
+                                             BasicBlock *Pred) {
+  Instruction *UncondBranch = Pred->getTerminator();
+  // Clone the return and add it to the end of the predecessor.
+  Instruction *NewRet = RI->clone();
+  Pred->getInstList().push_back(NewRet);
+
+  // If the return instruction returns a value, and if the value was a
+  // PHI node in "BB", propagate the right value into the return.
+  for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end();
+       i != e; ++i) {
+    Value *V = *i;
+    Instruction *NewBC = nullptr;
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(V)) {
+      // Return value might be bitcasted. Clone and insert it before the
+      // return instruction.
+      V = BCI->getOperand(0);
+      NewBC = BCI->clone();
+      Pred->getInstList().insert(NewRet->getIterator(), NewBC);
+      *i = NewBC;
+    }
+    if (PHINode *PN = dyn_cast<PHINode>(V)) {
+      if (PN->getParent() == BB) {
+        if (NewBC)
+          NewBC->setOperand(0, PN->getIncomingValueForBlock(Pred));
+        else
+          *i = PN->getIncomingValueForBlock(Pred);
+      }
+    }
+  }
+
+  // Update any PHI nodes in the returning block to realize that we no
+  // longer branch to them.
+  BB->removePredecessor(Pred);
+  UncondBranch->eraseFromParent();
+  return cast<ReturnInst>(NewRet);
+}
+
+TerminatorInst *
+llvm::SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
+                                bool Unreachable, MDNode *BranchWeights,
+                                DominatorTree *DT, LoopInfo *LI) {
+  BasicBlock *Head = SplitBefore->getParent();
+  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
+  TerminatorInst *HeadOldTerm = Head->getTerminator();
+  LLVMContext &C = Head->getContext();
+  BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+  TerminatorInst *CheckTerm;
+  if (Unreachable)
+    CheckTerm = new UnreachableInst(C, ThenBlock);
+  else
+    CheckTerm = BranchInst::Create(Tail, ThenBlock);
+  CheckTerm->setDebugLoc(SplitBefore->getDebugLoc());
+  BranchInst *HeadNewTerm =
+    BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cond);
+  HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
+  ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
+
+  if (DT) {
+    if (DomTreeNode *OldNode = DT->getNode(Head)) {
+      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
+
+      DomTreeNode *NewNode = DT->addNewBlock(Tail, Head);
+      for (DomTreeNode *Child : Children)
+        DT->changeImmediateDominator(Child, NewNode);
+
+      // Head dominates ThenBlock.
+      DT->addNewBlock(ThenBlock, Head);
+    }
+  }
+
+  if (LI) {
+    if (Loop *L = LI->getLoopFor(Head)) {
+      L->addBasicBlockToLoop(ThenBlock, *LI);
+      L->addBasicBlockToLoop(Tail, *LI);
+    }
+  }
+
+  return CheckTerm;
+}
+
+void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
+                                         TerminatorInst **ThenTerm,
+                                         TerminatorInst **ElseTerm,
+                                         MDNode *BranchWeights) {
+  BasicBlock *Head = SplitBefore->getParent();
+  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
+  TerminatorInst *HeadOldTerm = Head->getTerminator();
+  LLVMContext &C = Head->getContext();
+  BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+  BasicBlock *ElseBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+  *ThenTerm = BranchInst::Create(Tail, ThenBlock);
+  (*ThenTerm)->setDebugLoc(SplitBefore->getDebugLoc());
+  *ElseTerm = BranchInst::Create(Tail, ElseBlock);
+  (*ElseTerm)->setDebugLoc(SplitBefore->getDebugLoc());
+  BranchInst *HeadNewTerm =
+    BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/ElseBlock, Cond);
+  HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
+  ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
+}
+
+
+Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
+                             BasicBlock *&IfFalse) {
+  PHINode *SomePHI = dyn_cast<PHINode>(BB->begin());
+  BasicBlock *Pred1 = nullptr;
+  BasicBlock *Pred2 = nullptr;
+
+  if (SomePHI) {
+    if (SomePHI->getNumIncomingValues() != 2)
+      return nullptr;
+    Pred1 = SomePHI->getIncomingBlock(0);
+    Pred2 = SomePHI->getIncomingBlock(1);
+  } else {
+    pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+    if (PI == PE) // No predecessor
+      return nullptr;
+    Pred1 = *PI++;
+    if (PI == PE) // Only one predecessor
+      return nullptr;
+    Pred2 = *PI++;
+    if (PI != PE) // More than two predecessors
+      return nullptr;
+  }
+
+  // We can only handle branches.  Other control flow will be lowered to
+  // branches if possible anyway.
+  BranchInst *Pred1Br = dyn_cast<BranchInst>(Pred1->getTerminator());
+  BranchInst *Pred2Br = dyn_cast<BranchInst>(Pred2->getTerminator());
+  if (!Pred1Br || !Pred2Br)
+    return nullptr;
+
+  // Eliminate code duplication by ensuring that Pred1Br is conditional if
+  // either are.
+  if (Pred2Br->isConditional()) {
+    // If both branches are conditional, we don't have an "if statement".  In
+    // reality, we could transform this case, but since the condition will be
+    // required anyway, we stand no chance of eliminating it, so the xform is
+    // probably not profitable.
+    if (Pred1Br->isConditional())
+      return nullptr;
+
+    std::swap(Pred1, Pred2);
+    std::swap(Pred1Br, Pred2Br);
+  }
+
+  if (Pred1Br->isConditional()) {
+    // The only thing we have to watch out for here is to make sure that Pred2
+    // doesn't have incoming edges from other blocks.  If it does, the condition
+    // doesn't dominate BB.
+    if (!Pred2->getSinglePredecessor())
+      return nullptr;
+
+    // If we found a conditional branch predecessor, make sure that it branches
+    // to BB and Pred2Br.  If it doesn't, this isn't an "if statement".
+    if (Pred1Br->getSuccessor(0) == BB &&
+        Pred1Br->getSuccessor(1) == Pred2) {
+      IfTrue = Pred1;
+      IfFalse = Pred2;
+    } else if (Pred1Br->getSuccessor(0) == Pred2 &&
+               Pred1Br->getSuccessor(1) == BB) {
+      IfTrue = Pred2;
+      IfFalse = Pred1;
+    } else {
+      // We know that one arm of the conditional goes to BB, so the other must
+      // go somewhere unrelated, and this must not be an "if statement".
+      return nullptr;
+    }
+
+    return Pred1Br->getCondition();
+  }
+
+  // Ok, if we got here, both predecessors end with an unconditional branch to
+  // BB.  Don't panic!  If both blocks only have a single (identical)
+  // predecessor, and THAT is a conditional branch, then we're all ok!
+  BasicBlock *CommonPred = Pred1->getSinglePredecessor();
+  if (CommonPred == nullptr || CommonPred != Pred2->getSinglePredecessor())
+    return nullptr;
+
+  // Otherwise, if this is a conditional branch, then we can use it!
+  BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator());
+  if (!BI) return nullptr;
+
+  assert(BI->isConditional() && "Two successors but not conditional?");
+  if (BI->getSuccessor(0) == Pred1) {
+    IfTrue = Pred1;
+    IfFalse = Pred2;
+  } else {
+    IfTrue = Pred2;
+    IfFalse = Pred1;
+  }
+  return BI->getCondition();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
new file mode 100644
index 000000000000..175cbd2ce0df
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -0,0 +1,328 @@
+//===- BreakCriticalEdges.cpp - Critical Edge Elimination Pass ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// BreakCriticalEdges pass - Break all of the critical edges in the CFG by
+// inserting a dummy basic block.  This pass may be "required" by passes that
+// cannot deal with critical edges.  For this usage, the structure type is
+// forward declared.  This pass obviously invalidates the CFG, but can update
+// dominator trees.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BreakCriticalEdges.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "break-crit-edges"
+
+STATISTIC(NumBroken, "Number of blocks inserted");
+
+namespace {
+  struct BreakCriticalEdges : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    BreakCriticalEdges() : FunctionPass(ID) {
+      initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override {
+      auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+      auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+      auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+      auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+      unsigned N =
+          SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI));
+      NumBroken += N;
+      return N > 0;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+
+      // No loop canonicalization guarantees are broken by this pass.
+      AU.addPreservedID(LoopSimplifyID);
+    }
+  };
+}
+
+char BreakCriticalEdges::ID = 0;
+INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
+                "Break critical edges in CFG", false, false)
+
+// Publicly exposed interface to pass...
+char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID;
+FunctionPass *llvm::createBreakCriticalEdgesPass() {
+  return new BreakCriticalEdges();
+}
+
+PreservedAnalyses BreakCriticalEdgesPass::run(Function &F,
+                                              FunctionAnalysisManager &AM) {
+  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  auto *LI = AM.getCachedResult<LoopAnalysis>(F);
+  unsigned N = SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI));
+  NumBroken += N;
+  if (N == 0)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
+
+//===----------------------------------------------------------------------===//
+//    Implementation of the external critical edge manipulation functions
+//===----------------------------------------------------------------------===//
+
+/// When a loop exit edge is split, LCSSA form may require new PHIs in the new
+/// exit block. This function inserts the new PHIs, as needed. Preds is a list
+/// of preds inside the loop, SplitBB is the new loop exit block, and DestBB is
+/// the old loop exit, now the successor of SplitBB.
+static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
+                                       BasicBlock *SplitBB,
+                                       BasicBlock *DestBB) {
+  // SplitBB shouldn't have anything non-trivial in it yet.
+  assert((SplitBB->getFirstNonPHI() == SplitBB->getTerminator() ||
+          SplitBB->isLandingPad()) && "SplitBB has non-PHI nodes!");
+
+  // For each PHI in the destination block.
+  for (BasicBlock::iterator I = DestBB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    unsigned Idx = PN->getBasicBlockIndex(SplitBB);
+    Value *V = PN->getIncomingValue(Idx);
+
+    // If the input is a PHI which already satisfies LCSSA, don't create
+    // a new one.
+    if (const PHINode *VP = dyn_cast<PHINode>(V))
+      if (VP->getParent() == SplitBB)
+        continue;
+
+    // Otherwise a new PHI is needed. Create one and populate it.
+    PHINode *NewPN = PHINode::Create(
+        PN->getType(), Preds.size(), "split",
+        SplitBB->isLandingPad() ? &SplitBB->front() : SplitBB->getTerminator());
+    for (unsigned i = 0, e = Preds.size(); i != e; ++i)
+      NewPN->addIncoming(V, Preds[i]);
+
+    // Update the original PHI.
+    PN->setIncomingValue(Idx, NewPN);
+  }
+}
+
+BasicBlock *
+llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
+                        const CriticalEdgeSplittingOptions &Options) {
+  if (!isCriticalEdge(TI, SuccNum, Options.MergeIdenticalEdges))
+    return nullptr;
+
+  assert(!isa<IndirectBrInst>(TI) &&
+         "Cannot split critical edge from IndirectBrInst");
+
+  BasicBlock *TIBB = TI->getParent();
+  BasicBlock *DestBB = TI->getSuccessor(SuccNum);
+
+  // Splitting the critical edge to a pad block is non-trivial. Don't do
+  // it in this generic function.
+  if (DestBB->isEHPad()) return nullptr;
+
+  // Create a new basic block, linking it into the CFG.
+  BasicBlock *NewBB = BasicBlock::Create(TI->getContext(),
+                      TIBB->getName() + "." + DestBB->getName() + "_crit_edge");
+  // Create our unconditional branch.
+  BranchInst *NewBI = BranchInst::Create(DestBB, NewBB);
+  NewBI->setDebugLoc(TI->getDebugLoc());
+
+  // Branch to the new block, breaking the edge.
+  TI->setSuccessor(SuccNum, NewBB);
+
+  // Insert the block into the function... right after the block TI lives in.
+  Function &F = *TIBB->getParent();
+  Function::iterator FBBI = TIBB->getIterator();
+  F.getBasicBlockList().insert(++FBBI, NewBB);
+
+  // If there are any PHI nodes in DestBB, we need to update them so that they
+  // merge incoming values from NewBB instead of from TIBB.
+  {
+    unsigned BBIdx = 0;
+    for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+      // We no longer enter through TIBB, now we come in through NewBB.
+      // Revector exactly one entry in the PHI node that used to come from
+      // TIBB to come from NewBB.
+      PHINode *PN = cast<PHINode>(I);
+
+      // Reuse the previous value of BBIdx if it lines up.  In cases where we
+      // have multiple phi nodes with *lots* of predecessors, this is a speed
+      // win because we don't have to scan the PHI looking for TIBB.  This
+      // happens because the BB list of PHI nodes are usually in the same
+      // order.
+      if (PN->getIncomingBlock(BBIdx) != TIBB)
+        BBIdx = PN->getBasicBlockIndex(TIBB);
+      PN->setIncomingBlock(BBIdx, NewBB);
+    }
+  }
+
+  // If there are any other edges from TIBB to DestBB, update those to go
+  // through the split block, making those edges non-critical as well (and
+  // reducing the number of phi entries in the DestBB if relevant).
+  if (Options.MergeIdenticalEdges) {
+    for (unsigned i = SuccNum+1, e = TI->getNumSuccessors(); i != e; ++i) {
+      if (TI->getSuccessor(i) != DestBB) continue;
+
+      // Remove an entry for TIBB from DestBB phi nodes.
+      DestBB->removePredecessor(TIBB, Options.DontDeleteUselessPHIs);
+
+      // We found another edge to DestBB, go to NewBB instead.
+      TI->setSuccessor(i, NewBB);
+    }
+  }
+
+  // If we have nothing to update, just return.
+  auto *DT = Options.DT;
+  auto *LI = Options.LI;
+  if (!DT && !LI)
+    return NewBB;
+
+  // Now update analysis information.  Since the only predecessor of NewBB is
+  // the TIBB, TIBB clearly dominates NewBB.  TIBB usually doesn't dominate
+  // anything, as there are other successors of DestBB.  However, if all other
+  // predecessors of DestBB are already dominated by DestBB (e.g. DestBB is a
+  // loop header) then NewBB dominates DestBB.
+  SmallVector<BasicBlock*, 8> OtherPreds;
+
+  // If there is a PHI in the block, loop over predecessors with it, which is
+  // faster than iterating pred_begin/end.
+  if (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (PN->getIncomingBlock(i) != NewBB)
+        OtherPreds.push_back(PN->getIncomingBlock(i));
+  } else {
+    for (pred_iterator I = pred_begin(DestBB), E = pred_end(DestBB);
+         I != E; ++I) {
+      BasicBlock *P = *I;
+      if (P != NewBB)
+        OtherPreds.push_back(P);
+    }
+  }
+
+  bool NewBBDominatesDestBB = true;
+
+  // Should we update DominatorTree information?
+  if (DT) {
+    DomTreeNode *TINode = DT->getNode(TIBB);
+
+    // The new block is not the immediate dominator for any other nodes, but
+    // TINode is the immediate dominator for the new node.
+    //
+    if (TINode) {       // Don't break unreachable code!
+      DomTreeNode *NewBBNode = DT->addNewBlock(NewBB, TIBB);
+      DomTreeNode *DestBBNode = nullptr;
+
+      // If NewBBDominatesDestBB hasn't been computed yet, do so with DT.
+      if (!OtherPreds.empty()) {
+        DestBBNode = DT->getNode(DestBB);
+        while (!OtherPreds.empty() && NewBBDominatesDestBB) {
+          if (DomTreeNode *OPNode = DT->getNode(OtherPreds.back()))
+            NewBBDominatesDestBB = DT->dominates(DestBBNode, OPNode);
+          OtherPreds.pop_back();
+        }
+        OtherPreds.clear();
+      }
+
+      // If NewBBDominatesDestBB, then NewBB dominates DestBB, otherwise it
+      // doesn't dominate anything.
+      if (NewBBDominatesDestBB) {
+        if (!DestBBNode) DestBBNode = DT->getNode(DestBB);
+        DT->changeImmediateDominator(DestBBNode, NewBBNode);
+      }
+    }
+  }
+
+  // Update LoopInfo if it is around.
+  if (LI) {
+    if (Loop *TIL = LI->getLoopFor(TIBB)) {
+      // If one or the other blocks were not in a loop, the new block is not
+      // either, and thus LI doesn't need to be updated.
+      if (Loop *DestLoop = LI->getLoopFor(DestBB)) {
+        if (TIL == DestLoop) {
+          // Both in the same loop, the NewBB joins loop.
+          DestLoop->addBasicBlockToLoop(NewBB, *LI);
+        } else if (TIL->contains(DestLoop)) {
+          // Edge from an outer loop to an inner loop.  Add to the outer loop.
+          TIL->addBasicBlockToLoop(NewBB, *LI);
+        } else if (DestLoop->contains(TIL)) {
+          // Edge from an inner loop to an outer loop.  Add to the outer loop.
+          DestLoop->addBasicBlockToLoop(NewBB, *LI);
+        } else {
+          // Edge from two loops with no containment relation.  Because these
+          // are natural loops, we know that the destination block must be the
+          // header of its loop (adding a branch into a loop elsewhere would
+          // create an irreducible loop).
+          assert(DestLoop->getHeader() == DestBB &&
+                 "Should not create irreducible loops!");
+          if (Loop *P = DestLoop->getParentLoop())
+            P->addBasicBlockToLoop(NewBB, *LI);
+        }
+      }
+
+      // If TIBB is in a loop and DestBB is outside of that loop, we may need
+      // to update LoopSimplify form and LCSSA form.
+      if (!TIL->contains(DestBB)) {
+        assert(!TIL->contains(NewBB) &&
+               "Split point for loop exit is contained in loop!");
+
+        // Update LCSSA form in the newly created exit block.
+        if (Options.PreserveLCSSA) {
+          createPHIsForSplitLoopExit(TIBB, NewBB, DestBB);
+        }
+
+        // The only that we can break LoopSimplify form by splitting a critical
+        // edge is if after the split there exists some edge from TIL to DestBB
+        // *and* the only edge into DestBB from outside of TIL is that of
+        // NewBB. If the first isn't true, then LoopSimplify still holds, NewBB
+        // is the new exit block and it has no non-loop predecessors. If the
+        // second isn't true, then DestBB was not in LoopSimplify form prior to
+        // the split as it had a non-loop predecessor. In both of these cases,
+        // the predecessor must be directly in TIL, not in a subloop, or again
+        // LoopSimplify doesn't hold.
+        SmallVector<BasicBlock *, 4> LoopPreds;
+        for (pred_iterator I = pred_begin(DestBB), E = pred_end(DestBB); I != E;
+             ++I) {
+          BasicBlock *P = *I;
+          if (P == NewBB)
+            continue; // The new block is known.
+          if (LI->getLoopFor(P) != TIL) {
+            // No need to re-simplify, it wasn't to start with.
+            LoopPreds.clear();
+            break;
+          }
+          LoopPreds.push_back(P);
+        }
+        if (!LoopPreds.empty()) {
+          assert(!DestBB->isEHPad() && "We don't split edges to EH pads!");
+          BasicBlock *NewExitBB = SplitBlockPredecessors(
+              DestBB, LoopPreds, "split", DT, LI, Options.PreserveLCSSA);
+          if (Options.PreserveLCSSA)
+            createPHIsForSplitLoopExit(LoopPreds, NewExitBB, DestBB);
+        }
+      }
+    }
+  }
+
+  return NewBB;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
new file mode 100644
index 000000000000..b60dfb4f3541
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -0,0 +1,1015 @@
+//===- BuildLibCalls.cpp - Utility builder for libcalls -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some functions that will create standard C libcalls.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "build-libcalls"
+
+//- Infer Attributes ---------------------------------------------------------//
+
+STATISTIC(NumReadNone, "Number of functions inferred as readnone");
+STATISTIC(NumReadOnly, "Number of functions inferred as readonly");
+STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly");
+STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind");
+STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture");
+STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly");
+STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
+STATISTIC(NumNonNull, "Number of function returns inferred as nonnull returns");
+
+static bool setDoesNotAccessMemory(Function &F) {
+  if (F.doesNotAccessMemory())
+    return false;
+  F.setDoesNotAccessMemory();
+  ++NumReadNone;
+  return true;
+}
+
+static bool setOnlyReadsMemory(Function &F) {
+  if (F.onlyReadsMemory())
+    return false;
+  F.setOnlyReadsMemory();
+  ++NumReadOnly;
+  return true;
+}
+
+static bool setOnlyAccessesArgMemory(Function &F) {
+  if (F.onlyAccessesArgMemory())
+    return false;
+  F.setOnlyAccessesArgMemory();
+  ++NumArgMemOnly;
+  return true;
+}
+
+static bool setDoesNotThrow(Function &F) {
+  if (F.doesNotThrow())
+    return false;
+  F.setDoesNotThrow();
+  ++NumNoUnwind;
+  return true;
+}
+
+static bool setRetDoesNotAlias(Function &F) {
+  if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias))
+    return false;
+  F.addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+  ++NumNoAlias;
+  return true;
+}
+
+static bool setDoesNotCapture(Function &F, unsigned ArgNo) {
+  if (F.hasParamAttribute(ArgNo, Attribute::NoCapture))
+    return false;
+  F.addParamAttr(ArgNo, Attribute::NoCapture);
+  ++NumNoCapture;
+  return true;
+}
+
+static bool setOnlyReadsMemory(Function &F, unsigned ArgNo) {
+  if (F.hasParamAttribute(ArgNo, Attribute::ReadOnly))
+    return false;
+  F.addParamAttr(ArgNo, Attribute::ReadOnly);
+  ++NumReadOnlyArg;
+  return true;
+}
+
+static bool setRetNonNull(Function &F) {
+  assert(F.getReturnType()->isPointerTy() &&
+         "nonnull applies only to pointers");
+  if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NonNull))
+    return false;
+  F.addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+  ++NumNonNull;
+  return true;
+}
+
+bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
+  LibFunc TheLibFunc;
+  if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
+    return false;
+
+  bool Changed = false;
+  switch (TheLibFunc) {
+  case LibFunc_strlen:
+  case LibFunc_wcslen:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_strchr:
+  case LibFunc_strrchr:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc_strtol:
+  case LibFunc_strtod:
+  case LibFunc_strtof:
+  case LibFunc_strtoul:
+  case LibFunc_strtoll:
+  case LibFunc_strtold:
+  case LibFunc_strtoull:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_strcpy:
+  case LibFunc_stpcpy:
+  case LibFunc_strcat:
+  case LibFunc_strncat:
+  case LibFunc_strncpy:
+  case LibFunc_stpncpy:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_strxfrm:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_strcmp:      // 0,1
+  case LibFunc_strspn:      // 0,1
+  case LibFunc_strncmp:     // 0,1
+  case LibFunc_strcspn:     // 0,1
+  case LibFunc_strcoll:     // 0,1
+  case LibFunc_strcasecmp:  // 0,1
+  case LibFunc_strncasecmp: //
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_strstr:
+  case LibFunc_strpbrk:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_strtok:
+  case LibFunc_strtok_r:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_scanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_setbuf:
+  case LibFunc_setvbuf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_strdup:
+  case LibFunc_strndup:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_stat:
+  case LibFunc_statvfs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_sscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_sprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_snprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc_setitimer:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_system:
+    // May throw; "system" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_malloc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    return Changed;
+  case LibFunc_memcmp:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_memchr:
+  case LibFunc_memrchr:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc_modf:
+  case LibFunc_modff:
+  case LibFunc_modfl:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_memcpy:
+  case LibFunc_mempcpy:
+  case LibFunc_memccpy:
+  case LibFunc_memmove:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_memcpy_chk:
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc_memalign:
+    Changed |= setRetDoesNotAlias(F);
+    return Changed;
+  case LibFunc_mkdir:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_mktime:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_realloc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_read:
+    // May throw; "read" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_rewind:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_rmdir:
+  case LibFunc_remove:
+  case LibFunc_realpath:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_rename:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_readlink:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_write:
+    // May throw; "write" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_bcopy:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_bcmp:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_bzero:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_calloc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    return Changed;
+  case LibFunc_chmod:
+  case LibFunc_chown:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_ctermid:
+  case LibFunc_clearerr:
+  case LibFunc_closedir:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_atoi:
+  case LibFunc_atol:
+  case LibFunc_atof:
+  case LibFunc_atoll:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_access:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_fopen:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_fdopen:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_feof:
+  case LibFunc_free:
+  case LibFunc_fseek:
+  case LibFunc_ftell:
+  case LibFunc_fgetc:
+  case LibFunc_fseeko:
+  case LibFunc_ftello:
+  case LibFunc_fileno:
+  case LibFunc_fflush:
+  case LibFunc_fclose:
+  case LibFunc_fsetpos:
+  case LibFunc_flockfile:
+  case LibFunc_funlockfile:
+  case LibFunc_ftrylockfile:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_ferror:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F);
+    return Changed;
+  case LibFunc_fputc:
+  case LibFunc_fstat:
+  case LibFunc_frexp:
+  case LibFunc_frexpf:
+  case LibFunc_frexpl:
+  case LibFunc_fstatvfs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_fgets:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc_fread:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 3);
+    return Changed;
+  case LibFunc_fwrite:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 3);
+    // FIXME: readonly #1?
+    return Changed;
+  case LibFunc_fputs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_fscanf:
+  case LibFunc_fprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_fgetpos:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_getc:
+  case LibFunc_getlogin_r:
+  case LibFunc_getc_unlocked:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_getenv:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_gets:
+  case LibFunc_getchar:
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc_getitimer:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_getpwnam:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_ungetc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_uname:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_unlink:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_unsetenv:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_utime:
+  case LibFunc_utimes:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_putc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_puts:
+  case LibFunc_printf:
+  case LibFunc_perror:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_pread:
+    // May throw; "pread" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_pwrite:
+    // May throw; "pwrite" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_putchar:
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc_popen:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_pclose:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_vscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_vsscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_vfscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_valloc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    return Changed;
+  case LibFunc_vprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_vfprintf:
+  case LibFunc_vsprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_vsnprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc_open:
+    // May throw; "open" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_opendir:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_tmpfile:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    return Changed;
+  case LibFunc_times:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_htonl:
+  case LibFunc_htons:
+  case LibFunc_ntohl:
+  case LibFunc_ntohs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAccessMemory(F);
+    return Changed;
+  case LibFunc_lstat:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_lchown:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_qsort:
+    // May throw; places call through function pointer.
+    Changed |= setDoesNotCapture(F, 3);
+    return Changed;
+  case LibFunc_dunder_strdup:
+  case LibFunc_dunder_strndup:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_dunder_strtok_r:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_under_IO_getc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_under_IO_putc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_dunder_isoc99_scanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_stat64:
+  case LibFunc_lstat64:
+  case LibFunc_statvfs64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_dunder_isoc99_sscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_fopen64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_fseeko64:
+  case LibFunc_ftello64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_tmpfile64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    return Changed;
+  case LibFunc_fstat64:
+  case LibFunc_fstatvfs64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_open64:
+    // May throw; "open" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_gettimeofday:
+    // Currently some platforms have the restrict keyword on the arguments to
+    // gettimeofday. To be conservative, do not add noalias to gettimeofday's
+    // arguments.
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_Znwj: // new(unsigned int)
+  case LibFunc_Znwm: // new(unsigned long)
+  case LibFunc_Znaj: // new[](unsigned int)
+  case LibFunc_Znam: // new[](unsigned long)
+  case LibFunc_msvc_new_int: // new(unsigned int)
+  case LibFunc_msvc_new_longlong: // new(unsigned long long)
+  case LibFunc_msvc_new_array_int: // new[](unsigned int)
+  case LibFunc_msvc_new_array_longlong: // new[](unsigned long long)
+    // Operator new always returns a nonnull noalias pointer
+    Changed |= setRetNonNull(F);
+    Changed |= setRetDoesNotAlias(F);
+    return Changed;
+  //TODO: add LibFunc entries for:
+  //case LibFunc_memset_pattern4:
+  //case LibFunc_memset_pattern8:
+  case LibFunc_memset_pattern16:
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  // int __nvvm_reflect(const char *)
+  case LibFunc_nvvm_reflect:
+    Changed |= setDoesNotAccessMemory(F);
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+
+  default:
+    // FIXME: It'd be really nice to cover all the library functions we're
+    // aware of here.
+    return false;
+  }
+}
+
+//- Emit LibCalls ------------------------------------------------------------//
+
+Value *llvm::castToCStr(Value *V, IRBuilder<> &B) {
+  unsigned AS = V->getType()->getPointerAddressSpace();
+  return B.CreateBitCast(V, B.getInt8PtrTy(AS), "cstr");
+}
+
+Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
+                        const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_strlen))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Constant *StrLen = M->getOrInsertFunction("strlen", DL.getIntPtrType(Context),
+                                            B.getInt8PtrTy());
+  inferLibFuncAttributes(*M->getFunction("strlen"), *TLI);
+  CallInst *CI = B.CreateCall(StrLen, castToCStr(Ptr, B), "strlen");
+  if (const Function *F = dyn_cast<Function>(StrLen->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
+                        const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_strchr))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  Type *I8Ptr = B.getInt8PtrTy();
+  Type *I32Ty = B.getInt32Ty();
+  Constant *StrChr =
+      M->getOrInsertFunction("strchr", I8Ptr, I8Ptr, I32Ty);
+  inferLibFuncAttributes(*M->getFunction("strchr"), *TLI);
+  CallInst *CI = B.CreateCall(
+      StrChr, {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, "strchr");
+  if (const Function *F = dyn_cast<Function>(StrChr->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
+                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_strncmp))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Value *StrNCmp = M->getOrInsertFunction("strncmp", B.getInt32Ty(),
+                                          B.getInt8PtrTy(), B.getInt8PtrTy(),
+                                          DL.getIntPtrType(Context));
+  inferLibFuncAttributes(*M->getFunction("strncmp"), *TLI);
+  CallInst *CI = B.CreateCall(
+      StrNCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "strncmp");
+
+  if (const Function *F = dyn_cast<Function>(StrNCmp->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
+                        const TargetLibraryInfo *TLI, StringRef Name) {
+  if (!TLI->has(LibFunc_strcpy))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  Type *I8Ptr = B.getInt8PtrTy();
+  Value *StrCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr);
+  inferLibFuncAttributes(*M->getFunction(Name), *TLI);
+  CallInst *CI =
+      B.CreateCall(StrCpy, {castToCStr(Dst, B), castToCStr(Src, B)}, Name);
+  if (const Function *F = dyn_cast<Function>(StrCpy->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
+                         const TargetLibraryInfo *TLI, StringRef Name) {
+  if (!TLI->has(LibFunc_strncpy))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  Type *I8Ptr = B.getInt8PtrTy();
+  Value *StrNCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr,
+                                          Len->getType());
+  inferLibFuncAttributes(*M->getFunction(Name), *TLI);
+  CallInst *CI = B.CreateCall(
+      StrNCpy, {castToCStr(Dst, B), castToCStr(Src, B), Len}, "strncpy");
+  if (const Function *F = dyn_cast<Function>(StrNCpy->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
+                           IRBuilder<> &B, const DataLayout &DL,
+                           const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_memcpy_chk))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  AttributeList AS;
+  AS = AttributeList::get(M->getContext(), AttributeList::FunctionIndex,
+                          Attribute::NoUnwind);
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Value *MemCpy = M->getOrInsertFunction(
+      "__memcpy_chk", AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(),
+      B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context),
+      DL.getIntPtrType(Context));
+  Dst = castToCStr(Dst, B);
+  Src = castToCStr(Src, B);
+  CallInst *CI = B.CreateCall(MemCpy, {Dst, Src, Len, ObjSize});
+  if (const Function *F = dyn_cast<Function>(MemCpy->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
+                        const DataLayout &DL, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_memchr))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Value *MemChr = M->getOrInsertFunction("memchr", B.getInt8PtrTy(),
+                                         B.getInt8PtrTy(), B.getInt32Ty(),
+                                         DL.getIntPtrType(Context));
+  inferLibFuncAttributes(*M->getFunction("memchr"), *TLI);
+  CallInst *CI = B.CreateCall(MemChr, {castToCStr(Ptr, B), Val, Len}, "memchr");
+
+  if (const Function *F = dyn_cast<Function>(MemChr->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
+                        const DataLayout &DL, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_memcmp))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Value *MemCmp = M->getOrInsertFunction("memcmp", B.getInt32Ty(),
+                                         B.getInt8PtrTy(), B.getInt8PtrTy(),
+                                         DL.getIntPtrType(Context));
+  inferLibFuncAttributes(*M->getFunction("memcmp"), *TLI);
+  CallInst *CI = B.CreateCall(
+      MemCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "memcmp");
+
+  if (const Function *F = dyn_cast<Function>(MemCmp->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+/// Append a suffix to the function name according to the type of 'Op'.
+static void appendTypeSuffix(Value *Op, StringRef &Name,
+                             SmallString<20> &NameBuffer) {
+  if (!Op->getType()->isDoubleTy()) {
+      NameBuffer += Name;
+
+    if (Op->getType()->isFloatTy())
+      NameBuffer += 'f';
+    else
+      NameBuffer += 'l';
+
+    Name = NameBuffer;
+  }  
+}
+
+Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
+                                  const AttributeList &Attrs) {
+  SmallString<20> NameBuffer;
+  appendTypeSuffix(Op, Name, NameBuffer);
+
+  Module *M = B.GetInsertBlock()->getModule();
+  Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
+                                         Op->getType());
+  CallInst *CI = B.CreateCall(Callee, Op, Name);
+
+  // The incoming attribute set may have come from a speculatable intrinsic, but
+  // is being replaced with a library call which is not allowed to be
+  // speculatable.
+  CI->setAttributes(Attrs.removeAttribute(B.getContext(),
+                                          AttributeList::FunctionIndex,
+                                          Attribute::Speculatable));
+  if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
+                                   IRBuilder<> &B, const AttributeList &Attrs) {
+  SmallString<20> NameBuffer;
+  appendTypeSuffix(Op1, Name, NameBuffer);
+
+  Module *M = B.GetInsertBlock()->getModule();
+  Value *Callee = M->getOrInsertFunction(Name, Op1->getType(), Op1->getType(),
+                                         Op2->getType());
+  CallInst *CI = B.CreateCall(Callee, {Op1, Op2}, Name);
+  CI->setAttributes(Attrs);
+  if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
+                         const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_putchar))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(), B.getInt32Ty());
+  inferLibFuncAttributes(*M->getFunction("putchar"), *TLI);
+  CallInst *CI = B.CreateCall(PutChar,
+                              B.CreateIntCast(Char,
+                              B.getInt32Ty(),
+                              /*isSigned*/true,
+                              "chari"),
+                              "putchar");
+
+  if (const Function *F = dyn_cast<Function>(PutChar->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitPutS(Value *Str, IRBuilder<> &B,
+                      const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_puts))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  Value *PutS =
+      M->getOrInsertFunction("puts", B.getInt32Ty(), B.getInt8PtrTy());
+  inferLibFuncAttributes(*M->getFunction("puts"), *TLI);
+  CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), "puts");
+  if (const Function *F = dyn_cast<Function>(PutS->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
+                       const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fputc))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  Constant *F = M->getOrInsertFunction("fputc", B.getInt32Ty(), B.getInt32Ty(),
+                                       File->getType());
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(*M->getFunction("fputc"), *TLI);
+  Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
+                         "chari");
+  CallInst *CI = B.CreateCall(F, {Char, File}, "fputc");
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
+                       const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fputs))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  StringRef FPutsName = TLI->getName(LibFunc_fputs);
+  Constant *F = M->getOrInsertFunction(
+      FPutsName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType());
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(*M->getFunction(FPutsName), *TLI);
+  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs");
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
+                        const DataLayout &DL, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fwrite))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  StringRef FWriteName = TLI->getName(LibFunc_fwrite);
+  Constant *F = M->getOrInsertFunction(
+      FWriteName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
+      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(*M->getFunction(FWriteName), *TLI);
+  CallInst *CI =
+      B.CreateCall(F, {castToCStr(Ptr, B), Size,
+                       ConstantInt::get(DL.getIntPtrType(Context), 1), File});
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
new file mode 100644
index 000000000000..83ec7f55d1af
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -0,0 +1,479 @@
+//===-- BypassSlowDivision.cpp - Bypass slow division ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an optimization for div and rem on architectures that
+// execute short instructions significantly faster than longer instructions.
+// For example, on Intel Atom 32-bit divides are slow enough that during
+// runtime it is profitable to check the value of the operands, and if they are
+// positive and less than 256 use an unsigned 8-bit divide.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bypass-slow-division"
+
+namespace {
+  struct DivOpInfo {
+    bool SignedOp;
+    Value *Dividend;
+    Value *Divisor;
+
+    DivOpInfo(bool InSignedOp, Value *InDividend, Value *InDivisor)
+      : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
+  };
+
+  struct QuotRemPair {
+    Value *Quotient;
+    Value *Remainder;
+
+    QuotRemPair(Value *InQuotient, Value *InRemainder)
+        : Quotient(InQuotient), Remainder(InRemainder) {}
+  };
+
+  /// A quotient and remainder, plus a BB from which they logically "originate".
+  /// If you use Quotient or Remainder in a Phi node, you should use BB as its
+  /// corresponding predecessor.
+  struct QuotRemWithBB {
+    BasicBlock *BB = nullptr;
+    Value *Quotient = nullptr;
+    Value *Remainder = nullptr;
+  };
+}
+
+namespace llvm {
+  template<>
+  struct DenseMapInfo<DivOpInfo> {
+    static bool isEqual(const DivOpInfo &Val1, const DivOpInfo &Val2) {
+      return Val1.SignedOp == Val2.SignedOp &&
+             Val1.Dividend == Val2.Dividend &&
+             Val1.Divisor == Val2.Divisor;
+    }
+
+    static DivOpInfo getEmptyKey() {
+      return DivOpInfo(false, nullptr, nullptr);
+    }
+
+    static DivOpInfo getTombstoneKey() {
+      return DivOpInfo(true, nullptr, nullptr);
+    }
+
+    static unsigned getHashValue(const DivOpInfo &Val) {
+      return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^
+                        reinterpret_cast<uintptr_t>(Val.Divisor)) ^
+                        (unsigned)Val.SignedOp;
+    }
+  };
+
+  typedef DenseMap<DivOpInfo, QuotRemPair> DivCacheTy;
+  typedef DenseMap<unsigned, unsigned> BypassWidthsTy;
+  typedef SmallPtrSet<Instruction *, 4> VisitedSetTy;
+}
+
+namespace {
+enum ValueRange {
+  /// Operand definitely fits into BypassType. No runtime checks are needed.
+  VALRNG_KNOWN_SHORT,
+  /// A runtime check is required, as value range is unknown.
+  VALRNG_UNKNOWN,
+  /// Operand is unlikely to fit into BypassType. The bypassing should be
+  /// disabled.
+  VALRNG_LIKELY_LONG
+};
+
+class FastDivInsertionTask {
+  bool IsValidTask = false;
+  Instruction *SlowDivOrRem = nullptr;
+  IntegerType *BypassType = nullptr;
+  BasicBlock *MainBB = nullptr;
+
+  bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
+  ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
+  QuotRemWithBB createSlowBB(BasicBlock *Successor);
+  QuotRemWithBB createFastBB(BasicBlock *Successor);
+  QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
+                                   BasicBlock *PhiBB);
+  Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
+  Optional<QuotRemPair> insertFastDivAndRem();
+
+  bool isSignedOp() {
+    return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
+           SlowDivOrRem->getOpcode() == Instruction::SRem;
+  }
+  bool isDivisionOp() {
+    return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
+           SlowDivOrRem->getOpcode() == Instruction::UDiv;
+  }
+  Type *getSlowType() { return SlowDivOrRem->getType(); }
+
+public:
+  FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
+  Value *getReplacement(DivCacheTy &Cache);
+};
+} // anonymous namespace
+
+FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
+                                           const BypassWidthsTy &BypassWidths) {
+  switch (I->getOpcode()) {
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+    SlowDivOrRem = I;
+    break;
+  default:
+    // I is not a div/rem operation.
+    return;
+  }
+
+  // Skip division on vector types. Only optimize integer instructions.
+  IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
+  if (!SlowType)
+    return;
+
+  // Skip if this bitwidth is not bypassed.
+  auto BI = BypassWidths.find(SlowType->getBitWidth());
+  if (BI == BypassWidths.end())
+    return;
+
+  // Get type for div/rem instruction with bypass bitwidth.
+  IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
+  BypassType = BT;
+
+  // The original basic block.
+  MainBB = I->getParent();
+
+  // The instruction is indeed a slow div or rem operation.
+  IsValidTask = true;
+}
+
+/// Reuses previously-computed dividend or remainder from the current BB if
+/// operands and operation are identical. Otherwise calls insertFastDivAndRem to
+/// perform the optimization and caches the resulting dividend and remainder.
+/// If no replacement can be generated, nullptr is returned.
+Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
+  // First, make sure that the task is valid.
+  if (!IsValidTask)
+    return nullptr;
+
+  // Then, look for a value in Cache.
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+  DivOpInfo Key(isSignedOp(), Dividend, Divisor);
+  auto CacheI = Cache.find(Key);
+
+  if (CacheI == Cache.end()) {
+    // If previous instance does not exist, try to insert fast div.
+    Optional<QuotRemPair> OptResult = insertFastDivAndRem();
+    // Bail out if insertFastDivAndRem has failed.
+    if (!OptResult)
+      return nullptr;
+    CacheI = Cache.insert({Key, *OptResult}).first;
+  }
+
+  QuotRemPair &Value = CacheI->second;
+  return isDivisionOp() ? Value.Quotient : Value.Remainder;
+}
+
+/// \brief Check if a value looks like a hash.
+///
+/// The routine is expected to detect values computed using the most common hash
+/// algorithms. Typically, hash computations end with one of the following
+/// instructions:
+///
+/// 1) MUL with a constant wider than BypassType
+/// 2) XOR instruction
+///
+/// And even if we are wrong and the value is not a hash, it is still quite
+/// unlikely that such values will fit into BypassType.
+///
+/// To detect string hash algorithms like FNV we have to look through PHI-nodes.
+/// It is implemented as a depth-first search for values that look neither long
+/// nor hash-like.
+bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  switch (I->getOpcode()) {
+  case Instruction::Xor:
+    return true;
+  case Instruction::Mul: {
+    // After Constant Hoisting pass, long constants may be represented as
+    // bitcast instructions. As a result, some constants may look like an
+    // instruction at first, and an additional check is necessary to find out if
+    // an operand is actually a constant.
+    Value *Op1 = I->getOperand(1);
+    ConstantInt *C = dyn_cast<ConstantInt>(Op1);
+    if (!C && isa<BitCastInst>(Op1))
+      C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
+    return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
+  }
+  case Instruction::PHI: {
+    // Stop IR traversal in case of a crazy input code. This limits recursion
+    // depth.
+    if (Visited.size() >= 16)
+      return false;
+    // Do not visit nodes that have been visited already. We return true because
+    // it means that we couldn't find any value that doesn't look hash-like.
+    if (Visited.find(I) != Visited.end())
+      return true;
+    Visited.insert(I);
+    return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
+      // Ignore undef values as they probably don't affect the division
+      // operands.
+      return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
+             isa<UndefValue>(V);
+    });
+  }
+  default:
+    return false;
+  }
+}
+
+/// Check if an integer value fits into our bypass type.
+ValueRange FastDivInsertionTask::getValueRange(Value *V,
+                                               VisitedSetTy &Visited) {
+  unsigned ShortLen = BypassType->getBitWidth();
+  unsigned LongLen = V->getType()->getIntegerBitWidth();
+
+  assert(LongLen > ShortLen && "Value type must be wider than BypassType");
+  unsigned HiBits = LongLen - ShortLen;
+
+  const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
+  KnownBits Known(LongLen);
+
+  computeKnownBits(V, Known, DL);
+
+  if (Known.countMinLeadingZeros() >= HiBits)
+    return VALRNG_KNOWN_SHORT;
+
+  if (Known.countMaxLeadingZeros() < HiBits)
+    return VALRNG_LIKELY_LONG;
+
+  // Long integer divisions are often used in hashtable implementations. It's
+  // not worth bypassing such divisions because hash values are extremely
+  // unlikely to have enough leading zeros. The call below tries to detect
+  // values that are unlikely to fit BypassType (including hashes).
+  if (isHashLikeValue(V, Visited))
+    return VALRNG_LIKELY_LONG;
+
+  return VALRNG_UNKNOWN;
+}
+
+/// Add new basic block for slow div and rem operations and put it before
+/// SuccessorBB.
+QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
+  QuotRemWithBB DivRemPair;
+  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
+                                     MainBB->getParent(), SuccessorBB);
+  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
+
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+
+  if (isSignedOp()) {
+    DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
+    DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
+  } else {
+    DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
+    DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
+  }
+
+  Builder.CreateBr(SuccessorBB);
+  return DivRemPair;
+}
+
+/// Add new basic block for fast div and rem operations and put it before
+/// SuccessorBB.
+QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
+  QuotRemWithBB DivRemPair;
+  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
+                                     MainBB->getParent(), SuccessorBB);
+  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
+
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+  Value *ShortDivisorV =
+      Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
+  Value *ShortDividendV =
+      Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
+
+  // udiv/urem because this optimization only handles positive numbers.
+  Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
+  Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
+  DivRemPair.Quotient =
+      Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
+  DivRemPair.Remainder =
+      Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
+  Builder.CreateBr(SuccessorBB);
+
+  return DivRemPair;
+}
+
+/// Creates Phi nodes for result of Div and Rem.
+QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
+                                                       QuotRemWithBB &RHS,
+                                                       BasicBlock *PhiBB) {
+  IRBuilder<> Builder(PhiBB, PhiBB->begin());
+  PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
+  QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
+  QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
+  PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
+  RemPhi->addIncoming(LHS.Remainder, LHS.BB);
+  RemPhi->addIncoming(RHS.Remainder, RHS.BB);
+  return QuotRemPair(QuoPhi, RemPhi);
+}
+
+/// Creates a runtime check to test whether both the divisor and dividend fit
+/// into BypassType. The check is inserted at the end of MainBB. True return
+/// value means that the operands fit. Either of the operands may be NULL if it
+/// doesn't need a runtime check.
+Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
+  assert((Op1 || Op2) && "Nothing to check");
+  IRBuilder<> Builder(MainBB, MainBB->end());
+
+  Value *OrV;
+  if (Op1 && Op2)
+    OrV = Builder.CreateOr(Op1, Op2);
+  else
+    OrV = Op1 ? Op1 : Op2;
+
+  // BitMask is inverted to check if the operands are
+  // larger than the bypass type
+  uint64_t BitMask = ~BypassType->getBitMask();
+  Value *AndV = Builder.CreateAnd(OrV, BitMask);
+
+  // Compare operand values
+  Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
+  return Builder.CreateICmpEQ(AndV, ZeroV);
+}
+
+/// Substitutes the div/rem instruction with code that checks the value of the
+/// operands and uses a shorter-faster div/rem instruction when possible.
+Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+
+  if (isa<ConstantInt>(Divisor)) {
+    // Keep division by a constant for DAGCombiner.
+    return None;
+  }
+
+  VisitedSetTy SetL;
+  ValueRange DividendRange = getValueRange(Dividend, SetL);
+  if (DividendRange == VALRNG_LIKELY_LONG)
+    return None;
+
+  VisitedSetTy SetR;
+  ValueRange DivisorRange = getValueRange(Divisor, SetR);
+  if (DivisorRange == VALRNG_LIKELY_LONG)
+    return None;
+
+  bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
+  bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
+
+  if (DividendShort && DivisorShort) {
+    // If both operands are known to be short then just replace the long
+    // division with a short one in-place.
+
+    IRBuilder<> Builder(SlowDivOrRem);
+    Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
+    Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
+    Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
+    Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
+    Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
+    Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
+    return QuotRemPair(ExtDiv, ExtRem);
+  } else if (DividendShort && !isSignedOp()) {
+    // If the division is unsigned and Dividend is known to be short, then
+    // either
+    // 1) Divisor is less or equal to Dividend, and the result can be computed
+    //    with a short division.
+    // 2) Divisor is greater than Dividend. In this case, no division is needed
+    //    at all: The quotient is 0 and the remainder is equal to Dividend.
+    //
+    // So instead of checking at runtime whether Divisor fits into BypassType,
+    // we emit a runtime check to differentiate between these two cases. This
+    // lets us entirely avoid a long div.
+
+    // Split the basic block before the div/rem.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+    // Remove the unconditional branch from MainBB to SuccessorBB.
+    MainBB->getInstList().back().eraseFromParent();
+    QuotRemWithBB Long;
+    Long.BB = MainBB;
+    Long.Quotient = ConstantInt::get(getSlowType(), 0);
+    Long.Remainder = Dividend;
+    QuotRemWithBB Fast = createFastBB(SuccessorBB);
+    QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
+    IRBuilder<> Builder(MainBB, MainBB->end());
+    Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
+    Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
+    return Result;
+  } else {
+    // General case. Create both slow and fast div/rem pairs and choose one of
+    // them at runtime.
+
+    // Split the basic block before the div/rem.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+    // Remove the unconditional branch from MainBB to SuccessorBB.
+    MainBB->getInstList().back().eraseFromParent();
+    QuotRemWithBB Fast = createFastBB(SuccessorBB);
+    QuotRemWithBB Slow = createSlowBB(SuccessorBB);
+    QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
+    Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
+                                            DivisorShort ? nullptr : Divisor);
+    IRBuilder<> Builder(MainBB, MainBB->end());
+    Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
+    return Result;
+  }
+}
+
+/// This optimization identifies DIV/REM instructions in a BB that can be
+/// profitably bypassed and carried out with a shorter, faster divide.
+bool llvm::bypassSlowDivision(BasicBlock *BB,
+                              const BypassWidthsTy &BypassWidths) {
+  DivCacheTy PerBBDivCache;
+
+  bool MadeChange = false;
+  Instruction* Next = &*BB->begin();
+  while (Next != nullptr) {
+    // We may add instructions immediately after I, but we want to skip over
+    // them.
+    Instruction* I = Next;
+    Next = Next->getNextNode();
+
+    FastDivInsertionTask Task(I, BypassWidths);
+    if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
+      I->replaceAllUsesWith(Replacement);
+      I->eraseFromParent();
+      MadeChange = true;
+    }
+  }
+
+  // Above we eagerly create divs and rems, as pairs, so that we can efficiently
+  // create divrem machine instructions.  Now erase any unused divs / rems so we
+  // don't leave extra instructions sitting around.
+  for (auto &KV : PerBBDivCache)
+    for (Value *V : {KV.second.Quotient, KV.second.Remainder})
+      RecursivelyDeleteTriviallyDeadInstructions(V);
+
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
new file mode 100644
index 000000000000..7e75e8847785
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -0,0 +1,833 @@
+//===- CloneFunction.cpp - Clone a function into another function ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneFunctionInto interface, which is used as the
+// low-level function cloner.  This is used by the CloneFunction and function
+// inliner to do the dirty work of copying the body of a function around.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <map>
+using namespace llvm;
+
+/// See comments in Cloning.h.
+BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
+                                  const Twine &NameSuffix, Function *F,
+                                  ClonedCodeInfo *CodeInfo,
+                                  DebugInfoFinder *DIFinder) {
+  DenseMap<const MDNode *, MDNode *> Cache;
+  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
+  if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
+
+  bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
+  Module *TheModule = F ? F->getParent() : nullptr;
+
+  // Loop over all instructions, and copy them over.
+  for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
+       II != IE; ++II) {
+
+    if (DIFinder && TheModule) {
+      if (auto *DDI = dyn_cast<DbgDeclareInst>(II))
+        DIFinder->processDeclare(*TheModule, DDI);
+      else if (auto *DVI = dyn_cast<DbgValueInst>(II))
+        DIFinder->processValue(*TheModule, DVI);
+
+      if (auto DbgLoc = II->getDebugLoc())
+        DIFinder->processLocation(*TheModule, DbgLoc.get());
+    }
+
+    Instruction *NewInst = II->clone();
+    if (II->hasName())
+      NewInst->setName(II->getName()+NameSuffix);
+    NewBB->getInstList().push_back(NewInst);
+    VMap[&*II] = NewInst; // Add instruction map to value.
+
+    hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+      if (isa<ConstantInt>(AI->getArraySize()))
+        hasStaticAllocas = true;
+      else
+        hasDynamicAllocas = true;
+    }
+  }
+  
+  if (CodeInfo) {
+    CodeInfo->ContainsCalls          |= hasCalls;
+    CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
+    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
+                                        BB != &BB->getParent()->getEntryBlock();
+  }
+  return NewBB;
+}
+
+// Clone OldFunc into NewFunc, transforming the old arguments into references to
+// VMap values.
+//
+void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
+                             ValueToValueMapTy &VMap,
+                             bool ModuleLevelChanges,
+                             SmallVectorImpl<ReturnInst*> &Returns,
+                             const char *NameSuffix, ClonedCodeInfo *CodeInfo,
+                             ValueMapTypeRemapper *TypeMapper,
+                             ValueMaterializer *Materializer) {
+  assert(NameSuffix && "NameSuffix cannot be null!");
+
+#ifndef NDEBUG
+  for (const Argument &I : OldFunc->args())
+    assert(VMap.count(&I) && "No mapping from source argument specified!");
+#endif
+
+  // Copy all attributes other than those stored in the AttributeList.  We need
+  // to remap the parameter indices of the AttributeList.
+  AttributeList NewAttrs = NewFunc->getAttributes();
+  NewFunc->copyAttributesFrom(OldFunc);
+  NewFunc->setAttributes(NewAttrs);
+
+  // Fix up the personality function that got copied over.
+  if (OldFunc->hasPersonalityFn())
+    NewFunc->setPersonalityFn(
+        MapValue(OldFunc->getPersonalityFn(), VMap,
+                 ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                 TypeMapper, Materializer));
+
+  SmallVector<AttributeSet, 4> NewArgAttrs(NewFunc->arg_size());
+  AttributeList OldAttrs = OldFunc->getAttributes();
+
+  // Clone any argument attributes that are present in the VMap.
+  for (const Argument &OldArg : OldFunc->args()) {
+    if (Argument *NewArg = dyn_cast<Argument>(VMap[&OldArg])) {
+      NewArgAttrs[NewArg->getArgNo()] =
+          OldAttrs.getParamAttributes(OldArg.getArgNo());
+    }
+  }
+
+  NewFunc->setAttributes(
+      AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(),
+                         OldAttrs.getRetAttributes(), NewArgAttrs));
+
+  bool MustCloneSP =
+      OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent();
+  DISubprogram *SP = OldFunc->getSubprogram();
+  if (SP) {
+    assert(!MustCloneSP || ModuleLevelChanges);
+    // Add mappings for some DebugInfo nodes that we don't want duplicated
+    // even if they're distinct.
+    auto &MD = VMap.MD();
+    MD[SP->getUnit()].reset(SP->getUnit());
+    MD[SP->getType()].reset(SP->getType());
+    MD[SP->getFile()].reset(SP->getFile());
+    // If we're not cloning into the same module, no need to clone the
+    // subprogram
+    if (!MustCloneSP)
+      MD[SP].reset(SP);
+  }
+
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  OldFunc->getAllMetadata(MDs);
+  for (auto MD : MDs) {
+    NewFunc->addMetadata(
+        MD.first,
+        *MapMetadata(MD.second, VMap,
+                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                     TypeMapper, Materializer));
+  }
+
+  // When we remap instructions, we want to avoid duplicating inlined
+  // DISubprograms, so record all subprograms we find as we duplicate
+  // instructions and then freeze them in the MD map.
+  // We also record information about dbg.value and dbg.declare to avoid
+  // duplicating the types.
+  DebugInfoFinder DIFinder;
+
+  // Loop over all of the basic blocks in the function, cloning them as
+  // appropriate.  Note that we save BE this way in order to handle cloning of
+  // recursive functions into themselves.
+  //
+  for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end();
+       BI != BE; ++BI) {
+    const BasicBlock &BB = *BI;
+
+    // Create a new basic block and copy instructions into it!
+    BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo,
+                                      SP ? &DIFinder : nullptr);
+
+    // Add basic block mapping.
+    VMap[&BB] = CBB;
+
+    // It is only legal to clone a function if a block address within that
+    // function is never referenced outside of the function.  Given that, we
+    // want to map block addresses from the old function to block addresses in
+    // the clone. (This is different from the generic ValueMapper
+    // implementation, which generates an invalid blockaddress when
+    // cloning a function.)
+    if (BB.hasAddressTaken()) {
+      Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
+                                              const_cast<BasicBlock*>(&BB));
+      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);
+    }
+
+    // Note return instructions for the caller.
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator()))
+      Returns.push_back(RI);
+  }
+
+  for (DISubprogram *ISP : DIFinder.subprograms()) {
+    if (ISP != SP) {
+      VMap.MD()[ISP].reset(ISP);
+    }
+  }
+
+  for (auto *Type : DIFinder.types()) {
+    VMap.MD()[Type].reset(Type);
+  }
+
+  // Loop over all of the instructions in the function, fixing up operand
+  // references as we go.  This uses VMap to do all the hard work.
+  for (Function::iterator BB =
+           cast<BasicBlock>(VMap[&OldFunc->front()])->getIterator(),
+                          BE = NewFunc->end();
+       BB != BE; ++BB)
+    // Loop over all instructions, fixing each one as we find it...
+    for (Instruction &II : *BB)
+      RemapInstruction(&II, VMap,
+                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                       TypeMapper, Materializer);
+}
+
+/// Return a copy of the specified function and add it to that function's
+/// module.  Also, any references specified in the VMap are changed to refer to
+/// their mapped value instead of the original one.  If any of the arguments to
+/// the function are in the VMap, the arguments are deleted from the resultant
+/// function.  The VMap is updated to include mappings from all of the
+/// instructions and basicblocks in the function from their old to new values.
+///
+Function *llvm::CloneFunction(Function *F, ValueToValueMapTy &VMap,
+                              ClonedCodeInfo *CodeInfo) {
+  std::vector<Type*> ArgTypes;
+
+  // The user might be deleting arguments to the function by specifying them in
+  // the VMap.  If so, we need to not add the arguments to the arg ty vector
+  //
+  for (const Argument &I : F->args())
+    if (VMap.count(&I) == 0) // Haven't mapped the argument to anything yet?
+      ArgTypes.push_back(I.getType());
+
+  // Create a new function type...
+  FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(),
+                                    ArgTypes, F->getFunctionType()->isVarArg());
+
+  // Create the new function...
+  Function *NewF =
+      Function::Create(FTy, F->getLinkage(), F->getName(), F->getParent());
+
+  // Loop over the arguments, copying the names of the mapped arguments over...
+  Function::arg_iterator DestI = NewF->arg_begin();
+  for (const Argument & I : F->args())
+    if (VMap.count(&I) == 0) {     // Is this argument preserved?
+      DestI->setName(I.getName()); // Copy the name over...
+      VMap[&I] = &*DestI++;        // Add mapping to VMap
+    }
+
+  SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
+  CloneFunctionInto(NewF, F, VMap, F->getSubprogram() != nullptr, Returns, "",
+                    CodeInfo);
+
+  return NewF;
+}
+
+
+
+namespace {
+  /// This is a private class used to implement CloneAndPruneFunctionInto.
+  struct PruningFunctionCloner {
+    Function *NewFunc;
+    const Function *OldFunc;
+    ValueToValueMapTy &VMap;
+    bool ModuleLevelChanges;
+    const char *NameSuffix;
+    ClonedCodeInfo *CodeInfo;
+
+  public:
+    PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
+                          ValueToValueMapTy &valueMap, bool moduleLevelChanges,
+                          const char *nameSuffix, ClonedCodeInfo *codeInfo)
+        : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap),
+          ModuleLevelChanges(moduleLevelChanges), NameSuffix(nameSuffix),
+          CodeInfo(codeInfo) {}
+
+    /// The specified block is found to be reachable, clone it and
+    /// anything that it can reach.
+    void CloneBlock(const BasicBlock *BB, 
+                    BasicBlock::const_iterator StartingInst,
+                    std::vector<const BasicBlock*> &ToClone);
+  };
+}
+
+/// The specified block is found to be reachable, clone it and
+/// anything that it can reach.
+void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
+                                       BasicBlock::const_iterator StartingInst,
+                                       std::vector<const BasicBlock*> &ToClone){
+  WeakTrackingVH &BBEntry = VMap[BB];
+
+  // Have we already cloned this block?
+  if (BBEntry) return;
+  
+  // Nope, clone it now.
+  BasicBlock *NewBB;
+  BBEntry = NewBB = BasicBlock::Create(BB->getContext());
+  if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
+
+  // It is only legal to clone a function if a block address within that
+  // function is never referenced outside of the function.  Given that, we
+  // want to map block addresses from the old function to block addresses in
+  // the clone. (This is different from the generic ValueMapper
+  // implementation, which generates an invalid blockaddress when
+  // cloning a function.)
+  //
+  // Note that we don't need to fix the mapping for unreachable blocks;
+  // the default mapping there is safe.
+  if (BB->hasAddressTaken()) {
+    Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
+                                            const_cast<BasicBlock*>(BB));
+    VMap[OldBBAddr] = BlockAddress::get(NewFunc, NewBB);
+  }
+
+  bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
+
+  // Loop over all instructions, and copy them over, DCE'ing as we go.  This
+  // loop doesn't include the terminator.
+  for (BasicBlock::const_iterator II = StartingInst, IE = --BB->end();
+       II != IE; ++II) {
+
+    Instruction *NewInst = II->clone();
+
+    // Eagerly remap operands to the newly cloned instruction, except for PHI
+    // nodes for which we defer processing until we update the CFG.
+    if (!isa<PHINode>(NewInst)) {
+      RemapInstruction(NewInst, VMap,
+                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+
+      // If we can simplify this instruction to some other value, simply add
+      // a mapping to that value rather than inserting a new instruction into
+      // the basic block.
+      if (Value *V =
+              SimplifyInstruction(NewInst, BB->getModule()->getDataLayout())) {
+        // On the off-chance that this simplifies to an instruction in the old
+        // function, map it back into the new function.
+        if (Value *MappedV = VMap.lookup(V))
+          V = MappedV;
+
+        if (!NewInst->mayHaveSideEffects()) {
+          VMap[&*II] = V;
+          NewInst->deleteValue();
+          continue;
+        }
+      }
+    }
+
+    if (II->hasName())
+      NewInst->setName(II->getName()+NameSuffix);
+    VMap[&*II] = NewInst; // Add instruction map to value.
+    NewBB->getInstList().push_back(NewInst);
+    hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
+
+    if (CodeInfo)
+      if (auto CS = ImmutableCallSite(&*II))
+        if (CS.hasOperandBundles())
+          CodeInfo->OperandBundleCallSites.push_back(NewInst);
+
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+      if (isa<ConstantInt>(AI->getArraySize()))
+        hasStaticAllocas = true;
+      else
+        hasDynamicAllocas = true;
+    }
+  }
+  
+  // Finally, clone over the terminator.
+  const TerminatorInst *OldTI = BB->getTerminator();
+  bool TerminatorDone = false;
+  if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) {
+    if (BI->isConditional()) {
+      // If the condition was a known constant in the callee...
+      ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+      // Or is a known constant in the caller...
+      if (!Cond) {
+        Value *V = VMap.lookup(BI->getCondition());
+        Cond = dyn_cast_or_null<ConstantInt>(V);
+      }
+
+      // Constant fold to uncond branch!
+      if (Cond) {
+        BasicBlock *Dest = BI->getSuccessor(!Cond->getZExtValue());
+        VMap[OldTI] = BranchInst::Create(Dest, NewBB);
+        ToClone.push_back(Dest);
+        TerminatorDone = true;
+      }
+    }
+  } else if (const SwitchInst *SI = dyn_cast<SwitchInst>(OldTI)) {
+    // If switching on a value known constant in the caller.
+    ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
+    if (!Cond) { // Or known constant after constant prop in the callee...
+      Value *V = VMap.lookup(SI->getCondition());
+      Cond = dyn_cast_or_null<ConstantInt>(V);
+    }
+    if (Cond) {     // Constant fold to uncond branch!
+      SwitchInst::ConstCaseHandle Case = *SI->findCaseValue(Cond);
+      BasicBlock *Dest = const_cast<BasicBlock*>(Case.getCaseSuccessor());
+      VMap[OldTI] = BranchInst::Create(Dest, NewBB);
+      ToClone.push_back(Dest);
+      TerminatorDone = true;
+    }
+  }
+  
+  if (!TerminatorDone) {
+    Instruction *NewInst = OldTI->clone();
+    if (OldTI->hasName())
+      NewInst->setName(OldTI->getName()+NameSuffix);
+    NewBB->getInstList().push_back(NewInst);
+    VMap[OldTI] = NewInst;             // Add instruction map to value.
+
+    if (CodeInfo)
+      if (auto CS = ImmutableCallSite(OldTI))
+        if (CS.hasOperandBundles())
+          CodeInfo->OperandBundleCallSites.push_back(NewInst);
+
+    // Recursively clone any reachable successor blocks.
+    const TerminatorInst *TI = BB->getTerminator();
+    for (const BasicBlock *Succ : TI->successors())
+      ToClone.push_back(Succ);
+  }
+  
+  if (CodeInfo) {
+    CodeInfo->ContainsCalls          |= hasCalls;
+    CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
+    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
+      BB != &BB->getParent()->front();
+  }
+}
+
+/// This works like CloneAndPruneFunctionInto, except that it does not clone the
+/// entire function. Instead it starts at an instruction provided by the caller
+/// and copies (and prunes) only the code reachable from that instruction.
+void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
+                                     const Instruction *StartingInst,
+                                     ValueToValueMapTy &VMap,
+                                     bool ModuleLevelChanges,
+                                     SmallVectorImpl<ReturnInst *> &Returns,
+                                     const char *NameSuffix,
+                                     ClonedCodeInfo *CodeInfo) {
+  assert(NameSuffix && "NameSuffix cannot be null!");
+
+  ValueMapTypeRemapper *TypeMapper = nullptr;
+  ValueMaterializer *Materializer = nullptr;
+
+#ifndef NDEBUG
+  // If the cloning starts at the beginning of the function, verify that
+  // the function arguments are mapped.
+  if (!StartingInst)
+    for (const Argument &II : OldFunc->args())
+      assert(VMap.count(&II) && "No mapping from source argument specified!");
+#endif
+
+  PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges,
+                            NameSuffix, CodeInfo);
+  const BasicBlock *StartingBB;
+  if (StartingInst)
+    StartingBB = StartingInst->getParent();
+  else {
+    StartingBB = &OldFunc->getEntryBlock();
+    StartingInst = &StartingBB->front();
+  }
+
+  // Clone the entry block, and anything recursively reachable from it.
+  std::vector<const BasicBlock*> CloneWorklist;
+  PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist);
+  while (!CloneWorklist.empty()) {
+    const BasicBlock *BB = CloneWorklist.back();
+    CloneWorklist.pop_back();
+    PFC.CloneBlock(BB, BB->begin(), CloneWorklist);
+  }
+  
+  // Loop over all of the basic blocks in the old function.  If the block was
+  // reachable, we have cloned it and the old block is now in the value map:
+  // insert it into the new function in the right order.  If not, ignore it.
+  //
+  // Defer PHI resolution until rest of function is resolved.
+  SmallVector<const PHINode*, 16> PHIToResolve;
+  for (const BasicBlock &BI : *OldFunc) {
+    Value *V = VMap.lookup(&BI);
+    BasicBlock *NewBB = cast_or_null<BasicBlock>(V);
+    if (!NewBB) continue;  // Dead block.
+
+    // Add the new block to the new function.
+    NewFunc->getBasicBlockList().push_back(NewBB);
+
+    // Handle PHI nodes specially, as we have to remove references to dead
+    // blocks.
+    for (BasicBlock::const_iterator I = BI.begin(), E = BI.end(); I != E; ++I) {
+      // PHI nodes may have been remapped to non-PHI nodes by the caller or
+      // during the cloning process.
+      if (const PHINode *PN = dyn_cast<PHINode>(I)) {
+        if (isa<PHINode>(VMap[PN]))
+          PHIToResolve.push_back(PN);
+        else
+          break;
+      } else {
+        break;
+      }
+    }
+
+    // Finally, remap the terminator instructions, as those can't be remapped
+    // until all BBs are mapped.
+    RemapInstruction(NewBB->getTerminator(), VMap,
+                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                     TypeMapper, Materializer);
+  }
+  
+  // Defer PHI resolution until rest of function is resolved, PHI resolution
+  // requires the CFG to be up-to-date.
+  for (unsigned phino = 0, e = PHIToResolve.size(); phino != e; ) {
+    const PHINode *OPN = PHIToResolve[phino];
+    unsigned NumPreds = OPN->getNumIncomingValues();
+    const BasicBlock *OldBB = OPN->getParent();
+    BasicBlock *NewBB = cast<BasicBlock>(VMap[OldBB]);
+
+    // Map operands for blocks that are live and remove operands for blocks
+    // that are dead.
+    for (; phino != PHIToResolve.size() &&
+         PHIToResolve[phino]->getParent() == OldBB; ++phino) {
+      OPN = PHIToResolve[phino];
+      PHINode *PN = cast<PHINode>(VMap[OPN]);
+      for (unsigned pred = 0, e = NumPreds; pred != e; ++pred) {
+        Value *V = VMap.lookup(PN->getIncomingBlock(pred));
+        if (BasicBlock *MappedBlock = cast_or_null<BasicBlock>(V)) {
+          Value *InVal = MapValue(PN->getIncomingValue(pred),
+                                  VMap, 
+                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+          assert(InVal && "Unknown input value?");
+          PN->setIncomingValue(pred, InVal);
+          PN->setIncomingBlock(pred, MappedBlock);
+        } else {
+          PN->removeIncomingValue(pred, false);
+          --pred;  // Revisit the next entry.
+          --e;
+        }
+      } 
+    }
+    
+    // The loop above has removed PHI entries for those blocks that are dead
+    // and has updated others.  However, if a block is live (i.e. copied over)
+    // but its terminator has been changed to not go to this block, then our
+    // phi nodes will have invalid entries.  Update the PHI nodes in this
+    // case.
+    PHINode *PN = cast<PHINode>(NewBB->begin());
+    NumPreds = std::distance(pred_begin(NewBB), pred_end(NewBB));
+    if (NumPreds != PN->getNumIncomingValues()) {
+      assert(NumPreds < PN->getNumIncomingValues());
+      // Count how many times each predecessor comes to this block.
+      std::map<BasicBlock*, unsigned> PredCount;
+      for (pred_iterator PI = pred_begin(NewBB), E = pred_end(NewBB);
+           PI != E; ++PI)
+        --PredCount[*PI];
+      
+      // Figure out how many entries to remove from each PHI.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        ++PredCount[PN->getIncomingBlock(i)];
+      
+      // At this point, the excess predecessor entries are positive in the
+      // map.  Loop over all of the PHIs and remove excess predecessor
+      // entries.
+      BasicBlock::iterator I = NewBB->begin();
+      for (; (PN = dyn_cast<PHINode>(I)); ++I) {
+        for (const auto &PCI : PredCount) {
+          BasicBlock *Pred = PCI.first;
+          for (unsigned NumToRemove = PCI.second; NumToRemove; --NumToRemove)
+            PN->removeIncomingValue(Pred, false);
+        }
+      }
+    }
+    
+    // If the loops above have made these phi nodes have 0 or 1 operand,
+    // replace them with undef or the input value.  We must do this for
+    // correctness, because 0-operand phis are not valid.
+    PN = cast<PHINode>(NewBB->begin());
+    if (PN->getNumIncomingValues() == 0) {
+      BasicBlock::iterator I = NewBB->begin();
+      BasicBlock::const_iterator OldI = OldBB->begin();
+      while ((PN = dyn_cast<PHINode>(I++))) {
+        Value *NV = UndefValue::get(PN->getType());
+        PN->replaceAllUsesWith(NV);
+        assert(VMap[&*OldI] == PN && "VMap mismatch");
+        VMap[&*OldI] = NV;
+        PN->eraseFromParent();
+        ++OldI;
+      }
+    }
+  }
+
+  // Make a second pass over the PHINodes now that all of them have been
+  // remapped into the new function, simplifying the PHINode and performing any
+  // recursive simplifications exposed. This will transparently update the
+  // WeakTrackingVH in the VMap. Notably, we rely on that so that if we coalesce
+  // two PHINodes, the iteration over the old PHIs remains valid, and the
+  // mapping will just map us to the new node (which may not even be a PHI
+  // node).
+  const DataLayout &DL = NewFunc->getParent()->getDataLayout();
+  SmallSetVector<const Value *, 8> Worklist;
+  for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx)
+    if (isa<PHINode>(VMap[PHIToResolve[Idx]]))
+      Worklist.insert(PHIToResolve[Idx]);
+
+  // Note that we must test the size on each iteration, the worklist can grow.
+  for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
+    const Value *OrigV = Worklist[Idx];
+    auto *I = dyn_cast_or_null<Instruction>(VMap.lookup(OrigV));
+    if (!I)
+      continue;
+
+    // Skip over non-intrinsic callsites, we don't want to remove any nodes from
+    // the CGSCC.
+    CallSite CS = CallSite(I);
+    if (CS && CS.getCalledFunction() && !CS.getCalledFunction()->isIntrinsic())
+      continue;
+
+    // See if this instruction simplifies.
+    Value *SimpleV = SimplifyInstruction(I, DL);
+    if (!SimpleV)
+      continue;
+
+    // Stash away all the uses of the old instruction so we can check them for
+    // recursive simplifications after a RAUW. This is cheaper than checking all
+    // uses of To on the recursive step in most cases.
+    for (const User *U : OrigV->users())
+      Worklist.insert(cast<Instruction>(U));
+
+    // Replace the instruction with its simplified value.
+    I->replaceAllUsesWith(SimpleV);
+
+    // If the original instruction had no side effects, remove it.
+    if (isInstructionTriviallyDead(I))
+      I->eraseFromParent();
+    else
+      VMap[OrigV] = I;
+  }
+
+  // Now that the inlined function body has been fully constructed, go through
+  // and zap unconditional fall-through branches. This happens all the time when
+  // specializing code: code specialization turns conditional branches into
+  // uncond branches, and this code folds them.
+  Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator();
+  Function::iterator I = Begin;
+  while (I != NewFunc->end()) {
+    // Check if this block has become dead during inlining or other
+    // simplifications. Note that the first block will appear dead, as it has
+    // not yet been wired up properly.
+    if (I != Begin && (pred_begin(&*I) == pred_end(&*I) ||
+                       I->getSinglePredecessor() == &*I)) {
+      BasicBlock *DeadBB = &*I++;
+      DeleteDeadBlock(DeadBB);
+      continue;
+    }
+
+    // We need to simplify conditional branches and switches with a constant
+    // operand. We try to prune these out when cloning, but if the
+    // simplification required looking through PHI nodes, those are only
+    // available after forming the full basic block. That may leave some here,
+    // and we still want to prune the dead code as early as possible.
+    ConstantFoldTerminator(&*I);
+
+    BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator());
+    if (!BI || BI->isConditional()) { ++I; continue; }
+    
+    BasicBlock *Dest = BI->getSuccessor(0);
+    if (!Dest->getSinglePredecessor()) {
+      ++I; continue;
+    }
+
+    // We shouldn't be able to get single-entry PHI nodes here, as instsimplify
+    // above should have zapped all of them..
+    assert(!isa<PHINode>(Dest->begin()));
+
+    // We know all single-entry PHI nodes in the inlined function have been
+    // removed, so we just need to splice the blocks.
+    BI->eraseFromParent();
+    
+    // Make all PHI nodes that referred to Dest now refer to I as their source.
+    Dest->replaceAllUsesWith(&*I);
+
+    // Move all the instructions in the succ to the pred.
+    I->getInstList().splice(I->end(), Dest->getInstList());
+    
+    // Remove the dest block.
+    Dest->eraseFromParent();
+    
+    // Do not increment I, iteratively merge all things this block branches to.
+  }
+
+  // Make a final pass over the basic blocks from the old function to gather
+  // any return instructions which survived folding. We have to do this here
+  // because we can iteratively remove and merge returns above.
+  for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB])->getIterator(),
+                          E = NewFunc->end();
+       I != E; ++I)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator()))
+      Returns.push_back(RI);
+}
+
+
+/// This works exactly like CloneFunctionInto,
+/// except that it does some simple constant prop and DCE on the fly.  The
+/// effect of this is to copy significantly less code in cases where (for
+/// example) a function call with constant arguments is inlined, and those
+/// constant arguments cause a significant amount of code in the callee to be
+/// dead.  Since this doesn't produce an exact copy of the input, it can't be
+/// used for things like CloneFunction or CloneModule.
+void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
+                                     ValueToValueMapTy &VMap,
+                                     bool ModuleLevelChanges,
+                                     SmallVectorImpl<ReturnInst*> &Returns,
+                                     const char *NameSuffix, 
+                                     ClonedCodeInfo *CodeInfo,
+                                     Instruction *TheCall) {
+  CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap,
+                            ModuleLevelChanges, Returns, NameSuffix, CodeInfo);
+}
+
+/// \brief Remaps instructions in \p Blocks using the mapping in \p VMap.
+void llvm::remapInstructionsInBlocks(
+    const SmallVectorImpl<BasicBlock *> &Blocks, ValueToValueMapTy &VMap) {
+  // Rewrite the code to refer to itself.
+  for (auto *BB : Blocks)
+    for (auto &Inst : *BB)
+      RemapInstruction(&Inst, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+}
+
+/// \brief Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
+/// Blocks.
+///
+/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block
+/// \p LoopDomBB.  Insert the new blocks before block specified in \p Before.
+Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
+                                   Loop *OrigLoop, ValueToValueMapTy &VMap,
+                                   const Twine &NameSuffix, LoopInfo *LI,
+                                   DominatorTree *DT,
+                                   SmallVectorImpl<BasicBlock *> &Blocks) {
+  assert(OrigLoop->getSubLoops().empty() && 
+         "Loop to be cloned cannot have inner loop");
+  Function *F = OrigLoop->getHeader()->getParent();
+  Loop *ParentLoop = OrigLoop->getParentLoop();
+
+  Loop *NewLoop = new Loop();
+  if (ParentLoop)
+    ParentLoop->addChildLoop(NewLoop);
+  else
+    LI->addTopLevelLoop(NewLoop);
+
+  BasicBlock *OrigPH = OrigLoop->getLoopPreheader();
+  assert(OrigPH && "No preheader");
+  BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F);
+  // To rename the loop PHIs.
+  VMap[OrigPH] = NewPH;
+  Blocks.push_back(NewPH);
+
+  // Update LoopInfo.
+  if (ParentLoop)
+    ParentLoop->addBasicBlockToLoop(NewPH, *LI);
+
+  // Update DominatorTree.
+  DT->addNewBlock(NewPH, LoopDomBB);
+
+  for (BasicBlock *BB : OrigLoop->getBlocks()) {
+    BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F);
+    VMap[BB] = NewBB;
+
+    // Update LoopInfo.
+    NewLoop->addBasicBlockToLoop(NewBB, *LI);
+
+    // Add DominatorTree node. After seeing all blocks, update to correct IDom.
+    DT->addNewBlock(NewBB, NewPH);
+
+    Blocks.push_back(NewBB);
+  }
+
+  for (BasicBlock *BB : OrigLoop->getBlocks()) {
+    // Update DominatorTree.
+    BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
+    DT->changeImmediateDominator(cast<BasicBlock>(VMap[BB]),
+                                 cast<BasicBlock>(VMap[IDomBB]));
+  }
+
+  // Move them physically from the end of the block list.
+  F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(),
+                                NewPH);
+  F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(),
+                                NewLoop->getHeader()->getIterator(), F->end());
+
+  return NewLoop;
+}
+
+/// \brief Duplicate non-Phi instructions from the beginning of block up to
+/// StopAt instruction into a split block between BB and its predecessor.
+BasicBlock *
+llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
+                                          Instruction *StopAt,
+                                          ValueToValueMapTy &ValueMapping) {
+  // We are going to have to map operands from the original BB block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
+  // account for entry from PredBB.
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+
+  BasicBlock *NewBB = SplitEdge(PredBB, BB);
+  NewBB->setName(PredBB->getName() + ".split");
+  Instruction *NewTerm = NewBB->getTerminator();
+
+  // Clone the non-phi instructions of BB into NewBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; StopAt != &*BI; ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    New->insertBefore(NewTerm);
+    ValueMapping[&*BI] = New;
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        auto I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+  }
+
+  return NewBB;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
new file mode 100644
index 000000000000..d27cb45c7d7f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -0,0 +1,200 @@
+//===- CloneModule.cpp - Clone an entire module ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneModule interface which makes a copy of an
+// entire module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Core.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+static void copyComdat(GlobalObject *Dst, const GlobalObject *Src) {
+  const Comdat *SC = Src->getComdat();
+  if (!SC)
+    return;
+  Comdat *DC = Dst->getParent()->getOrInsertComdat(SC->getName());
+  DC->setSelectionKind(SC->getSelectionKind());
+  Dst->setComdat(DC);
+}
+
+/// This is not as easy as it might seem because we have to worry about making
+/// copies of global variables and functions, and making their (initializers and
+/// references, respectively) refer to the right globals.
+///
+std::unique_ptr<Module> llvm::CloneModule(const Module *M) {
+  // Create the value map that maps things from the old module over to the new
+  // module.
+  ValueToValueMapTy VMap;
+  return CloneModule(M, VMap);
+}
+
+std::unique_ptr<Module> llvm::CloneModule(const Module *M,
+                                          ValueToValueMapTy &VMap) {
+  return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; });
+}
+
+std::unique_ptr<Module> llvm::CloneModule(
+    const Module *M, ValueToValueMapTy &VMap,
+    function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) {
+  // First off, we need to create the new module.
+  std::unique_ptr<Module> New =
+      llvm::make_unique<Module>(M->getModuleIdentifier(), M->getContext());
+  New->setDataLayout(M->getDataLayout());
+  New->setTargetTriple(M->getTargetTriple());
+  New->setModuleInlineAsm(M->getModuleInlineAsm());
+   
+  // Loop over all of the global variables, making corresponding globals in the
+  // new module.  Here we add them to the VMap and to the new Module.  We
+  // don't worry about attributes or initializers, they will come later.
+  //
+  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+       I != E; ++I) {
+    GlobalVariable *GV = new GlobalVariable(*New, 
+                                            I->getValueType(),
+                                            I->isConstant(), I->getLinkage(),
+                                            (Constant*) nullptr, I->getName(),
+                                            (GlobalVariable*) nullptr,
+                                            I->getThreadLocalMode(),
+                                            I->getType()->getAddressSpace());
+    GV->copyAttributesFrom(&*I);
+    VMap[&*I] = GV;
+  }
+
+  // Loop over the functions in the module, making external functions as before
+  for (const Function &I : *M) {
+    Function *NF = Function::Create(cast<FunctionType>(I.getValueType()),
+                                    I.getLinkage(), I.getName(), New.get());
+    NF->copyAttributesFrom(&I);
+    VMap[&I] = NF;
+  }
+
+  // Loop over the aliases in the module
+  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I) {
+    if (!ShouldCloneDefinition(&*I)) {
+      // An alias cannot act as an external reference, so we need to create
+      // either a function or a global variable depending on the value type.
+      // FIXME: Once pointee types are gone we can probably pick one or the
+      // other.
+      GlobalValue *GV;
+      if (I->getValueType()->isFunctionTy())
+        GV = Function::Create(cast<FunctionType>(I->getValueType()),
+                              GlobalValue::ExternalLinkage, I->getName(),
+                              New.get());
+      else
+        GV = new GlobalVariable(
+            *New, I->getValueType(), false, GlobalValue::ExternalLinkage,
+            nullptr, I->getName(), nullptr,
+            I->getThreadLocalMode(), I->getType()->getAddressSpace());
+      VMap[&*I] = GV;
+      // We do not copy attributes (mainly because copying between different
+      // kinds of globals is forbidden), but this is generally not required for
+      // correctness.
+      continue;
+    }
+    auto *GA = GlobalAlias::create(I->getValueType(),
+                                   I->getType()->getPointerAddressSpace(),
+                                   I->getLinkage(), I->getName(), New.get());
+    GA->copyAttributesFrom(&*I);
+    VMap[&*I] = GA;
+  }
+  
+  // Now that all of the things that global variable initializer can refer to
+  // have been created, loop through and copy the global variable referrers
+  // over...  We also set the attributes on the global now.
+  //
+  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+       I != E; ++I) {
+    if (I->isDeclaration())
+      continue;
+
+    GlobalVariable *GV = cast<GlobalVariable>(VMap[&*I]);
+    if (!ShouldCloneDefinition(&*I)) {
+      // Skip after setting the correct linkage for an external reference.
+      GV->setLinkage(GlobalValue::ExternalLinkage);
+      continue;
+    }
+    if (I->hasInitializer())
+      GV->setInitializer(MapValue(I->getInitializer(), VMap));
+
+    SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+    I->getAllMetadata(MDs);
+    for (auto MD : MDs)
+      GV->addMetadata(MD.first, *MapMetadata(MD.second, VMap));
+
+    copyComdat(GV, &*I);
+  }
+
+  // Similarly, copy over function bodies now...
+  //
+  for (const Function &I : *M) {
+    if (I.isDeclaration())
+      continue;
+
+    Function *F = cast<Function>(VMap[&I]);
+    if (!ShouldCloneDefinition(&I)) {
+      // Skip after setting the correct linkage for an external reference.
+      F->setLinkage(GlobalValue::ExternalLinkage);
+      // Personality function is not valid on a declaration.
+      F->setPersonalityFn(nullptr);
+      continue;
+    }
+
+    Function::arg_iterator DestI = F->arg_begin();
+    for (Function::const_arg_iterator J = I.arg_begin(); J != I.arg_end();
+         ++J) {
+      DestI->setName(J->getName());
+      VMap[&*J] = &*DestI++;
+    }
+
+    SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned.
+    CloneFunctionInto(F, &I, VMap, /*ModuleLevelChanges=*/true, Returns);
+
+    if (I.hasPersonalityFn())
+      F->setPersonalityFn(MapValue(I.getPersonalityFn(), VMap));
+
+    copyComdat(F, &I);
+  }
+
+  // And aliases
+  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I) {
+    // We already dealt with undefined aliases above.
+    if (!ShouldCloneDefinition(&*I))
+      continue;
+    GlobalAlias *GA = cast<GlobalAlias>(VMap[&*I]);
+    if (const Constant *C = I->getAliasee())
+      GA->setAliasee(MapValue(C, VMap));
+  }
+
+  // And named metadata....
+  for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
+         E = M->named_metadata_end(); I != E; ++I) {
+    const NamedMDNode &NMD = *I;
+    NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName());
+    for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
+      NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap));
+  }
+
+  return New;
+}
+
+extern "C" {
+
+LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) {
+  return wrap(CloneModule(unwrap(M)).release());
+}
+
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp b/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp
new file mode 100644
index 000000000000..d9294c499309
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp
@@ -0,0 +1,108 @@
+//===- CmpInstAnalysis.cpp - Utils to help fold compares ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file holds routines to help analyse compare instructions
+// and fold them into constants or other compare instructions
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CmpInstAnalysis.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+unsigned llvm::getICmpCode(const ICmpInst *ICI, bool InvertPred) {
+  ICmpInst::Predicate Pred = InvertPred ? ICI->getInversePredicate()
+                                        : ICI->getPredicate();
+  switch (Pred) {
+      // False -> 0
+    case ICmpInst::ICMP_UGT: return 1;  // 001
+    case ICmpInst::ICMP_SGT: return 1;  // 001
+    case ICmpInst::ICMP_EQ:  return 2;  // 010
+    case ICmpInst::ICMP_UGE: return 3;  // 011
+    case ICmpInst::ICMP_SGE: return 3;  // 011
+    case ICmpInst::ICMP_ULT: return 4;  // 100
+    case ICmpInst::ICMP_SLT: return 4;  // 100
+    case ICmpInst::ICMP_NE:  return 5;  // 101
+    case ICmpInst::ICMP_ULE: return 6;  // 110
+    case ICmpInst::ICMP_SLE: return 6;  // 110
+      // True -> 7
+    default:
+      llvm_unreachable("Invalid ICmp predicate!");
+  }
+}
+
+Value *llvm::getICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
+                          CmpInst::Predicate &NewICmpPred) {
+  switch (Code) {
+    default: llvm_unreachable("Illegal ICmp code!");
+    case 0: // False.
+      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
+    case 1: NewICmpPred = Sign ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
+    case 2: NewICmpPred = ICmpInst::ICMP_EQ; break;
+    case 3: NewICmpPred = Sign ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
+    case 4: NewICmpPred = Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
+    case 5: NewICmpPred = ICmpInst::ICMP_NE; break;
+    case 6: NewICmpPred = Sign ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
+    case 7: // True.
+      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
+  }
+  return nullptr;
+}
+
+bool llvm::PredicatesFoldable(ICmpInst::Predicate p1, ICmpInst::Predicate p2) {
+  return (CmpInst::isSigned(p1) == CmpInst::isSigned(p2)) ||
+         (CmpInst::isSigned(p1) && ICmpInst::isEquality(p2)) ||
+         (CmpInst::isSigned(p2) && ICmpInst::isEquality(p1));
+}
+
+bool llvm::decomposeBitTestICmp(const ICmpInst *I, CmpInst::Predicate &Pred,
+                                Value *&X, Value *&Y, Value *&Z) {
+  ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1));
+  if (!C)
+    return false;
+
+  switch (I->getPredicate()) {
+  default:
+    return false;
+  case ICmpInst::ICMP_SLT:
+    // X < 0 is equivalent to (X & SignMask) != 0.
+    if (!C->isZero())
+      return false;
+    Y = ConstantInt::get(I->getContext(), APInt::getSignMask(C->getBitWidth()));
+    Pred = ICmpInst::ICMP_NE;
+    break;
+  case ICmpInst::ICMP_SGT:
+    // X > -1 is equivalent to (X & SignMask) == 0.
+    if (!C->isMinusOne())
+      return false;
+    Y = ConstantInt::get(I->getContext(), APInt::getSignMask(C->getBitWidth()));
+    Pred = ICmpInst::ICMP_EQ;
+    break;
+  case ICmpInst::ICMP_ULT:
+    // X <u 2^n is equivalent to (X & ~(2^n-1)) == 0.
+    if (!C->getValue().isPowerOf2())
+      return false;
+    Y = ConstantInt::get(I->getContext(), -C->getValue());
+    Pred = ICmpInst::ICMP_EQ;
+    break;
+  case ICmpInst::ICMP_UGT:
+    // X >u 2^n-1 is equivalent to (X & ~(2^n-1)) != 0.
+    if (!(C->getValue() + 1).isPowerOf2())
+      return false;
+    Y = ConstantInt::get(I->getContext(), ~C->getValue());
+    Pred = ICmpInst::ICMP_NE;
+    break;
+  }
+
+  X = I->getOperand(0);
+  Z = ConstantInt::getNullValue(C->getType());
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
new file mode 100644
index 000000000000..1189714dfab1
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -0,0 +1,1122 @@
+//===- CodeExtractor.cpp - Pull code region into a new function -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interface to tear out a code region, such as an
+// individual loop or a parallel section, into a new function, replacing it with
+// a call to the new function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "code-extractor"
+
+// Provide a command-line option to aggregate function arguments into a struct
+// for functions produced by the code extractor. This is useful when converting
+// extracted functions to pthread-based code, as only one argument (void*) can
+// be passed in to pthread_create().
+static cl::opt<bool>
+AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,
+                 cl::desc("Aggregate arguments to code-extracted functions"));
+
+/// \brief Test whether a block is valid for extraction.
+bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB) {
+  // Landing pads must be in the function where they were inserted for cleanup.
+  if (BB.isEHPad())
+    return false;
+  // taking the address of a basic block moved to another function is illegal
+  if (BB.hasAddressTaken())
+    return false;
+
+  // don't hoist code that uses another basicblock address, as it's likely to
+  // lead to unexpected behavior, like cross-function jumps
+  SmallPtrSet<User const *, 16> Visited;
+  SmallVector<User const *, 16> ToVisit;
+
+  for (Instruction const &Inst : BB)
+    ToVisit.push_back(&Inst);
+
+  while (!ToVisit.empty()) {
+    User const *Curr = ToVisit.pop_back_val();
+    if (!Visited.insert(Curr).second)
+      continue;
+    if (isa<BlockAddress const>(Curr))
+      return false; // even a reference to self is likely to be not compatible
+
+    if (isa<Instruction>(Curr) && cast<Instruction>(Curr)->getParent() != &BB)
+      continue;
+
+    for (auto const &U : Curr->operands()) {
+      if (auto *UU = dyn_cast<User>(U))
+        ToVisit.push_back(UU);
+    }
+  }
+
+  // Don't hoist code containing allocas, invokes, or vastarts.
+  for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
+    if (isa<AllocaInst>(I) || isa<InvokeInst>(I))
+      return false;
+    if (const CallInst *CI = dyn_cast<CallInst>(I))
+      if (const Function *F = CI->getCalledFunction())
+        if (F->getIntrinsicID() == Intrinsic::vastart)
+          return false;
+  }
+
+  return true;
+}
+
+/// \brief Build a set of blocks to extract if the input blocks are viable.
+static SetVector<BasicBlock *>
+buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT) {
+  assert(!BBs.empty() && "The set of blocks to extract must be non-empty");
+  SetVector<BasicBlock *> Result;
+
+  // Loop over the blocks, adding them to our set-vector, and aborting with an
+  // empty set if we encounter invalid blocks.
+  for (BasicBlock *BB : BBs) {
+
+    // If this block is dead, don't process it.
+    if (DT && !DT->isReachableFromEntry(BB))
+      continue;
+
+    if (!Result.insert(BB))
+      llvm_unreachable("Repeated basic blocks in extraction input");
+    if (!CodeExtractor::isBlockValidForExtraction(*BB)) {
+      Result.clear();
+      return Result;
+    }
+  }
+
+#ifndef NDEBUG
+  for (SetVector<BasicBlock *>::iterator I = std::next(Result.begin()),
+                                         E = Result.end();
+       I != E; ++I)
+    for (pred_iterator PI = pred_begin(*I), PE = pred_end(*I);
+         PI != PE; ++PI)
+      assert(Result.count(*PI) &&
+             "No blocks in this region may have entries from outside the region"
+             " except for the first block!");
+#endif
+
+  return Result;
+}
+
+CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
+                             bool AggregateArgs, BlockFrequencyInfo *BFI,
+                             BranchProbabilityInfo *BPI)
+    : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
+      BPI(BPI), Blocks(buildExtractionBlockSet(BBs, DT)), NumExitBlocks(~0U) {}
+
+CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
+                             BlockFrequencyInfo *BFI,
+                             BranchProbabilityInfo *BPI)
+    : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
+      BPI(BPI), Blocks(buildExtractionBlockSet(L.getBlocks(), &DT)),
+      NumExitBlocks(~0U) {}
+
+/// definedInRegion - Return true if the specified value is defined in the
+/// extracted region.
+static bool definedInRegion(const SetVector<BasicBlock *> &Blocks, Value *V) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (Blocks.count(I->getParent()))
+      return true;
+  return false;
+}
+
+/// definedInCaller - Return true if the specified value is defined in the
+/// function being code extracted, but not in the region being extracted.
+/// These values must be passed in as live-ins to the function.
+static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) {
+  if (isa<Argument>(V)) return true;
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (!Blocks.count(I->getParent()))
+      return true;
+  return false;
+}
+
+static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) {
+  BasicBlock *CommonExitBlock = nullptr;
+  auto hasNonCommonExitSucc = [&](BasicBlock *Block) {
+    for (auto *Succ : successors(Block)) {
+      // Internal edges, ok.
+      if (Blocks.count(Succ))
+        continue;
+      if (!CommonExitBlock) {
+        CommonExitBlock = Succ;
+        continue;
+      }
+      if (CommonExitBlock == Succ)
+        continue;
+
+      return true;
+    }
+    return false;
+  };
+
+  if (any_of(Blocks, hasNonCommonExitSucc))
+    return nullptr;
+
+  return CommonExitBlock;
+}
+
+bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers(
+    Instruction *Addr) const {
+  AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets());
+  Function *Func = (*Blocks.begin())->getParent();
+  for (BasicBlock &BB : *Func) {
+    if (Blocks.count(&BB))
+      continue;
+    for (Instruction &II : BB) {
+
+      if (isa<DbgInfoIntrinsic>(II))
+        continue;
+
+      unsigned Opcode = II.getOpcode();
+      Value *MemAddr = nullptr;
+      switch (Opcode) {
+      case Instruction::Store:
+      case Instruction::Load: {
+        if (Opcode == Instruction::Store) {
+          StoreInst *SI = cast<StoreInst>(&II);
+          MemAddr = SI->getPointerOperand();
+        } else {
+          LoadInst *LI = cast<LoadInst>(&II);
+          MemAddr = LI->getPointerOperand();
+        }
+        // Global variable can not be aliased with locals.
+        if (dyn_cast<Constant>(MemAddr))
+          break;
+        Value *Base = MemAddr->stripInBoundsConstantOffsets();
+        if (!dyn_cast<AllocaInst>(Base) || Base == AI)
+          return false;
+        break;
+      }
+      default: {
+        IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II);
+        if (IntrInst) {
+          if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
+              IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
+            break;
+          return false;
+        }
+        // Treat all the other cases conservatively if it has side effects.
+        if (II.mayHaveSideEffects())
+          return false;
+      }
+      }
+    }
+  }
+
+  return true;
+}
+
+BasicBlock *
+CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) {
+  BasicBlock *SinglePredFromOutlineRegion = nullptr;
+  assert(!Blocks.count(CommonExitBlock) &&
+         "Expect a block outside the region!");
+  for (auto *Pred : predecessors(CommonExitBlock)) {
+    if (!Blocks.count(Pred))
+      continue;
+    if (!SinglePredFromOutlineRegion) {
+      SinglePredFromOutlineRegion = Pred;
+    } else if (SinglePredFromOutlineRegion != Pred) {
+      SinglePredFromOutlineRegion = nullptr;
+      break;
+    }
+  }
+
+  if (SinglePredFromOutlineRegion)
+    return SinglePredFromOutlineRegion;
+
+#ifndef NDEBUG
+  auto getFirstPHI = [](BasicBlock *BB) {
+    BasicBlock::iterator I = BB->begin();
+    PHINode *FirstPhi = nullptr;
+    while (I != BB->end()) {
+      PHINode *Phi = dyn_cast<PHINode>(I);
+      if (!Phi)
+        break;
+      if (!FirstPhi) {
+        FirstPhi = Phi;
+        break;
+      }
+    }
+    return FirstPhi;
+  };
+  // If there are any phi nodes, the single pred either exists or has already
+  // be created before code extraction.
+  assert(!getFirstPHI(CommonExitBlock) && "Phi not expected");
+#endif
+
+  BasicBlock *NewExitBlock = CommonExitBlock->splitBasicBlock(
+      CommonExitBlock->getFirstNonPHI()->getIterator());
+
+  for (auto *Pred : predecessors(CommonExitBlock)) {
+    if (Blocks.count(Pred))
+      continue;
+    Pred->getTerminator()->replaceUsesOfWith(CommonExitBlock, NewExitBlock);
+  }
+  // Now add the old exit block to the outline region.
+  Blocks.insert(CommonExitBlock);
+  return CommonExitBlock;
+}
+
+void CodeExtractor::findAllocas(ValueSet &SinkCands, ValueSet &HoistCands,
+                                BasicBlock *&ExitBlock) const {
+  Function *Func = (*Blocks.begin())->getParent();
+  ExitBlock = getCommonExitBlock(Blocks);
+
+  for (BasicBlock &BB : *Func) {
+    if (Blocks.count(&BB))
+      continue;
+    for (Instruction &II : BB) {
+      auto *AI = dyn_cast<AllocaInst>(&II);
+      if (!AI)
+        continue;
+
+      // Find the pair of life time markers for address 'Addr' that are either
+      // defined inside the outline region or can legally be shrinkwrapped into
+      // the outline region. If there are not other untracked uses of the
+      // address, return the pair of markers if found; otherwise return a pair
+      // of nullptr.
+      auto GetLifeTimeMarkers =
+          [&](Instruction *Addr, bool &SinkLifeStart,
+              bool &HoistLifeEnd) -> std::pair<Instruction *, Instruction *> {
+        Instruction *LifeStart = nullptr, *LifeEnd = nullptr;
+
+        for (User *U : Addr->users()) {
+          IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(U);
+          if (IntrInst) {
+            if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start) {
+              // Do not handle the case where AI has multiple start markers.
+              if (LifeStart)
+                return std::make_pair<Instruction *>(nullptr, nullptr);
+              LifeStart = IntrInst;
+            }
+            if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) {
+              if (LifeEnd)
+                return std::make_pair<Instruction *>(nullptr, nullptr);
+              LifeEnd = IntrInst;
+            }
+            continue;
+          }
+          // Find untracked uses of the address, bail.
+          if (!definedInRegion(Blocks, U))
+            return std::make_pair<Instruction *>(nullptr, nullptr);
+        }
+
+        if (!LifeStart || !LifeEnd)
+          return std::make_pair<Instruction *>(nullptr, nullptr);
+
+        SinkLifeStart = !definedInRegion(Blocks, LifeStart);
+        HoistLifeEnd = !definedInRegion(Blocks, LifeEnd);
+        // Do legality Check.
+        if ((SinkLifeStart || HoistLifeEnd) &&
+            !isLegalToShrinkwrapLifetimeMarkers(Addr))
+          return std::make_pair<Instruction *>(nullptr, nullptr);
+
+        // Check to see if we have a place to do hoisting, if not, bail.
+        if (HoistLifeEnd && !ExitBlock)
+          return std::make_pair<Instruction *>(nullptr, nullptr);
+
+        return std::make_pair(LifeStart, LifeEnd);
+      };
+
+      bool SinkLifeStart = false, HoistLifeEnd = false;
+      auto Markers = GetLifeTimeMarkers(AI, SinkLifeStart, HoistLifeEnd);
+
+      if (Markers.first) {
+        if (SinkLifeStart)
+          SinkCands.insert(Markers.first);
+        SinkCands.insert(AI);
+        if (HoistLifeEnd)
+          HoistCands.insert(Markers.second);
+        continue;
+      }
+
+      // Follow the bitcast.
+      Instruction *MarkerAddr = nullptr;
+      for (User *U : AI->users()) {
+
+        if (U->stripInBoundsConstantOffsets() == AI) {
+          SinkLifeStart = false;
+          HoistLifeEnd = false;
+          Instruction *Bitcast = cast<Instruction>(U);
+          Markers = GetLifeTimeMarkers(Bitcast, SinkLifeStart, HoistLifeEnd);
+          if (Markers.first) {
+            MarkerAddr = Bitcast;
+            continue;
+          }
+        }
+
+        // Found unknown use of AI.
+        if (!definedInRegion(Blocks, U)) {
+          MarkerAddr = nullptr;
+          break;
+        }
+      }
+
+      if (MarkerAddr) {
+        if (SinkLifeStart)
+          SinkCands.insert(Markers.first);
+        if (!definedInRegion(Blocks, MarkerAddr))
+          SinkCands.insert(MarkerAddr);
+        SinkCands.insert(AI);
+        if (HoistLifeEnd)
+          HoistCands.insert(Markers.second);
+      }
+    }
+  }
+}
+
+void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs,
+                                      const ValueSet &SinkCands) const {
+
+  for (BasicBlock *BB : Blocks) {
+    // If a used value is defined outside the region, it's an input.  If an
+    // instruction is used outside the region, it's an output.
+    for (Instruction &II : *BB) {
+      for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE;
+           ++OI) {
+        Value *V = *OI;
+        if (!SinkCands.count(V) && definedInCaller(Blocks, V))
+          Inputs.insert(V);
+      }
+
+      for (User *U : II.users())
+        if (!definedInRegion(Blocks, U)) {
+          Outputs.insert(&II);
+          break;
+        }
+    }
+  }
+}
+
+/// severSplitPHINodes - If a PHI node has multiple inputs from outside of the
+/// region, we need to split the entry block of the region so that the PHI node
+/// is easier to deal with.
+void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
+  unsigned NumPredsFromRegion = 0;
+  unsigned NumPredsOutsideRegion = 0;
+
+  if (Header != &Header->getParent()->getEntryBlock()) {
+    PHINode *PN = dyn_cast<PHINode>(Header->begin());
+    if (!PN) return;  // No PHI nodes.
+
+    // If the header node contains any PHI nodes, check to see if there is more
+    // than one entry from outside the region.  If so, we need to sever the
+    // header block into two.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (Blocks.count(PN->getIncomingBlock(i)))
+        ++NumPredsFromRegion;
+      else
+        ++NumPredsOutsideRegion;
+
+    // If there is one (or fewer) predecessor from outside the region, we don't
+    // need to do anything special.
+    if (NumPredsOutsideRegion <= 1) return;
+  }
+
+  // Otherwise, we need to split the header block into two pieces: one
+  // containing PHI nodes merging values from outside of the region, and a
+  // second that contains all of the code for the block and merges back any
+  // incoming values from inside of the region.
+  BasicBlock *NewBB = llvm::SplitBlock(Header, Header->getFirstNonPHI(), DT);
+
+  // We only want to code extract the second block now, and it becomes the new
+  // header of the region.
+  BasicBlock *OldPred = Header;
+  Blocks.remove(OldPred);
+  Blocks.insert(NewBB);
+  Header = NewBB;
+
+  // Okay, now we need to adjust the PHI nodes and any branches from within the
+  // region to go to the new header block instead of the old header block.
+  if (NumPredsFromRegion) {
+    PHINode *PN = cast<PHINode>(OldPred->begin());
+    // Loop over all of the predecessors of OldPred that are in the region,
+    // changing them to branch to NewBB instead.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (Blocks.count(PN->getIncomingBlock(i))) {
+        TerminatorInst *TI = PN->getIncomingBlock(i)->getTerminator();
+        TI->replaceUsesOfWith(OldPred, NewBB);
+      }
+
+    // Okay, everything within the region is now branching to the right block, we
+    // just have to update the PHI nodes now, inserting PHI nodes into NewBB.
+    BasicBlock::iterator AfterPHIs;
+    for (AfterPHIs = OldPred->begin(); isa<PHINode>(AfterPHIs); ++AfterPHIs) {
+      PHINode *PN = cast<PHINode>(AfterPHIs);
+      // Create a new PHI node in the new region, which has an incoming value
+      // from OldPred of PN.
+      PHINode *NewPN = PHINode::Create(PN->getType(), 1 + NumPredsFromRegion,
+                                       PN->getName() + ".ce", &NewBB->front());
+      PN->replaceAllUsesWith(NewPN);
+      NewPN->addIncoming(PN, OldPred);
+
+      // Loop over all of the incoming value in PN, moving them to NewPN if they
+      // are from the extracted region.
+      for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+        if (Blocks.count(PN->getIncomingBlock(i))) {
+          NewPN->addIncoming(PN->getIncomingValue(i), PN->getIncomingBlock(i));
+          PN->removeIncomingValue(i);
+          --i;
+        }
+      }
+    }
+  }
+}
+
+void CodeExtractor::splitReturnBlocks() {
+  for (BasicBlock *Block : Blocks)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(Block->getTerminator())) {
+      BasicBlock *New =
+          Block->splitBasicBlock(RI->getIterator(), Block->getName() + ".ret");
+      if (DT) {
+        // Old dominates New. New node dominates all other nodes dominated
+        // by Old.
+        DomTreeNode *OldNode = DT->getNode(Block);
+        SmallVector<DomTreeNode *, 8> Children(OldNode->begin(),
+                                               OldNode->end());
+
+        DomTreeNode *NewNode = DT->addNewBlock(New, Block);
+
+        for (DomTreeNode *I : Children)
+          DT->changeImmediateDominator(I, NewNode);
+      }
+    }
+}
+
+/// constructFunction - make a function based on inputs and outputs, as follows:
+/// f(in0, ..., inN, out0, ..., outN)
+///
+Function *CodeExtractor::constructFunction(const ValueSet &inputs,
+                                           const ValueSet &outputs,
+                                           BasicBlock *header,
+                                           BasicBlock *newRootNode,
+                                           BasicBlock *newHeader,
+                                           Function *oldFunction,
+                                           Module *M) {
+  DEBUG(dbgs() << "inputs: " << inputs.size() << "\n");
+  DEBUG(dbgs() << "outputs: " << outputs.size() << "\n");
+
+  // This function returns unsigned, outputs will go back by reference.
+  switch (NumExitBlocks) {
+  case 0:
+  case 1: RetTy = Type::getVoidTy(header->getContext()); break;
+  case 2: RetTy = Type::getInt1Ty(header->getContext()); break;
+  default: RetTy = Type::getInt16Ty(header->getContext()); break;
+  }
+
+  std::vector<Type*> paramTy;
+
+  // Add the types of the input values to the function's argument list
+  for (Value *value : inputs) {
+    DEBUG(dbgs() << "value used in func: " << *value << "\n");
+    paramTy.push_back(value->getType());
+  }
+
+  // Add the types of the output values to the function's argument list.
+  for (Value *output : outputs) {
+    DEBUG(dbgs() << "instr used in func: " << *output << "\n");
+    if (AggregateArgs)
+      paramTy.push_back(output->getType());
+    else
+      paramTy.push_back(PointerType::getUnqual(output->getType()));
+  }
+
+  DEBUG({
+    dbgs() << "Function type: " << *RetTy << " f(";
+    for (Type *i : paramTy)
+      dbgs() << *i << ", ";
+    dbgs() << ")\n";
+  });
+
+  StructType *StructTy;
+  if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
+    StructTy = StructType::get(M->getContext(), paramTy);
+    paramTy.clear();
+    paramTy.push_back(PointerType::getUnqual(StructTy));
+  }
+  FunctionType *funcType =
+                  FunctionType::get(RetTy, paramTy, false);
+
+  // Create the new function
+  Function *newFunction = Function::Create(funcType,
+                                           GlobalValue::InternalLinkage,
+                                           oldFunction->getName() + "_" +
+                                           header->getName(), M);
+  // If the old function is no-throw, so is the new one.
+  if (oldFunction->doesNotThrow())
+    newFunction->setDoesNotThrow();
+
+  // Inherit the uwtable attribute if we need to.
+  if (oldFunction->hasUWTable())
+    newFunction->setHasUWTable();
+
+  // Inherit all of the target dependent attributes.
+  //  (e.g. If the extracted region contains a call to an x86.sse
+  //  instruction we need to make sure that the extracted region has the
+  //  "target-features" attribute allowing it to be lowered.
+  // FIXME: This should be changed to check to see if a specific
+  //           attribute can not be inherited.
+  AttrBuilder AB(oldFunction->getAttributes().getFnAttributes());
+  for (const auto &Attr : AB.td_attrs())
+    newFunction->addFnAttr(Attr.first, Attr.second);
+
+  newFunction->getBasicBlockList().push_back(newRootNode);
+
+  // Create an iterator to name all of the arguments we inserted.
+  Function::arg_iterator AI = newFunction->arg_begin();
+
+  // Rewrite all users of the inputs in the extracted region to use the
+  // arguments (or appropriate addressing into struct) instead.
+  for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+    Value *RewriteVal;
+    if (AggregateArgs) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext()));
+      Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i);
+      TerminatorInst *TI = newFunction->begin()->getTerminator();
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI);
+      RewriteVal = new LoadInst(GEP, "loadgep_" + inputs[i]->getName(), TI);
+    } else
+      RewriteVal = &*AI++;
+
+    std::vector<User*> Users(inputs[i]->user_begin(), inputs[i]->user_end());
+    for (User *use : Users)
+      if (Instruction *inst = dyn_cast<Instruction>(use))
+        if (Blocks.count(inst->getParent()))
+          inst->replaceUsesOfWith(inputs[i], RewriteVal);
+  }
+
+  // Set names for input and output arguments.
+  if (!AggregateArgs) {
+    AI = newFunction->arg_begin();
+    for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++AI)
+      AI->setName(inputs[i]->getName());
+    for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++AI)
+      AI->setName(outputs[i]->getName()+".out");
+  }
+
+  // Rewrite branches to basic blocks outside of the loop to new dummy blocks
+  // within the new function. This must be done before we lose track of which
+  // blocks were originally in the code region.
+  std::vector<User*> Users(header->user_begin(), header->user_end());
+  for (unsigned i = 0, e = Users.size(); i != e; ++i)
+    // The BasicBlock which contains the branch is not in the region
+    // modify the branch target to a new block
+    if (TerminatorInst *TI = dyn_cast<TerminatorInst>(Users[i]))
+      if (!Blocks.count(TI->getParent()) &&
+          TI->getParent()->getParent() == oldFunction)
+        TI->replaceUsesOfWith(header, newHeader);
+
+  return newFunction;
+}
+
+/// FindPhiPredForUseInBlock - Given a value and a basic block, find a PHI
+/// that uses the value within the basic block, and return the predecessor
+/// block associated with that use, or return 0 if none is found.
+static BasicBlock* FindPhiPredForUseInBlock(Value* Used, BasicBlock* BB) {
+  for (Use &U : Used->uses()) {
+     PHINode *P = dyn_cast<PHINode>(U.getUser());
+     if (P && P->getParent() == BB)
+       return P->getIncomingBlock(U);
+  }
+
+  return nullptr;
+}
+
+/// emitCallAndSwitchStatement - This method sets up the caller side by adding
+/// the call instruction, splitting any PHI nodes in the header block as
+/// necessary.
+void CodeExtractor::
+emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
+                           ValueSet &inputs, ValueSet &outputs) {
+  // Emit a call to the new function, passing in: *pointer to struct (if
+  // aggregating parameters), or plan inputs and allocated memory for outputs
+  std::vector<Value*> params, StructValues, ReloadOutputs, Reloads;
+
+  Module *M = newFunction->getParent();
+  LLVMContext &Context = M->getContext();
+  const DataLayout &DL = M->getDataLayout();
+
+  // Add inputs as params, or to be filled into the struct
+  for (Value *input : inputs)
+    if (AggregateArgs)
+      StructValues.push_back(input);
+    else
+      params.push_back(input);
+
+  // Create allocas for the outputs
+  for (Value *output : outputs) {
+    if (AggregateArgs) {
+      StructValues.push_back(output);
+    } else {
+      AllocaInst *alloca =
+        new AllocaInst(output->getType(), DL.getAllocaAddrSpace(),
+                       nullptr, output->getName() + ".loc",
+                       &codeReplacer->getParent()->front().front());
+      ReloadOutputs.push_back(alloca);
+      params.push_back(alloca);
+    }
+  }
+
+  StructType *StructArgTy = nullptr;
+  AllocaInst *Struct = nullptr;
+  if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
+    std::vector<Type*> ArgTypes;
+    for (ValueSet::iterator v = StructValues.begin(),
+           ve = StructValues.end(); v != ve; ++v)
+      ArgTypes.push_back((*v)->getType());
+
+    // Allocate a struct at the beginning of this function
+    StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
+    Struct = new AllocaInst(StructArgTy, DL.getAllocaAddrSpace(), nullptr,
+                            "structArg",
+                            &codeReplacer->getParent()->front().front());
+    params.push_back(Struct);
+
+    for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i);
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName());
+      codeReplacer->getInstList().push_back(GEP);
+      StoreInst *SI = new StoreInst(StructValues[i], GEP);
+      codeReplacer->getInstList().push_back(SI);
+    }
+  }
+
+  // Emit the call to the function
+  CallInst *call = CallInst::Create(newFunction, params,
+                                    NumExitBlocks > 1 ? "targetBlock" : "");
+  codeReplacer->getInstList().push_back(call);
+
+  Function::arg_iterator OutputArgBegin = newFunction->arg_begin();
+  unsigned FirstOut = inputs.size();
+  if (!AggregateArgs)
+    std::advance(OutputArgBegin, inputs.size());
+
+  // Reload the outputs passed in by reference
+  for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
+    Value *Output = nullptr;
+    if (AggregateArgs) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          StructArgTy, Struct, Idx, "gep_reload_" + outputs[i]->getName());
+      codeReplacer->getInstList().push_back(GEP);
+      Output = GEP;
+    } else {
+      Output = ReloadOutputs[i];
+    }
+    LoadInst *load = new LoadInst(Output, outputs[i]->getName()+".reload");
+    Reloads.push_back(load);
+    codeReplacer->getInstList().push_back(load);
+    std::vector<User*> Users(outputs[i]->user_begin(), outputs[i]->user_end());
+    for (unsigned u = 0, e = Users.size(); u != e; ++u) {
+      Instruction *inst = cast<Instruction>(Users[u]);
+      if (!Blocks.count(inst->getParent()))
+        inst->replaceUsesOfWith(outputs[i], load);
+    }
+  }
+
+  // Now we can emit a switch statement using the call as a value.
+  SwitchInst *TheSwitch =
+      SwitchInst::Create(Constant::getNullValue(Type::getInt16Ty(Context)),
+                         codeReplacer, 0, codeReplacer);
+
+  // Since there may be multiple exits from the original region, make the new
+  // function return an unsigned, switch on that number.  This loop iterates
+  // over all of the blocks in the extracted region, updating any terminator
+  // instructions in the to-be-extracted region that branch to blocks that are
+  // not in the region to be extracted.
+  std::map<BasicBlock*, BasicBlock*> ExitBlockMap;
+
+  unsigned switchVal = 0;
+  for (BasicBlock *Block : Blocks) {
+    TerminatorInst *TI = Block->getTerminator();
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      if (!Blocks.count(TI->getSuccessor(i))) {
+        BasicBlock *OldTarget = TI->getSuccessor(i);
+        // add a new basic block which returns the appropriate value
+        BasicBlock *&NewTarget = ExitBlockMap[OldTarget];
+        if (!NewTarget) {
+          // If we don't already have an exit stub for this non-extracted
+          // destination, create one now!
+          NewTarget = BasicBlock::Create(Context,
+                                         OldTarget->getName() + ".exitStub",
+                                         newFunction);
+          unsigned SuccNum = switchVal++;
+
+          Value *brVal = nullptr;
+          switch (NumExitBlocks) {
+          case 0:
+          case 1: break;  // No value needed.
+          case 2:         // Conditional branch, return a bool
+            brVal = ConstantInt::get(Type::getInt1Ty(Context), !SuccNum);
+            break;
+          default:
+            brVal = ConstantInt::get(Type::getInt16Ty(Context), SuccNum);
+            break;
+          }
+
+          ReturnInst *NTRet = ReturnInst::Create(Context, brVal, NewTarget);
+
+          // Update the switch instruction.
+          TheSwitch->addCase(ConstantInt::get(Type::getInt16Ty(Context),
+                                              SuccNum),
+                             OldTarget);
+
+          // Restore values just before we exit
+          Function::arg_iterator OAI = OutputArgBegin;
+          for (unsigned out = 0, e = outputs.size(); out != e; ++out) {
+            // For an invoke, the normal destination is the only one that is
+            // dominated by the result of the invocation
+            BasicBlock *DefBlock = cast<Instruction>(outputs[out])->getParent();
+
+            bool DominatesDef = true;
+
+            BasicBlock *NormalDest = nullptr;
+            if (auto *Invoke = dyn_cast<InvokeInst>(outputs[out]))
+              NormalDest = Invoke->getNormalDest();
+
+            if (NormalDest) {
+              DefBlock = NormalDest;
+
+              // Make sure we are looking at the original successor block, not
+              // at a newly inserted exit block, which won't be in the dominator
+              // info.
+              for (const auto &I : ExitBlockMap)
+                if (DefBlock == I.second) {
+                  DefBlock = I.first;
+                  break;
+                }
+
+              // In the extract block case, if the block we are extracting ends
+              // with an invoke instruction, make sure that we don't emit a
+              // store of the invoke value for the unwind block.
+              if (!DT && DefBlock != OldTarget)
+                DominatesDef = false;
+            }
+
+            if (DT) {
+              DominatesDef = DT->dominates(DefBlock, OldTarget);
+              
+              // If the output value is used by a phi in the target block,
+              // then we need to test for dominance of the phi's predecessor
+              // instead.  Unfortunately, this a little complicated since we
+              // have already rewritten uses of the value to uses of the reload.
+              BasicBlock* pred = FindPhiPredForUseInBlock(Reloads[out], 
+                                                          OldTarget);
+              if (pred && DT && DT->dominates(DefBlock, pred))
+                DominatesDef = true;
+            }
+
+            if (DominatesDef) {
+              if (AggregateArgs) {
+                Value *Idx[2];
+                Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+                Idx[1] = ConstantInt::get(Type::getInt32Ty(Context),
+                                          FirstOut+out);
+                GetElementPtrInst *GEP = GetElementPtrInst::Create(
+                    StructArgTy, &*OAI, Idx, "gep_" + outputs[out]->getName(),
+                    NTRet);
+                new StoreInst(outputs[out], GEP, NTRet);
+              } else {
+                new StoreInst(outputs[out], &*OAI, NTRet);
+              }
+            }
+            // Advance output iterator even if we don't emit a store
+            if (!AggregateArgs) ++OAI;
+          }
+        }
+
+        // rewrite the original branch instruction with this new target
+        TI->setSuccessor(i, NewTarget);
+      }
+  }
+
+  // Now that we've done the deed, simplify the switch instruction.
+  Type *OldFnRetTy = TheSwitch->getParent()->getParent()->getReturnType();
+  switch (NumExitBlocks) {
+  case 0:
+    // There are no successors (the block containing the switch itself), which
+    // means that previously this was the last part of the function, and hence
+    // this should be rewritten as a `ret'
+
+    // Check if the function should return a value
+    if (OldFnRetTy->isVoidTy()) {
+      ReturnInst::Create(Context, nullptr, TheSwitch);  // Return void
+    } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) {
+      // return what we have
+      ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch);
+    } else {
+      // Otherwise we must have code extracted an unwind or something, just
+      // return whatever we want.
+      ReturnInst::Create(Context, 
+                         Constant::getNullValue(OldFnRetTy), TheSwitch);
+    }
+
+    TheSwitch->eraseFromParent();
+    break;
+  case 1:
+    // Only a single destination, change the switch into an unconditional
+    // branch.
+    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch);
+    TheSwitch->eraseFromParent();
+    break;
+  case 2:
+    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getSuccessor(2),
+                       call, TheSwitch);
+    TheSwitch->eraseFromParent();
+    break;
+  default:
+    // Otherwise, make the default destination of the switch instruction be one
+    // of the other successors.
+    TheSwitch->setCondition(call);
+    TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks));
+    // Remove redundant case
+    TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1));
+    break;
+  }
+}
+
+void CodeExtractor::moveCodeToFunction(Function *newFunction) {
+  Function *oldFunc = (*Blocks.begin())->getParent();
+  Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList();
+  Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList();
+
+  for (BasicBlock *Block : Blocks) {
+    // Delete the basic block from the old function, and the list of blocks
+    oldBlocks.remove(Block);
+
+    // Insert this basic block into the new function
+    newBlocks.push_back(Block);
+  }
+}
+
+void CodeExtractor::calculateNewCallTerminatorWeights(
+    BasicBlock *CodeReplacer,
+    DenseMap<BasicBlock *, BlockFrequency> &ExitWeights,
+    BranchProbabilityInfo *BPI) {
+  typedef BlockFrequencyInfoImplBase::Distribution Distribution;
+  typedef BlockFrequencyInfoImplBase::BlockNode BlockNode;
+
+  // Update the branch weights for the exit block.
+  TerminatorInst *TI = CodeReplacer->getTerminator();
+  SmallVector<unsigned, 8> BranchWeights(TI->getNumSuccessors(), 0);
+
+  // Block Frequency distribution with dummy node.
+  Distribution BranchDist;
+
+  // Add each of the frequencies of the successors.
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i < e; ++i) {
+    BlockNode ExitNode(i);
+    uint64_t ExitFreq = ExitWeights[TI->getSuccessor(i)].getFrequency();
+    if (ExitFreq != 0)
+      BranchDist.addExit(ExitNode, ExitFreq);
+    else
+      BPI->setEdgeProbability(CodeReplacer, i, BranchProbability::getZero());
+  }
+
+  // Check for no total weight.
+  if (BranchDist.Total == 0)
+    return;
+
+  // Normalize the distribution so that they can fit in unsigned.
+  BranchDist.normalize();
+
+  // Create normalized branch weights and set the metadata.
+  for (unsigned I = 0, E = BranchDist.Weights.size(); I < E; ++I) {
+    const auto &Weight = BranchDist.Weights[I];
+
+    // Get the weight and update the current BFI.
+    BranchWeights[Weight.TargetNode.Index] = Weight.Amount;
+    BranchProbability BP(Weight.Amount, BranchDist.Total);
+    BPI->setEdgeProbability(CodeReplacer, Weight.TargetNode.Index, BP);
+  }
+  TI->setMetadata(
+      LLVMContext::MD_prof,
+      MDBuilder(TI->getContext()).createBranchWeights(BranchWeights));
+}
+
+Function *CodeExtractor::extractCodeRegion() {
+  if (!isEligible())
+    return nullptr;
+
+  ValueSet inputs, outputs, SinkingCands, HoistingCands;
+  BasicBlock *CommonExit = nullptr;
+
+  // Assumption: this is a single-entry code region, and the header is the first
+  // block in the region.
+  BasicBlock *header = *Blocks.begin();
+
+  // Calculate the entry frequency of the new function before we change the root
+  //   block.
+  BlockFrequency EntryFreq;
+  if (BFI) {
+    assert(BPI && "Both BPI and BFI are required to preserve profile info");
+    for (BasicBlock *Pred : predecessors(header)) {
+      if (Blocks.count(Pred))
+        continue;
+      EntryFreq +=
+          BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, header);
+    }
+  }
+
+  // If we have to split PHI nodes or the entry block, do so now.
+  severSplitPHINodes(header);
+
+  // If we have any return instructions in the region, split those blocks so
+  // that the return is not in the region.
+  splitReturnBlocks();
+
+  Function *oldFunction = header->getParent();
+
+  // This takes place of the original loop
+  BasicBlock *codeReplacer = BasicBlock::Create(header->getContext(), 
+                                                "codeRepl", oldFunction,
+                                                header);
+
+  // The new function needs a root node because other nodes can branch to the
+  // head of the region, but the entry node of a function cannot have preds.
+  BasicBlock *newFuncRoot = BasicBlock::Create(header->getContext(), 
+                                               "newFuncRoot");
+  newFuncRoot->getInstList().push_back(BranchInst::Create(header));
+
+  findAllocas(SinkingCands, HoistingCands, CommonExit);
+  assert(HoistingCands.empty() || CommonExit);
+
+  // Find inputs to, outputs from the code region.
+  findInputsOutputs(inputs, outputs, SinkingCands);
+
+  // Now sink all instructions which only have non-phi uses inside the region
+  for (auto *II : SinkingCands)
+    cast<Instruction>(II)->moveBefore(*newFuncRoot,
+                                      newFuncRoot->getFirstInsertionPt());
+
+  if (!HoistingCands.empty()) {
+    auto *HoistToBlock = findOrCreateBlockForHoisting(CommonExit);
+    Instruction *TI = HoistToBlock->getTerminator();
+    for (auto *II : HoistingCands)
+      cast<Instruction>(II)->moveBefore(TI);
+  }
+
+  // Calculate the exit blocks for the extracted region and the total exit
+  //  weights for each of those blocks.
+  DenseMap<BasicBlock *, BlockFrequency> ExitWeights;
+  SmallPtrSet<BasicBlock *, 1> ExitBlocks;
+  for (BasicBlock *Block : Blocks) {
+    for (succ_iterator SI = succ_begin(Block), SE = succ_end(Block); SI != SE;
+         ++SI) {
+      if (!Blocks.count(*SI)) {
+        // Update the branch weight for this successor.
+        if (BFI) {
+          BlockFrequency &BF = ExitWeights[*SI];
+          BF += BFI->getBlockFreq(Block) * BPI->getEdgeProbability(Block, *SI);
+        }
+        ExitBlocks.insert(*SI);
+      }
+    }
+  }
+  NumExitBlocks = ExitBlocks.size();
+
+  // Construct new function based on inputs/outputs & add allocas for all defs.
+  Function *newFunction = constructFunction(inputs, outputs, header,
+                                            newFuncRoot,
+                                            codeReplacer, oldFunction,
+                                            oldFunction->getParent());
+
+  // Update the entry count of the function.
+  if (BFI) {
+    Optional<uint64_t> EntryCount =
+        BFI->getProfileCountFromFreq(EntryFreq.getFrequency());
+    if (EntryCount.hasValue())
+      newFunction->setEntryCount(EntryCount.getValue());
+    BFI->setBlockFreq(codeReplacer, EntryFreq.getFrequency());
+  }
+
+  emitCallAndSwitchStatement(newFunction, codeReplacer, inputs, outputs);
+
+  moveCodeToFunction(newFunction);
+
+  // Update the branch weights for the exit block.
+  if (BFI && NumExitBlocks > 1)
+    calculateNewCallTerminatorWeights(codeReplacer, ExitWeights, BPI);
+
+  // Loop over all of the PHI nodes in the header block, and change any
+  // references to the old incoming edge to be the new incoming edge.
+  for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!Blocks.count(PN->getIncomingBlock(i)))
+        PN->setIncomingBlock(i, newFuncRoot);
+  }
+
+  // Look at all successors of the codeReplacer block.  If any of these blocks
+  // had PHI nodes in them, we need to update the "from" block to be the code
+  // replacer, not the original block in the extracted region.
+  std::vector<BasicBlock*> Succs(succ_begin(codeReplacer),
+                                 succ_end(codeReplacer));
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
+    for (BasicBlock::iterator I = Succs[i]->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PN = cast<PHINode>(I);
+      std::set<BasicBlock*> ProcessedPreds;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (Blocks.count(PN->getIncomingBlock(i))) {
+          if (ProcessedPreds.insert(PN->getIncomingBlock(i)).second)
+            PN->setIncomingBlock(i, codeReplacer);
+          else {
+            // There were multiple entries in the PHI for this block, now there
+            // is only one, so remove the duplicated entries.
+            PN->removeIncomingValue(i, false);
+            --i; --e;
+          }
+        }
+    }
+
+  DEBUG(if (verifyFunction(*newFunction)) 
+        report_fatal_error("verifyFunction failed!"));
+  return newFunction;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp
new file mode 100644
index 000000000000..6642a97a29c2
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp
@@ -0,0 +1,165 @@
+//===- CtorUtils.cpp - Helpers for working with global_ctors ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines functions that are used to process llvm.global_ctors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "ctor_utils"
+
+namespace llvm {
+
+namespace {
+/// Given a specified llvm.global_ctors list, remove the listed elements.
+void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) {
+  // Filter out the initializer elements to remove.
+  ConstantArray *OldCA = cast<ConstantArray>(GCL->getInitializer());
+  SmallVector<Constant *, 10> CAList;
+  for (unsigned I = 0, E = OldCA->getNumOperands(); I < E; ++I)
+    if (!CtorsToRemove.test(I))
+      CAList.push_back(OldCA->getOperand(I));
+
+  // Create the new array initializer.
+  ArrayType *ATy =
+      ArrayType::get(OldCA->getType()->getElementType(), CAList.size());
+  Constant *CA = ConstantArray::get(ATy, CAList);
+
+  // If we didn't change the number of elements, don't create a new GV.
+  if (CA->getType() == OldCA->getType()) {
+    GCL->setInitializer(CA);
+    return;
+  }
+
+  // Create the new global and insert it next to the existing list.
+  GlobalVariable *NGV =
+      new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(),
+                         CA, "", GCL->getThreadLocalMode());
+  GCL->getParent()->getGlobalList().insert(GCL->getIterator(), NGV);
+  NGV->takeName(GCL);
+
+  // Nuke the old list, replacing any uses with the new one.
+  if (!GCL->use_empty()) {
+    Constant *V = NGV;
+    if (V->getType() != GCL->getType())
+      V = ConstantExpr::getBitCast(V, GCL->getType());
+    GCL->replaceAllUsesWith(V);
+  }
+  GCL->eraseFromParent();
+}
+
+/// Given a llvm.global_ctors list that we can understand,
+/// return a list of the functions and null terminator as a vector.
+std::vector<Function *> parseGlobalCtors(GlobalVariable *GV) {
+  if (GV->getInitializer()->isNullValue())
+    return std::vector<Function *>();
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+  std::vector<Function *> Result;
+  Result.reserve(CA->getNumOperands());
+  for (auto &V : CA->operands()) {
+    ConstantStruct *CS = cast<ConstantStruct>(V);
+    Result.push_back(dyn_cast<Function>(CS->getOperand(1)));
+  }
+  return Result;
+}
+
+/// Find the llvm.global_ctors list, verifying that all initializers have an
+/// init priority of 65535.
+GlobalVariable *findGlobalCtors(Module &M) {
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (!GV)
+    return nullptr;
+
+  // Verify that the initializer is simple enough for us to handle. We are
+  // only allowed to optimize the initializer if it is unique.
+  if (!GV->hasUniqueInitializer())
+    return nullptr;
+
+  if (isa<ConstantAggregateZero>(GV->getInitializer()))
+    return GV;
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+
+  for (auto &V : CA->operands()) {
+    if (isa<ConstantAggregateZero>(V))
+      continue;
+    ConstantStruct *CS = cast<ConstantStruct>(V);
+    if (isa<ConstantPointerNull>(CS->getOperand(1)))
+      continue;
+
+    // Must have a function or null ptr.
+    if (!isa<Function>(CS->getOperand(1)))
+      return nullptr;
+
+    // Init priority must be standard.
+    ConstantInt *CI = cast<ConstantInt>(CS->getOperand(0));
+    if (CI->getZExtValue() != 65535)
+      return nullptr;
+  }
+
+  return GV;
+}
+} // namespace
+
+/// Call "ShouldRemove" for every entry in M's global_ctor list and remove the
+/// entries for which it returns true.  Return true if anything changed.
+bool optimizeGlobalCtorsList(Module &M,
+                             function_ref<bool(Function *)> ShouldRemove) {
+  GlobalVariable *GlobalCtors = findGlobalCtors(M);
+  if (!GlobalCtors)
+    return false;
+
+  std::vector<Function *> Ctors = parseGlobalCtors(GlobalCtors);
+  if (Ctors.empty())
+    return false;
+
+  bool MadeChange = false;
+
+  // Loop over global ctors, optimizing them when we can.
+  unsigned NumCtors = Ctors.size();
+  BitVector CtorsToRemove(NumCtors);
+  for (unsigned i = 0; i != Ctors.size() && NumCtors > 0; ++i) {
+    Function *F = Ctors[i];
+    // Found a null terminator in the middle of the list, prune off the rest of
+    // the list.
+    if (!F)
+      continue;
+
+    DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
+
+    // We cannot simplify external ctor functions.
+    if (F->empty())
+      continue;
+
+    // If we can evaluate the ctor at compile time, do.
+    if (ShouldRemove(F)) {
+      Ctors[i] = nullptr;
+      CtorsToRemove.set(i);
+      NumCtors--;
+      MadeChange = true;
+      continue;
+    }
+  }
+
+  if (!MadeChange)
+    return false;
+
+  removeGlobalCtors(GlobalCtors, CtorsToRemove);
+  return true;
+}
+
+} // End llvm namespace
diff --git a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
new file mode 100644
index 000000000000..6d3d287defdb
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -0,0 +1,151 @@
+//===- DemoteRegToStack.cpp - Move a virtual register to the stack --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+/// DemoteRegToStack - This function takes a virtual register computed by an
+/// Instruction and replaces it with a slot in the stack frame, allocated via
+/// alloca.  This allows the CFG to be changed around without fear of
+/// invalidating the SSA information for the value.  It returns the pointer to
+/// the alloca inserted to create a stack slot for I.
+AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
+                                   Instruction *AllocaPoint) {
+  if (I.use_empty()) {
+    I.eraseFromParent();
+    return nullptr;
+  }
+
+  Function *F = I.getParent()->getParent();
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
+  // Create a stack slot to hold the value.
+  AllocaInst *Slot;
+  if (AllocaPoint) {
+    Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
+                          I.getName()+".reg2mem", AllocaPoint);
+  } else {
+    Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
+                          I.getName() + ".reg2mem", &F->getEntryBlock().front());
+  }
+
+  // We cannot demote invoke instructions to the stack if their normal edge
+  // is critical. Therefore, split the critical edge and create a basic block
+  // into which the store can be inserted.
+  if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
+    if (!II->getNormalDest()->getSinglePredecessor()) {
+      unsigned SuccNum = GetSuccessorNumber(II->getParent(), II->getNormalDest());
+      assert(isCriticalEdge(II, SuccNum) && "Expected a critical edge!");
+      BasicBlock *BB = SplitCriticalEdge(II, SuccNum);
+      assert(BB && "Unable to split critical edge.");
+      (void)BB;
+    }
+  }
+
+  // Change all of the users of the instruction to read from the stack slot.
+  while (!I.use_empty()) {
+    Instruction *U = cast<Instruction>(I.user_back());
+    if (PHINode *PN = dyn_cast<PHINode>(U)) {
+      // If this is a PHI node, we can't insert a load of the value before the
+      // use.  Instead insert the load in the predecessor block corresponding
+      // to the incoming value.
+      //
+      // Note that if there are multiple edges from a basic block to this PHI
+      // node that we cannot have multiple loads. The problem is that the
+      // resulting PHI node will have multiple values (from each load) coming in
+      // from the same block, which is illegal SSA form. For this reason, we
+      // keep track of and reuse loads we insert.
+      DenseMap<BasicBlock*, Value*> Loads;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingValue(i) == &I) {
+          Value *&V = Loads[PN->getIncomingBlock(i)];
+          if (!V) {
+            // Insert the load into the predecessor block
+            V = new LoadInst(Slot, I.getName()+".reload", VolatileLoads,
+                             PN->getIncomingBlock(i)->getTerminator());
+          }
+          PN->setIncomingValue(i, V);
+        }
+
+    } else {
+      // If this is a normal instruction, just insert a load.
+      Value *V = new LoadInst(Slot, I.getName()+".reload", VolatileLoads, U);
+      U->replaceUsesOfWith(&I, V);
+    }
+  }
+
+  // Insert stores of the computed value into the stack slot. We have to be
+  // careful if I is an invoke instruction, because we can't insert the store
+  // AFTER the terminator instruction.
+  BasicBlock::iterator InsertPt;
+  if (!isa<TerminatorInst>(I)) {
+    InsertPt = ++I.getIterator();
+    for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
+      /* empty */;   // Don't insert before PHI nodes or landingpad instrs.
+  } else {
+    InvokeInst &II = cast<InvokeInst>(I);
+    InsertPt = II.getNormalDest()->getFirstInsertionPt();
+  }
+
+  new StoreInst(&I, Slot, &*InsertPt);
+  return Slot;
+}
+
+/// DemotePHIToStack - This function takes a virtual register computed by a PHI
+/// node and replaces it with a slot in the stack frame allocated via alloca.
+/// The PHI node is deleted. It returns the pointer to the alloca inserted.
+AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
+  if (P->use_empty()) {
+    P->eraseFromParent();
+    return nullptr;
+  }
+
+  const DataLayout &DL = P->getModule()->getDataLayout();
+
+  // Create a stack slot to hold the value.
+  AllocaInst *Slot;
+  if (AllocaPoint) {
+    Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
+                          P->getName()+".reg2mem", AllocaPoint);
+  } else {
+    Function *F = P->getParent()->getParent();
+    Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
+                          P->getName() + ".reg2mem",
+                          &F->getEntryBlock().front());
+  }
+
+  // Iterate over each operand inserting a store in each predecessor.
+  for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(P->getIncomingValue(i))) {
+      assert(II->getParent() != P->getIncomingBlock(i) &&
+             "Invoke edge not supported yet"); (void)II;
+    }
+    new StoreInst(P->getIncomingValue(i), Slot,
+                  P->getIncomingBlock(i)->getTerminator());
+  }
+
+  // Insert a load in place of the PHI and replace all uses.
+  BasicBlock::iterator InsertPt = P->getIterator();
+
+  for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
+    /* empty */;   // Don't insert before PHI nodes or landingpad instrs.
+
+  Value *V = new LoadInst(Slot, P->getName() + ".reload", &*InsertPt);
+  P->replaceAllUsesWith(V);
+
+  // Delete PHI.
+  P->eraseFromParent();
+  return Slot;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp b/contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
new file mode 100644
index 000000000000..78d7474e5b95
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -0,0 +1,95 @@
+//===- EscapeEnumerator.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines a helper class that enumerates all possible exits from a function,
+// including exception handling.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+static Constant *getDefaultPersonalityFn(Module *M) {
+  LLVMContext &C = M->getContext();
+  Triple T(M->getTargetTriple());
+  EHPersonality Pers = getDefaultEHPersonality(T);
+  return M->getOrInsertFunction(getEHPersonalityName(Pers),
+                                FunctionType::get(Type::getInt32Ty(C), true));
+}
+
+IRBuilder<> *EscapeEnumerator::Next() {
+  if (Done)
+    return nullptr;
+
+  // Find all 'return', 'resume', and 'unwind' instructions.
+  while (StateBB != StateE) {
+    BasicBlock *CurBB = &*StateBB++;
+
+    // Branches and invokes do not escape, only unwind, resume, and return
+    // do.
+    TerminatorInst *TI = CurBB->getTerminator();
+    if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
+      continue;
+
+    Builder.SetInsertPoint(TI);
+    return &Builder;
+  }
+
+  Done = true;
+
+  if (!HandleExceptions)
+    return nullptr;
+
+  if (F.doesNotThrow())
+    return nullptr;
+
+  // Find all 'call' instructions that may throw.
+  SmallVector<Instruction *, 16> Calls;
+  for (BasicBlock &BB : F)
+    for (Instruction &II : BB)
+      if (CallInst *CI = dyn_cast<CallInst>(&II))
+        if (!CI->doesNotThrow())
+          Calls.push_back(CI);
+
+  if (Calls.empty())
+    return nullptr;
+
+  // Create a cleanup block.
+  LLVMContext &C = F.getContext();
+  BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
+  Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C));
+  if (!F.hasPersonalityFn()) {
+    Constant *PersFn = getDefaultPersonalityFn(F.getParent());
+    F.setPersonalityFn(PersFn);
+  }
+
+  if (isFuncletEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) {
+    report_fatal_error("Funclet EH not supported");
+  }
+
+  LandingPadInst *LPad =
+      LandingPadInst::Create(ExnTy, 1, "cleanup.lpad", CleanupBB);
+  LPad->setCleanup(true);
+  ResumeInst *RI = ResumeInst::Create(LPad, CleanupBB);
+
+  // Transform the 'call' instructions into 'invoke's branching to the
+  // cleanup block. Go in reverse order to make prettier BB names.
+  SmallVector<Value *, 16> Args;
+  for (unsigned I = Calls.size(); I != 0;) {
+    CallInst *CI = cast<CallInst>(Calls[--I]);
+    changeToInvokeAndSplitBasicBlock(CI, CleanupBB);
+  }
+
+  Builder.SetInsertPoint(RI);
+  return &Builder;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp b/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp
new file mode 100644
index 000000000000..1328f2f3ec01
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -0,0 +1,597 @@
+//===- Evaluator.cpp - LLVM IR evaluator ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Function evaluator for LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Evaluator.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "evaluator"
+
+using namespace llvm;
+
+static inline bool
+isSimpleEnoughValueToCommit(Constant *C,
+                            SmallPtrSetImpl<Constant *> &SimpleConstants,
+                            const DataLayout &DL);
+
+/// Return true if the specified constant can be handled by the code generator.
+/// We don't want to generate something like:
+///   void *X = &X/42;
+/// because the code generator doesn't have a relocation that can handle that.
+///
+/// This function should be called if C was not found (but just got inserted)
+/// in SimpleConstants to avoid having to rescan the same constants all the
+/// time.
+static bool
+isSimpleEnoughValueToCommitHelper(Constant *C,
+                                  SmallPtrSetImpl<Constant *> &SimpleConstants,
+                                  const DataLayout &DL) {
+  // Simple global addresses are supported, do not allow dllimport or
+  // thread-local globals.
+  if (auto *GV = dyn_cast<GlobalValue>(C))
+    return !GV->hasDLLImportStorageClass() && !GV->isThreadLocal();
+
+  // Simple integer, undef, constant aggregate zero, etc are all supported.
+  if (C->getNumOperands() == 0 || isa<BlockAddress>(C))
+    return true;
+
+  // Aggregate values are safe if all their elements are.
+  if (isa<ConstantAggregate>(C)) {
+    for (Value *Op : C->operands())
+      if (!isSimpleEnoughValueToCommit(cast<Constant>(Op), SimpleConstants, DL))
+        return false;
+    return true;
+  }
+
+  // We don't know exactly what relocations are allowed in constant expressions,
+  // so we allow &global+constantoffset, which is safe and uniformly supported
+  // across targets.
+  ConstantExpr *CE = cast<ConstantExpr>(C);
+  switch (CE->getOpcode()) {
+  case Instruction::BitCast:
+    // Bitcast is fine if the casted value is fine.
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+    // int <=> ptr is fine if the int type is the same size as the
+    // pointer type.
+    if (DL.getTypeSizeInBits(CE->getType()) !=
+        DL.getTypeSizeInBits(CE->getOperand(0)->getType()))
+      return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+  // GEP is fine if it is simple + constant offset.
+  case Instruction::GetElementPtr:
+    for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
+      if (!isa<ConstantInt>(CE->getOperand(i)))
+        return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+  case Instruction::Add:
+    // We allow simple+cst.
+    if (!isa<ConstantInt>(CE->getOperand(1)))
+      return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+  }
+  return false;
+}
+
+static inline bool
+isSimpleEnoughValueToCommit(Constant *C,
+                            SmallPtrSetImpl<Constant *> &SimpleConstants,
+                            const DataLayout &DL) {
+  // If we already checked this constant, we win.
+  if (!SimpleConstants.insert(C).second)
+    return true;
+  // Check the constant.
+  return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL);
+}
+
+/// Return true if this constant is simple enough for us to understand.  In
+/// particular, if it is a cast to anything other than from one pointer type to
+/// another pointer type, we punt.  We basically just support direct accesses to
+/// globals and GEP's of globals.  This should be kept up to date with
+/// CommitValueTo.
+static bool isSimpleEnoughPointerToCommit(Constant *C) {
+  // Conservatively, avoid aggregate types. This is because we don't
+  // want to worry about them partially overlapping other stores.
+  if (!cast<PointerType>(C->getType())->getElementType()->isSingleValueType())
+    return false;
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+    // Do not allow weak/*_odr/linkonce linkage or external globals.
+    return GV->hasUniqueInitializer();
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    // Handle a constantexpr gep.
+    if (CE->getOpcode() == Instruction::GetElementPtr &&
+        isa<GlobalVariable>(CE->getOperand(0)) &&
+        cast<GEPOperator>(CE)->isInBounds()) {
+      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
+      // external globals.
+      if (!GV->hasUniqueInitializer())
+        return false;
+
+      // The first index must be zero.
+      ConstantInt *CI = dyn_cast<ConstantInt>(*std::next(CE->op_begin()));
+      if (!CI || !CI->isZero()) return false;
+
+      // The remaining indices must be compile-time known integers within the
+      // notional bounds of the corresponding static array types.
+      if (!CE->isGEPWithNoNotionalOverIndexing())
+        return false;
+
+      return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+
+    // A constantexpr bitcast from a pointer to another pointer is a no-op,
+    // and we know how to evaluate it by moving the bitcast from the pointer
+    // operand to the value operand.
+    } else if (CE->getOpcode() == Instruction::BitCast &&
+               isa<GlobalVariable>(CE->getOperand(0))) {
+      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
+      // external globals.
+      return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer();
+    }
+  }
+
+  return false;
+}
+
+/// Return the value that would be computed by a load from P after the stores
+/// reflected by 'memory' have been performed.  If we can't decide, return null.
+Constant *Evaluator::ComputeLoadResult(Constant *P) {
+  // If this memory location has been recently stored, use the stored value: it
+  // is the most up-to-date.
+  DenseMap<Constant*, Constant*>::const_iterator I = MutatedMemory.find(P);
+  if (I != MutatedMemory.end()) return I->second;
+
+  // Access it.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
+    if (GV->hasDefinitiveInitializer())
+      return GV->getInitializer();
+    return nullptr;
+  }
+
+  // Handle a constantexpr getelementptr.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P))
+    if (CE->getOpcode() == Instruction::GetElementPtr &&
+        isa<GlobalVariable>(CE->getOperand(0))) {
+      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+      if (GV->hasDefinitiveInitializer())
+        return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+    }
+
+  return nullptr;  // don't know how to evaluate.
+}
+
+/// Evaluate all instructions in block BB, returning true if successful, false
+/// if we can't evaluate it.  NewBB returns the next BB that control flows into,
+/// or null upon return.
+bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
+                              BasicBlock *&NextBB) {
+  // This is the main evaluation loop.
+  while (1) {
+    Constant *InstResult = nullptr;
+
+    DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
+      if (!SI->isSimple()) {
+        DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
+      Constant *Ptr = getVal(SI->getOperand(1));
+      if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) {
+        DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
+        Ptr = FoldedPtr;
+        DEBUG(dbgs() << "; To: " << *Ptr << "\n");
+      }
+      if (!isSimpleEnoughPointerToCommit(Ptr)) {
+        // If this is too complex for us to commit, reject it.
+        DEBUG(dbgs() << "Pointer is too complex for us to evaluate store.");
+        return false;
+      }
+
+      Constant *Val = getVal(SI->getOperand(0));
+
+      // If this might be too difficult for the backend to handle (e.g. the addr
+      // of one global variable divided by another) then we can't commit it.
+      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) {
+        DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val
+              << "\n");
+        return false;
+      }
+
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+        if (CE->getOpcode() == Instruction::BitCast) {
+          DEBUG(dbgs() << "Attempting to resolve bitcast on constant ptr.\n");
+          // If we're evaluating a store through a bitcast, then we need
+          // to pull the bitcast off the pointer type and push it onto the
+          // stored value.
+          Ptr = CE->getOperand(0);
+
+          Type *NewTy = cast<PointerType>(Ptr->getType())->getElementType();
+
+          // In order to push the bitcast onto the stored value, a bitcast
+          // from NewTy to Val's type must be legal.  If it's not, we can try
+          // introspecting NewTy to find a legal conversion.
+          while (!Val->getType()->canLosslesslyBitCastTo(NewTy)) {
+            // If NewTy is a struct, we can convert the pointer to the struct
+            // into a pointer to its first member.
+            // FIXME: This could be extended to support arrays as well.
+            if (StructType *STy = dyn_cast<StructType>(NewTy)) {
+              NewTy = STy->getTypeAtIndex(0U);
+
+              IntegerType *IdxTy = IntegerType::get(NewTy->getContext(), 32);
+              Constant *IdxZero = ConstantInt::get(IdxTy, 0, false);
+              Constant * const IdxList[] = {IdxZero, IdxZero};
+
+              Ptr = ConstantExpr::getGetElementPtr(nullptr, Ptr, IdxList);
+              if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI))
+                Ptr = FoldedPtr;
+
+            // If we can't improve the situation by introspecting NewTy,
+            // we have to give up.
+            } else {
+              DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
+                    "evaluate.\n");
+              return false;
+            }
+          }
+
+          // If we found compatible types, go ahead and push the bitcast
+          // onto the stored value.
+          Val = ConstantExpr::getBitCast(Val, NewTy);
+
+          DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
+        }
+      }
+
+      MutatedMemory[Ptr] = Val;
+    } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
+      InstResult = ConstantExpr::get(BO->getOpcode(),
+                                     getVal(BO->getOperand(0)),
+                                     getVal(BO->getOperand(1)));
+      DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: " << *InstResult
+            << "\n");
+    } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
+      InstResult = ConstantExpr::getCompare(CI->getPredicate(),
+                                            getVal(CI->getOperand(0)),
+                                            getVal(CI->getOperand(1)));
+      DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
+            << "\n");
+    } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
+      InstResult = ConstantExpr::getCast(CI->getOpcode(),
+                                         getVal(CI->getOperand(0)),
+                                         CI->getType());
+      DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
+            << "\n");
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
+      InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
+                                           getVal(SI->getOperand(1)),
+                                           getVal(SI->getOperand(2)));
+      DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
+            << "\n");
+    } else if (auto *EVI = dyn_cast<ExtractValueInst>(CurInst)) {
+      InstResult = ConstantExpr::getExtractValue(
+          getVal(EVI->getAggregateOperand()), EVI->getIndices());
+      DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: " << *InstResult
+                   << "\n");
+    } else if (auto *IVI = dyn_cast<InsertValueInst>(CurInst)) {
+      InstResult = ConstantExpr::getInsertValue(
+          getVal(IVI->getAggregateOperand()),
+          getVal(IVI->getInsertedValueOperand()), IVI->getIndices());
+      DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: " << *InstResult
+                   << "\n");
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
+      Constant *P = getVal(GEP->getOperand(0));
+      SmallVector<Constant*, 8> GEPOps;
+      for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
+           i != e; ++i)
+        GEPOps.push_back(getVal(*i));
+      InstResult =
+          ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), P, GEPOps,
+                                         cast<GEPOperator>(GEP)->isInBounds());
+      DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult
+            << "\n");
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
+
+      if (!LI->isSimple()) {
+        DEBUG(dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
+
+      Constant *Ptr = getVal(LI->getOperand(0));
+      if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) {
+        Ptr = FoldedPtr;
+        DEBUG(dbgs() << "Found a constant pointer expression, constant "
+              "folding: " << *Ptr << "\n");
+      }
+      InstResult = ComputeLoadResult(Ptr);
+      if (!InstResult) {
+        DEBUG(dbgs() << "Failed to compute load result. Can not evaluate load."
+              "\n");
+        return false; // Could not evaluate load.
+      }
+
+      DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
+    } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
+      if (AI->isArrayAllocation()) {
+        DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
+        return false;  // Cannot handle array allocs.
+      }
+      Type *Ty = AI->getAllocatedType();
+      AllocaTmps.push_back(
+          make_unique<GlobalVariable>(Ty, false, GlobalValue::InternalLinkage,
+                                      UndefValue::get(Ty), AI->getName()));
+      InstResult = AllocaTmps.back().get();
+      DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
+    } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
+      CallSite CS(&*CurInst);
+
+      // Debug info can safely be ignored here.
+      if (isa<DbgInfoIntrinsic>(CS.getInstruction())) {
+        DEBUG(dbgs() << "Ignoring debug info.\n");
+        ++CurInst;
+        continue;
+      }
+
+      // Cannot handle inline asm.
+      if (isa<InlineAsm>(CS.getCalledValue())) {
+        DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
+        return false;
+      }
+
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
+        if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
+          if (MSI->isVolatile()) {
+            DEBUG(dbgs() << "Can not optimize a volatile memset " <<
+                  "intrinsic.\n");
+            return false;
+          }
+          Constant *Ptr = getVal(MSI->getDest());
+          Constant *Val = getVal(MSI->getValue());
+          Constant *DestVal = ComputeLoadResult(getVal(Ptr));
+          if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
+            // This memset is a no-op.
+            DEBUG(dbgs() << "Ignoring no-op memset.\n");
+            ++CurInst;
+            continue;
+          }
+        }
+
+        if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+            II->getIntrinsicID() == Intrinsic::lifetime_end) {
+          DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
+          ++CurInst;
+          continue;
+        }
+
+        if (II->getIntrinsicID() == Intrinsic::invariant_start) {
+          // We don't insert an entry into Values, as it doesn't have a
+          // meaningful return value.
+          if (!II->use_empty()) {
+            DEBUG(dbgs() << "Found unused invariant_start. Can't evaluate.\n");
+            return false;
+          }
+          ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
+          Value *PtrArg = getVal(II->getArgOperand(1));
+          Value *Ptr = PtrArg->stripPointerCasts();
+          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+            Type *ElemTy = GV->getValueType();
+            if (!Size->isMinusOne() &&
+                Size->getValue().getLimitedValue() >=
+                    DL.getTypeStoreSize(ElemTy)) {
+              Invariants.insert(GV);
+              DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV
+                    << "\n");
+            } else {
+              DEBUG(dbgs() << "Found a global var, but can not treat it as an "
+                    "invariant.\n");
+            }
+          }
+          // Continue even if we do nothing.
+          ++CurInst;
+          continue;
+        } else if (II->getIntrinsicID() == Intrinsic::assume) {
+          DEBUG(dbgs() << "Skipping assume intrinsic.\n");
+          ++CurInst;
+          continue;
+        }
+
+        DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
+        return false;
+      }
+
+      // Resolve function pointers.
+      Function *Callee = dyn_cast<Function>(getVal(CS.getCalledValue()));
+      if (!Callee || Callee->isInterposable()) {
+        DEBUG(dbgs() << "Can not resolve function pointer.\n");
+        return false;  // Cannot resolve.
+      }
+
+      SmallVector<Constant*, 8> Formals;
+      for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i)
+        Formals.push_back(getVal(*i));
+
+      if (Callee->isDeclaration()) {
+        // If this is a function we can constant fold, do it.
+        if (Constant *C = ConstantFoldCall(CS, Callee, Formals, TLI)) {
+          InstResult = C;
+          DEBUG(dbgs() << "Constant folded function call. Result: " <<
+                *InstResult << "\n");
+        } else {
+          DEBUG(dbgs() << "Can not constant fold function call.\n");
+          return false;
+        }
+      } else {
+        if (Callee->getFunctionType()->isVarArg()) {
+          DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
+          return false;
+        }
+
+        Constant *RetVal = nullptr;
+        // Execute the call, if successful, use the return value.
+        ValueStack.emplace_back();
+        if (!EvaluateFunction(Callee, RetVal, Formals)) {
+          DEBUG(dbgs() << "Failed to evaluate function.\n");
+          return false;
+        }
+        ValueStack.pop_back();
+        InstResult = RetVal;
+
+        if (InstResult) {
+          DEBUG(dbgs() << "Successfully evaluated function. Result: "
+                       << *InstResult << "\n\n");
+        } else {
+          DEBUG(dbgs() << "Successfully evaluated function. Result: 0\n\n");
+        }
+      }
+    } else if (isa<TerminatorInst>(CurInst)) {
+      DEBUG(dbgs() << "Found a terminator instruction.\n");
+
+      if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
+        if (BI->isUnconditional()) {
+          NextBB = BI->getSuccessor(0);
+        } else {
+          ConstantInt *Cond =
+            dyn_cast<ConstantInt>(getVal(BI->getCondition()));
+          if (!Cond) return false;  // Cannot determine.
+
+          NextBB = BI->getSuccessor(!Cond->getZExtValue());
+        }
+      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurInst)) {
+        ConstantInt *Val =
+          dyn_cast<ConstantInt>(getVal(SI->getCondition()));
+        if (!Val) return false;  // Cannot determine.
+        NextBB = SI->findCaseValue(Val)->getCaseSuccessor();
+      } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) {
+        Value *Val = getVal(IBI->getAddress())->stripPointerCasts();
+        if (BlockAddress *BA = dyn_cast<BlockAddress>(Val))
+          NextBB = BA->getBasicBlock();
+        else
+          return false;  // Cannot determine.
+      } else if (isa<ReturnInst>(CurInst)) {
+        NextBB = nullptr;
+      } else {
+        // invoke, unwind, resume, unreachable.
+        DEBUG(dbgs() << "Can not handle terminator.");
+        return false;  // Cannot handle this terminator.
+      }
+
+      // We succeeded at evaluating this block!
+      DEBUG(dbgs() << "Successfully evaluated block.\n");
+      return true;
+    } else {
+      // Did not know how to evaluate this!
+      DEBUG(dbgs() << "Failed to evaluate block due to unhandled instruction."
+            "\n");
+      return false;
+    }
+
+    if (!CurInst->use_empty()) {
+      if (auto *FoldedInstResult = ConstantFoldConstant(InstResult, DL, TLI))
+        InstResult = FoldedInstResult;
+
+      setVal(&*CurInst, InstResult);
+    }
+
+    // If we just processed an invoke, we finished evaluating the block.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
+      NextBB = II->getNormalDest();
+      DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
+      return true;
+    }
+
+    // Advance program counter.
+    ++CurInst;
+  }
+}
+
+/// Evaluate a call to function F, returning true if successful, false if we
+/// can't evaluate it.  ActualArgs contains the formal arguments for the
+/// function.
+bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
+                                 const SmallVectorImpl<Constant*> &ActualArgs) {
+  // Check to see if this function is already executing (recursion).  If so,
+  // bail out.  TODO: we might want to accept limited recursion.
+  if (is_contained(CallStack, F))
+    return false;
+
+  CallStack.push_back(F);
+
+  // Initialize arguments to the incoming values specified.
+  unsigned ArgNo = 0;
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
+       ++AI, ++ArgNo)
+    setVal(&*AI, ActualArgs[ArgNo]);
+
+  // ExecutedBlocks - We only handle non-looping, non-recursive code.  As such,
+  // we can only evaluate any one basic block at most once.  This set keeps
+  // track of what we have executed so we can detect recursive cases etc.
+  SmallPtrSet<BasicBlock*, 32> ExecutedBlocks;
+
+  // CurBB - The current basic block we're evaluating.
+  BasicBlock *CurBB = &F->front();
+
+  BasicBlock::iterator CurInst = CurBB->begin();
+
+  while (1) {
+    BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings.
+    DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
+
+    if (!EvaluateBlock(CurInst, NextBB))
+      return false;
+
+    if (!NextBB) {
+      // Successfully running until there's no next block means that we found
+      // the return.  Fill it the return value and pop the call stack.
+      ReturnInst *RI = cast<ReturnInst>(CurBB->getTerminator());
+      if (RI->getNumOperands())
+        RetVal = getVal(RI->getOperand(0));
+      CallStack.pop_back();
+      return true;
+    }
+
+    // Okay, we succeeded in evaluating this control flow.  See if we have
+    // executed the new block before.  If so, we have a looping function,
+    // which we cannot evaluate in reasonable time.
+    if (!ExecutedBlocks.insert(NextBB).second)
+      return false;  // looped!
+
+    // Okay, we have never been in this block before.  Check to see if there
+    // are any PHI nodes.  If so, evaluate them with information about where
+    // we came from.
+    PHINode *PN = nullptr;
+    for (CurInst = NextBB->begin();
+         (PN = dyn_cast<PHINode>(CurInst)); ++CurInst)
+      setVal(PN, getVal(PN->getIncomingValueForBlock(CurBB)));
+
+    // Advance to the next block.
+    CurBB = NextBB;
+  }
+}
+
diff --git a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp
new file mode 100644
index 000000000000..435eff3bef47
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -0,0 +1,482 @@
+//===- FlatternCFG.cpp - Code to perform CFG flattening ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Reduce conditional branches in CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "flattencfg"
+
+namespace {
+class FlattenCFGOpt {
+  AliasAnalysis *AA;
+  /// \brief Use parallel-and or parallel-or to generate conditions for
+  /// conditional branches.
+  bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder);
+  /// \brief If \param BB is the merge block of an if-region, attempt to merge
+  /// the if-region with an adjacent if-region upstream if two if-regions
+  /// contain identical instructions.
+  bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder);
+  /// \brief Compare a pair of blocks: \p Block1 and \p Block2, which
+  /// are from two if-regions whose entry blocks are \p Head1 and \p
+  /// Head2.  \returns true if \p Block1 and \p Block2 contain identical
+  /// instructions, and have no memory reference alias with \p Head2.
+  /// This is used as a legality check for merging if-regions.
+  bool CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
+                            BasicBlock *Block1, BasicBlock *Block2);
+
+public:
+  FlattenCFGOpt(AliasAnalysis *AA) : AA(AA) {}
+  bool run(BasicBlock *BB);
+};
+}
+
+/// If \param [in] BB has more than one predecessor that is a conditional
+/// branch, attempt to use parallel and/or for the branch condition. \returns
+/// true on success.
+///
+/// Before:
+///   ......
+///   %cmp10 = fcmp une float %tmp1, %tmp2
+///   br i1 %cmp1, label %if.then, label %lor.rhs
+///
+/// lor.rhs:
+///   ......
+///   %cmp11 = fcmp une float %tmp3, %tmp4
+///   br i1 %cmp11, label %if.then, label %ifend
+///
+/// if.end:  // the merge block
+///   ......
+///
+/// if.then: // has two predecessors, both of them contains conditional branch.
+///   ......
+///   br label %if.end;
+///
+/// After:
+///  ......
+///  %cmp10 = fcmp une float %tmp1, %tmp2
+///  ......
+///  %cmp11 = fcmp une float %tmp3, %tmp4
+///  %cmp12 = or i1 %cmp10, %cmp11    // parallel-or mode.
+///  br i1 %cmp12, label %if.then, label %ifend
+///
+///  if.end:
+///    ......
+///
+///  if.then:
+///    ......
+///    br label %if.end;
+///
+///  Current implementation handles two cases.
+///  Case 1: \param BB is on the else-path.
+///
+///          BB1
+///        /     |
+///       BB2    |
+///      /   \   |
+///     BB3   \  |     where, BB1, BB2 contain conditional branches.
+///      \    |  /     BB3 contains unconditional branch.
+///       \   | /      BB4 corresponds to \param BB which is also the merge.
+///  BB => BB4
+///
+///
+///  Corresponding source code:
+///
+///  if (a == b && c == d)
+///    statement; // BB3
+///
+///  Case 2: \param BB BB is on the then-path.
+///
+///             BB1
+///          /      |
+///         |      BB2
+///         \    /    |  where BB1, BB2 contain conditional branches.
+///  BB =>   BB3      |  BB3 contains unconditiona branch and corresponds
+///           \     /    to \param BB.  BB4 is the merge.
+///             BB4
+///
+///  Corresponding source code:
+///
+///  if (a == b || c == d)
+///    statement;  // BB3
+///
+///  In both cases,  \param BB is the common successor of conditional branches.
+///  In Case 1, \param BB (BB4) has an unconditional branch (BB3) as
+///  its predecessor.  In Case 2, \param BB (BB3) only has conditional branches
+///  as its predecessors.
+///
+bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
+  PHINode *PHI = dyn_cast<PHINode>(BB->begin());
+  if (PHI)
+    return false; // For simplicity, avoid cases containing PHI nodes.
+
+  BasicBlock *LastCondBlock = nullptr;
+  BasicBlock *FirstCondBlock = nullptr;
+  BasicBlock *UnCondBlock = nullptr;
+  int Idx = -1;
+
+  // Check predecessors of \param BB.
+  SmallPtrSet<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
+  for (SmallPtrSetIterator<BasicBlock *> PI = Preds.begin(), PE = Preds.end();
+       PI != PE; ++PI) {
+    BasicBlock *Pred = *PI;
+    BranchInst *PBI = dyn_cast<BranchInst>(Pred->getTerminator());
+
+    // All predecessors should terminate with a branch.
+    if (!PBI)
+      return false;
+
+    BasicBlock *PP = Pred->getSinglePredecessor();
+
+    if (PBI->isUnconditional()) {
+      // Case 1: Pred (BB3) is an unconditional block, it should
+      // have a single predecessor (BB2) that is also a predecessor
+      // of \param BB (BB4) and should not have address-taken.
+      // There should exist only one such unconditional
+      // branch among the predecessors.
+      if (UnCondBlock || !PP || (Preds.count(PP) == 0) ||
+          Pred->hasAddressTaken())
+        return false;
+
+      UnCondBlock = Pred;
+      continue;
+    }
+
+    // Only conditional branches are allowed beyond this point.
+    assert(PBI->isConditional());
+
+    // Condition's unique use should be the branch instruction.
+    Value *PC = PBI->getCondition();
+    if (!PC || !PC->hasOneUse())
+      return false;
+
+    if (PP && Preds.count(PP)) {
+      // These are internal condition blocks to be merged from, e.g.,
+      // BB2 in both cases.
+      // Should not be address-taken.
+      if (Pred->hasAddressTaken())
+        return false;
+
+      // Instructions in the internal condition blocks should be safe
+      // to hoist up.
+      for (BasicBlock::iterator BI = Pred->begin(), BE = PBI->getIterator();
+           BI != BE;) {
+        Instruction *CI = &*BI++;
+        if (isa<PHINode>(CI) || !isSafeToSpeculativelyExecute(CI))
+          return false;
+      }
+    } else {
+      // This is the condition block to be merged into, e.g. BB1 in
+      // both cases.
+      if (FirstCondBlock)
+        return false;
+      FirstCondBlock = Pred;
+    }
+
+    // Find whether BB is uniformly on the true (or false) path
+    // for all of its predecessors.
+    BasicBlock *PS1 = PBI->getSuccessor(0);
+    BasicBlock *PS2 = PBI->getSuccessor(1);
+    BasicBlock *PS = (PS1 == BB) ? PS2 : PS1;
+    int CIdx = (PS1 == BB) ? 0 : 1;
+
+    if (Idx == -1)
+      Idx = CIdx;
+    else if (CIdx != Idx)
+      return false;
+
+    // PS is the successor which is not BB. Check successors to identify
+    // the last conditional branch.
+    if (Preds.count(PS) == 0) {
+      // Case 2.
+      LastCondBlock = Pred;
+    } else {
+      // Case 1
+      BranchInst *BPS = dyn_cast<BranchInst>(PS->getTerminator());
+      if (BPS && BPS->isUnconditional()) {
+        // Case 1: PS(BB3) should be an unconditional branch.
+        LastCondBlock = Pred;
+      }
+    }
+  }
+
+  if (!FirstCondBlock || !LastCondBlock || (FirstCondBlock == LastCondBlock))
+    return false;
+
+  TerminatorInst *TBB = LastCondBlock->getTerminator();
+  BasicBlock *PS1 = TBB->getSuccessor(0);
+  BasicBlock *PS2 = TBB->getSuccessor(1);
+  BranchInst *PBI1 = dyn_cast<BranchInst>(PS1->getTerminator());
+  BranchInst *PBI2 = dyn_cast<BranchInst>(PS2->getTerminator());
+
+  // If PS1 does not jump into PS2, but PS2 jumps into PS1,
+  // attempt branch inversion.
+  if (!PBI1 || !PBI1->isUnconditional() ||
+      (PS1->getTerminator()->getSuccessor(0) != PS2)) {
+    // Check whether PS2 jumps into PS1.
+    if (!PBI2 || !PBI2->isUnconditional() ||
+        (PS2->getTerminator()->getSuccessor(0) != PS1))
+      return false;
+
+    // Do branch inversion.
+    BasicBlock *CurrBlock = LastCondBlock;
+    bool EverChanged = false;
+    for (;CurrBlock != FirstCondBlock;
+          CurrBlock = CurrBlock->getSinglePredecessor()) {
+      BranchInst *BI = dyn_cast<BranchInst>(CurrBlock->getTerminator());
+      CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition());
+      if (!CI)
+        continue;
+
+      CmpInst::Predicate Predicate = CI->getPredicate();
+      // Canonicalize icmp_ne -> icmp_eq, fcmp_one -> fcmp_oeq
+      if ((Predicate == CmpInst::ICMP_NE) || (Predicate == CmpInst::FCMP_ONE)) {
+        CI->setPredicate(ICmpInst::getInversePredicate(Predicate));
+        BI->swapSuccessors();
+        EverChanged = true;
+      }
+    }
+    return EverChanged;
+  }
+
+  // PS1 must have a conditional branch.
+  if (!PBI1 || !PBI1->isUnconditional())
+    return false;
+
+  // PS2 should not contain PHI node.
+  PHI = dyn_cast<PHINode>(PS2->begin());
+  if (PHI)
+    return false;
+
+  // Do the transformation.
+  BasicBlock *CB;
+  BranchInst *PBI = dyn_cast<BranchInst>(FirstCondBlock->getTerminator());
+  bool Iteration = true;
+  IRBuilder<>::InsertPointGuard Guard(Builder);
+  Value *PC = PBI->getCondition();
+
+  do {
+    CB = PBI->getSuccessor(1 - Idx);
+    // Delete the conditional branch.
+    FirstCondBlock->getInstList().pop_back();
+    FirstCondBlock->getInstList()
+        .splice(FirstCondBlock->end(), CB->getInstList());
+    PBI = cast<BranchInst>(FirstCondBlock->getTerminator());
+    Value *CC = PBI->getCondition();
+    // Merge conditions.
+    Builder.SetInsertPoint(PBI);
+    Value *NC;
+    if (Idx == 0)
+      // Case 2, use parallel or.
+      NC = Builder.CreateOr(PC, CC);
+    else
+      // Case 1, use parallel and.
+      NC = Builder.CreateAnd(PC, CC);
+
+    PBI->replaceUsesOfWith(CC, NC);
+    PC = NC;
+    if (CB == LastCondBlock)
+      Iteration = false;
+    // Remove internal conditional branches.
+    CB->dropAllReferences();
+    // make CB unreachable and let downstream to delete the block.
+    new UnreachableInst(CB->getContext(), CB);
+  } while (Iteration);
+
+  DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock);
+  return true;
+}
+
+/// Compare blocks from two if-regions, where \param Head1 is the entry of the
+/// 1st if-region. \param Head2 is the entry of the 2nd if-region. \param
+/// Block1 is a block in the 1st if-region to compare. \param Block2 is a block
+//  in the 2nd if-region to compare.  \returns true if \param Block1 and \param
+/// Block2 have identical instructions and do not have memory reference alias
+/// with \param Head2.
+///
+bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
+                                         BasicBlock *Block1,
+                                         BasicBlock *Block2) {
+  TerminatorInst *PTI2 = Head2->getTerminator();
+  Instruction *PBI2 = &Head2->front();
+
+  bool eq1 = (Block1 == Head1);
+  bool eq2 = (Block2 == Head2);
+  if (eq1 || eq2) {
+    // An empty then-path or else-path.
+    return (eq1 == eq2);
+  }
+
+  // Check whether instructions in Block1 and Block2 are identical
+  // and do not alias with instructions in Head2.
+  BasicBlock::iterator iter1 = Block1->begin();
+  BasicBlock::iterator end1 = Block1->getTerminator()->getIterator();
+  BasicBlock::iterator iter2 = Block2->begin();
+  BasicBlock::iterator end2 = Block2->getTerminator()->getIterator();
+
+  while (1) {
+    if (iter1 == end1) {
+      if (iter2 != end2)
+        return false;
+      break;
+    }
+
+    if (!iter1->isIdenticalTo(&*iter2))
+      return false;
+
+    // Illegal to remove instructions with side effects except
+    // non-volatile stores.
+    if (iter1->mayHaveSideEffects()) {
+      Instruction *CurI = &*iter1;
+      StoreInst *SI = dyn_cast<StoreInst>(CurI);
+      if (!SI || SI->isVolatile())
+        return false;
+    }
+
+    // For simplicity and speed, data dependency check can be
+    // avoided if read from memory doesn't exist.
+    if (iter1->mayReadFromMemory())
+      return false;
+
+    if (iter1->mayWriteToMemory()) {
+      for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) {
+        if (BI->mayReadFromMemory() || BI->mayWriteToMemory()) {
+          // Check alias with Head2.
+          if (!AA || AA->alias(&*iter1, &*BI))
+            return false;
+        }
+      }
+    }
+    ++iter1;
+    ++iter2;
+  }
+
+  return true;
+}
+
+/// Check whether \param BB is the merge block of a if-region.  If yes, check
+/// whether there exists an adjacent if-region upstream, the two if-regions
+/// contain identical instructions and can be legally merged.  \returns true if
+/// the two if-regions are merged.
+///
+/// From:
+/// if (a)
+///   statement;
+/// if (b)
+///   statement;
+///
+/// To:
+/// if (a || b)
+///   statement;
+///
+bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
+  BasicBlock *IfTrue2, *IfFalse2;
+  Value *IfCond2 = GetIfCondition(BB, IfTrue2, IfFalse2);
+  Instruction *CInst2 = dyn_cast_or_null<Instruction>(IfCond2);
+  if (!CInst2)
+    return false;
+
+  BasicBlock *SecondEntryBlock = CInst2->getParent();
+  if (SecondEntryBlock->hasAddressTaken())
+    return false;
+
+  BasicBlock *IfTrue1, *IfFalse1;
+  Value *IfCond1 = GetIfCondition(SecondEntryBlock, IfTrue1, IfFalse1);
+  Instruction *CInst1 = dyn_cast_or_null<Instruction>(IfCond1);
+  if (!CInst1)
+    return false;
+
+  BasicBlock *FirstEntryBlock = CInst1->getParent();
+
+  // Either then-path or else-path should be empty.
+  if ((IfTrue1 != FirstEntryBlock) && (IfFalse1 != FirstEntryBlock))
+    return false;
+  if ((IfTrue2 != SecondEntryBlock) && (IfFalse2 != SecondEntryBlock))
+    return false;
+
+  TerminatorInst *PTI2 = SecondEntryBlock->getTerminator();
+  Instruction *PBI2 = &SecondEntryBlock->front();
+
+  if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfTrue1,
+                            IfTrue2))
+    return false;
+
+  if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfFalse1,
+                            IfFalse2))
+    return false;
+
+  // Check whether \param SecondEntryBlock has side-effect and is safe to
+  // speculate.
+  for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) {
+    Instruction *CI = &*BI;
+    if (isa<PHINode>(CI) || CI->mayHaveSideEffects() ||
+        !isSafeToSpeculativelyExecute(CI))
+      return false;
+  }
+
+  // Merge \param SecondEntryBlock into \param FirstEntryBlock.
+  FirstEntryBlock->getInstList().pop_back();
+  FirstEntryBlock->getInstList()
+      .splice(FirstEntryBlock->end(), SecondEntryBlock->getInstList());
+  BranchInst *PBI = dyn_cast<BranchInst>(FirstEntryBlock->getTerminator());
+  Value *CC = PBI->getCondition();
+  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
+  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  Builder.SetInsertPoint(PBI);
+  Value *NC = Builder.CreateOr(CInst1, CC);
+  PBI->replaceUsesOfWith(CC, NC);
+  Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt);
+
+  // Remove IfTrue1
+  if (IfTrue1 != FirstEntryBlock) {
+    IfTrue1->dropAllReferences();
+    IfTrue1->eraseFromParent();
+  }
+
+  // Remove IfFalse1
+  if (IfFalse1 != FirstEntryBlock) {
+    IfFalse1->dropAllReferences();
+    IfFalse1->eraseFromParent();
+  }
+
+  // Remove \param SecondEntryBlock
+  SecondEntryBlock->dropAllReferences();
+  SecondEntryBlock->eraseFromParent();
+  DEBUG(dbgs() << "If conditions merged into:\n" << *FirstEntryBlock);
+  return true;
+}
+
+bool FlattenCFGOpt::run(BasicBlock *BB) {
+  assert(BB && BB->getParent() && "Block not embedded in function!");
+  assert(BB->getTerminator() && "Degenerate basic block encountered!");
+
+  IRBuilder<> Builder(BB);
+
+  if (FlattenParallelAndOr(BB, Builder) || MergeIfRegion(BB, Builder))
+    return true;
+  return false;
+}
+
+/// FlattenCFG - This function is used to flatten a CFG.  For
+/// example, it uses parallel-and and parallel-or mode to collapse
+//  if-conditions and merge if-regions with identical statements.
+///
+bool llvm::FlattenCFG(BasicBlock *BB, AliasAnalysis *AA) {
+  return FlattenCFGOpt(AA).run(BB);
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp
new file mode 100644
index 000000000000..4a2be3a53176
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -0,0 +1,923 @@
+//===- FunctionComparator.h - Function Comparator -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the FunctionComparator and GlobalNumberState classes
+// which are used by the MergeFunctions pass for comparing functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/FunctionComparator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "functioncomparator"
+
+int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const {
+  if (L < R) return -1;
+  if (L > R) return 1;
+  return 0;
+}
+
+int FunctionComparator::cmpOrderings(AtomicOrdering L, AtomicOrdering R) const {
+  if ((int)L < (int)R) return -1;
+  if ((int)L > (int)R) return 1;
+  return 0;
+}
+
+int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const {
+  if (int Res = cmpNumbers(L.getBitWidth(), R.getBitWidth()))
+    return Res;
+  if (L.ugt(R)) return 1;
+  if (R.ugt(L)) return -1;
+  return 0;
+}
+
+int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const {
+  // Floats are ordered first by semantics (i.e. float, double, half, etc.),
+  // then by value interpreted as a bitstring (aka APInt).
+  const fltSemantics &SL = L.getSemantics(), &SR = R.getSemantics();
+  if (int Res = cmpNumbers(APFloat::semanticsPrecision(SL),
+                           APFloat::semanticsPrecision(SR)))
+    return Res;
+  if (int Res = cmpNumbers(APFloat::semanticsMaxExponent(SL),
+                           APFloat::semanticsMaxExponent(SR)))
+    return Res;
+  if (int Res = cmpNumbers(APFloat::semanticsMinExponent(SL),
+                           APFloat::semanticsMinExponent(SR)))
+    return Res;
+  if (int Res = cmpNumbers(APFloat::semanticsSizeInBits(SL),
+                           APFloat::semanticsSizeInBits(SR)))
+    return Res;
+  return cmpAPInts(L.bitcastToAPInt(), R.bitcastToAPInt());
+}
+
+int FunctionComparator::cmpMem(StringRef L, StringRef R) const {
+  // Prevent heavy comparison, compare sizes first.
+  if (int Res = cmpNumbers(L.size(), R.size()))
+    return Res;
+
+  // Compare strings lexicographically only when it is necessary: only when
+  // strings are equal in size.
+  return L.compare(R);
+}
+
+int FunctionComparator::cmpAttrs(const AttributeList L,
+                                 const AttributeList R) const {
+  if (int Res = cmpNumbers(L.getNumAttrSets(), R.getNumAttrSets()))
+    return Res;
+
+  for (unsigned i = L.index_begin(), e = L.index_end(); i != e; ++i) {
+    AttributeSet LAS = L.getAttributes(i);
+    AttributeSet RAS = R.getAttributes(i);
+    AttributeSet::iterator LI = LAS.begin(), LE = LAS.end();
+    AttributeSet::iterator RI = RAS.begin(), RE = RAS.end();
+    for (; LI != LE && RI != RE; ++LI, ++RI) {
+      Attribute LA = *LI;
+      Attribute RA = *RI;
+      if (LA < RA)
+        return -1;
+      if (RA < LA)
+        return 1;
+    }
+    if (LI != LE)
+      return 1;
+    if (RI != RE)
+      return -1;
+  }
+  return 0;
+}
+
+int FunctionComparator::cmpRangeMetadata(const MDNode *L,
+                                         const MDNode *R) const {
+  if (L == R)
+    return 0;
+  if (!L)
+    return -1;
+  if (!R)
+    return 1;
+  // Range metadata is a sequence of numbers. Make sure they are the same
+  // sequence.
+  // TODO: Note that as this is metadata, it is possible to drop and/or merge
+  // this data when considering functions to merge. Thus this comparison would
+  // return 0 (i.e. equivalent), but merging would become more complicated
+  // because the ranges would need to be unioned. It is not likely that
+  // functions differ ONLY in this metadata if they are actually the same
+  // function semantically.
+  if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
+    return Res;
+  for (size_t I = 0; I < L->getNumOperands(); ++I) {
+    ConstantInt *LLow = mdconst::extract<ConstantInt>(L->getOperand(I));
+    ConstantInt *RLow = mdconst::extract<ConstantInt>(R->getOperand(I));
+    if (int Res = cmpAPInts(LLow->getValue(), RLow->getValue()))
+      return Res;
+  }
+  return 0;
+}
+
+int FunctionComparator::cmpOperandBundlesSchema(const Instruction *L,
+                                                const Instruction *R) const {
+  ImmutableCallSite LCS(L);
+  ImmutableCallSite RCS(R);
+
+  assert(LCS && RCS && "Must be calls or invokes!");
+  assert(LCS.isCall() == RCS.isCall() && "Can't compare otherwise!");
+
+  if (int Res =
+          cmpNumbers(LCS.getNumOperandBundles(), RCS.getNumOperandBundles()))
+    return Res;
+
+  for (unsigned i = 0, e = LCS.getNumOperandBundles(); i != e; ++i) {
+    auto OBL = LCS.getOperandBundleAt(i);
+    auto OBR = RCS.getOperandBundleAt(i);
+
+    if (int Res = OBL.getTagName().compare(OBR.getTagName()))
+      return Res;
+
+    if (int Res = cmpNumbers(OBL.Inputs.size(), OBR.Inputs.size()))
+      return Res;
+  }
+
+  return 0;
+}
+
+/// Constants comparison:
+/// 1. Check whether type of L constant could be losslessly bitcasted to R
+/// type.
+/// 2. Compare constant contents.
+/// For more details see declaration comments.
+int FunctionComparator::cmpConstants(const Constant *L,
+                                     const Constant *R) const {
+
+  Type *TyL = L->getType();
+  Type *TyR = R->getType();
+
+  // Check whether types are bitcastable. This part is just re-factored
+  // Type::canLosslesslyBitCastTo method, but instead of returning true/false,
+  // we also pack into result which type is "less" for us.
+  int TypesRes = cmpTypes(TyL, TyR);
+  if (TypesRes != 0) {
+    // Types are different, but check whether we can bitcast them.
+    if (!TyL->isFirstClassType()) {
+      if (TyR->isFirstClassType())
+        return -1;
+      // Neither TyL nor TyR are values of first class type. Return the result
+      // of comparing the types
+      return TypesRes;
+    }
+    if (!TyR->isFirstClassType()) {
+      if (TyL->isFirstClassType())
+        return 1;
+      return TypesRes;
+    }
+
+    // Vector -> Vector conversions are always lossless if the two vector types
+    // have the same size, otherwise not.
+    unsigned TyLWidth = 0;
+    unsigned TyRWidth = 0;
+
+    if (auto *VecTyL = dyn_cast<VectorType>(TyL))
+      TyLWidth = VecTyL->getBitWidth();
+    if (auto *VecTyR = dyn_cast<VectorType>(TyR))
+      TyRWidth = VecTyR->getBitWidth();
+
+    if (TyLWidth != TyRWidth)
+      return cmpNumbers(TyLWidth, TyRWidth);
+
+    // Zero bit-width means neither TyL nor TyR are vectors.
+    if (!TyLWidth) {
+      PointerType *PTyL = dyn_cast<PointerType>(TyL);
+      PointerType *PTyR = dyn_cast<PointerType>(TyR);
+      if (PTyL && PTyR) {
+        unsigned AddrSpaceL = PTyL->getAddressSpace();
+        unsigned AddrSpaceR = PTyR->getAddressSpace();
+        if (int Res = cmpNumbers(AddrSpaceL, AddrSpaceR))
+          return Res;
+      }
+      if (PTyL)
+        return 1;
+      if (PTyR)
+        return -1;
+
+      // TyL and TyR aren't vectors, nor pointers. We don't know how to
+      // bitcast them.
+      return TypesRes;
+    }
+  }
+
+  // OK, types are bitcastable, now check constant contents.
+
+  if (L->isNullValue() && R->isNullValue())
+    return TypesRes;
+  if (L->isNullValue() && !R->isNullValue())
+    return 1;
+  if (!L->isNullValue() && R->isNullValue())
+    return -1;
+
+  auto GlobalValueL = const_cast<GlobalValue*>(dyn_cast<GlobalValue>(L));
+  auto GlobalValueR = const_cast<GlobalValue*>(dyn_cast<GlobalValue>(R));
+  if (GlobalValueL && GlobalValueR) {
+    return cmpGlobalValues(GlobalValueL, GlobalValueR);
+  }
+
+  if (int Res = cmpNumbers(L->getValueID(), R->getValueID()))
+    return Res;
+
+  if (const auto *SeqL = dyn_cast<ConstantDataSequential>(L)) {
+    const auto *SeqR = cast<ConstantDataSequential>(R);
+    // This handles ConstantDataArray and ConstantDataVector. Note that we
+    // compare the two raw data arrays, which might differ depending on the host
+    // endianness. This isn't a problem though, because the endiness of a module
+    // will affect the order of the constants, but this order is the same
+    // for a given input module and host platform.
+    return cmpMem(SeqL->getRawDataValues(), SeqR->getRawDataValues());
+  }
+
+  switch (L->getValueID()) {
+  case Value::UndefValueVal:
+  case Value::ConstantTokenNoneVal:
+    return TypesRes;
+  case Value::ConstantIntVal: {
+    const APInt &LInt = cast<ConstantInt>(L)->getValue();
+    const APInt &RInt = cast<ConstantInt>(R)->getValue();
+    return cmpAPInts(LInt, RInt);
+  }
+  case Value::ConstantFPVal: {
+    const APFloat &LAPF = cast<ConstantFP>(L)->getValueAPF();
+    const APFloat &RAPF = cast<ConstantFP>(R)->getValueAPF();
+    return cmpAPFloats(LAPF, RAPF);
+  }
+  case Value::ConstantArrayVal: {
+    const ConstantArray *LA = cast<ConstantArray>(L);
+    const ConstantArray *RA = cast<ConstantArray>(R);
+    uint64_t NumElementsL = cast<ArrayType>(TyL)->getNumElements();
+    uint64_t NumElementsR = cast<ArrayType>(TyR)->getNumElements();
+    if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+      return Res;
+    for (uint64_t i = 0; i < NumElementsL; ++i) {
+      if (int Res = cmpConstants(cast<Constant>(LA->getOperand(i)),
+                                 cast<Constant>(RA->getOperand(i))))
+        return Res;
+    }
+    return 0;
+  }
+  case Value::ConstantStructVal: {
+    const ConstantStruct *LS = cast<ConstantStruct>(L);
+    const ConstantStruct *RS = cast<ConstantStruct>(R);
+    unsigned NumElementsL = cast<StructType>(TyL)->getNumElements();
+    unsigned NumElementsR = cast<StructType>(TyR)->getNumElements();
+    if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+      return Res;
+    for (unsigned i = 0; i != NumElementsL; ++i) {
+      if (int Res = cmpConstants(cast<Constant>(LS->getOperand(i)),
+                                 cast<Constant>(RS->getOperand(i))))
+        return Res;
+    }
+    return 0;
+  }
+  case Value::ConstantVectorVal: {
+    const ConstantVector *LV = cast<ConstantVector>(L);
+    const ConstantVector *RV = cast<ConstantVector>(R);
+    unsigned NumElementsL = cast<VectorType>(TyL)->getNumElements();
+    unsigned NumElementsR = cast<VectorType>(TyR)->getNumElements();
+    if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+      return Res;
+    for (uint64_t i = 0; i < NumElementsL; ++i) {
+      if (int Res = cmpConstants(cast<Constant>(LV->getOperand(i)),
+                                 cast<Constant>(RV->getOperand(i))))
+        return Res;
+    }
+    return 0;
+  }
+  case Value::ConstantExprVal: {
+    const ConstantExpr *LE = cast<ConstantExpr>(L);
+    const ConstantExpr *RE = cast<ConstantExpr>(R);
+    unsigned NumOperandsL = LE->getNumOperands();
+    unsigned NumOperandsR = RE->getNumOperands();
+    if (int Res = cmpNumbers(NumOperandsL, NumOperandsR))
+      return Res;
+    for (unsigned i = 0; i < NumOperandsL; ++i) {
+      if (int Res = cmpConstants(cast<Constant>(LE->getOperand(i)),
+                                 cast<Constant>(RE->getOperand(i))))
+        return Res;
+    }
+    return 0;
+  }
+  case Value::BlockAddressVal: {
+    const BlockAddress *LBA = cast<BlockAddress>(L);
+    const BlockAddress *RBA = cast<BlockAddress>(R);
+    if (int Res = cmpValues(LBA->getFunction(), RBA->getFunction()))
+      return Res;
+    if (LBA->getFunction() == RBA->getFunction()) {
+      // They are BBs in the same function. Order by which comes first in the
+      // BB order of the function. This order is deterministic.
+      Function* F = LBA->getFunction();
+      BasicBlock *LBB = LBA->getBasicBlock();
+      BasicBlock *RBB = RBA->getBasicBlock();
+      if (LBB == RBB)
+        return 0;
+      for(BasicBlock &BB : F->getBasicBlockList()) {
+        if (&BB == LBB) {
+          assert(&BB != RBB);
+          return -1;
+        }
+        if (&BB == RBB)
+          return 1;
+      }
+      llvm_unreachable("Basic Block Address does not point to a basic block in "
+                       "its function.");
+      return -1;
+    } else {
+      // cmpValues said the functions are the same. So because they aren't
+      // literally the same pointer, they must respectively be the left and
+      // right functions.
+      assert(LBA->getFunction() == FnL && RBA->getFunction() == FnR);
+      // cmpValues will tell us if these are equivalent BasicBlocks, in the
+      // context of their respective functions.
+      return cmpValues(LBA->getBasicBlock(), RBA->getBasicBlock());
+    }
+  }
+  default: // Unknown constant, abort.
+    DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n");
+    llvm_unreachable("Constant ValueID not recognized.");
+    return -1;
+  }
+}
+
+int FunctionComparator::cmpGlobalValues(GlobalValue *L, GlobalValue *R) const {
+  uint64_t LNumber = GlobalNumbers->getNumber(L);
+  uint64_t RNumber = GlobalNumbers->getNumber(R);
+  return cmpNumbers(LNumber, RNumber);
+}
+
+/// cmpType - compares two types,
+/// defines total ordering among the types set.
+/// See method declaration comments for more details.
+int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
+  PointerType *PTyL = dyn_cast<PointerType>(TyL);
+  PointerType *PTyR = dyn_cast<PointerType>(TyR);
+
+  const DataLayout &DL = FnL->getParent()->getDataLayout();
+  if (PTyL && PTyL->getAddressSpace() == 0)
+    TyL = DL.getIntPtrType(TyL);
+  if (PTyR && PTyR->getAddressSpace() == 0)
+    TyR = DL.getIntPtrType(TyR);
+
+  if (TyL == TyR)
+    return 0;
+
+  if (int Res = cmpNumbers(TyL->getTypeID(), TyR->getTypeID()))
+    return Res;
+
+  switch (TyL->getTypeID()) {
+  default:
+    llvm_unreachable("Unknown type!");
+    // Fall through in Release mode.
+    LLVM_FALLTHROUGH;
+  case Type::IntegerTyID:
+    return cmpNumbers(cast<IntegerType>(TyL)->getBitWidth(),
+                      cast<IntegerType>(TyR)->getBitWidth());
+  // TyL == TyR would have returned true earlier, because types are uniqued.
+  case Type::VoidTyID:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+  case Type::MetadataTyID:
+  case Type::TokenTyID:
+    return 0;
+
+  case Type::PointerTyID: {
+    assert(PTyL && PTyR && "Both types must be pointers here.");
+    return cmpNumbers(PTyL->getAddressSpace(), PTyR->getAddressSpace());
+  }
+
+  case Type::StructTyID: {
+    StructType *STyL = cast<StructType>(TyL);
+    StructType *STyR = cast<StructType>(TyR);
+    if (STyL->getNumElements() != STyR->getNumElements())
+      return cmpNumbers(STyL->getNumElements(), STyR->getNumElements());
+
+    if (STyL->isPacked() != STyR->isPacked())
+      return cmpNumbers(STyL->isPacked(), STyR->isPacked());
+
+    for (unsigned i = 0, e = STyL->getNumElements(); i != e; ++i) {
+      if (int Res = cmpTypes(STyL->getElementType(i), STyR->getElementType(i)))
+        return Res;
+    }
+    return 0;
+  }
+
+  case Type::FunctionTyID: {
+    FunctionType *FTyL = cast<FunctionType>(TyL);
+    FunctionType *FTyR = cast<FunctionType>(TyR);
+    if (FTyL->getNumParams() != FTyR->getNumParams())
+      return cmpNumbers(FTyL->getNumParams(), FTyR->getNumParams());
+
+    if (FTyL->isVarArg() != FTyR->isVarArg())
+      return cmpNumbers(FTyL->isVarArg(), FTyR->isVarArg());
+
+    if (int Res = cmpTypes(FTyL->getReturnType(), FTyR->getReturnType()))
+      return Res;
+
+    for (unsigned i = 0, e = FTyL->getNumParams(); i != e; ++i) {
+      if (int Res = cmpTypes(FTyL->getParamType(i), FTyR->getParamType(i)))
+        return Res;
+    }
+    return 0;
+  }
+
+  case Type::ArrayTyID:
+  case Type::VectorTyID: {
+    auto *STyL = cast<SequentialType>(TyL);
+    auto *STyR = cast<SequentialType>(TyR);
+    if (STyL->getNumElements() != STyR->getNumElements())
+      return cmpNumbers(STyL->getNumElements(), STyR->getNumElements());
+    return cmpTypes(STyL->getElementType(), STyR->getElementType());
+  }
+  }
+}
+
+// Determine whether the two operations are the same except that pointer-to-A
+// and pointer-to-B are equivalent. This should be kept in sync with
+// Instruction::isSameOperationAs.
+// Read method declaration comments for more details.
+int FunctionComparator::cmpOperations(const Instruction *L,
+                                      const Instruction *R,
+                                      bool &needToCmpOperands) const {
+  needToCmpOperands = true;
+  if (int Res = cmpValues(L, R))
+    return Res;
+
+  // Differences from Instruction::isSameOperationAs:
+  //  * replace type comparison with calls to cmpTypes.
+  //  * we test for I->getRawSubclassOptionalData (nuw/nsw/tail) at the top.
+  //  * because of the above, we don't test for the tail bit on calls later on.
+  if (int Res = cmpNumbers(L->getOpcode(), R->getOpcode()))
+    return Res;
+
+  if (const GetElementPtrInst *GEPL = dyn_cast<GetElementPtrInst>(L)) {
+    needToCmpOperands = false;
+    const GetElementPtrInst *GEPR = cast<GetElementPtrInst>(R);
+    if (int Res =
+            cmpValues(GEPL->getPointerOperand(), GEPR->getPointerOperand()))
+      return Res;
+    return cmpGEPs(GEPL, GEPR);
+  }
+
+  if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
+    return Res;
+
+  if (int Res = cmpTypes(L->getType(), R->getType()))
+    return Res;
+
+  if (int Res = cmpNumbers(L->getRawSubclassOptionalData(),
+                           R->getRawSubclassOptionalData()))
+    return Res;
+
+  // We have two instructions of identical opcode and #operands.  Check to see
+  // if all operands are the same type
+  for (unsigned i = 0, e = L->getNumOperands(); i != e; ++i) {
+    if (int Res =
+            cmpTypes(L->getOperand(i)->getType(), R->getOperand(i)->getType()))
+      return Res;
+  }
+
+  // Check special state that is a part of some instructions.
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(L)) {
+    if (int Res = cmpTypes(AI->getAllocatedType(),
+                           cast<AllocaInst>(R)->getAllocatedType()))
+      return Res;
+    return cmpNumbers(AI->getAlignment(), cast<AllocaInst>(R)->getAlignment());
+  }
+  if (const LoadInst *LI = dyn_cast<LoadInst>(L)) {
+    if (int Res = cmpNumbers(LI->isVolatile(), cast<LoadInst>(R)->isVolatile()))
+      return Res;
+    if (int Res =
+            cmpNumbers(LI->getAlignment(), cast<LoadInst>(R)->getAlignment()))
+      return Res;
+    if (int Res =
+            cmpOrderings(LI->getOrdering(), cast<LoadInst>(R)->getOrdering()))
+      return Res;
+    if (int Res = cmpNumbers(LI->getSyncScopeID(),
+                             cast<LoadInst>(R)->getSyncScopeID()))
+      return Res;
+    return cmpRangeMetadata(LI->getMetadata(LLVMContext::MD_range),
+        cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range));
+  }
+  if (const StoreInst *SI = dyn_cast<StoreInst>(L)) {
+    if (int Res =
+            cmpNumbers(SI->isVolatile(), cast<StoreInst>(R)->isVolatile()))
+      return Res;
+    if (int Res =
+            cmpNumbers(SI->getAlignment(), cast<StoreInst>(R)->getAlignment()))
+      return Res;
+    if (int Res =
+            cmpOrderings(SI->getOrdering(), cast<StoreInst>(R)->getOrdering()))
+      return Res;
+    return cmpNumbers(SI->getSyncScopeID(),
+                      cast<StoreInst>(R)->getSyncScopeID());
+  }
+  if (const CmpInst *CI = dyn_cast<CmpInst>(L))
+    return cmpNumbers(CI->getPredicate(), cast<CmpInst>(R)->getPredicate());
+  if (const CallInst *CI = dyn_cast<CallInst>(L)) {
+    if (int Res = cmpNumbers(CI->getCallingConv(),
+                             cast<CallInst>(R)->getCallingConv()))
+      return Res;
+    if (int Res =
+            cmpAttrs(CI->getAttributes(), cast<CallInst>(R)->getAttributes()))
+      return Res;
+    if (int Res = cmpOperandBundlesSchema(CI, R))
+      return Res;
+    return cmpRangeMetadata(
+        CI->getMetadata(LLVMContext::MD_range),
+        cast<CallInst>(R)->getMetadata(LLVMContext::MD_range));
+  }
+  if (const InvokeInst *II = dyn_cast<InvokeInst>(L)) {
+    if (int Res = cmpNumbers(II->getCallingConv(),
+                             cast<InvokeInst>(R)->getCallingConv()))
+      return Res;
+    if (int Res =
+            cmpAttrs(II->getAttributes(), cast<InvokeInst>(R)->getAttributes()))
+      return Res;
+    if (int Res = cmpOperandBundlesSchema(II, R))
+      return Res;
+    return cmpRangeMetadata(
+        II->getMetadata(LLVMContext::MD_range),
+        cast<InvokeInst>(R)->getMetadata(LLVMContext::MD_range));
+  }
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) {
+    ArrayRef<unsigned> LIndices = IVI->getIndices();
+    ArrayRef<unsigned> RIndices = cast<InsertValueInst>(R)->getIndices();
+    if (int Res = cmpNumbers(LIndices.size(), RIndices.size()))
+      return Res;
+    for (size_t i = 0, e = LIndices.size(); i != e; ++i) {
+      if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
+        return Res;
+    }
+    return 0;
+  }
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(L)) {
+    ArrayRef<unsigned> LIndices = EVI->getIndices();
+    ArrayRef<unsigned> RIndices = cast<ExtractValueInst>(R)->getIndices();
+    if (int Res = cmpNumbers(LIndices.size(), RIndices.size()))
+      return Res;
+    for (size_t i = 0, e = LIndices.size(); i != e; ++i) {
+      if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
+        return Res;
+    }
+  }
+  if (const FenceInst *FI = dyn_cast<FenceInst>(L)) {
+    if (int Res =
+            cmpOrderings(FI->getOrdering(), cast<FenceInst>(R)->getOrdering()))
+      return Res;
+    return cmpNumbers(FI->getSyncScopeID(),
+                      cast<FenceInst>(R)->getSyncScopeID());
+  }
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(L)) {
+    if (int Res = cmpNumbers(CXI->isVolatile(),
+                             cast<AtomicCmpXchgInst>(R)->isVolatile()))
+      return Res;
+    if (int Res = cmpNumbers(CXI->isWeak(),
+                             cast<AtomicCmpXchgInst>(R)->isWeak()))
+      return Res;
+    if (int Res =
+            cmpOrderings(CXI->getSuccessOrdering(),
+                         cast<AtomicCmpXchgInst>(R)->getSuccessOrdering()))
+      return Res;
+    if (int Res =
+            cmpOrderings(CXI->getFailureOrdering(),
+                         cast<AtomicCmpXchgInst>(R)->getFailureOrdering()))
+      return Res;
+    return cmpNumbers(CXI->getSyncScopeID(),
+                      cast<AtomicCmpXchgInst>(R)->getSyncScopeID());
+  }
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(L)) {
+    if (int Res = cmpNumbers(RMWI->getOperation(),
+                             cast<AtomicRMWInst>(R)->getOperation()))
+      return Res;
+    if (int Res = cmpNumbers(RMWI->isVolatile(),
+                             cast<AtomicRMWInst>(R)->isVolatile()))
+      return Res;
+    if (int Res = cmpOrderings(RMWI->getOrdering(),
+                             cast<AtomicRMWInst>(R)->getOrdering()))
+      return Res;
+    return cmpNumbers(RMWI->getSyncScopeID(),
+                      cast<AtomicRMWInst>(R)->getSyncScopeID());
+  }
+  if (const PHINode *PNL = dyn_cast<PHINode>(L)) {
+    const PHINode *PNR = cast<PHINode>(R);
+    // Ensure that in addition to the incoming values being identical
+    // (checked by the caller of this function), the incoming blocks
+    // are also identical.
+    for (unsigned i = 0, e = PNL->getNumIncomingValues(); i != e; ++i) {
+      if (int Res =
+              cmpValues(PNL->getIncomingBlock(i), PNR->getIncomingBlock(i)))
+        return Res;
+    }
+  }
+  return 0;
+}
+
+// Determine whether two GEP operations perform the same underlying arithmetic.
+// Read method declaration comments for more details.
+int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,
+                                const GEPOperator *GEPR) const {
+
+  unsigned int ASL = GEPL->getPointerAddressSpace();
+  unsigned int ASR = GEPR->getPointerAddressSpace();
+
+  if (int Res = cmpNumbers(ASL, ASR))
+    return Res;
+
+  // When we have target data, we can reduce the GEP down to the value in bytes
+  // added to the address.
+  const DataLayout &DL = FnL->getParent()->getDataLayout();
+  unsigned BitWidth = DL.getPointerSizeInBits(ASL);
+  APInt OffsetL(BitWidth, 0), OffsetR(BitWidth, 0);
+  if (GEPL->accumulateConstantOffset(DL, OffsetL) &&
+      GEPR->accumulateConstantOffset(DL, OffsetR))
+    return cmpAPInts(OffsetL, OffsetR);
+  if (int Res = cmpTypes(GEPL->getSourceElementType(),
+                         GEPR->getSourceElementType()))
+    return Res;
+
+  if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands()))
+    return Res;
+
+  for (unsigned i = 0, e = GEPL->getNumOperands(); i != e; ++i) {
+    if (int Res = cmpValues(GEPL->getOperand(i), GEPR->getOperand(i)))
+      return Res;
+  }
+
+  return 0;
+}
+
+int FunctionComparator::cmpInlineAsm(const InlineAsm *L,
+                                     const InlineAsm *R) const {
+  // InlineAsm's are uniqued. If they are the same pointer, obviously they are
+  // the same, otherwise compare the fields.
+  if (L == R)
+    return 0;
+  if (int Res = cmpTypes(L->getFunctionType(), R->getFunctionType()))
+    return Res;
+  if (int Res = cmpMem(L->getAsmString(), R->getAsmString()))
+    return Res;
+  if (int Res = cmpMem(L->getConstraintString(), R->getConstraintString()))
+    return Res;
+  if (int Res = cmpNumbers(L->hasSideEffects(), R->hasSideEffects()))
+    return Res;
+  if (int Res = cmpNumbers(L->isAlignStack(), R->isAlignStack()))
+    return Res;
+  if (int Res = cmpNumbers(L->getDialect(), R->getDialect()))
+    return Res;
+  llvm_unreachable("InlineAsm blocks were not uniqued.");
+  return 0;
+}
+
+/// Compare two values used by the two functions under pair-wise comparison. If
+/// this is the first time the values are seen, they're added to the mapping so
+/// that we will detect mismatches on next use.
+/// See comments in declaration for more details.
+int FunctionComparator::cmpValues(const Value *L, const Value *R) const {
+  // Catch self-reference case.
+  if (L == FnL) {
+    if (R == FnR)
+      return 0;
+    return -1;
+  }
+  if (R == FnR) {
+    if (L == FnL)
+      return 0;
+    return 1;
+  }
+
+  const Constant *ConstL = dyn_cast<Constant>(L);
+  const Constant *ConstR = dyn_cast<Constant>(R);
+  if (ConstL && ConstR) {
+    if (L == R)
+      return 0;
+    return cmpConstants(ConstL, ConstR);
+  }
+
+  if (ConstL)
+    return 1;
+  if (ConstR)
+    return -1;
+
+  const InlineAsm *InlineAsmL = dyn_cast<InlineAsm>(L);
+  const InlineAsm *InlineAsmR = dyn_cast<InlineAsm>(R);
+
+  if (InlineAsmL && InlineAsmR)
+    return cmpInlineAsm(InlineAsmL, InlineAsmR);
+  if (InlineAsmL)
+    return 1;
+  if (InlineAsmR)
+    return -1;
+
+  auto LeftSN = sn_mapL.insert(std::make_pair(L, sn_mapL.size())),
+       RightSN = sn_mapR.insert(std::make_pair(R, sn_mapR.size()));
+
+  return cmpNumbers(LeftSN.first->second, RightSN.first->second);
+}
+
+// Test whether two basic blocks have equivalent behaviour.
+int FunctionComparator::cmpBasicBlocks(const BasicBlock *BBL,
+                                       const BasicBlock *BBR) const {
+  BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end();
+  BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end();
+
+  do {
+    bool needToCmpOperands = true;
+    if (int Res = cmpOperations(&*InstL, &*InstR, needToCmpOperands))
+      return Res;
+    if (needToCmpOperands) {
+      assert(InstL->getNumOperands() == InstR->getNumOperands());
+
+      for (unsigned i = 0, e = InstL->getNumOperands(); i != e; ++i) {
+        Value *OpL = InstL->getOperand(i);
+        Value *OpR = InstR->getOperand(i);
+        if (int Res = cmpValues(OpL, OpR))
+          return Res;
+        // cmpValues should ensure this is true.
+        assert(cmpTypes(OpL->getType(), OpR->getType()) == 0);
+      }
+    }
+
+    ++InstL;
+    ++InstR;
+  } while (InstL != InstLE && InstR != InstRE);
+
+  if (InstL != InstLE && InstR == InstRE)
+    return 1;
+  if (InstL == InstLE && InstR != InstRE)
+    return -1;
+  return 0;
+}
+
+int FunctionComparator::compareSignature() const {
+  if (int Res = cmpAttrs(FnL->getAttributes(), FnR->getAttributes()))
+    return Res;
+
+  if (int Res = cmpNumbers(FnL->hasGC(), FnR->hasGC()))
+    return Res;
+
+  if (FnL->hasGC()) {
+    if (int Res = cmpMem(FnL->getGC(), FnR->getGC()))
+      return Res;
+  }
+
+  if (int Res = cmpNumbers(FnL->hasSection(), FnR->hasSection()))
+    return Res;
+
+  if (FnL->hasSection()) {
+    if (int Res = cmpMem(FnL->getSection(), FnR->getSection()))
+      return Res;
+  }
+
+  if (int Res = cmpNumbers(FnL->isVarArg(), FnR->isVarArg()))
+    return Res;
+
+  // TODO: if it's internal and only used in direct calls, we could handle this
+  // case too.
+  if (int Res = cmpNumbers(FnL->getCallingConv(), FnR->getCallingConv()))
+    return Res;
+
+  if (int Res = cmpTypes(FnL->getFunctionType(), FnR->getFunctionType()))
+    return Res;
+
+  assert(FnL->arg_size() == FnR->arg_size() &&
+         "Identically typed functions have different numbers of args!");
+
+  // Visit the arguments so that they get enumerated in the order they're
+  // passed in.
+  for (Function::const_arg_iterator ArgLI = FnL->arg_begin(),
+       ArgRI = FnR->arg_begin(),
+       ArgLE = FnL->arg_end();
+       ArgLI != ArgLE; ++ArgLI, ++ArgRI) {
+    if (cmpValues(&*ArgLI, &*ArgRI) != 0)
+      llvm_unreachable("Arguments repeat!");
+  }
+  return 0;
+}
+
+// Test whether the two functions have equivalent behaviour.
+int FunctionComparator::compare() {
+  beginCompare();
+
+  if (int Res = compareSignature())
+    return Res;
+
+  // We do a CFG-ordered walk since the actual ordering of the blocks in the
+  // linked list is immaterial. Our walk starts at the entry block for both
+  // functions, then takes each block from each terminator in order. As an
+  // artifact, this also means that unreachable blocks are ignored.
+  SmallVector<const BasicBlock *, 8> FnLBBs, FnRBBs;
+  SmallPtrSet<const BasicBlock *, 32> VisitedBBs; // in terms of F1.
+
+  FnLBBs.push_back(&FnL->getEntryBlock());
+  FnRBBs.push_back(&FnR->getEntryBlock());
+
+  VisitedBBs.insert(FnLBBs[0]);
+  while (!FnLBBs.empty()) {
+    const BasicBlock *BBL = FnLBBs.pop_back_val();
+    const BasicBlock *BBR = FnRBBs.pop_back_val();
+
+    if (int Res = cmpValues(BBL, BBR))
+      return Res;
+
+    if (int Res = cmpBasicBlocks(BBL, BBR))
+      return Res;
+
+    const TerminatorInst *TermL = BBL->getTerminator();
+    const TerminatorInst *TermR = BBR->getTerminator();
+
+    assert(TermL->getNumSuccessors() == TermR->getNumSuccessors());
+    for (unsigned i = 0, e = TermL->getNumSuccessors(); i != e; ++i) {
+      if (!VisitedBBs.insert(TermL->getSuccessor(i)).second)
+        continue;
+
+      FnLBBs.push_back(TermL->getSuccessor(i));
+      FnRBBs.push_back(TermR->getSuccessor(i));
+    }
+  }
+  return 0;
+}
+
+namespace {
+
+// Accumulate the hash of a sequence of 64-bit integers. This is similar to a
+// hash of a sequence of 64bit ints, but the entire input does not need to be
+// available at once. This interface is necessary for functionHash because it
+// needs to accumulate the hash as the structure of the function is traversed
+// without saving these values to an intermediate buffer. This form of hashing
+// is not often needed, as usually the object to hash is just read from a
+// buffer.
+class HashAccumulator64 {
+  uint64_t Hash;
+public:
+  // Initialize to random constant, so the state isn't zero.
+  HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; }
+  void add(uint64_t V) {
+     Hash = llvm::hashing::detail::hash_16_bytes(Hash, V);
+  }
+  // No finishing is required, because the entire hash value is used.
+  uint64_t getHash() { return Hash; }
+};
+} // end anonymous namespace
+
+// A function hash is calculated by considering only the number of arguments and
+// whether a function is varargs, the order of basic blocks (given by the
+// successors of each basic block in depth first order), and the order of
+// opcodes of each instruction within each of these basic blocks. This mirrors
+// the strategy compare() uses to compare functions by walking the BBs in depth
+// first order and comparing each instruction in sequence. Because this hash
+// does not look at the operands, it is insensitive to things such as the
+// target of calls and the constants used in the function, which makes it useful
+// when possibly merging functions which are the same modulo constants and call
+// targets.
+FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) {
+  HashAccumulator64 H;
+  H.add(F.isVarArg());
+  H.add(F.arg_size());
+
+  SmallVector<const BasicBlock *, 8> BBs;
+  SmallSet<const BasicBlock *, 16> VisitedBBs;
+
+  // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(),
+  // accumulating the hash of the function "structure." (BB and opcode sequence)
+  BBs.push_back(&F.getEntryBlock());
+  VisitedBBs.insert(BBs[0]);
+  while (!BBs.empty()) {
+    const BasicBlock *BB = BBs.pop_back_val();
+    // This random value acts as a block header, as otherwise the partition of
+    // opcodes into BBs wouldn't affect the hash, only the order of the opcodes
+    H.add(45798);
+    for (auto &Inst : *BB) {
+      H.add(Inst.getOpcode());
+    }
+    const TerminatorInst *Term = BB->getTerminator();
+    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+      if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
+        continue;
+      BBs.push_back(Term->getSuccessor(i));
+    }
+  }
+  return H.getHash();
+}
+
+
diff --git a/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
new file mode 100644
index 000000000000..a98d07237b47
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -0,0 +1,262 @@
+//===- lib/Transforms/Utils/FunctionImportUtils.cpp - Importing utilities -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the FunctionImportGlobalProcessing class, used
+// to perform the necessary global value handling for function importing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/FunctionImportUtils.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+using namespace llvm;
+
+/// Checks if we should import SGV as a definition, otherwise import as a
+/// declaration.
+bool FunctionImportGlobalProcessing::doImportAsDefinition(
+    const GlobalValue *SGV, SetVector<GlobalValue *> *GlobalsToImport) {
+
+  // For alias, we tie the definition to the base object. Extract it and recurse
+  if (auto *GA = dyn_cast<GlobalAlias>(SGV)) {
+    if (GA->isInterposable())
+      return false;
+    const GlobalObject *GO = GA->getBaseObject();
+    if (!GO->hasLinkOnceODRLinkage())
+      return false;
+    return FunctionImportGlobalProcessing::doImportAsDefinition(
+        GO, GlobalsToImport);
+  }
+  // Only import the globals requested for importing.
+  if (GlobalsToImport->count(const_cast<GlobalValue *>(SGV)))
+    return true;
+  // Otherwise no.
+  return false;
+}
+
+bool FunctionImportGlobalProcessing::doImportAsDefinition(
+    const GlobalValue *SGV) {
+  if (!isPerformingImport())
+    return false;
+  return FunctionImportGlobalProcessing::doImportAsDefinition(SGV,
+                                                              GlobalsToImport);
+}
+
+bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
+    const GlobalValue *SGV) {
+  assert(SGV->hasLocalLinkage());
+  // Both the imported references and the original local variable must
+  // be promoted.
+  if (!isPerformingImport() && !isModuleExporting())
+    return false;
+
+  if (isPerformingImport()) {
+    assert((!GlobalsToImport->count(const_cast<GlobalValue *>(SGV)) ||
+            !isNonRenamableLocal(*SGV)) &&
+           "Attempting to promote non-renamable local");
+    // We don't know for sure yet if we are importing this value (as either
+    // a reference or a def), since we are simply walking all values in the
+    // module. But by necessity if we end up importing it and it is local,
+    // it must be promoted, so unconditionally promote all values in the
+    // importing module.
+    return true;
+  }
+
+  // When exporting, consult the index. We can have more than one local
+  // with the same GUID, in the case of same-named locals in different but
+  // same-named source files that were compiled in their respective directories
+  // (so the source file name and resulting GUID is the same). Find the one
+  // in this module.
+  auto Summary = ImportIndex.findSummaryInModule(
+      SGV->getGUID(), SGV->getParent()->getModuleIdentifier());
+  assert(Summary && "Missing summary for global value when exporting");
+  auto Linkage = Summary->linkage();
+  if (!GlobalValue::isLocalLinkage(Linkage)) {
+    assert(!isNonRenamableLocal(*SGV) &&
+           "Attempting to promote non-renamable local");
+    return true;
+  }
+
+  return false;
+}
+
+#ifndef NDEBUG
+bool FunctionImportGlobalProcessing::isNonRenamableLocal(
+    const GlobalValue &GV) const {
+  if (!GV.hasLocalLinkage())
+    return false;
+  // This needs to stay in sync with the logic in buildModuleSummaryIndex.
+  if (GV.hasSection())
+    return true;
+  if (Used.count(const_cast<GlobalValue *>(&GV)))
+    return true;
+  return false;
+}
+#endif
+
+std::string FunctionImportGlobalProcessing::getName(const GlobalValue *SGV,
+                                                    bool DoPromote) {
+  // For locals that must be promoted to global scope, ensure that
+  // the promoted name uniquely identifies the copy in the original module,
+  // using the ID assigned during combined index creation. When importing,
+  // we rename all locals (not just those that are promoted) in order to
+  // avoid naming conflicts between locals imported from different modules.
+  if (SGV->hasLocalLinkage() && (DoPromote || isPerformingImport()))
+    return ModuleSummaryIndex::getGlobalNameForLocal(
+        SGV->getName(),
+        ImportIndex.getModuleHash(SGV->getParent()->getModuleIdentifier()));
+  return SGV->getName();
+}
+
+GlobalValue::LinkageTypes
+FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV,
+                                           bool DoPromote) {
+  // Any local variable that is referenced by an exported function needs
+  // to be promoted to global scope. Since we don't currently know which
+  // functions reference which local variables/functions, we must treat
+  // all as potentially exported if this module is exporting anything.
+  if (isModuleExporting()) {
+    if (SGV->hasLocalLinkage() && DoPromote)
+      return GlobalValue::ExternalLinkage;
+    return SGV->getLinkage();
+  }
+
+  // Otherwise, if we aren't importing, no linkage change is needed.
+  if (!isPerformingImport())
+    return SGV->getLinkage();
+
+  switch (SGV->getLinkage()) {
+  case GlobalValue::ExternalLinkage:
+    // External defnitions are converted to available_externally
+    // definitions upon import, so that they are available for inlining
+    // and/or optimization, but are turned into declarations later
+    // during the EliminateAvailableExternally pass.
+    if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
+      return GlobalValue::AvailableExternallyLinkage;
+    // An imported external declaration stays external.
+    return SGV->getLinkage();
+
+  case GlobalValue::AvailableExternallyLinkage:
+    // An imported available_externally definition converts
+    // to external if imported as a declaration.
+    if (!doImportAsDefinition(SGV))
+      return GlobalValue::ExternalLinkage;
+    // An imported available_externally declaration stays that way.
+    return SGV->getLinkage();
+
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+    // These both stay the same when importing the definition.
+    // The ThinLTO pass will eventually force-import their definitions.
+    return SGV->getLinkage();
+
+  case GlobalValue::WeakAnyLinkage:
+    // Can't import weak_any definitions correctly, or we might change the
+    // program semantics, since the linker will pick the first weak_any
+    // definition and importing would change the order they are seen by the
+    // linker. The module linking caller needs to enforce this.
+    assert(!doImportAsDefinition(SGV));
+    // If imported as a declaration, it becomes external_weak.
+    return SGV->getLinkage();
+
+  case GlobalValue::WeakODRLinkage:
+    // For weak_odr linkage, there is a guarantee that all copies will be
+    // equivalent, so the issue described above for weak_any does not exist,
+    // and the definition can be imported. It can be treated similarly
+    // to an imported externally visible global value.
+    if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
+      return GlobalValue::AvailableExternallyLinkage;
+    else
+      return GlobalValue::ExternalLinkage;
+
+  case GlobalValue::AppendingLinkage:
+    // It would be incorrect to import an appending linkage variable,
+    // since it would cause global constructors/destructors to be
+    // executed multiple times. This should have already been handled
+    // by linkIfNeeded, and we will assert in shouldLinkFromSource
+    // if we try to import, so we simply return AppendingLinkage.
+    return GlobalValue::AppendingLinkage;
+
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+    // If we are promoting the local to global scope, it is handled
+    // similarly to a normal externally visible global.
+    if (DoPromote) {
+      if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
+        return GlobalValue::AvailableExternallyLinkage;
+      else
+        return GlobalValue::ExternalLinkage;
+    }
+    // A non-promoted imported local definition stays local.
+    // The ThinLTO pass will eventually force-import their definitions.
+    return SGV->getLinkage();
+
+  case GlobalValue::ExternalWeakLinkage:
+    // External weak doesn't apply to definitions, must be a declaration.
+    assert(!doImportAsDefinition(SGV));
+    // Linkage stays external_weak.
+    return SGV->getLinkage();
+
+  case GlobalValue::CommonLinkage:
+    // Linkage stays common on definitions.
+    // The ThinLTO pass will eventually force-import their definitions.
+    return SGV->getLinkage();
+  }
+
+  llvm_unreachable("unknown linkage type");
+}
+
+void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
+  bool DoPromote = false;
+  if (GV.hasLocalLinkage() &&
+      ((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) {
+    // Once we change the name or linkage it is difficult to determine
+    // again whether we should promote since shouldPromoteLocalToGlobal needs
+    // to locate the summary (based on GUID from name and linkage). Therefore,
+    // use DoPromote result saved above.
+    GV.setName(getName(&GV, DoPromote));
+    GV.setLinkage(getLinkage(&GV, DoPromote));
+    if (!GV.hasLocalLinkage())
+      GV.setVisibility(GlobalValue::HiddenVisibility);
+  } else
+    GV.setLinkage(getLinkage(&GV, /* DoPromote */ false));
+
+  // Remove functions imported as available externally defs from comdats,
+  // as this is a declaration for the linker, and will be dropped eventually.
+  // It is illegal for comdats to contain declarations.
+  auto *GO = dyn_cast_or_null<GlobalObject>(&GV);
+  if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) {
+    // The IRMover should not have placed any imported declarations in
+    // a comdat, so the only declaration that should be in a comdat
+    // at this point would be a definition imported as available_externally.
+    assert(GO->hasAvailableExternallyLinkage() &&
+           "Expected comdat on definition (possibly available external)");
+    GO->setComdat(nullptr);
+  }
+}
+
+void FunctionImportGlobalProcessing::processGlobalsForThinLTO() {
+  for (GlobalVariable &GV : M.globals())
+    processGlobalForThinLTO(GV);
+  for (Function &SF : M)
+    processGlobalForThinLTO(SF);
+  for (GlobalAlias &GA : M.aliases())
+    processGlobalForThinLTO(GA);
+}
+
+bool FunctionImportGlobalProcessing::run() {
+  processGlobalsForThinLTO();
+  return false;
+}
+
+bool llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index,
+                                  SetVector<GlobalValue *> *GlobalsToImport) {
+  FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport);
+  return ThinLTOProcessing.run();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp
new file mode 100644
index 000000000000..245fefb38ee8
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -0,0 +1,196 @@
+//===-- GlobalStatus.cpp - Compute status info for globals -----------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+
+using namespace llvm;
+
+/// Return the stronger of the two ordering. If the two orderings are acquire
+/// and release, then return AcquireRelease.
+///
+static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
+  if ((X == AtomicOrdering::Acquire && Y == AtomicOrdering::Release) ||
+      (Y == AtomicOrdering::Acquire && X == AtomicOrdering::Release))
+    return AtomicOrdering::AcquireRelease;
+  return (AtomicOrdering)std::max((unsigned)X, (unsigned)Y);
+}
+
+/// It is safe to destroy a constant iff it is only used by constants itself.
+/// Note that constants cannot be cyclic, so this test is pretty easy to
+/// implement recursively.
+///
+bool llvm::isSafeToDestroyConstant(const Constant *C) {
+  if (isa<GlobalValue>(C))
+    return false;
+
+  if (isa<ConstantData>(C))
+    return false;
+
+  for (const User *U : C->users())
+    if (const Constant *CU = dyn_cast<Constant>(U)) {
+      if (!isSafeToDestroyConstant(CU))
+        return false;
+    } else
+      return false;
+  return true;
+}
+
+static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
+                             SmallPtrSetImpl<const PHINode *> &PhiUsers) {
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    if (GV->isExternallyInitialized())
+      GS.StoredType = GlobalStatus::StoredOnce;
+
+  for (const Use &U : V->uses()) {
+    const User *UR = U.getUser();
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) {
+      GS.HasNonInstructionUser = true;
+
+      // If the result of the constantexpr isn't pointer type, then we won't
+      // know to expect it in various places.  Just reject early.
+      if (!isa<PointerType>(CE->getType()))
+        return true;
+
+      if (analyzeGlobalAux(CE, GS, PhiUsers))
+        return true;
+    } else if (const Instruction *I = dyn_cast<Instruction>(UR)) {
+      if (!GS.HasMultipleAccessingFunctions) {
+        const Function *F = I->getParent()->getParent();
+        if (!GS.AccessingFunction)
+          GS.AccessingFunction = F;
+        else if (GS.AccessingFunction != F)
+          GS.HasMultipleAccessingFunctions = true;
+      }
+      if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        GS.IsLoaded = true;
+        // Don't hack on volatile loads.
+        if (LI->isVolatile())
+          return true;
+        GS.Ordering = strongerOrdering(GS.Ordering, LI->getOrdering());
+      } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        // Don't allow a store OF the address, only stores TO the address.
+        if (SI->getOperand(0) == V)
+          return true;
+
+        // Don't hack on volatile stores.
+        if (SI->isVolatile())
+          return true;
+
+        GS.Ordering = strongerOrdering(GS.Ordering, SI->getOrdering());
+
+        // If this is a direct store to the global (i.e., the global is a scalar
+        // value, not an aggregate), keep more specific information about
+        // stores.
+        if (GS.StoredType != GlobalStatus::Stored) {
+          if (const GlobalVariable *GV =
+                  dyn_cast<GlobalVariable>(SI->getOperand(1))) {
+            Value *StoredVal = SI->getOperand(0);
+
+            if (Constant *C = dyn_cast<Constant>(StoredVal)) {
+              if (C->isThreadDependent()) {
+                // The stored value changes between threads; don't track it.
+                return true;
+              }
+            }
+
+            if (GV->hasInitializer() && StoredVal == GV->getInitializer()) {
+              if (GS.StoredType < GlobalStatus::InitializerStored)
+                GS.StoredType = GlobalStatus::InitializerStored;
+            } else if (isa<LoadInst>(StoredVal) &&
+                       cast<LoadInst>(StoredVal)->getOperand(0) == GV) {
+              if (GS.StoredType < GlobalStatus::InitializerStored)
+                GS.StoredType = GlobalStatus::InitializerStored;
+            } else if (GS.StoredType < GlobalStatus::StoredOnce) {
+              GS.StoredType = GlobalStatus::StoredOnce;
+              GS.StoredOnceValue = StoredVal;
+            } else if (GS.StoredType == GlobalStatus::StoredOnce &&
+                       GS.StoredOnceValue == StoredVal) {
+              // noop.
+            } else {
+              GS.StoredType = GlobalStatus::Stored;
+            }
+          } else {
+            GS.StoredType = GlobalStatus::Stored;
+          }
+        }
+      } else if (isa<BitCastInst>(I)) {
+        if (analyzeGlobalAux(I, GS, PhiUsers))
+          return true;
+      } else if (isa<GetElementPtrInst>(I)) {
+        if (analyzeGlobalAux(I, GS, PhiUsers))
+          return true;
+      } else if (isa<SelectInst>(I)) {
+        if (analyzeGlobalAux(I, GS, PhiUsers))
+          return true;
+      } else if (const PHINode *PN = dyn_cast<PHINode>(I)) {
+        // PHI nodes we can check just like select or GEP instructions, but we
+        // have to be careful about infinite recursion.
+        if (PhiUsers.insert(PN).second) // Not already visited.
+          if (analyzeGlobalAux(I, GS, PhiUsers))
+            return true;
+      } else if (isa<CmpInst>(I)) {
+        GS.IsCompared = true;
+      } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
+        if (MTI->isVolatile())
+          return true;
+        if (MTI->getArgOperand(0) == V)
+          GS.StoredType = GlobalStatus::Stored;
+        if (MTI->getArgOperand(1) == V)
+          GS.IsLoaded = true;
+      } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
+        assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!");
+        if (MSI->isVolatile())
+          return true;
+        GS.StoredType = GlobalStatus::Stored;
+      } else if (auto C = ImmutableCallSite(I)) {
+        if (!C.isCallee(&U))
+          return true;
+        GS.IsLoaded = true;
+      } else {
+        return true; // Any other non-load instruction might take address!
+      }
+    } else if (const Constant *C = dyn_cast<Constant>(UR)) {
+      GS.HasNonInstructionUser = true;
+      // We might have a dead and dangling constant hanging off of here.
+      if (!isSafeToDestroyConstant(C))
+        return true;
+    } else {
+      GS.HasNonInstructionUser = true;
+      // Otherwise must be some other user.
+      return true;
+    }
+  }
+
+  return false;
+}
+
+GlobalStatus::GlobalStatus() = default;
+
+bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) {
+  SmallPtrSet<const PHINode *, 16> PhiUsers;
+  return analyzeGlobalAux(V, GS, PhiUsers);
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp b/contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
new file mode 100644
index 000000000000..b8c12ad5ea84
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
@@ -0,0 +1,205 @@
+//===-- ImportedFunctionsInliningStats.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Generating inliner statistics for imported functions, mostly useful for
+// ThinLTO.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <iomanip>
+#include <sstream>
+using namespace llvm;
+
+ImportedFunctionsInliningStatistics::InlineGraphNode &
+ImportedFunctionsInliningStatistics::createInlineGraphNode(const Function &F) {
+
+  auto &ValueLookup = NodesMap[F.getName()];
+  if (!ValueLookup) {
+    ValueLookup = llvm::make_unique<InlineGraphNode>();
+    ValueLookup->Imported = F.getMetadata("thinlto_src_module") != nullptr;
+  }
+  return *ValueLookup;
+}
+
+void ImportedFunctionsInliningStatistics::recordInline(const Function &Caller,
+                                                       const Function &Callee) {
+
+  InlineGraphNode &CallerNode = createInlineGraphNode(Caller);
+  InlineGraphNode &CalleeNode = createInlineGraphNode(Callee);
+  CalleeNode.NumberOfInlines++;
+
+  if (!CallerNode.Imported && !CalleeNode.Imported) {
+    // Direct inline from not imported callee to not imported caller, so we
+    // don't have to add this to graph. It might be very helpful if you wanna
+    // get the inliner statistics in compile step where there are no imported
+    // functions. In this case the graph would be empty.
+    CalleeNode.NumberOfRealInlines++;
+    return;
+  }
+
+  CallerNode.InlinedCallees.push_back(&CalleeNode);
+  if (!CallerNode.Imported) {
+    // We could avoid second lookup, but it would make the code ultra ugly.
+    auto It = NodesMap.find(Caller.getName());
+    assert(It != NodesMap.end() && "The node should be already there.");
+    // Save Caller as a starting node for traversal. The string has to be one
+    // from map because Caller can disappear (and function name with it).
+    NonImportedCallers.push_back(It->first());
+  }
+}
+
+void ImportedFunctionsInliningStatistics::setModuleInfo(const Module &M) {
+  ModuleName = M.getName();
+  for (const auto &F : M.functions()) {
+    if (F.isDeclaration())
+      continue;
+    AllFunctions++;
+    ImportedFunctions += int(F.getMetadata("thinlto_src_module") != nullptr);
+  }
+}
+static std::string getStatString(const char *Msg, int32_t Fraction, int32_t All,
+                                 const char *PercentageOfMsg,
+                                 bool LineEnd = true) {
+  double Result = 0;
+  if (All != 0)
+    Result = 100 * static_cast<double>(Fraction) / All;
+
+  std::stringstream Str;
+  Str << std::setprecision(4) << Msg << ": " << Fraction << " [" << Result
+      << "% of " << PercentageOfMsg << "]";
+  if (LineEnd)
+    Str << "\n";
+  return Str.str();
+}
+
+void ImportedFunctionsInliningStatistics::dump(const bool Verbose) {
+  calculateRealInlines();
+  NonImportedCallers.clear();
+
+  int32_t InlinedImportedFunctionsCount = 0;
+  int32_t InlinedNotImportedFunctionsCount = 0;
+
+  int32_t InlinedImportedFunctionsToImportingModuleCount = 0;
+  int32_t InlinedNotImportedFunctionsToImportingModuleCount = 0;
+
+  const auto SortedNodes = getSortedNodes();
+  std::string Out;
+  Out.reserve(5000);
+  raw_string_ostream Ostream(Out);
+
+  Ostream << "------- Dumping inliner stats for [" << ModuleName
+          << "] -------\n";
+
+  if (Verbose)
+    Ostream << "-- List of inlined functions:\n";
+
+  for (const auto &Node : SortedNodes) {
+    assert(Node->second->NumberOfInlines >= Node->second->NumberOfRealInlines);
+    if (Node->second->NumberOfInlines == 0)
+      continue;
+
+    if (Node->second->Imported) {
+      InlinedImportedFunctionsCount++;
+      InlinedImportedFunctionsToImportingModuleCount +=
+          int(Node->second->NumberOfRealInlines > 0);
+    } else {
+      InlinedNotImportedFunctionsCount++;
+      InlinedNotImportedFunctionsToImportingModuleCount +=
+          int(Node->second->NumberOfRealInlines > 0);
+    }
+
+    if (Verbose)
+      Ostream << "Inlined "
+              << (Node->second->Imported ? "imported " : "not imported ")
+              << "function [" << Node->first() << "]"
+              << ": #inlines = " << Node->second->NumberOfInlines
+              << ", #inlines_to_importing_module = "
+              << Node->second->NumberOfRealInlines << "\n";
+  }
+
+  auto InlinedFunctionsCount =
+      InlinedImportedFunctionsCount + InlinedNotImportedFunctionsCount;
+  auto NotImportedFuncCount = AllFunctions - ImportedFunctions;
+  auto ImportedNotInlinedIntoModule =
+      ImportedFunctions - InlinedImportedFunctionsToImportingModuleCount;
+
+  Ostream << "-- Summary:\n"
+          << "All functions: " << AllFunctions
+          << ", imported functions: " << ImportedFunctions << "\n"
+          << getStatString("inlined functions", InlinedFunctionsCount,
+                           AllFunctions, "all functions")
+          << getStatString("imported functions inlined anywhere",
+                           InlinedImportedFunctionsCount, ImportedFunctions,
+                           "imported functions")
+          << getStatString("imported functions inlined into importing module",
+                           InlinedImportedFunctionsToImportingModuleCount,
+                           ImportedFunctions, "imported functions",
+                           /*LineEnd=*/false)
+          << getStatString(", remaining", ImportedNotInlinedIntoModule,
+                           ImportedFunctions, "imported functions")
+          << getStatString("non-imported functions inlined anywhere",
+                           InlinedNotImportedFunctionsCount,
+                           NotImportedFuncCount, "non-imported functions")
+          << getStatString(
+                 "non-imported functions inlined into importing module",
+                 InlinedNotImportedFunctionsToImportingModuleCount,
+                 NotImportedFuncCount, "non-imported functions");
+  Ostream.flush();
+  dbgs() << Out;
+}
+
+void ImportedFunctionsInliningStatistics::calculateRealInlines() {
+  // Removing duplicated Callers.
+  std::sort(NonImportedCallers.begin(), NonImportedCallers.end());
+  NonImportedCallers.erase(
+      std::unique(NonImportedCallers.begin(), NonImportedCallers.end()),
+      NonImportedCallers.end());
+
+  for (const auto &Name : NonImportedCallers) {
+    auto &Node = *NodesMap[Name];
+    if (!Node.Visited)
+      dfs(Node);
+  }
+}
+
+void ImportedFunctionsInliningStatistics::dfs(InlineGraphNode &GraphNode) {
+  assert(!GraphNode.Visited);
+  GraphNode.Visited = true;
+  for (auto *const InlinedFunctionNode : GraphNode.InlinedCallees) {
+    InlinedFunctionNode->NumberOfRealInlines++;
+    if (!InlinedFunctionNode->Visited)
+      dfs(*InlinedFunctionNode);
+  }
+}
+
+ImportedFunctionsInliningStatistics::SortedNodesTy
+ImportedFunctionsInliningStatistics::getSortedNodes() {
+  SortedNodesTy SortedNodes;
+  SortedNodes.reserve(NodesMap.size());
+  for (const NodesMapTy::value_type& Node : NodesMap)
+    SortedNodes.push_back(&Node);
+
+  std::sort(
+      SortedNodes.begin(), SortedNodes.end(),
+      [&](const SortedNodesTy::value_type &Lhs,
+          const SortedNodesTy::value_type &Rhs) {
+        if (Lhs->second->NumberOfInlines != Rhs->second->NumberOfInlines)
+          return Lhs->second->NumberOfInlines > Rhs->second->NumberOfInlines;
+        if (Lhs->second->NumberOfRealInlines != Rhs->second->NumberOfRealInlines)
+          return Lhs->second->NumberOfRealInlines >
+                 Rhs->second->NumberOfRealInlines;
+        return Lhs->first() < Rhs->first();
+      });
+  return SortedNodes;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
new file mode 100644
index 000000000000..2a18c140c788
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -0,0 +1,2282 @@
+//===- InlineFunction.cpp - Code to perform function inlining -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements inlining of a function into a call site, resolving
+// parameters and the return value as appropriate.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+
+using namespace llvm;
+
+static cl::opt<bool>
+EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true),
+  cl::Hidden,
+  cl::desc("Convert noalias attributes to metadata during inlining."));
+
+static cl::opt<bool>
+PreserveAlignmentAssumptions("preserve-alignment-assumptions-during-inlining",
+  cl::init(true), cl::Hidden,
+  cl::desc("Convert align attributes to assumptions during inlining."));
+
+bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI,
+                          AAResults *CalleeAAR, bool InsertLifetime) {
+  return InlineFunction(CallSite(CI), IFI, CalleeAAR, InsertLifetime);
+}
+bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI,
+                          AAResults *CalleeAAR, bool InsertLifetime) {
+  return InlineFunction(CallSite(II), IFI, CalleeAAR, InsertLifetime);
+}
+
+namespace {
+  /// A class for recording information about inlining a landing pad.
+  class LandingPadInliningInfo {
+    BasicBlock *OuterResumeDest; ///< Destination of the invoke's unwind.
+    BasicBlock *InnerResumeDest; ///< Destination for the callee's resume.
+    LandingPadInst *CallerLPad;  ///< LandingPadInst associated with the invoke.
+    PHINode *InnerEHValuesPHI;   ///< PHI for EH values from landingpad insts.
+    SmallVector<Value*, 8> UnwindDestPHIValues;
+
+  public:
+    LandingPadInliningInfo(InvokeInst *II)
+      : OuterResumeDest(II->getUnwindDest()), InnerResumeDest(nullptr),
+        CallerLPad(nullptr), InnerEHValuesPHI(nullptr) {
+      // If there are PHI nodes in the unwind destination block, we need to keep
+      // track of which values came into them from the invoke before removing
+      // the edge from this block.
+      llvm::BasicBlock *InvokeBB = II->getParent();
+      BasicBlock::iterator I = OuterResumeDest->begin();
+      for (; isa<PHINode>(I); ++I) {
+        // Save the value to use for this edge.
+        PHINode *PHI = cast<PHINode>(I);
+        UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB));
+      }
+
+      CallerLPad = cast<LandingPadInst>(I);
+    }
+
+    /// The outer unwind destination is the target of
+    /// unwind edges introduced for calls within the inlined function.
+    BasicBlock *getOuterResumeDest() const {
+      return OuterResumeDest;
+    }
+
+    BasicBlock *getInnerResumeDest();
+
+    LandingPadInst *getLandingPadInst() const { return CallerLPad; }
+
+    /// Forward the 'resume' instruction to the caller's landing pad block.
+    /// When the landing pad block has only one predecessor, this is
+    /// a simple branch. When there is more than one predecessor, we need to
+    /// split the landing pad block after the landingpad instruction and jump
+    /// to there.
+    void forwardResume(ResumeInst *RI,
+                       SmallPtrSetImpl<LandingPadInst*> &InlinedLPads);
+
+    /// Add incoming-PHI values to the unwind destination block for the given
+    /// basic block, using the values for the original invoke's source block.
+    void addIncomingPHIValuesFor(BasicBlock *BB) const {
+      addIncomingPHIValuesForInto(BB, OuterResumeDest);
+    }
+
+    void addIncomingPHIValuesForInto(BasicBlock *src, BasicBlock *dest) const {
+      BasicBlock::iterator I = dest->begin();
+      for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
+        PHINode *phi = cast<PHINode>(I);
+        phi->addIncoming(UnwindDestPHIValues[i], src);
+      }
+    }
+  };
+} // anonymous namespace
+
+/// Get or create a target for the branch from ResumeInsts.
+BasicBlock *LandingPadInliningInfo::getInnerResumeDest() {
+  if (InnerResumeDest) return InnerResumeDest;
+
+  // Split the landing pad.
+  BasicBlock::iterator SplitPoint = ++CallerLPad->getIterator();
+  InnerResumeDest =
+    OuterResumeDest->splitBasicBlock(SplitPoint,
+                                     OuterResumeDest->getName() + ".body");
+
+  // The number of incoming edges we expect to the inner landing pad.
+  const unsigned PHICapacity = 2;
+
+  // Create corresponding new PHIs for all the PHIs in the outer landing pad.
+  Instruction *InsertPoint = &InnerResumeDest->front();
+  BasicBlock::iterator I = OuterResumeDest->begin();
+  for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
+    PHINode *OuterPHI = cast<PHINode>(I);
+    PHINode *InnerPHI = PHINode::Create(OuterPHI->getType(), PHICapacity,
+                                        OuterPHI->getName() + ".lpad-body",
+                                        InsertPoint);
+    OuterPHI->replaceAllUsesWith(InnerPHI);
+    InnerPHI->addIncoming(OuterPHI, OuterResumeDest);
+  }
+
+  // Create a PHI for the exception values.
+  InnerEHValuesPHI = PHINode::Create(CallerLPad->getType(), PHICapacity,
+                                     "eh.lpad-body", InsertPoint);
+  CallerLPad->replaceAllUsesWith(InnerEHValuesPHI);
+  InnerEHValuesPHI->addIncoming(CallerLPad, OuterResumeDest);
+
+  // All done.
+  return InnerResumeDest;
+}
+
+/// Forward the 'resume' instruction to the caller's landing pad block.
+/// When the landing pad block has only one predecessor, this is a simple
+/// branch. When there is more than one predecessor, we need to split the
+/// landing pad block after the landingpad instruction and jump to there.
+void LandingPadInliningInfo::forwardResume(
+    ResumeInst *RI, SmallPtrSetImpl<LandingPadInst *> &InlinedLPads) {
+  BasicBlock *Dest = getInnerResumeDest();
+  BasicBlock *Src = RI->getParent();
+
+  BranchInst::Create(Dest, Src);
+
+  // Update the PHIs in the destination. They were inserted in an order which
+  // makes this work.
+  addIncomingPHIValuesForInto(Src, Dest);
+
+  InnerEHValuesPHI->addIncoming(RI->getOperand(0), Src);
+  RI->eraseFromParent();
+}
+
+/// Helper for getUnwindDestToken/getUnwindDestTokenHelper.
+static Value *getParentPad(Value *EHPad) {
+  if (auto *FPI = dyn_cast<FuncletPadInst>(EHPad))
+    return FPI->getParentPad();
+  return cast<CatchSwitchInst>(EHPad)->getParentPad();
+}
+
+typedef DenseMap<Instruction *, Value *> UnwindDestMemoTy;
+
+/// Helper for getUnwindDestToken that does the descendant-ward part of
+/// the search.
+static Value *getUnwindDestTokenHelper(Instruction *EHPad,
+                                       UnwindDestMemoTy &MemoMap) {
+  SmallVector<Instruction *, 8> Worklist(1, EHPad);
+
+  while (!Worklist.empty()) {
+    Instruction *CurrentPad = Worklist.pop_back_val();
+    // We only put pads on the worklist that aren't in the MemoMap.  When
+    // we find an unwind dest for a pad we may update its ancestors, but
+    // the queue only ever contains uncles/great-uncles/etc. of CurrentPad,
+    // so they should never get updated while queued on the worklist.
+    assert(!MemoMap.count(CurrentPad));
+    Value *UnwindDestToken = nullptr;
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(CurrentPad)) {
+      if (CatchSwitch->hasUnwindDest()) {
+        UnwindDestToken = CatchSwitch->getUnwindDest()->getFirstNonPHI();
+      } else {
+        // Catchswitch doesn't have a 'nounwind' variant, and one might be
+        // annotated as "unwinds to caller" when really it's nounwind (see
+        // e.g. SimplifyCFGOpt::SimplifyUnreachable), so we can't infer the
+        // parent's unwind dest from this.  We can check its catchpads'
+        // descendants, since they might include a cleanuppad with an
+        // "unwinds to caller" cleanupret, which can be trusted.
+        for (auto HI = CatchSwitch->handler_begin(),
+                  HE = CatchSwitch->handler_end();
+             HI != HE && !UnwindDestToken; ++HI) {
+          BasicBlock *HandlerBlock = *HI;
+          auto *CatchPad = cast<CatchPadInst>(HandlerBlock->getFirstNonPHI());
+          for (User *Child : CatchPad->users()) {
+            // Intentionally ignore invokes here -- since the catchswitch is
+            // marked "unwind to caller", it would be a verifier error if it
+            // contained an invoke which unwinds out of it, so any invoke we'd
+            // encounter must unwind to some child of the catch.
+            if (!isa<CleanupPadInst>(Child) && !isa<CatchSwitchInst>(Child))
+              continue;
+
+            Instruction *ChildPad = cast<Instruction>(Child);
+            auto Memo = MemoMap.find(ChildPad);
+            if (Memo == MemoMap.end()) {
+              // Haven't figured out this child pad yet; queue it.
+              Worklist.push_back(ChildPad);
+              continue;
+            }
+            // We've already checked this child, but might have found that
+            // it offers no proof either way.
+            Value *ChildUnwindDestToken = Memo->second;
+            if (!ChildUnwindDestToken)
+              continue;
+            // We already know the child's unwind dest, which can either
+            // be ConstantTokenNone to indicate unwind to caller, or can
+            // be another child of the catchpad.  Only the former indicates
+            // the unwind dest of the catchswitch.
+            if (isa<ConstantTokenNone>(ChildUnwindDestToken)) {
+              UnwindDestToken = ChildUnwindDestToken;
+              break;
+            }
+            assert(getParentPad(ChildUnwindDestToken) == CatchPad);
+          }
+        }
+      }
+    } else {
+      auto *CleanupPad = cast<CleanupPadInst>(CurrentPad);
+      for (User *U : CleanupPad->users()) {
+        if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) {
+          if (BasicBlock *RetUnwindDest = CleanupRet->getUnwindDest())
+            UnwindDestToken = RetUnwindDest->getFirstNonPHI();
+          else
+            UnwindDestToken = ConstantTokenNone::get(CleanupPad->getContext());
+          break;
+        }
+        Value *ChildUnwindDestToken;
+        if (auto *Invoke = dyn_cast<InvokeInst>(U)) {
+          ChildUnwindDestToken = Invoke->getUnwindDest()->getFirstNonPHI();
+        } else if (isa<CleanupPadInst>(U) || isa<CatchSwitchInst>(U)) {
+          Instruction *ChildPad = cast<Instruction>(U);
+          auto Memo = MemoMap.find(ChildPad);
+          if (Memo == MemoMap.end()) {
+            // Haven't resolved this child yet; queue it and keep searching.
+            Worklist.push_back(ChildPad);
+            continue;
+          }
+          // We've checked this child, but still need to ignore it if it
+          // had no proof either way.
+          ChildUnwindDestToken = Memo->second;
+          if (!ChildUnwindDestToken)
+            continue;
+        } else {
+          // Not a relevant user of the cleanuppad
+          continue;
+        }
+        // In a well-formed program, the child/invoke must either unwind to
+        // an(other) child of the cleanup, or exit the cleanup.  In the
+        // first case, continue searching.
+        if (isa<Instruction>(ChildUnwindDestToken) &&
+            getParentPad(ChildUnwindDestToken) == CleanupPad)
+          continue;
+        UnwindDestToken = ChildUnwindDestToken;
+        break;
+      }
+    }
+    // If we haven't found an unwind dest for CurrentPad, we may have queued its
+    // children, so move on to the next in the worklist.
+    if (!UnwindDestToken)
+      continue;
+
+    // Now we know that CurrentPad unwinds to UnwindDestToken.  It also exits
+    // any ancestors of CurrentPad up to but not including UnwindDestToken's
+    // parent pad.  Record this in the memo map, and check to see if the
+    // original EHPad being queried is one of the ones exited.
+    Value *UnwindParent;
+    if (auto *UnwindPad = dyn_cast<Instruction>(UnwindDestToken))
+      UnwindParent = getParentPad(UnwindPad);
+    else
+      UnwindParent = nullptr;
+    bool ExitedOriginalPad = false;
+    for (Instruction *ExitedPad = CurrentPad;
+         ExitedPad && ExitedPad != UnwindParent;
+         ExitedPad = dyn_cast<Instruction>(getParentPad(ExitedPad))) {
+      // Skip over catchpads since they just follow their catchswitches.
+      if (isa<CatchPadInst>(ExitedPad))
+        continue;
+      MemoMap[ExitedPad] = UnwindDestToken;
+      ExitedOriginalPad |= (ExitedPad == EHPad);
+    }
+
+    if (ExitedOriginalPad)
+      return UnwindDestToken;
+
+    // Continue the search.
+  }
+
+  // No definitive information is contained within this funclet.
+  return nullptr;
+}
+
+/// Given an EH pad, find where it unwinds.  If it unwinds to an EH pad,
+/// return that pad instruction.  If it unwinds to caller, return
+/// ConstantTokenNone.  If it does not have a definitive unwind destination,
+/// return nullptr.
+///
+/// This routine gets invoked for calls in funclets in inlinees when inlining
+/// an invoke.  Since many funclets don't have calls inside them, it's queried
+/// on-demand rather than building a map of pads to unwind dests up front.
+/// Determining a funclet's unwind dest may require recursively searching its
+/// descendants, and also ancestors and cousins if the descendants don't provide
+/// an answer.  Since most funclets will have their unwind dest immediately
+/// available as the unwind dest of a catchswitch or cleanupret, this routine
+/// searches top-down from the given pad and then up. To avoid worst-case
+/// quadratic run-time given that approach, it uses a memo map to avoid
+/// re-processing funclet trees.  The callers that rewrite the IR as they go
+/// take advantage of this, for correctness, by checking/forcing rewritten
+/// pads' entries to match the original callee view.
+static Value *getUnwindDestToken(Instruction *EHPad,
+                                 UnwindDestMemoTy &MemoMap) {
+  // Catchpads unwind to the same place as their catchswitch;
+  // redirct any queries on catchpads so the code below can
+  // deal with just catchswitches and cleanuppads.
+  if (auto *CPI = dyn_cast<CatchPadInst>(EHPad))
+    EHPad = CPI->getCatchSwitch();
+
+  // Check if we've already determined the unwind dest for this pad.
+  auto Memo = MemoMap.find(EHPad);
+  if (Memo != MemoMap.end())
+    return Memo->second;
+
+  // Search EHPad and, if necessary, its descendants.
+  Value *UnwindDestToken = getUnwindDestTokenHelper(EHPad, MemoMap);
+  assert((UnwindDestToken == nullptr) != (MemoMap.count(EHPad) != 0));
+  if (UnwindDestToken)
+    return UnwindDestToken;
+
+  // No information is available for this EHPad from itself or any of its
+  // descendants.  An unwind all the way out to a pad in the caller would
+  // need also to agree with the unwind dest of the parent funclet, so
+  // search up the chain to try to find a funclet with information.  Put
+  // null entries in the memo map to avoid re-processing as we go up.
+  MemoMap[EHPad] = nullptr;
+#ifndef NDEBUG
+  SmallPtrSet<Instruction *, 4> TempMemos;
+  TempMemos.insert(EHPad);
+#endif
+  Instruction *LastUselessPad = EHPad;
+  Value *AncestorToken;
+  for (AncestorToken = getParentPad(EHPad);
+       auto *AncestorPad = dyn_cast<Instruction>(AncestorToken);
+       AncestorToken = getParentPad(AncestorToken)) {
+    // Skip over catchpads since they just follow their catchswitches.
+    if (isa<CatchPadInst>(AncestorPad))
+      continue;
+    // If the MemoMap had an entry mapping AncestorPad to nullptr, since we
+    // haven't yet called getUnwindDestTokenHelper for AncestorPad in this
+    // call to getUnwindDestToken, that would mean that AncestorPad had no
+    // information in itself, its descendants, or its ancestors.  If that
+    // were the case, then we should also have recorded the lack of information
+    // for the descendant that we're coming from.  So assert that we don't
+    // find a null entry in the MemoMap for AncestorPad.
+    assert(!MemoMap.count(AncestorPad) || MemoMap[AncestorPad]);
+    auto AncestorMemo = MemoMap.find(AncestorPad);
+    if (AncestorMemo == MemoMap.end()) {
+      UnwindDestToken = getUnwindDestTokenHelper(AncestorPad, MemoMap);
+    } else {
+      UnwindDestToken = AncestorMemo->second;
+    }
+    if (UnwindDestToken)
+      break;
+    LastUselessPad = AncestorPad;
+    MemoMap[LastUselessPad] = nullptr;
+#ifndef NDEBUG
+    TempMemos.insert(LastUselessPad);
+#endif
+  }
+
+  // We know that getUnwindDestTokenHelper was called on LastUselessPad and
+  // returned nullptr (and likewise for EHPad and any of its ancestors up to
+  // LastUselessPad), so LastUselessPad has no information from below.  Since
+  // getUnwindDestTokenHelper must investigate all downward paths through
+  // no-information nodes to prove that a node has no information like this,
+  // and since any time it finds information it records it in the MemoMap for
+  // not just the immediately-containing funclet but also any ancestors also
+  // exited, it must be the case that, walking downward from LastUselessPad,
+  // visiting just those nodes which have not been mapped to an unwind dest
+  // by getUnwindDestTokenHelper (the nullptr TempMemos notwithstanding, since
+  // they are just used to keep getUnwindDestTokenHelper from repeating work),
+  // any node visited must have been exhaustively searched with no information
+  // for it found.
+  SmallVector<Instruction *, 8> Worklist(1, LastUselessPad);
+  while (!Worklist.empty()) {
+    Instruction *UselessPad = Worklist.pop_back_val();
+    auto Memo = MemoMap.find(UselessPad);
+    if (Memo != MemoMap.end() && Memo->second) {
+      // Here the name 'UselessPad' is a bit of a misnomer, because we've found
+      // that it is a funclet that does have information about unwinding to
+      // a particular destination; its parent was a useless pad.
+      // Since its parent has no information, the unwind edge must not escape
+      // the parent, and must target a sibling of this pad.  This local unwind
+      // gives us no information about EHPad.  Leave it and the subtree rooted
+      // at it alone.
+      assert(getParentPad(Memo->second) == getParentPad(UselessPad));
+      continue;
+    }
+    // We know we don't have information for UselesPad.  If it has an entry in
+    // the MemoMap (mapping it to nullptr), it must be one of the TempMemos
+    // added on this invocation of getUnwindDestToken; if a previous invocation
+    // recorded nullptr, it would have had to prove that the ancestors of
+    // UselessPad, which include LastUselessPad, had no information, and that
+    // in turn would have required proving that the descendants of
+    // LastUselesPad, which include EHPad, have no information about
+    // LastUselessPad, which would imply that EHPad was mapped to nullptr in
+    // the MemoMap on that invocation, which isn't the case if we got here.
+    assert(!MemoMap.count(UselessPad) || TempMemos.count(UselessPad));
+    // Assert as we enumerate users that 'UselessPad' doesn't have any unwind
+    // information that we'd be contradicting by making a map entry for it
+    // (which is something that getUnwindDestTokenHelper must have proved for
+    // us to get here).  Just assert on is direct users here; the checks in
+    // this downward walk at its descendants will verify that they don't have
+    // any unwind edges that exit 'UselessPad' either (i.e. they either have no
+    // unwind edges or unwind to a sibling).
+    MemoMap[UselessPad] = UnwindDestToken;
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(UselessPad)) {
+      assert(CatchSwitch->getUnwindDest() == nullptr && "Expected useless pad");
+      for (BasicBlock *HandlerBlock : CatchSwitch->handlers()) {
+        auto *CatchPad = HandlerBlock->getFirstNonPHI();
+        for (User *U : CatchPad->users()) {
+          assert(
+              (!isa<InvokeInst>(U) ||
+               (getParentPad(
+                    cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHI()) ==
+                CatchPad)) &&
+              "Expected useless pad");
+          if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
+            Worklist.push_back(cast<Instruction>(U));
+        }
+      }
+    } else {
+      assert(isa<CleanupPadInst>(UselessPad));
+      for (User *U : UselessPad->users()) {
+        assert(!isa<CleanupReturnInst>(U) && "Expected useless pad");
+        assert((!isa<InvokeInst>(U) ||
+                (getParentPad(
+                     cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHI()) ==
+                 UselessPad)) &&
+               "Expected useless pad");
+        if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
+          Worklist.push_back(cast<Instruction>(U));
+      }
+    }
+  }
+
+  return UnwindDestToken;
+}
+
+/// When we inline a basic block into an invoke,
+/// we have to turn all of the calls that can throw into invokes.
+/// This function analyze BB to see if there are any calls, and if so,
+/// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI
+/// nodes in that block with the values specified in InvokeDestPHIValues.
+static BasicBlock *HandleCallsInBlockInlinedThroughInvoke(
+    BasicBlock *BB, BasicBlock *UnwindEdge,
+    UnwindDestMemoTy *FuncletUnwindMap = nullptr) {
+  for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
+    Instruction *I = &*BBI++;
+
+    // We only need to check for function calls: inlined invoke
+    // instructions require no special handling.
+    CallInst *CI = dyn_cast<CallInst>(I);
+
+    if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue()))
+      continue;
+
+    // We do not need to (and in fact, cannot) convert possibly throwing calls
+    // to @llvm.experimental_deoptimize (resp. @llvm.experimental.guard) into
+    // invokes.  The caller's "segment" of the deoptimization continuation
+    // attached to the newly inlined @llvm.experimental_deoptimize
+    // (resp. @llvm.experimental.guard) call should contain the exception
+    // handling logic, if any.
+    if (auto *F = CI->getCalledFunction())
+      if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize ||
+          F->getIntrinsicID() == Intrinsic::experimental_guard)
+        continue;
+
+    if (auto FuncletBundle = CI->getOperandBundle(LLVMContext::OB_funclet)) {
+      // This call is nested inside a funclet.  If that funclet has an unwind
+      // destination within the inlinee, then unwinding out of this call would
+      // be UB.  Rewriting this call to an invoke which targets the inlined
+      // invoke's unwind dest would give the call's parent funclet multiple
+      // unwind destinations, which is something that subsequent EH table
+      // generation can't handle and that the veirifer rejects.  So when we
+      // see such a call, leave it as a call.
+      auto *FuncletPad = cast<Instruction>(FuncletBundle->Inputs[0]);
+      Value *UnwindDestToken =
+          getUnwindDestToken(FuncletPad, *FuncletUnwindMap);
+      if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken))
+        continue;
+#ifndef NDEBUG
+      Instruction *MemoKey;
+      if (auto *CatchPad = dyn_cast<CatchPadInst>(FuncletPad))
+        MemoKey = CatchPad->getCatchSwitch();
+      else
+        MemoKey = FuncletPad;
+      assert(FuncletUnwindMap->count(MemoKey) &&
+             (*FuncletUnwindMap)[MemoKey] == UnwindDestToken &&
+             "must get memoized to avoid confusing later searches");
+#endif // NDEBUG
+    }
+
+    changeToInvokeAndSplitBasicBlock(CI, UnwindEdge);
+    return BB;
+  }
+  return nullptr;
+}
+
+/// If we inlined an invoke site, we need to convert calls
+/// in the body of the inlined function into invokes.
+///
+/// II is the invoke instruction being inlined.  FirstNewBlock is the first
+/// block of the inlined code (the last block is the end of the function),
+/// and InlineCodeInfo is information about the code that got inlined.
+static void HandleInlinedLandingPad(InvokeInst *II, BasicBlock *FirstNewBlock,
+                                    ClonedCodeInfo &InlinedCodeInfo) {
+  BasicBlock *InvokeDest = II->getUnwindDest();
+
+  Function *Caller = FirstNewBlock->getParent();
+
+  // The inlined code is currently at the end of the function, scan from the
+  // start of the inlined code to its end, checking for stuff we need to
+  // rewrite.
+  LandingPadInliningInfo Invoke(II);
+
+  // Get all of the inlined landing pad instructions.
+  SmallPtrSet<LandingPadInst*, 16> InlinedLPads;
+  for (Function::iterator I = FirstNewBlock->getIterator(), E = Caller->end();
+       I != E; ++I)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator()))
+      InlinedLPads.insert(II->getLandingPadInst());
+
+  // Append the clauses from the outer landing pad instruction into the inlined
+  // landing pad instructions.
+  LandingPadInst *OuterLPad = Invoke.getLandingPadInst();
+  for (LandingPadInst *InlinedLPad : InlinedLPads) {
+    unsigned OuterNum = OuterLPad->getNumClauses();
+    InlinedLPad->reserveClauses(OuterNum);
+    for (unsigned OuterIdx = 0; OuterIdx != OuterNum; ++OuterIdx)
+      InlinedLPad->addClause(OuterLPad->getClause(OuterIdx));
+    if (OuterLPad->isCleanup())
+      InlinedLPad->setCleanup(true);
+  }
+
+  for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end();
+       BB != E; ++BB) {
+    if (InlinedCodeInfo.ContainsCalls)
+      if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke(
+              &*BB, Invoke.getOuterResumeDest()))
+        // Update any PHI nodes in the exceptional block to indicate that there
+        // is now a new entry in them.
+        Invoke.addIncomingPHIValuesFor(NewBB);
+
+    // Forward any resumes that are remaining here.
+    if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator()))
+      Invoke.forwardResume(RI, InlinedLPads);
+  }
+
+  // Now that everything is happy, we have one final detail.  The PHI nodes in
+  // the exception destination block still have entries due to the original
+  // invoke instruction. Eliminate these entries (which might even delete the
+  // PHI node) now.
+  InvokeDest->removePredecessor(II->getParent());
+}
+
+/// If we inlined an invoke site, we need to convert calls
+/// in the body of the inlined function into invokes.
+///
+/// II is the invoke instruction being inlined.  FirstNewBlock is the first
+/// block of the inlined code (the last block is the end of the function),
+/// and InlineCodeInfo is information about the code that got inlined.
+static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
+                               ClonedCodeInfo &InlinedCodeInfo) {
+  BasicBlock *UnwindDest = II->getUnwindDest();
+  Function *Caller = FirstNewBlock->getParent();
+
+  assert(UnwindDest->getFirstNonPHI()->isEHPad() && "unexpected BasicBlock!");
+
+  // If there are PHI nodes in the unwind destination block, we need to keep
+  // track of which values came into them from the invoke before removing the
+  // edge from this block.
+  SmallVector<Value *, 8> UnwindDestPHIValues;
+  llvm::BasicBlock *InvokeBB = II->getParent();
+  for (Instruction &I : *UnwindDest) {
+    // Save the value to use for this edge.
+    PHINode *PHI = dyn_cast<PHINode>(&I);
+    if (!PHI)
+      break;
+    UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB));
+  }
+
+  // Add incoming-PHI values to the unwind destination block for the given basic
+  // block, using the values for the original invoke's source block.
+  auto UpdatePHINodes = [&](BasicBlock *Src) {
+    BasicBlock::iterator I = UnwindDest->begin();
+    for (Value *V : UnwindDestPHIValues) {
+      PHINode *PHI = cast<PHINode>(I);
+      PHI->addIncoming(V, Src);
+      ++I;
+    }
+  };
+
+  // This connects all the instructions which 'unwind to caller' to the invoke
+  // destination.
+  UnwindDestMemoTy FuncletUnwindMap;
+  for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end();
+       BB != E; ++BB) {
+    if (auto *CRI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
+      if (CRI->unwindsToCaller()) {
+        auto *CleanupPad = CRI->getCleanupPad();
+        CleanupReturnInst::Create(CleanupPad, UnwindDest, CRI);
+        CRI->eraseFromParent();
+        UpdatePHINodes(&*BB);
+        // Finding a cleanupret with an unwind destination would confuse
+        // subsequent calls to getUnwindDestToken, so map the cleanuppad
+        // to short-circuit any such calls and recognize this as an "unwind
+        // to caller" cleanup.
+        assert(!FuncletUnwindMap.count(CleanupPad) ||
+               isa<ConstantTokenNone>(FuncletUnwindMap[CleanupPad]));
+        FuncletUnwindMap[CleanupPad] =
+            ConstantTokenNone::get(Caller->getContext());
+      }
+    }
+
+    Instruction *I = BB->getFirstNonPHI();
+    if (!I->isEHPad())
+      continue;
+
+    Instruction *Replacement = nullptr;
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
+      if (CatchSwitch->unwindsToCaller()) {
+        Value *UnwindDestToken;
+        if (auto *ParentPad =
+                dyn_cast<Instruction>(CatchSwitch->getParentPad())) {
+          // This catchswitch is nested inside another funclet.  If that
+          // funclet has an unwind destination within the inlinee, then
+          // unwinding out of this catchswitch would be UB.  Rewriting this
+          // catchswitch to unwind to the inlined invoke's unwind dest would
+          // give the parent funclet multiple unwind destinations, which is
+          // something that subsequent EH table generation can't handle and
+          // that the veirifer rejects.  So when we see such a call, leave it
+          // as "unwind to caller".
+          UnwindDestToken = getUnwindDestToken(ParentPad, FuncletUnwindMap);
+          if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken))
+            continue;
+        } else {
+          // This catchswitch has no parent to inherit constraints from, and
+          // none of its descendants can have an unwind edge that exits it and
+          // targets another funclet in the inlinee.  It may or may not have a
+          // descendant that definitively has an unwind to caller.  In either
+          // case, we'll have to assume that any unwinds out of it may need to
+          // be routed to the caller, so treat it as though it has a definitive
+          // unwind to caller.
+          UnwindDestToken = ConstantTokenNone::get(Caller->getContext());
+        }
+        auto *NewCatchSwitch = CatchSwitchInst::Create(
+            CatchSwitch->getParentPad(), UnwindDest,
+            CatchSwitch->getNumHandlers(), CatchSwitch->getName(),
+            CatchSwitch);
+        for (BasicBlock *PadBB : CatchSwitch->handlers())
+          NewCatchSwitch->addHandler(PadBB);
+        // Propagate info for the old catchswitch over to the new one in
+        // the unwind map.  This also serves to short-circuit any subsequent
+        // checks for the unwind dest of this catchswitch, which would get
+        // confused if they found the outer handler in the callee.
+        FuncletUnwindMap[NewCatchSwitch] = UnwindDestToken;
+        Replacement = NewCatchSwitch;
+      }
+    } else if (!isa<FuncletPadInst>(I)) {
+      llvm_unreachable("unexpected EHPad!");
+    }
+
+    if (Replacement) {
+      Replacement->takeName(I);
+      I->replaceAllUsesWith(Replacement);
+      I->eraseFromParent();
+      UpdatePHINodes(&*BB);
+    }
+  }
+
+  if (InlinedCodeInfo.ContainsCalls)
+    for (Function::iterator BB = FirstNewBlock->getIterator(),
+                            E = Caller->end();
+         BB != E; ++BB)
+      if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke(
+              &*BB, UnwindDest, &FuncletUnwindMap))
+        // Update any PHI nodes in the exceptional block to indicate that there
+        // is now a new entry in them.
+        UpdatePHINodes(NewBB);
+
+  // Now that everything is happy, we have one final detail.  The PHI nodes in
+  // the exception destination block still have entries due to the original
+  // invoke instruction. Eliminate these entries (which might even delete the
+  // PHI node) now.
+  UnwindDest->removePredecessor(InvokeBB);
+}
+
+/// When inlining a call site that has !llvm.mem.parallel_loop_access metadata,
+/// that metadata should be propagated to all memory-accessing cloned
+/// instructions.
+static void PropagateParallelLoopAccessMetadata(CallSite CS,
+                                                ValueToValueMapTy &VMap) {
+  MDNode *M =
+    CS.getInstruction()->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
+  if (!M)
+    return;
+
+  for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
+       VMI != VMIE; ++VMI) {
+    if (!VMI->second)
+      continue;
+
+    Instruction *NI = dyn_cast<Instruction>(VMI->second);
+    if (!NI)
+      continue;
+
+    if (MDNode *PM = NI->getMetadata(LLVMContext::MD_mem_parallel_loop_access)) {
+        M = MDNode::concatenate(PM, M);
+      NI->setMetadata(LLVMContext::MD_mem_parallel_loop_access, M);
+    } else if (NI->mayReadOrWriteMemory()) {
+      NI->setMetadata(LLVMContext::MD_mem_parallel_loop_access, M);
+    }
+  }
+}
+
+/// When inlining a function that contains noalias scope metadata,
+/// this metadata needs to be cloned so that the inlined blocks
+/// have different "unique scopes" at every call site. Were this not done, then
+/// aliasing scopes from a function inlined into a caller multiple times could
+/// not be differentiated (and this would lead to miscompiles because the
+/// non-aliasing property communicated by the metadata could have
+/// call-site-specific control dependencies).
+static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
+  const Function *CalledFunc = CS.getCalledFunction();
+  SetVector<const MDNode *> MD;
+
+  // Note: We could only clone the metadata if it is already used in the
+  // caller. I'm omitting that check here because it might confuse
+  // inter-procedural alias analysis passes. We can revisit this if it becomes
+  // an efficiency or overhead problem.
+
+  for (const BasicBlock &I : *CalledFunc)
+    for (const Instruction &J : I) {
+      if (const MDNode *M = J.getMetadata(LLVMContext::MD_alias_scope))
+        MD.insert(M);
+      if (const MDNode *M = J.getMetadata(LLVMContext::MD_noalias))
+        MD.insert(M);
+    }
+
+  if (MD.empty())
+    return;
+
+  // Walk the existing metadata, adding the complete (perhaps cyclic) chain to
+  // the set.
+  SmallVector<const Metadata *, 16> Queue(MD.begin(), MD.end());
+  while (!Queue.empty()) {
+    const MDNode *M = cast<MDNode>(Queue.pop_back_val());
+    for (unsigned i = 0, ie = M->getNumOperands(); i != ie; ++i)
+      if (const MDNode *M1 = dyn_cast<MDNode>(M->getOperand(i)))
+        if (MD.insert(M1))
+          Queue.push_back(M1);
+  }
+
+  // Now we have a complete set of all metadata in the chains used to specify
+  // the noalias scopes and the lists of those scopes.
+  SmallVector<TempMDTuple, 16> DummyNodes;
+  DenseMap<const MDNode *, TrackingMDNodeRef> MDMap;
+  for (const MDNode *I : MD) {
+    DummyNodes.push_back(MDTuple::getTemporary(CalledFunc->getContext(), None));
+    MDMap[I].reset(DummyNodes.back().get());
+  }
+
+  // Create new metadata nodes to replace the dummy nodes, replacing old
+  // metadata references with either a dummy node or an already-created new
+  // node.
+  for (const MDNode *I : MD) {
+    SmallVector<Metadata *, 4> NewOps;
+    for (unsigned i = 0, ie = I->getNumOperands(); i != ie; ++i) {
+      const Metadata *V = I->getOperand(i);
+      if (const MDNode *M = dyn_cast<MDNode>(V))
+        NewOps.push_back(MDMap[M]);
+      else
+        NewOps.push_back(const_cast<Metadata *>(V));
+    }
+
+    MDNode *NewM = MDNode::get(CalledFunc->getContext(), NewOps);
+    MDTuple *TempM = cast<MDTuple>(MDMap[I]);
+    assert(TempM->isTemporary() && "Expected temporary node");
+
+    TempM->replaceAllUsesWith(NewM);
+  }
+
+  // Now replace the metadata in the new inlined instructions with the
+  // repacements from the map.
+  for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
+       VMI != VMIE; ++VMI) {
+    if (!VMI->second)
+      continue;
+
+    Instruction *NI = dyn_cast<Instruction>(VMI->second);
+    if (!NI)
+      continue;
+
+    if (MDNode *M = NI->getMetadata(LLVMContext::MD_alias_scope)) {
+      MDNode *NewMD = MDMap[M];
+      // If the call site also had alias scope metadata (a list of scopes to
+      // which instructions inside it might belong), propagate those scopes to
+      // the inlined instructions.
+      if (MDNode *CSM =
+              CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope))
+        NewMD = MDNode::concatenate(NewMD, CSM);
+      NI->setMetadata(LLVMContext::MD_alias_scope, NewMD);
+    } else if (NI->mayReadOrWriteMemory()) {
+      if (MDNode *M =
+              CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope))
+        NI->setMetadata(LLVMContext::MD_alias_scope, M);
+    }
+
+    if (MDNode *M = NI->getMetadata(LLVMContext::MD_noalias)) {
+      MDNode *NewMD = MDMap[M];
+      // If the call site also had noalias metadata (a list of scopes with
+      // which instructions inside it don't alias), propagate those scopes to
+      // the inlined instructions.
+      if (MDNode *CSM =
+              CS.getInstruction()->getMetadata(LLVMContext::MD_noalias))
+        NewMD = MDNode::concatenate(NewMD, CSM);
+      NI->setMetadata(LLVMContext::MD_noalias, NewMD);
+    } else if (NI->mayReadOrWriteMemory()) {
+      if (MDNode *M = CS.getInstruction()->getMetadata(LLVMContext::MD_noalias))
+        NI->setMetadata(LLVMContext::MD_noalias, M);
+    }
+  }
+}
+
+/// If the inlined function has noalias arguments,
+/// then add new alias scopes for each noalias argument, tag the mapped noalias
+/// parameters with noalias metadata specifying the new scope, and tag all
+/// non-derived loads, stores and memory intrinsics with the new alias scopes.
+static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
+                                  const DataLayout &DL, AAResults *CalleeAAR) {
+  if (!EnableNoAliasConversion)
+    return;
+
+  const Function *CalledFunc = CS.getCalledFunction();
+  SmallVector<const Argument *, 4> NoAliasArgs;
+
+  for (const Argument &Arg : CalledFunc->args())
+    if (Arg.hasNoAliasAttr() && !Arg.use_empty())
+      NoAliasArgs.push_back(&Arg);
+
+  if (NoAliasArgs.empty())
+    return;
+
+  // To do a good job, if a noalias variable is captured, we need to know if
+  // the capture point dominates the particular use we're considering.
+  DominatorTree DT;
+  DT.recalculate(const_cast<Function&>(*CalledFunc));
+
+  // noalias indicates that pointer values based on the argument do not alias
+  // pointer values which are not based on it. So we add a new "scope" for each
+  // noalias function argument. Accesses using pointers based on that argument
+  // become part of that alias scope, accesses using pointers not based on that
+  // argument are tagged as noalias with that scope.
+
+  DenseMap<const Argument *, MDNode *> NewScopes;
+  MDBuilder MDB(CalledFunc->getContext());
+
+  // Create a new scope domain for this function.
+  MDNode *NewDomain =
+    MDB.createAnonymousAliasScopeDomain(CalledFunc->getName());
+  for (unsigned i = 0, e = NoAliasArgs.size(); i != e; ++i) {
+    const Argument *A = NoAliasArgs[i];
+
+    std::string Name = CalledFunc->getName();
+    if (A->hasName()) {
+      Name += ": %";
+      Name += A->getName();
+    } else {
+      Name += ": argument ";
+      Name += utostr(i);
+    }
+
+    // Note: We always create a new anonymous root here. This is true regardless
+    // of the linkage of the callee because the aliasing "scope" is not just a
+    // property of the callee, but also all control dependencies in the caller.
+    MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
+    NewScopes.insert(std::make_pair(A, NewScope));
+  }
+
+  // Iterate over all new instructions in the map; for all memory-access
+  // instructions, add the alias scope metadata.
+  for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
+       VMI != VMIE; ++VMI) {
+    if (const Instruction *I = dyn_cast<Instruction>(VMI->first)) {
+      if (!VMI->second)
+        continue;
+
+      Instruction *NI = dyn_cast<Instruction>(VMI->second);
+      if (!NI)
+        continue;
+
+      bool IsArgMemOnlyCall = false, IsFuncCall = false;
+      SmallVector<const Value *, 2> PtrArgs;
+
+      if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+        PtrArgs.push_back(LI->getPointerOperand());
+      else if (const StoreInst *SI = dyn_cast<StoreInst>(I))
+        PtrArgs.push_back(SI->getPointerOperand());
+      else if (const VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
+        PtrArgs.push_back(VAAI->getPointerOperand());
+      else if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I))
+        PtrArgs.push_back(CXI->getPointerOperand());
+      else if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I))
+        PtrArgs.push_back(RMWI->getPointerOperand());
+      else if (ImmutableCallSite ICS = ImmutableCallSite(I)) {
+        // If we know that the call does not access memory, then we'll still
+        // know that about the inlined clone of this call site, and we don't
+        // need to add metadata.
+        if (ICS.doesNotAccessMemory())
+          continue;
+
+        IsFuncCall = true;
+        if (CalleeAAR) {
+          FunctionModRefBehavior MRB = CalleeAAR->getModRefBehavior(ICS);
+          if (MRB == FMRB_OnlyAccessesArgumentPointees ||
+              MRB == FMRB_OnlyReadsArgumentPointees)
+            IsArgMemOnlyCall = true;
+        }
+
+        for (Value *Arg : ICS.args()) {
+          // We need to check the underlying objects of all arguments, not just
+          // the pointer arguments, because we might be passing pointers as
+          // integers, etc.
+          // However, if we know that the call only accesses pointer arguments,
+          // then we only need to check the pointer arguments.
+          if (IsArgMemOnlyCall && !Arg->getType()->isPointerTy())
+            continue;
+
+          PtrArgs.push_back(Arg);
+        }
+      }
+
+      // If we found no pointers, then this instruction is not suitable for
+      // pairing with an instruction to receive aliasing metadata.
+      // However, if this is a call, this we might just alias with none of the
+      // noalias arguments.
+      if (PtrArgs.empty() && !IsFuncCall)
+        continue;
+
+      // It is possible that there is only one underlying object, but you
+      // need to go through several PHIs to see it, and thus could be
+      // repeated in the Objects list.
+      SmallPtrSet<const Value *, 4> ObjSet;
+      SmallVector<Metadata *, 4> Scopes, NoAliases;
+
+      SmallSetVector<const Argument *, 4> NAPtrArgs;
+      for (const Value *V : PtrArgs) {
+        SmallVector<Value *, 4> Objects;
+        GetUnderlyingObjects(const_cast<Value*>(V),
+                             Objects, DL, /* LI = */ nullptr);
+
+        for (Value *O : Objects)
+          ObjSet.insert(O);
+      }
+
+      // Figure out if we're derived from anything that is not a noalias
+      // argument.
+      bool CanDeriveViaCapture = false, UsesAliasingPtr = false;
+      for (const Value *V : ObjSet) {
+        // Is this value a constant that cannot be derived from any pointer
+        // value (we need to exclude constant expressions, for example, that
+        // are formed from arithmetic on global symbols).
+        bool IsNonPtrConst = isa<ConstantInt>(V) || isa<ConstantFP>(V) ||
+                             isa<ConstantPointerNull>(V) ||
+                             isa<ConstantDataVector>(V) || isa<UndefValue>(V);
+        if (IsNonPtrConst)
+          continue;
+
+        // If this is anything other than a noalias argument, then we cannot
+        // completely describe the aliasing properties using alias.scope
+        // metadata (and, thus, won't add any).
+        if (const Argument *A = dyn_cast<Argument>(V)) {
+          if (!A->hasNoAliasAttr())
+            UsesAliasingPtr = true;
+        } else {
+          UsesAliasingPtr = true;
+        }
+
+        // If this is not some identified function-local object (which cannot
+        // directly alias a noalias argument), or some other argument (which,
+        // by definition, also cannot alias a noalias argument), then we could
+        // alias a noalias argument that has been captured).
+        if (!isa<Argument>(V) &&
+            !isIdentifiedFunctionLocal(const_cast<Value*>(V)))
+          CanDeriveViaCapture = true;
+      }
+
+      // A function call can always get captured noalias pointers (via other
+      // parameters, globals, etc.).
+      if (IsFuncCall && !IsArgMemOnlyCall)
+        CanDeriveViaCapture = true;
+
+      // First, we want to figure out all of the sets with which we definitely
+      // don't alias. Iterate over all noalias set, and add those for which:
+      //   1. The noalias argument is not in the set of objects from which we
+      //      definitely derive.
+      //   2. The noalias argument has not yet been captured.
+      // An arbitrary function that might load pointers could see captured
+      // noalias arguments via other noalias arguments or globals, and so we
+      // must always check for prior capture.
+      for (const Argument *A : NoAliasArgs) {
+        if (!ObjSet.count(A) && (!CanDeriveViaCapture ||
+                                 // It might be tempting to skip the
+                                 // PointerMayBeCapturedBefore check if
+                                 // A->hasNoCaptureAttr() is true, but this is
+                                 // incorrect because nocapture only guarantees
+                                 // that no copies outlive the function, not
+                                 // that the value cannot be locally captured.
+                                 !PointerMayBeCapturedBefore(A,
+                                   /* ReturnCaptures */ false,
+                                   /* StoreCaptures */ false, I, &DT)))
+          NoAliases.push_back(NewScopes[A]);
+      }
+
+      if (!NoAliases.empty())
+        NI->setMetadata(LLVMContext::MD_noalias,
+                        MDNode::concatenate(
+                            NI->getMetadata(LLVMContext::MD_noalias),
+                            MDNode::get(CalledFunc->getContext(), NoAliases)));
+
+      // Next, we want to figure out all of the sets to which we might belong.
+      // We might belong to a set if the noalias argument is in the set of
+      // underlying objects. If there is some non-noalias argument in our list
+      // of underlying objects, then we cannot add a scope because the fact
+      // that some access does not alias with any set of our noalias arguments
+      // cannot itself guarantee that it does not alias with this access
+      // (because there is some pointer of unknown origin involved and the
+      // other access might also depend on this pointer). We also cannot add
+      // scopes to arbitrary functions unless we know they don't access any
+      // non-parameter pointer-values.
+      bool CanAddScopes = !UsesAliasingPtr;
+      if (CanAddScopes && IsFuncCall)
+        CanAddScopes = IsArgMemOnlyCall;
+
+      if (CanAddScopes)
+        for (const Argument *A : NoAliasArgs) {
+          if (ObjSet.count(A))
+            Scopes.push_back(NewScopes[A]);
+        }
+
+      if (!Scopes.empty())
+        NI->setMetadata(
+            LLVMContext::MD_alias_scope,
+            MDNode::concatenate(NI->getMetadata(LLVMContext::MD_alias_scope),
+                                MDNode::get(CalledFunc->getContext(), Scopes)));
+    }
+  }
+}
+
+/// If the inlined function has non-byval align arguments, then
+/// add @llvm.assume-based alignment assumptions to preserve this information.
+static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
+  if (!PreserveAlignmentAssumptions || !IFI.GetAssumptionCache)
+    return;
+
+  AssumptionCache *AC = &(*IFI.GetAssumptionCache)(*CS.getCaller());
+  auto &DL = CS.getCaller()->getParent()->getDataLayout();
+
+  // To avoid inserting redundant assumptions, we should check for assumptions
+  // already in the caller. To do this, we might need a DT of the caller.
+  DominatorTree DT;
+  bool DTCalculated = false;
+
+  Function *CalledFunc = CS.getCalledFunction();
+  for (Argument &Arg : CalledFunc->args()) {
+    unsigned Align = Arg.getType()->isPointerTy() ? Arg.getParamAlignment() : 0;
+    if (Align && !Arg.hasByValOrInAllocaAttr() && !Arg.hasNUses(0)) {
+      if (!DTCalculated) {
+        DT.recalculate(*CS.getCaller());
+        DTCalculated = true;
+      }
+
+      // If we can already prove the asserted alignment in the context of the
+      // caller, then don't bother inserting the assumption.
+      Value *ArgVal = CS.getArgument(Arg.getArgNo());
+      if (getKnownAlignment(ArgVal, DL, CS.getInstruction(), AC, &DT) >= Align)
+        continue;
+
+      CallInst *NewAsmp = IRBuilder<>(CS.getInstruction())
+                              .CreateAlignmentAssumption(DL, ArgVal, Align);
+      AC->registerAssumption(NewAsmp);
+    }
+  }
+}
+
+/// Once we have cloned code over from a callee into the caller,
+/// update the specified callgraph to reflect the changes we made.
+/// Note that it's possible that not all code was copied over, so only
+/// some edges of the callgraph may remain.
+static void UpdateCallGraphAfterInlining(CallSite CS,
+                                         Function::iterator FirstNewBlock,
+                                         ValueToValueMapTy &VMap,
+                                         InlineFunctionInfo &IFI) {
+  CallGraph &CG = *IFI.CG;
+  const Function *Caller = CS.getCaller();
+  const Function *Callee = CS.getCalledFunction();
+  CallGraphNode *CalleeNode = CG[Callee];
+  CallGraphNode *CallerNode = CG[Caller];
+
+  // Since we inlined some uninlined call sites in the callee into the caller,
+  // add edges from the caller to all of the callees of the callee.
+  CallGraphNode::iterator I = CalleeNode->begin(), E = CalleeNode->end();
+
+  // Consider the case where CalleeNode == CallerNode.
+  CallGraphNode::CalledFunctionsVector CallCache;
+  if (CalleeNode == CallerNode) {
+    CallCache.assign(I, E);
+    I = CallCache.begin();
+    E = CallCache.end();
+  }
+
+  for (; I != E; ++I) {
+    const Value *OrigCall = I->first;
+
+    ValueToValueMapTy::iterator VMI = VMap.find(OrigCall);
+    // Only copy the edge if the call was inlined!
+    if (VMI == VMap.end() || VMI->second == nullptr)
+      continue;
+    
+    // If the call was inlined, but then constant folded, there is no edge to
+    // add.  Check for this case.
+    Instruction *NewCall = dyn_cast<Instruction>(VMI->second);
+    if (!NewCall)
+      continue;
+
+    // We do not treat intrinsic calls like real function calls because we
+    // expect them to become inline code; do not add an edge for an intrinsic.
+    CallSite CS = CallSite(NewCall);
+    if (CS && CS.getCalledFunction() && CS.getCalledFunction()->isIntrinsic())
+      continue;
+    
+    // Remember that this call site got inlined for the client of
+    // InlineFunction.
+    IFI.InlinedCalls.push_back(NewCall);
+
+    // It's possible that inlining the callsite will cause it to go from an
+    // indirect to a direct call by resolving a function pointer.  If this
+    // happens, set the callee of the new call site to a more precise
+    // destination.  This can also happen if the call graph node of the caller
+    // was just unnecessarily imprecise.
+    if (!I->second->getFunction())
+      if (Function *F = CallSite(NewCall).getCalledFunction()) {
+        // Indirect call site resolved to direct call.
+        CallerNode->addCalledFunction(CallSite(NewCall), CG[F]);
+
+        continue;
+      }
+
+    CallerNode->addCalledFunction(CallSite(NewCall), I->second);
+  }
+  
+  // Update the call graph by deleting the edge from Callee to Caller.  We must
+  // do this after the loop above in case Caller and Callee are the same.
+  CallerNode->removeCallEdgeFor(CS);
+}
+
+static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
+                                    BasicBlock *InsertBlock,
+                                    InlineFunctionInfo &IFI) {
+  Type *AggTy = cast<PointerType>(Src->getType())->getElementType();
+  IRBuilder<> Builder(InsertBlock, InsertBlock->begin());
+
+  Value *Size = Builder.getInt64(M->getDataLayout().getTypeStoreSize(AggTy));
+
+  // Always generate a memcpy of alignment 1 here because we don't know
+  // the alignment of the src pointer.  Other optimizations can infer
+  // better alignment.
+  Builder.CreateMemCpy(Dst, Src, Size, /*Align=*/1);
+}
+
+/// When inlining a call site that has a byval argument,
+/// we have to make the implicit memcpy explicit by adding it.
+static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
+                                  const Function *CalledFunc,
+                                  InlineFunctionInfo &IFI,
+                                  unsigned ByValAlignment) {
+  PointerType *ArgTy = cast<PointerType>(Arg->getType());
+  Type *AggTy = ArgTy->getElementType();
+
+  Function *Caller = TheCall->getFunction();
+  const DataLayout &DL = Caller->getParent()->getDataLayout();
+
+  // If the called function is readonly, then it could not mutate the caller's
+  // copy of the byval'd memory.  In this case, it is safe to elide the copy and
+  // temporary.
+  if (CalledFunc->onlyReadsMemory()) {
+    // If the byval argument has a specified alignment that is greater than the
+    // passed in pointer, then we either have to round up the input pointer or
+    // give up on this transformation.
+    if (ByValAlignment <= 1)  // 0 = unspecified, 1 = no particular alignment.
+      return Arg;
+
+    AssumptionCache *AC =
+        IFI.GetAssumptionCache ? &(*IFI.GetAssumptionCache)(*Caller) : nullptr;
+
+    // If the pointer is already known to be sufficiently aligned, or if we can
+    // round it up to a larger alignment, then we don't need a temporary.
+    if (getOrEnforceKnownAlignment(Arg, ByValAlignment, DL, TheCall, AC) >=
+        ByValAlignment)
+      return Arg;
+
+    // Otherwise, we have to make a memcpy to get a safe alignment.  This is bad
+    // for code quality, but rarely happens and is required for correctness.
+  }
+
+  // Create the alloca.  If we have DataLayout, use nice alignment.
+  unsigned Align = DL.getPrefTypeAlignment(AggTy);
+
+  // If the byval had an alignment specified, we *must* use at least that
+  // alignment, as it is required by the byval argument (and uses of the
+  // pointer inside the callee).
+  Align = std::max(Align, ByValAlignment);
+
+  Value *NewAlloca = new AllocaInst(AggTy, DL.getAllocaAddrSpace(),
+                                    nullptr, Align, Arg->getName(),
+                                    &*Caller->begin()->begin());
+  IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca));
+
+  // Uses of the argument in the function should use our new alloca
+  // instead.
+  return NewAlloca;
+}
+
+// Check whether this Value is used by a lifetime intrinsic.
+static bool isUsedByLifetimeMarker(Value *V) {
+  for (User *U : V->users()) {
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::lifetime_start:
+      case Intrinsic::lifetime_end:
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Check whether the given alloca already has
+// lifetime.start or lifetime.end intrinsics.
+static bool hasLifetimeMarkers(AllocaInst *AI) {
+  Type *Ty = AI->getType();
+  Type *Int8PtrTy = Type::getInt8PtrTy(Ty->getContext(),
+                                       Ty->getPointerAddressSpace());
+  if (Ty == Int8PtrTy)
+    return isUsedByLifetimeMarker(AI);
+
+  // Do a scan to find all the casts to i8*.
+  for (User *U : AI->users()) {
+    if (U->getType() != Int8PtrTy) continue;
+    if (U->stripPointerCasts() != AI) continue;
+    if (isUsedByLifetimeMarker(U))
+      return true;
+  }
+  return false;
+}
+
+/// Return the result of AI->isStaticAlloca() if AI were moved to the entry
+/// block. Allocas used in inalloca calls and allocas of dynamic array size
+/// cannot be static.
+static bool allocaWouldBeStaticInEntry(const AllocaInst *AI ) {
+  return isa<Constant>(AI->getArraySize()) && !AI->isUsedWithInAlloca();
+}
+
+/// Update inlined instructions' line numbers to
+/// to encode location where these instructions are inlined.
+static void fixupLineNumbers(Function *Fn, Function::iterator FI,
+                             Instruction *TheCall, bool CalleeHasDebugInfo) {
+  const DebugLoc &TheCallDL = TheCall->getDebugLoc();
+  if (!TheCallDL)
+    return;
+
+  auto &Ctx = Fn->getContext();
+  DILocation *InlinedAtNode = TheCallDL;
+
+  // Create a unique call site, not to be confused with any other call from the
+  // same location.
+  InlinedAtNode = DILocation::getDistinct(
+      Ctx, InlinedAtNode->getLine(), InlinedAtNode->getColumn(),
+      InlinedAtNode->getScope(), InlinedAtNode->getInlinedAt());
+
+  // Cache the inlined-at nodes as they're built so they are reused, without
+  // this every instruction's inlined-at chain would become distinct from each
+  // other.
+  DenseMap<const MDNode *, MDNode *> IANodes;
+
+  for (; FI != Fn->end(); ++FI) {
+    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
+         BI != BE; ++BI) {
+      if (DebugLoc DL = BI->getDebugLoc()) {
+        auto IA = DebugLoc::appendInlinedAt(DL, InlinedAtNode, BI->getContext(),
+                                            IANodes);
+        auto IDL = DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), IA);
+        BI->setDebugLoc(IDL);
+        continue;
+      }
+
+      if (CalleeHasDebugInfo)
+        continue;
+      
+      // If the inlined instruction has no line number, make it look as if it
+      // originates from the call location. This is important for
+      // ((__always_inline__, __nodebug__)) functions which must use caller
+      // location for all instructions in their function body.
+
+      // Don't update static allocas, as they may get moved later.
+      if (auto *AI = dyn_cast<AllocaInst>(BI))
+        if (allocaWouldBeStaticInEntry(AI))
+          continue;
+
+      BI->setDebugLoc(TheCallDL);
+    }
+  }
+}
+/// Update the block frequencies of the caller after a callee has been inlined.
+///
+/// Each block cloned into the caller has its block frequency scaled by the
+/// ratio of CallSiteFreq/CalleeEntryFreq. This ensures that the cloned copy of
+/// callee's entry block gets the same frequency as the callsite block and the
+/// relative frequencies of all cloned blocks remain the same after cloning.
+static void updateCallerBFI(BasicBlock *CallSiteBlock,
+                            const ValueToValueMapTy &VMap,
+                            BlockFrequencyInfo *CallerBFI,
+                            BlockFrequencyInfo *CalleeBFI,
+                            const BasicBlock &CalleeEntryBlock) {
+  SmallPtrSet<BasicBlock *, 16> ClonedBBs;
+  for (auto const &Entry : VMap) {
+    if (!isa<BasicBlock>(Entry.first) || !Entry.second)
+      continue;
+    auto *OrigBB = cast<BasicBlock>(Entry.first);
+    auto *ClonedBB = cast<BasicBlock>(Entry.second);
+    uint64_t Freq = CalleeBFI->getBlockFreq(OrigBB).getFrequency();
+    if (!ClonedBBs.insert(ClonedBB).second) {
+      // Multiple blocks in the callee might get mapped to one cloned block in
+      // the caller since we prune the callee as we clone it. When that happens,
+      // we want to use the maximum among the original blocks' frequencies.
+      uint64_t NewFreq = CallerBFI->getBlockFreq(ClonedBB).getFrequency();
+      if (NewFreq > Freq)
+        Freq = NewFreq;
+    }
+    CallerBFI->setBlockFreq(ClonedBB, Freq);
+  }
+  BasicBlock *EntryClone = cast<BasicBlock>(VMap.lookup(&CalleeEntryBlock));
+  CallerBFI->setBlockFreqAndScale(
+      EntryClone, CallerBFI->getBlockFreq(CallSiteBlock).getFrequency(),
+      ClonedBBs);
+}
+
+/// Update the branch metadata for cloned call instructions.
+static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
+                              const Optional<uint64_t> &CalleeEntryCount,
+                              const Instruction *TheCall,
+                              ProfileSummaryInfo *PSI,
+                              BlockFrequencyInfo *CallerBFI) {
+  if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1)
+    return;
+  Optional<uint64_t> CallSiteCount =
+      PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
+  uint64_t CallCount =
+      std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,
+               CalleeEntryCount.getValue());
+
+  for (auto const &Entry : VMap)
+    if (isa<CallInst>(Entry.first))
+      if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
+        CI->updateProfWeight(CallCount, CalleeEntryCount.getValue());
+  for (BasicBlock &BB : *Callee)
+    // No need to update the callsite if it is pruned during inlining.
+    if (VMap.count(&BB))
+      for (Instruction &I : BB)
+        if (CallInst *CI = dyn_cast<CallInst>(&I))
+          CI->updateProfWeight(CalleeEntryCount.getValue() - CallCount,
+                               CalleeEntryCount.getValue());
+}
+
+/// Update the entry count of callee after inlining.
+///
+/// The callsite's block count is subtracted from the callee's function entry
+/// count.
+static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB,
+                              Instruction *CallInst, Function *Callee,
+                              ProfileSummaryInfo *PSI) {
+  // If the callee has a original count of N, and the estimated count of
+  // callsite is M, the new callee count is set to N - M. M is estimated from
+  // the caller's entry count, its entry block frequency and the block frequency
+  // of the callsite.
+  Optional<uint64_t> CalleeCount = Callee->getEntryCount();
+  if (!CalleeCount.hasValue() || !PSI)
+    return;
+  Optional<uint64_t> CallCount = PSI->getProfileCount(CallInst, CallerBFI);
+  if (!CallCount.hasValue())
+    return;
+  // Since CallSiteCount is an estimate, it could exceed the original callee
+  // count and has to be set to 0.
+  if (CallCount.getValue() > CalleeCount.getValue())
+    Callee->setEntryCount(0);
+  else
+    Callee->setEntryCount(CalleeCount.getValue() - CallCount.getValue());
+}
+
+/// This function inlines the called function into the basic block of the
+/// caller. This returns false if it is not possible to inline this call.
+/// The program is still in a well defined state if this occurs though.
+///
+/// Note that this only does one level of inlining.  For example, if the
+/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
+/// exists in the instruction stream.  Similarly this will inline a recursive
+/// function by one level.
+bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
+                          AAResults *CalleeAAR, bool InsertLifetime) {
+  Instruction *TheCall = CS.getInstruction();
+  assert(TheCall->getParent() && TheCall->getFunction()
+         && "Instruction not in function!");
+
+  // If IFI has any state in it, zap it before we fill it in.
+  IFI.reset();
+
+  Function *CalledFunc = CS.getCalledFunction();
+  if (!CalledFunc ||              // Can't inline external function or indirect
+      CalledFunc->isDeclaration() || // call, or call to a vararg function!
+      CalledFunc->getFunctionType()->isVarArg()) return false;
+
+  // The inliner does not know how to inline through calls with operand bundles
+  // in general ...
+  if (CS.hasOperandBundles()) {
+    for (int i = 0, e = CS.getNumOperandBundles(); i != e; ++i) {
+      uint32_t Tag = CS.getOperandBundleAt(i).getTagID();
+      // ... but it knows how to inline through "deopt" operand bundles ...
+      if (Tag == LLVMContext::OB_deopt)
+        continue;
+      // ... and "funclet" operand bundles.
+      if (Tag == LLVMContext::OB_funclet)
+        continue;
+
+      return false;
+    }
+  }
+
+  // If the call to the callee cannot throw, set the 'nounwind' flag on any
+  // calls that we inline.
+  bool MarkNoUnwind = CS.doesNotThrow();
+
+  BasicBlock *OrigBB = TheCall->getParent();
+  Function *Caller = OrigBB->getParent();
+
+  // GC poses two hazards to inlining, which only occur when the callee has GC:
+  //  1. If the caller has no GC, then the callee's GC must be propagated to the
+  //     caller.
+  //  2. If the caller has a differing GC, it is invalid to inline.
+  if (CalledFunc->hasGC()) {
+    if (!Caller->hasGC())
+      Caller->setGC(CalledFunc->getGC());
+    else if (CalledFunc->getGC() != Caller->getGC())
+      return false;
+  }
+
+  // Get the personality function from the callee if it contains a landing pad.
+  Constant *CalledPersonality =
+      CalledFunc->hasPersonalityFn()
+          ? CalledFunc->getPersonalityFn()->stripPointerCasts()
+          : nullptr;
+
+  // Find the personality function used by the landing pads of the caller. If it
+  // exists, then check to see that it matches the personality function used in
+  // the callee.
+  Constant *CallerPersonality =
+      Caller->hasPersonalityFn()
+          ? Caller->getPersonalityFn()->stripPointerCasts()
+          : nullptr;
+  if (CalledPersonality) {
+    if (!CallerPersonality)
+      Caller->setPersonalityFn(CalledPersonality);
+    // If the personality functions match, then we can perform the
+    // inlining. Otherwise, we can't inline.
+    // TODO: This isn't 100% true. Some personality functions are proper
+    //       supersets of others and can be used in place of the other.
+    else if (CalledPersonality != CallerPersonality)
+      return false;
+  }
+
+  // We need to figure out which funclet the callsite was in so that we may
+  // properly nest the callee.
+  Instruction *CallSiteEHPad = nullptr;
+  if (CallerPersonality) {
+    EHPersonality Personality = classifyEHPersonality(CallerPersonality);
+    if (isFuncletEHPersonality(Personality)) {
+      Optional<OperandBundleUse> ParentFunclet =
+          CS.getOperandBundle(LLVMContext::OB_funclet);
+      if (ParentFunclet)
+        CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front());
+
+      // OK, the inlining site is legal.  What about the target function?
+
+      if (CallSiteEHPad) {
+        if (Personality == EHPersonality::MSVC_CXX) {
+          // The MSVC personality cannot tolerate catches getting inlined into
+          // cleanup funclets.
+          if (isa<CleanupPadInst>(CallSiteEHPad)) {
+            // Ok, the call site is within a cleanuppad.  Let's check the callee
+            // for catchpads.
+            for (const BasicBlock &CalledBB : *CalledFunc) {
+              if (isa<CatchSwitchInst>(CalledBB.getFirstNonPHI()))
+                return false;
+            }
+          }
+        } else if (isAsynchronousEHPersonality(Personality)) {
+          // SEH is even less tolerant, there may not be any sort of exceptional
+          // funclet in the callee.
+          for (const BasicBlock &CalledBB : *CalledFunc) {
+            if (CalledBB.isEHPad())
+              return false;
+          }
+        }
+      }
+    }
+  }
+
+  // Determine if we are dealing with a call in an EHPad which does not unwind
+  // to caller.
+  bool EHPadForCallUnwindsLocally = false;
+  if (CallSiteEHPad && CS.isCall()) {
+    UnwindDestMemoTy FuncletUnwindMap;
+    Value *CallSiteUnwindDestToken =
+        getUnwindDestToken(CallSiteEHPad, FuncletUnwindMap);
+
+    EHPadForCallUnwindsLocally =
+        CallSiteUnwindDestToken &&
+        !isa<ConstantTokenNone>(CallSiteUnwindDestToken);
+  }
+
+  // Get an iterator to the last basic block in the function, which will have
+  // the new function inlined after it.
+  Function::iterator LastBlock = --Caller->end();
+
+  // Make sure to capture all of the return instructions from the cloned
+  // function.
+  SmallVector<ReturnInst*, 8> Returns;
+  ClonedCodeInfo InlinedFunctionInfo;
+  Function::iterator FirstNewBlock;
+
+  { // Scope to destroy VMap after cloning.
+    ValueToValueMapTy VMap;
+    // Keep a list of pair (dst, src) to emit byval initializations.
+    SmallVector<std::pair<Value*, Value*>, 4> ByValInit;
+
+    auto &DL = Caller->getParent()->getDataLayout();
+
+    assert(CalledFunc->arg_size() == CS.arg_size() &&
+           "No varargs calls can be inlined!");
+
+    // Calculate the vector of arguments to pass into the function cloner, which
+    // matches up the formal to the actual argument values.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    unsigned ArgNo = 0;
+    for (Function::arg_iterator I = CalledFunc->arg_begin(),
+         E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) {
+      Value *ActualArg = *AI;
+
+      // When byval arguments actually inlined, we need to make the copy implied
+      // by them explicit.  However, we don't do this if the callee is readonly
+      // or readnone, because the copy would be unneeded: the callee doesn't
+      // modify the struct.
+      if (CS.isByValArgument(ArgNo)) {
+        ActualArg = HandleByValArgument(ActualArg, TheCall, CalledFunc, IFI,
+                                        CalledFunc->getParamAlignment(ArgNo));
+        if (ActualArg != *AI)
+          ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI));
+      }
+
+      VMap[&*I] = ActualArg;
+    }
+
+    // Add alignment assumptions if necessary. We do this before the inlined
+    // instructions are actually cloned into the caller so that we can easily
+    // check what will be known at the start of the inlined code.
+    AddAlignmentAssumptions(CS, IFI);
+
+    // We want the inliner to prune the code as it copies.  We would LOVE to
+    // have no dead or constant instructions leftover after inlining occurs
+    // (which can happen, e.g., because an argument was constant), but we'll be
+    // happy with whatever the cloner can do.
+    CloneAndPruneFunctionInto(Caller, CalledFunc, VMap,
+                              /*ModuleLevelChanges=*/false, Returns, ".i",
+                              &InlinedFunctionInfo, TheCall);
+    // Remember the first block that is newly cloned over.
+    FirstNewBlock = LastBlock; ++FirstNewBlock;
+
+    if (IFI.CallerBFI != nullptr && IFI.CalleeBFI != nullptr)
+      // Update the BFI of blocks cloned into the caller.
+      updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
+                      CalledFunc->front());
+
+    updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall,
+                      IFI.PSI, IFI.CallerBFI);
+    // Update the profile count of callee.
+    updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc, IFI.PSI);
+
+    // Inject byval arguments initialization.
+    for (std::pair<Value*, Value*> &Init : ByValInit)
+      HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(),
+                              &*FirstNewBlock, IFI);
+
+    Optional<OperandBundleUse> ParentDeopt =
+        CS.getOperandBundle(LLVMContext::OB_deopt);
+    if (ParentDeopt) {
+      SmallVector<OperandBundleDef, 2> OpDefs;
+
+      for (auto &VH : InlinedFunctionInfo.OperandBundleCallSites) {
+        Instruction *I = dyn_cast_or_null<Instruction>(VH);
+        if (!I) continue;  // instruction was DCE'd or RAUW'ed to undef
+
+        OpDefs.clear();
+
+        CallSite ICS(I);
+        OpDefs.reserve(ICS.getNumOperandBundles());
+
+        for (unsigned i = 0, e = ICS.getNumOperandBundles(); i < e; ++i) {
+          auto ChildOB = ICS.getOperandBundleAt(i);
+          if (ChildOB.getTagID() != LLVMContext::OB_deopt) {
+            // If the inlined call has other operand bundles, let them be
+            OpDefs.emplace_back(ChildOB);
+            continue;
+          }
+
+          // It may be useful to separate this logic (of handling operand
+          // bundles) out to a separate "policy" component if this gets crowded.
+          // Prepend the parent's deoptimization continuation to the newly
+          // inlined call's deoptimization continuation.
+          std::vector<Value *> MergedDeoptArgs;
+          MergedDeoptArgs.reserve(ParentDeopt->Inputs.size() +
+                                  ChildOB.Inputs.size());
+
+          MergedDeoptArgs.insert(MergedDeoptArgs.end(),
+                                 ParentDeopt->Inputs.begin(),
+                                 ParentDeopt->Inputs.end());
+          MergedDeoptArgs.insert(MergedDeoptArgs.end(), ChildOB.Inputs.begin(),
+                                 ChildOB.Inputs.end());
+
+          OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs));
+        }
+
+        Instruction *NewI = nullptr;
+        if (isa<CallInst>(I))
+          NewI = CallInst::Create(cast<CallInst>(I), OpDefs, I);
+        else
+          NewI = InvokeInst::Create(cast<InvokeInst>(I), OpDefs, I);
+
+        // Note: the RAUW does the appropriate fixup in VMap, so we need to do
+        // this even if the call returns void.
+        I->replaceAllUsesWith(NewI);
+
+        VH = nullptr;
+        I->eraseFromParent();
+      }
+    }
+
+    // Update the callgraph if requested.
+    if (IFI.CG)
+      UpdateCallGraphAfterInlining(CS, FirstNewBlock, VMap, IFI);
+
+    // For 'nodebug' functions, the associated DISubprogram is always null.
+    // Conservatively avoid propagating the callsite debug location to
+    // instructions inlined from a function whose DISubprogram is not null.
+    fixupLineNumbers(Caller, FirstNewBlock, TheCall,
+                     CalledFunc->getSubprogram() != nullptr);
+
+    // Clone existing noalias metadata if necessary.
+    CloneAliasScopeMetadata(CS, VMap);
+
+    // Add noalias metadata if necessary.
+    AddAliasScopeMetadata(CS, VMap, DL, CalleeAAR);
+
+    // Propagate llvm.mem.parallel_loop_access if necessary.
+    PropagateParallelLoopAccessMetadata(CS, VMap);
+
+    // Register any cloned assumptions.
+    if (IFI.GetAssumptionCache)
+      for (BasicBlock &NewBlock :
+           make_range(FirstNewBlock->getIterator(), Caller->end()))
+        for (Instruction &I : NewBlock) {
+          if (auto *II = dyn_cast<IntrinsicInst>(&I))
+            if (II->getIntrinsicID() == Intrinsic::assume)
+              (*IFI.GetAssumptionCache)(*Caller).registerAssumption(II);
+        }
+  }
+
+  // If there are any alloca instructions in the block that used to be the entry
+  // block for the callee, move them to the entry block of the caller.  First
+  // calculate which instruction they should be inserted before.  We insert the
+  // instructions at the end of the current alloca list.
+  {
+    BasicBlock::iterator InsertPoint = Caller->begin()->begin();
+    for (BasicBlock::iterator I = FirstNewBlock->begin(),
+         E = FirstNewBlock->end(); I != E; ) {
+      AllocaInst *AI = dyn_cast<AllocaInst>(I++);
+      if (!AI) continue;
+      
+      // If the alloca is now dead, remove it.  This often occurs due to code
+      // specialization.
+      if (AI->use_empty()) {
+        AI->eraseFromParent();
+        continue;
+      }
+
+      if (!allocaWouldBeStaticInEntry(AI))
+        continue;
+      
+      // Keep track of the static allocas that we inline into the caller.
+      IFI.StaticAllocas.push_back(AI);
+      
+      // Scan for the block of allocas that we can move over, and move them
+      // all at once.
+      while (isa<AllocaInst>(I) &&
+             allocaWouldBeStaticInEntry(cast<AllocaInst>(I))) {
+        IFI.StaticAllocas.push_back(cast<AllocaInst>(I));
+        ++I;
+      }
+
+      // Transfer all of the allocas over in a block.  Using splice means
+      // that the instructions aren't removed from the symbol table, then
+      // reinserted.
+      Caller->getEntryBlock().getInstList().splice(
+          InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I);
+    }
+    // Move any dbg.declares describing the allocas into the entry basic block.
+    DIBuilder DIB(*Caller->getParent());
+    for (auto &AI : IFI.StaticAllocas)
+      replaceDbgDeclareForAlloca(AI, AI, DIB, /*Deref=*/false);
+  }
+
+  bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false;
+  if (InlinedFunctionInfo.ContainsCalls) {
+    CallInst::TailCallKind CallSiteTailKind = CallInst::TCK_None;
+    if (CallInst *CI = dyn_cast<CallInst>(TheCall))
+      CallSiteTailKind = CI->getTailCallKind();
+
+    for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E;
+         ++BB) {
+      for (Instruction &I : *BB) {
+        CallInst *CI = dyn_cast<CallInst>(&I);
+        if (!CI)
+          continue;
+
+        if (Function *F = CI->getCalledFunction())
+          InlinedDeoptimizeCalls |=
+              F->getIntrinsicID() == Intrinsic::experimental_deoptimize;
+
+        // We need to reduce the strength of any inlined tail calls.  For
+        // musttail, we have to avoid introducing potential unbounded stack
+        // growth.  For example, if functions 'f' and 'g' are mutually recursive
+        // with musttail, we can inline 'g' into 'f' so long as we preserve
+        // musttail on the cloned call to 'f'.  If either the inlined call site
+        // or the cloned call site is *not* musttail, the program already has
+        // one frame of stack growth, so it's safe to remove musttail.  Here is
+        // a table of example transformations:
+        //
+        //    f -> musttail g -> musttail f  ==>  f -> musttail f
+        //    f -> musttail g ->     tail f  ==>  f ->     tail f
+        //    f ->          g -> musttail f  ==>  f ->          f
+        //    f ->          g ->     tail f  ==>  f ->          f
+        CallInst::TailCallKind ChildTCK = CI->getTailCallKind();
+        ChildTCK = std::min(CallSiteTailKind, ChildTCK);
+        CI->setTailCallKind(ChildTCK);
+        InlinedMustTailCalls |= CI->isMustTailCall();
+
+        // Calls inlined through a 'nounwind' call site should be marked
+        // 'nounwind'.
+        if (MarkNoUnwind)
+          CI->setDoesNotThrow();
+      }
+    }
+  }
+
+  // Leave lifetime markers for the static alloca's, scoping them to the
+  // function we just inlined.
+  if (InsertLifetime && !IFI.StaticAllocas.empty()) {
+    IRBuilder<> builder(&FirstNewBlock->front());
+    for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) {
+      AllocaInst *AI = IFI.StaticAllocas[ai];
+      // Don't mark swifterror allocas. They can't have bitcast uses.
+      if (AI->isSwiftError())
+        continue;
+
+      // If the alloca is already scoped to something smaller than the whole
+      // function then there's no need to add redundant, less accurate markers.
+      if (hasLifetimeMarkers(AI))
+        continue;
+
+      // Try to determine the size of the allocation.
+      ConstantInt *AllocaSize = nullptr;
+      if (ConstantInt *AIArraySize =
+          dyn_cast<ConstantInt>(AI->getArraySize())) {
+        auto &DL = Caller->getParent()->getDataLayout();
+        Type *AllocaType = AI->getAllocatedType();
+        uint64_t AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
+        uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
+
+        // Don't add markers for zero-sized allocas.
+        if (AllocaArraySize == 0)
+          continue;
+
+        // Check that array size doesn't saturate uint64_t and doesn't
+        // overflow when it's multiplied by type size.
+        if (AllocaArraySize != ~0ULL &&
+            UINT64_MAX / AllocaArraySize >= AllocaTypeSize) {
+          AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
+                                        AllocaArraySize * AllocaTypeSize);
+        }
+      }
+
+      builder.CreateLifetimeStart(AI, AllocaSize);
+      for (ReturnInst *RI : Returns) {
+        // Don't insert llvm.lifetime.end calls between a musttail or deoptimize
+        // call and a return.  The return kills all local allocas.
+        if (InlinedMustTailCalls &&
+            RI->getParent()->getTerminatingMustTailCall())
+          continue;
+        if (InlinedDeoptimizeCalls &&
+            RI->getParent()->getTerminatingDeoptimizeCall())
+          continue;
+        IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize);
+      }
+    }
+  }
+
+  // If the inlined code contained dynamic alloca instructions, wrap the inlined
+  // code with llvm.stacksave/llvm.stackrestore intrinsics.
+  if (InlinedFunctionInfo.ContainsDynamicAllocas) {
+    Module *M = Caller->getParent();
+    // Get the two intrinsics we care about.
+    Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave);
+    Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore);
+
+    // Insert the llvm.stacksave.
+    CallInst *SavedPtr = IRBuilder<>(&*FirstNewBlock, FirstNewBlock->begin())
+                             .CreateCall(StackSave, {}, "savedstack");
+
+    // Insert a call to llvm.stackrestore before any return instructions in the
+    // inlined function.
+    for (ReturnInst *RI : Returns) {
+      // Don't insert llvm.stackrestore calls between a musttail or deoptimize
+      // call and a return.  The return will restore the stack pointer.
+      if (InlinedMustTailCalls && RI->getParent()->getTerminatingMustTailCall())
+        continue;
+      if (InlinedDeoptimizeCalls && RI->getParent()->getTerminatingDeoptimizeCall())
+        continue;
+      IRBuilder<>(RI).CreateCall(StackRestore, SavedPtr);
+    }
+  }
+
+  // If we are inlining for an invoke instruction, we must make sure to rewrite
+  // any call instructions into invoke instructions.  This is sensitive to which
+  // funclet pads were top-level in the inlinee, so must be done before
+  // rewriting the "parent pad" links.
+  if (auto *II = dyn_cast<InvokeInst>(TheCall)) {
+    BasicBlock *UnwindDest = II->getUnwindDest();
+    Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI();
+    if (isa<LandingPadInst>(FirstNonPHI)) {
+      HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo);
+    } else {
+      HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo);
+    }
+  }
+
+  // Update the lexical scopes of the new funclets and callsites.
+  // Anything that had 'none' as its parent is now nested inside the callsite's
+  // EHPad.
+
+  if (CallSiteEHPad) {
+    for (Function::iterator BB = FirstNewBlock->getIterator(),
+                            E = Caller->end();
+         BB != E; ++BB) {
+      // Add bundle operands to any top-level call sites.
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) {
+        Instruction *I = &*BBI++;
+        CallSite CS(I);
+        if (!CS)
+          continue;
+
+        // Skip call sites which are nounwind intrinsics.
+        auto *CalledFn =
+            dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
+        if (CalledFn && CalledFn->isIntrinsic() && CS.doesNotThrow())
+          continue;
+
+        // Skip call sites which already have a "funclet" bundle.
+        if (CS.getOperandBundle(LLVMContext::OB_funclet))
+          continue;
+
+        CS.getOperandBundlesAsDefs(OpBundles);
+        OpBundles.emplace_back("funclet", CallSiteEHPad);
+
+        Instruction *NewInst;
+        if (CS.isCall())
+          NewInst = CallInst::Create(cast<CallInst>(I), OpBundles, I);
+        else
+          NewInst = InvokeInst::Create(cast<InvokeInst>(I), OpBundles, I);
+        NewInst->takeName(I);
+        I->replaceAllUsesWith(NewInst);
+        I->eraseFromParent();
+
+        OpBundles.clear();
+      }
+
+      // It is problematic if the inlinee has a cleanupret which unwinds to
+      // caller and we inline it into a call site which doesn't unwind but into
+      // an EH pad that does.  Such an edge must be dynamically unreachable.
+      // As such, we replace the cleanupret with unreachable.
+      if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(BB->getTerminator()))
+        if (CleanupRet->unwindsToCaller() && EHPadForCallUnwindsLocally)
+          changeToUnreachable(CleanupRet, /*UseLLVMTrap=*/false);
+
+      Instruction *I = BB->getFirstNonPHI();
+      if (!I->isEHPad())
+        continue;
+
+      if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
+        if (isa<ConstantTokenNone>(CatchSwitch->getParentPad()))
+          CatchSwitch->setParentPad(CallSiteEHPad);
+      } else {
+        auto *FPI = cast<FuncletPadInst>(I);
+        if (isa<ConstantTokenNone>(FPI->getParentPad()))
+          FPI->setParentPad(CallSiteEHPad);
+      }
+    }
+  }
+
+  if (InlinedDeoptimizeCalls) {
+    // We need to at least remove the deoptimizing returns from the Return set,
+    // so that the control flow from those returns does not get merged into the
+    // caller (but terminate it instead).  If the caller's return type does not
+    // match the callee's return type, we also need to change the return type of
+    // the intrinsic.
+    if (Caller->getReturnType() == TheCall->getType()) {
+      auto NewEnd = remove_if(Returns, [](ReturnInst *RI) {
+        return RI->getParent()->getTerminatingDeoptimizeCall() != nullptr;
+      });
+      Returns.erase(NewEnd, Returns.end());
+    } else {
+      SmallVector<ReturnInst *, 8> NormalReturns;
+      Function *NewDeoptIntrinsic = Intrinsic::getDeclaration(
+          Caller->getParent(), Intrinsic::experimental_deoptimize,
+          {Caller->getReturnType()});
+
+      for (ReturnInst *RI : Returns) {
+        CallInst *DeoptCall = RI->getParent()->getTerminatingDeoptimizeCall();
+        if (!DeoptCall) {
+          NormalReturns.push_back(RI);
+          continue;
+        }
+
+        // The calling convention on the deoptimize call itself may be bogus,
+        // since the code we're inlining may have undefined behavior (and may
+        // never actually execute at runtime); but all
+        // @llvm.experimental.deoptimize declarations have to have the same
+        // calling convention in a well-formed module.
+        auto CallingConv = DeoptCall->getCalledFunction()->getCallingConv();
+        NewDeoptIntrinsic->setCallingConv(CallingConv);
+        auto *CurBB = RI->getParent();
+        RI->eraseFromParent();
+
+        SmallVector<Value *, 4> CallArgs(DeoptCall->arg_begin(),
+                                         DeoptCall->arg_end());
+
+        SmallVector<OperandBundleDef, 1> OpBundles;
+        DeoptCall->getOperandBundlesAsDefs(OpBundles);
+        DeoptCall->eraseFromParent();
+        assert(!OpBundles.empty() &&
+               "Expected at least the deopt operand bundle");
+
+        IRBuilder<> Builder(CurBB);
+        CallInst *NewDeoptCall =
+            Builder.CreateCall(NewDeoptIntrinsic, CallArgs, OpBundles);
+        NewDeoptCall->setCallingConv(CallingConv);
+        if (NewDeoptCall->getType()->isVoidTy())
+          Builder.CreateRetVoid();
+        else
+          Builder.CreateRet(NewDeoptCall);
+      }
+
+      // Leave behind the normal returns so we can merge control flow.
+      std::swap(Returns, NormalReturns);
+    }
+  }
+
+  // Handle any inlined musttail call sites.  In order for a new call site to be
+  // musttail, the source of the clone and the inlined call site must have been
+  // musttail.  Therefore it's safe to return without merging control into the
+  // phi below.
+  if (InlinedMustTailCalls) {
+    // Check if we need to bitcast the result of any musttail calls.
+    Type *NewRetTy = Caller->getReturnType();
+    bool NeedBitCast = !TheCall->use_empty() && TheCall->getType() != NewRetTy;
+
+    // Handle the returns preceded by musttail calls separately.
+    SmallVector<ReturnInst *, 8> NormalReturns;
+    for (ReturnInst *RI : Returns) {
+      CallInst *ReturnedMustTail =
+          RI->getParent()->getTerminatingMustTailCall();
+      if (!ReturnedMustTail) {
+        NormalReturns.push_back(RI);
+        continue;
+      }
+      if (!NeedBitCast)
+        continue;
+
+      // Delete the old return and any preceding bitcast.
+      BasicBlock *CurBB = RI->getParent();
+      auto *OldCast = dyn_cast_or_null<BitCastInst>(RI->getReturnValue());
+      RI->eraseFromParent();
+      if (OldCast)
+        OldCast->eraseFromParent();
+
+      // Insert a new bitcast and return with the right type.
+      IRBuilder<> Builder(CurBB);
+      Builder.CreateRet(Builder.CreateBitCast(ReturnedMustTail, NewRetTy));
+    }
+
+    // Leave behind the normal returns so we can merge control flow.
+    std::swap(Returns, NormalReturns);
+  }
+
+  // Now that all of the transforms on the inlined code have taken place but
+  // before we splice the inlined code into the CFG and lose track of which
+  // blocks were actually inlined, collect the call sites. We only do this if
+  // call graph updates weren't requested, as those provide value handle based
+  // tracking of inlined call sites instead.
+  if (InlinedFunctionInfo.ContainsCalls && !IFI.CG) {
+    // Otherwise just collect the raw call sites that were inlined.
+    for (BasicBlock &NewBB :
+         make_range(FirstNewBlock->getIterator(), Caller->end()))
+      for (Instruction &I : NewBB)
+        if (auto CS = CallSite(&I))
+          IFI.InlinedCallSites.push_back(CS);
+  }
+
+  // If we cloned in _exactly one_ basic block, and if that block ends in a
+  // return instruction, we splice the body of the inlined callee directly into
+  // the calling basic block.
+  if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) {
+    // Move all of the instructions right before the call.
+    OrigBB->getInstList().splice(TheCall->getIterator(),
+                                 FirstNewBlock->getInstList(),
+                                 FirstNewBlock->begin(), FirstNewBlock->end());
+    // Remove the cloned basic block.
+    Caller->getBasicBlockList().pop_back();
+
+    // If the call site was an invoke instruction, add a branch to the normal
+    // destination.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
+      BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), TheCall);
+      NewBr->setDebugLoc(Returns[0]->getDebugLoc());
+    }
+
+    // If the return instruction returned a value, replace uses of the call with
+    // uses of the returned value.
+    if (!TheCall->use_empty()) {
+      ReturnInst *R = Returns[0];
+      if (TheCall == R->getReturnValue())
+        TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+      else
+        TheCall->replaceAllUsesWith(R->getReturnValue());
+    }
+    // Since we are now done with the Call/Invoke, we can delete it.
+    TheCall->eraseFromParent();
+
+    // Since we are now done with the return instruction, delete it also.
+    Returns[0]->eraseFromParent();
+
+    // We are now done with the inlining.
+    return true;
+  }
+
+  // Otherwise, we have the normal case, of more than one block to inline or
+  // multiple return sites.
+
+  // We want to clone the entire callee function into the hole between the
+  // "starter" and "ender" blocks.  How we accomplish this depends on whether
+  // this is an invoke instruction or a call instruction.
+  BasicBlock *AfterCallBB;
+  BranchInst *CreatedBranchToNormalDest = nullptr;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
+
+    // Add an unconditional branch to make this look like the CallInst case...
+    CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), TheCall);
+
+    // Split the basic block.  This guarantees that no PHI nodes will have to be
+    // updated due to new incoming edges, and make the invoke case more
+    // symmetric to the call case.
+    AfterCallBB =
+        OrigBB->splitBasicBlock(CreatedBranchToNormalDest->getIterator(),
+                                CalledFunc->getName() + ".exit");
+
+  } else {  // It's a call
+    // If this is a call instruction, we need to split the basic block that
+    // the call lives in.
+    //
+    AfterCallBB = OrigBB->splitBasicBlock(TheCall->getIterator(),
+                                          CalledFunc->getName() + ".exit");
+  }
+
+  if (IFI.CallerBFI) {
+    // Copy original BB's block frequency to AfterCallBB
+    IFI.CallerBFI->setBlockFreq(
+        AfterCallBB, IFI.CallerBFI->getBlockFreq(OrigBB).getFrequency());
+  }
+
+  // Change the branch that used to go to AfterCallBB to branch to the first
+  // basic block of the inlined function.
+  //
+  TerminatorInst *Br = OrigBB->getTerminator();
+  assert(Br && Br->getOpcode() == Instruction::Br &&
+         "splitBasicBlock broken!");
+  Br->setOperand(0, &*FirstNewBlock);
+
+  // Now that the function is correct, make it a little bit nicer.  In
+  // particular, move the basic blocks inserted from the end of the function
+  // into the space made by splitting the source basic block.
+  Caller->getBasicBlockList().splice(AfterCallBB->getIterator(),
+                                     Caller->getBasicBlockList(), FirstNewBlock,
+                                     Caller->end());
+
+  // Handle all of the return instructions that we just cloned in, and eliminate
+  // any users of the original call/invoke instruction.
+  Type *RTy = CalledFunc->getReturnType();
+
+  PHINode *PHI = nullptr;
+  if (Returns.size() > 1) {
+    // The PHI node should go at the front of the new basic block to merge all
+    // possible incoming values.
+    if (!TheCall->use_empty()) {
+      PHI = PHINode::Create(RTy, Returns.size(), TheCall->getName(),
+                            &AfterCallBB->front());
+      // Anything that used the result of the function call should now use the
+      // PHI node as their operand.
+      TheCall->replaceAllUsesWith(PHI);
+    }
+
+    // Loop over all of the return instructions adding entries to the PHI node
+    // as appropriate.
+    if (PHI) {
+      for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+        ReturnInst *RI = Returns[i];
+        assert(RI->getReturnValue()->getType() == PHI->getType() &&
+               "Ret value not consistent in function!");
+        PHI->addIncoming(RI->getReturnValue(), RI->getParent());
+      }
+    }
+
+    // Add a branch to the merge points and remove return instructions.
+    DebugLoc Loc;
+    for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+      ReturnInst *RI = Returns[i];
+      BranchInst* BI = BranchInst::Create(AfterCallBB, RI);
+      Loc = RI->getDebugLoc();
+      BI->setDebugLoc(Loc);
+      RI->eraseFromParent();
+    }
+    // We need to set the debug location to *somewhere* inside the
+    // inlined function. The line number may be nonsensical, but the
+    // instruction will at least be associated with the right
+    // function.
+    if (CreatedBranchToNormalDest)
+      CreatedBranchToNormalDest->setDebugLoc(Loc);
+  } else if (!Returns.empty()) {
+    // Otherwise, if there is exactly one return value, just replace anything
+    // using the return value of the call with the computed value.
+    if (!TheCall->use_empty()) {
+      if (TheCall == Returns[0]->getReturnValue())
+        TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+      else
+        TheCall->replaceAllUsesWith(Returns[0]->getReturnValue());
+    }
+
+    // Update PHI nodes that use the ReturnBB to use the AfterCallBB.
+    BasicBlock *ReturnBB = Returns[0]->getParent();
+    ReturnBB->replaceAllUsesWith(AfterCallBB);
+
+    // Splice the code from the return block into the block that it will return
+    // to, which contains the code that was after the call.
+    AfterCallBB->getInstList().splice(AfterCallBB->begin(),
+                                      ReturnBB->getInstList());
+
+    if (CreatedBranchToNormalDest)
+      CreatedBranchToNormalDest->setDebugLoc(Returns[0]->getDebugLoc());
+
+    // Delete the return instruction now and empty ReturnBB now.
+    Returns[0]->eraseFromParent();
+    ReturnBB->eraseFromParent();
+  } else if (!TheCall->use_empty()) {
+    // No returns, but something is using the return value of the call.  Just
+    // nuke the result.
+    TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+  }
+
+  // Since we are now done with the Call/Invoke, we can delete it.
+  TheCall->eraseFromParent();
+
+  // If we inlined any musttail calls and the original return is now
+  // unreachable, delete it.  It can only contain a bitcast and ret.
+  if (InlinedMustTailCalls && pred_begin(AfterCallBB) == pred_end(AfterCallBB))
+    AfterCallBB->eraseFromParent();
+
+  // We should always be able to fold the entry block of the function into the
+  // single predecessor of the block...
+  assert(cast<BranchInst>(Br)->isUnconditional() && "splitBasicBlock broken!");
+  BasicBlock *CalleeEntry = cast<BranchInst>(Br)->getSuccessor(0);
+
+  // Splice the code entry block into calling block, right before the
+  // unconditional branch.
+  CalleeEntry->replaceAllUsesWith(OrigBB);  // Update PHI nodes
+  OrigBB->getInstList().splice(Br->getIterator(), CalleeEntry->getInstList());
+
+  // Remove the unconditional branch.
+  OrigBB->getInstList().erase(Br);
+
+  // Now we can remove the CalleeEntry block, which is now empty.
+  Caller->getBasicBlockList().erase(CalleeEntry);
+
+  // If we inserted a phi node, check to see if it has a single value (e.g. all
+  // the entries are the same or undef).  If so, remove the PHI so it doesn't
+  // block other optimizations.
+  if (PHI) {
+    AssumptionCache *AC =
+        IFI.GetAssumptionCache ? &(*IFI.GetAssumptionCache)(*Caller) : nullptr;
+    auto &DL = Caller->getParent()->getDataLayout();
+    if (Value *V = SimplifyInstruction(PHI, {DL, nullptr, nullptr, AC})) {
+      PHI->replaceAllUsesWith(V);
+      PHI->eraseFromParent();
+    }
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp
new file mode 100644
index 000000000000..23ec45edb3ef
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp
@@ -0,0 +1,63 @@
+//===- InstructionNamer.cpp - Give anonymous instructions names -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a little utility pass that gives instructions names, this is mostly
+// useful when diffing the effect of an optimization because deleting an
+// unnamed instruction can change all other instruction numbering, making the
+// diff very noisy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+namespace {
+  struct InstNamer : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    InstNamer() : FunctionPass(ID) {
+      initializeInstNamerPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &Info) const override {
+      Info.setPreservesAll();
+    }
+
+    bool runOnFunction(Function &F) override {
+      for (auto &Arg : F.args())
+        if (!Arg.hasName())
+          Arg.setName("arg");
+
+      for (BasicBlock &BB : F) {
+        if (!BB.hasName())
+          BB.setName("bb");
+
+        for (Instruction &I : BB)
+          if (!I.hasName() && !I.getType()->isVoidTy())
+            I.setName("tmp");
+      }
+      return true;
+    }
+  };
+
+  char InstNamer::ID = 0;
+}
+
+INITIALIZE_PASS(InstNamer, "instnamer",
+                "Assign names to anonymous instructions", false, false)
+char &llvm::InstructionNamerID = InstNamer::ID;
+//===----------------------------------------------------------------------===//
+//
+// InstructionNamer - Give any unnamed non-void instructions "tmp" names.
+//
+FunctionPass *llvm::createInstructionNamerPass() {
+  return new InstNamer();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp
new file mode 100644
index 000000000000..5a90dcb033b2
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp
@@ -0,0 +1,674 @@
+//===-- IntegerDivision.cpp - Expand integer division ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an implementation of 32bit and 64bit scalar integer
+// division for targets that don't have native support. It's largely derived
+// from compiler-rt's implementations of __udivsi3 and __udivmoddi4,
+// but hand-tuned for targets that prefer less control flow.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/IntegerDivision.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "integer-division"
+
+/// Generate code to compute the remainder of two signed integers. Returns the
+/// remainder, which will have the sign of the dividend. Builder's insert point
+/// should be pointing where the caller wants code generated, e.g. at the srem
+/// instruction. This will generate a urem in the process, and Builder's insert
+/// point will be pointing at the uren (if present, i.e. not folded), ready to
+/// be expanded if the user wishes
+static Value *generateSignedRemainderCode(Value *Dividend, Value *Divisor,
+                                          IRBuilder<> &Builder) {
+  unsigned BitWidth = Dividend->getType()->getIntegerBitWidth();
+  ConstantInt *Shift;
+
+  if (BitWidth == 64) {
+    Shift = Builder.getInt64(63);
+  } else {
+    assert(BitWidth == 32 && "Unexpected bit width");
+    Shift = Builder.getInt32(31);
+  }
+
+  // Following instructions are generated for both i32 (shift 31) and
+  // i64 (shift 63).
+
+  // ;   %dividend_sgn = ashr i32 %dividend, 31
+  // ;   %divisor_sgn  = ashr i32 %divisor, 31
+  // ;   %dvd_xor      = xor i32 %dividend, %dividend_sgn
+  // ;   %dvs_xor      = xor i32 %divisor, %divisor_sgn
+  // ;   %u_dividend   = sub i32 %dvd_xor, %dividend_sgn
+  // ;   %u_divisor    = sub i32 %dvs_xor, %divisor_sgn
+  // ;   %urem         = urem i32 %dividend, %divisor
+  // ;   %xored        = xor i32 %urem, %dividend_sgn
+  // ;   %srem         = sub i32 %xored, %dividend_sgn
+  Value *DividendSign = Builder.CreateAShr(Dividend, Shift);
+  Value *DivisorSign  = Builder.CreateAShr(Divisor, Shift);
+  Value *DvdXor       = Builder.CreateXor(Dividend, DividendSign);
+  Value *DvsXor       = Builder.CreateXor(Divisor, DivisorSign);
+  Value *UDividend    = Builder.CreateSub(DvdXor, DividendSign);
+  Value *UDivisor     = Builder.CreateSub(DvsXor, DivisorSign);
+  Value *URem         = Builder.CreateURem(UDividend, UDivisor);
+  Value *Xored        = Builder.CreateXor(URem, DividendSign);
+  Value *SRem         = Builder.CreateSub(Xored, DividendSign);
+
+  if (Instruction *URemInst = dyn_cast<Instruction>(URem))
+    Builder.SetInsertPoint(URemInst);
+
+  return SRem;
+}
+
+
+/// Generate code to compute the remainder of two unsigned integers. Returns the
+/// remainder. Builder's insert point should be pointing where the caller wants
+/// code generated, e.g. at the urem instruction. This will generate a udiv in
+/// the process, and Builder's insert point will be pointing at the udiv (if
+/// present, i.e. not folded), ready to be expanded if the user wishes
+static Value *generatedUnsignedRemainderCode(Value *Dividend, Value *Divisor,
+                                             IRBuilder<> &Builder) {
+  // Remainder = Dividend - Quotient*Divisor
+
+  // Following instructions are generated for both i32 and i64
+
+  // ;   %quotient  = udiv i32 %dividend, %divisor
+  // ;   %product   = mul i32 %divisor, %quotient
+  // ;   %remainder = sub i32 %dividend, %product
+  Value *Quotient  = Builder.CreateUDiv(Dividend, Divisor);
+  Value *Product   = Builder.CreateMul(Divisor, Quotient);
+  Value *Remainder = Builder.CreateSub(Dividend, Product);
+
+  if (Instruction *UDiv = dyn_cast<Instruction>(Quotient))
+    Builder.SetInsertPoint(UDiv);
+
+  return Remainder;
+}
+
+/// Generate code to divide two signed integers. Returns the quotient, rounded
+/// towards 0. Builder's insert point should be pointing where the caller wants
+/// code generated, e.g. at the sdiv instruction. This will generate a udiv in
+/// the process, and Builder's insert point will be pointing at the udiv (if
+/// present, i.e. not folded), ready to be expanded if the user wishes.
+static Value *generateSignedDivisionCode(Value *Dividend, Value *Divisor,
+                                         IRBuilder<> &Builder) {
+  // Implementation taken from compiler-rt's __divsi3 and __divdi3
+
+  unsigned BitWidth = Dividend->getType()->getIntegerBitWidth();
+  ConstantInt *Shift;
+
+  if (BitWidth == 64) {
+    Shift = Builder.getInt64(63);
+  } else {
+    assert(BitWidth == 32 && "Unexpected bit width");
+    Shift = Builder.getInt32(31);
+  }
+
+  // Following instructions are generated for both i32 (shift 31) and
+  // i64 (shift 63).
+
+  // ;   %tmp    = ashr i32 %dividend, 31
+  // ;   %tmp1   = ashr i32 %divisor, 31
+  // ;   %tmp2   = xor i32 %tmp, %dividend
+  // ;   %u_dvnd = sub nsw i32 %tmp2, %tmp
+  // ;   %tmp3   = xor i32 %tmp1, %divisor
+  // ;   %u_dvsr = sub nsw i32 %tmp3, %tmp1
+  // ;   %q_sgn  = xor i32 %tmp1, %tmp
+  // ;   %q_mag  = udiv i32 %u_dvnd, %u_dvsr
+  // ;   %tmp4   = xor i32 %q_mag, %q_sgn
+  // ;   %q      = sub i32 %tmp4, %q_sgn
+  Value *Tmp    = Builder.CreateAShr(Dividend, Shift);
+  Value *Tmp1   = Builder.CreateAShr(Divisor, Shift);
+  Value *Tmp2   = Builder.CreateXor(Tmp, Dividend);
+  Value *U_Dvnd = Builder.CreateSub(Tmp2, Tmp);
+  Value *Tmp3   = Builder.CreateXor(Tmp1, Divisor);
+  Value *U_Dvsr = Builder.CreateSub(Tmp3, Tmp1);
+  Value *Q_Sgn  = Builder.CreateXor(Tmp1, Tmp);
+  Value *Q_Mag  = Builder.CreateUDiv(U_Dvnd, U_Dvsr);
+  Value *Tmp4   = Builder.CreateXor(Q_Mag, Q_Sgn);
+  Value *Q      = Builder.CreateSub(Tmp4, Q_Sgn);
+
+  if (Instruction *UDiv = dyn_cast<Instruction>(Q_Mag))
+    Builder.SetInsertPoint(UDiv);
+
+  return Q;
+}
+
+/// Generates code to divide two unsigned scalar 32-bit or 64-bit integers.
+/// Returns the quotient, rounded towards 0. Builder's insert point should
+/// point where the caller wants code generated, e.g. at the udiv instruction.
+static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
+                                           IRBuilder<> &Builder) {
+  // The basic algorithm can be found in the compiler-rt project's
+  // implementation of __udivsi3.c. Here, we do a lower-level IR based approach
+  // that's been hand-tuned to lessen the amount of control flow involved.
+
+  // Some helper values
+  IntegerType *DivTy = cast<IntegerType>(Dividend->getType());
+  unsigned BitWidth = DivTy->getBitWidth();
+
+  ConstantInt *Zero;
+  ConstantInt *One;
+  ConstantInt *NegOne;
+  ConstantInt *MSB;
+
+  if (BitWidth == 64) {
+    Zero      = Builder.getInt64(0);
+    One       = Builder.getInt64(1);
+    NegOne    = ConstantInt::getSigned(DivTy, -1);
+    MSB       = Builder.getInt64(63);
+  } else {
+    assert(BitWidth == 32 && "Unexpected bit width");
+    Zero      = Builder.getInt32(0);
+    One       = Builder.getInt32(1);
+    NegOne    = ConstantInt::getSigned(DivTy, -1);
+    MSB       = Builder.getInt32(31);
+  }
+
+  ConstantInt *True = Builder.getTrue();
+
+  BasicBlock *IBB = Builder.GetInsertBlock();
+  Function *F = IBB->getParent();
+  Function *CTLZ = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
+                                             DivTy);
+
+  // Our CFG is going to look like:
+  // +---------------------+
+  // | special-cases       |
+  // |   ...               |
+  // +---------------------+
+  //  |       |
+  //  |   +----------+
+  //  |   |  bb1     |
+  //  |   |  ...     |
+  //  |   +----------+
+  //  |    |      |
+  //  |    |  +------------+
+  //  |    |  |  preheader |
+  //  |    |  |  ...       |
+  //  |    |  +------------+
+  //  |    |      |
+  //  |    |      |      +---+
+  //  |    |      |      |   |
+  //  |    |  +------------+ |
+  //  |    |  |  do-while  | |
+  //  |    |  |  ...       | |
+  //  |    |  +------------+ |
+  //  |    |      |      |   |
+  //  |   +-----------+  +---+
+  //  |   | loop-exit |
+  //  |   |  ...      |
+  //  |   +-----------+
+  //  |     |
+  // +-------+
+  // | ...   |
+  // | end   |
+  // +-------+
+  BasicBlock *SpecialCases = Builder.GetInsertBlock();
+  SpecialCases->setName(Twine(SpecialCases->getName(), "_udiv-special-cases"));
+  BasicBlock *End = SpecialCases->splitBasicBlock(Builder.GetInsertPoint(),
+                                                  "udiv-end");
+  BasicBlock *LoopExit  = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-loop-exit", F, End);
+  BasicBlock *DoWhile   = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-do-while", F, End);
+  BasicBlock *Preheader = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-preheader", F, End);
+  BasicBlock *BB1       = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-bb1", F, End);
+
+  // We'll be overwriting the terminator to insert our extra blocks
+  SpecialCases->getTerminator()->eraseFromParent();
+
+  // Same instructions are generated for both i32 (msb 31) and i64 (msb 63).
+
+  // First off, check for special cases: dividend or divisor is zero, divisor
+  // is greater than dividend, and divisor is 1.
+  // ; special-cases:
+  // ;   %ret0_1      = icmp eq i32 %divisor, 0
+  // ;   %ret0_2      = icmp eq i32 %dividend, 0
+  // ;   %ret0_3      = or i1 %ret0_1, %ret0_2
+  // ;   %tmp0        = tail call i32 @llvm.ctlz.i32(i32 %divisor, i1 true)
+  // ;   %tmp1        = tail call i32 @llvm.ctlz.i32(i32 %dividend, i1 true)
+  // ;   %sr          = sub nsw i32 %tmp0, %tmp1
+  // ;   %ret0_4      = icmp ugt i32 %sr, 31
+  // ;   %ret0        = or i1 %ret0_3, %ret0_4
+  // ;   %retDividend = icmp eq i32 %sr, 31
+  // ;   %retVal      = select i1 %ret0, i32 0, i32 %dividend
+  // ;   %earlyRet    = or i1 %ret0, %retDividend
+  // ;   br i1 %earlyRet, label %end, label %bb1
+  Builder.SetInsertPoint(SpecialCases);
+  Value *Ret0_1      = Builder.CreateICmpEQ(Divisor, Zero);
+  Value *Ret0_2      = Builder.CreateICmpEQ(Dividend, Zero);
+  Value *Ret0_3      = Builder.CreateOr(Ret0_1, Ret0_2);
+  Value *Tmp0 = Builder.CreateCall(CTLZ, {Divisor, True});
+  Value *Tmp1 = Builder.CreateCall(CTLZ, {Dividend, True});
+  Value *SR          = Builder.CreateSub(Tmp0, Tmp1);
+  Value *Ret0_4      = Builder.CreateICmpUGT(SR, MSB);
+  Value *Ret0        = Builder.CreateOr(Ret0_3, Ret0_4);
+  Value *RetDividend = Builder.CreateICmpEQ(SR, MSB);
+  Value *RetVal      = Builder.CreateSelect(Ret0, Zero, Dividend);
+  Value *EarlyRet    = Builder.CreateOr(Ret0, RetDividend);
+  Builder.CreateCondBr(EarlyRet, End, BB1);
+
+  // ; bb1:                                             ; preds = %special-cases
+  // ;   %sr_1     = add i32 %sr, 1
+  // ;   %tmp2     = sub i32 31, %sr
+  // ;   %q        = shl i32 %dividend, %tmp2
+  // ;   %skipLoop = icmp eq i32 %sr_1, 0
+  // ;   br i1 %skipLoop, label %loop-exit, label %preheader
+  Builder.SetInsertPoint(BB1);
+  Value *SR_1     = Builder.CreateAdd(SR, One);
+  Value *Tmp2     = Builder.CreateSub(MSB, SR);
+  Value *Q        = Builder.CreateShl(Dividend, Tmp2);
+  Value *SkipLoop = Builder.CreateICmpEQ(SR_1, Zero);
+  Builder.CreateCondBr(SkipLoop, LoopExit, Preheader);
+
+  // ; preheader:                                           ; preds = %bb1
+  // ;   %tmp3 = lshr i32 %dividend, %sr_1
+  // ;   %tmp4 = add i32 %divisor, -1
+  // ;   br label %do-while
+  Builder.SetInsertPoint(Preheader);
+  Value *Tmp3 = Builder.CreateLShr(Dividend, SR_1);
+  Value *Tmp4 = Builder.CreateAdd(Divisor, NegOne);
+  Builder.CreateBr(DoWhile);
+
+  // ; do-while:                                 ; preds = %do-while, %preheader
+  // ;   %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
+  // ;   %sr_3    = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
+  // ;   %r_1     = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
+  // ;   %q_2     = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
+  // ;   %tmp5  = shl i32 %r_1, 1
+  // ;   %tmp6  = lshr i32 %q_2, 31
+  // ;   %tmp7  = or i32 %tmp5, %tmp6
+  // ;   %tmp8  = shl i32 %q_2, 1
+  // ;   %q_1   = or i32 %carry_1, %tmp8
+  // ;   %tmp9  = sub i32 %tmp4, %tmp7
+  // ;   %tmp10 = ashr i32 %tmp9, 31
+  // ;   %carry = and i32 %tmp10, 1
+  // ;   %tmp11 = and i32 %tmp10, %divisor
+  // ;   %r     = sub i32 %tmp7, %tmp11
+  // ;   %sr_2  = add i32 %sr_3, -1
+  // ;   %tmp12 = icmp eq i32 %sr_2, 0
+  // ;   br i1 %tmp12, label %loop-exit, label %do-while
+  Builder.SetInsertPoint(DoWhile);
+  PHINode *Carry_1 = Builder.CreatePHI(DivTy, 2);
+  PHINode *SR_3    = Builder.CreatePHI(DivTy, 2);
+  PHINode *R_1     = Builder.CreatePHI(DivTy, 2);
+  PHINode *Q_2     = Builder.CreatePHI(DivTy, 2);
+  Value *Tmp5  = Builder.CreateShl(R_1, One);
+  Value *Tmp6  = Builder.CreateLShr(Q_2, MSB);
+  Value *Tmp7  = Builder.CreateOr(Tmp5, Tmp6);
+  Value *Tmp8  = Builder.CreateShl(Q_2, One);
+  Value *Q_1   = Builder.CreateOr(Carry_1, Tmp8);
+  Value *Tmp9  = Builder.CreateSub(Tmp4, Tmp7);
+  Value *Tmp10 = Builder.CreateAShr(Tmp9, MSB);
+  Value *Carry = Builder.CreateAnd(Tmp10, One);
+  Value *Tmp11 = Builder.CreateAnd(Tmp10, Divisor);
+  Value *R     = Builder.CreateSub(Tmp7, Tmp11);
+  Value *SR_2  = Builder.CreateAdd(SR_3, NegOne);
+  Value *Tmp12 = Builder.CreateICmpEQ(SR_2, Zero);
+  Builder.CreateCondBr(Tmp12, LoopExit, DoWhile);
+
+  // ; loop-exit:                                      ; preds = %do-while, %bb1
+  // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
+  // ;   %q_3     = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
+  // ;   %tmp13 = shl i32 %q_3, 1
+  // ;   %q_4   = or i32 %carry_2, %tmp13
+  // ;   br label %end
+  Builder.SetInsertPoint(LoopExit);
+  PHINode *Carry_2 = Builder.CreatePHI(DivTy, 2);
+  PHINode *Q_3     = Builder.CreatePHI(DivTy, 2);
+  Value *Tmp13 = Builder.CreateShl(Q_3, One);
+  Value *Q_4   = Builder.CreateOr(Carry_2, Tmp13);
+  Builder.CreateBr(End);
+
+  // ; end:                                 ; preds = %loop-exit, %special-cases
+  // ;   %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
+  // ;   ret i32 %q_5
+  Builder.SetInsertPoint(End, End->begin());
+  PHINode *Q_5 = Builder.CreatePHI(DivTy, 2);
+
+  // Populate the Phis, since all values have now been created. Our Phis were:
+  // ;   %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
+  Carry_1->addIncoming(Zero, Preheader);
+  Carry_1->addIncoming(Carry, DoWhile);
+  // ;   %sr_3 = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
+  SR_3->addIncoming(SR_1, Preheader);
+  SR_3->addIncoming(SR_2, DoWhile);
+  // ;   %r_1 = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
+  R_1->addIncoming(Tmp3, Preheader);
+  R_1->addIncoming(R, DoWhile);
+  // ;   %q_2 = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
+  Q_2->addIncoming(Q, Preheader);
+  Q_2->addIncoming(Q_1, DoWhile);
+  // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
+  Carry_2->addIncoming(Zero, BB1);
+  Carry_2->addIncoming(Carry, DoWhile);
+  // ;   %q_3 = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
+  Q_3->addIncoming(Q, BB1);
+  Q_3->addIncoming(Q_1, DoWhile);
+  // ;   %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
+  Q_5->addIncoming(Q_4, LoopExit);
+  Q_5->addIncoming(RetVal, SpecialCases);
+
+  return Q_5;
+}
+
+/// Generate code to calculate the remainder of two integers, replacing Rem with
+/// the generated code. This currently generates code using the udiv expansion,
+/// but future work includes generating more specialized code, e.g. when more
+/// information about the operands are known. Implements both 32bit and 64bit
+/// scalar division.
+///
+/// @brief Replace Rem with generated code.
+bool llvm::expandRemainder(BinaryOperator *Rem) {
+  assert((Rem->getOpcode() == Instruction::SRem ||
+          Rem->getOpcode() == Instruction::URem) &&
+         "Trying to expand remainder from a non-remainder function");
+
+  IRBuilder<> Builder(Rem);
+
+  assert(!Rem->getType()->isVectorTy() && "Div over vectors not supported");
+  assert((Rem->getType()->getIntegerBitWidth() == 32 ||
+          Rem->getType()->getIntegerBitWidth() == 64) &&
+         "Div of bitwidth other than 32 or 64 not supported");
+
+  // First prepare the sign if it's a signed remainder
+  if (Rem->getOpcode() == Instruction::SRem) {
+    Value *Remainder = generateSignedRemainderCode(Rem->getOperand(0),
+                                                   Rem->getOperand(1), Builder);
+
+    // Check whether this is the insert point while Rem is still valid.
+    bool IsInsertPoint = Rem->getIterator() == Builder.GetInsertPoint();
+    Rem->replaceAllUsesWith(Remainder);
+    Rem->dropAllReferences();
+    Rem->eraseFromParent();
+
+    // If we didn't actually generate an urem instruction, we're done
+    // This happens for example if the input were constant. In this case the
+    // Builder insertion point was unchanged
+    if (IsInsertPoint)
+      return true;
+
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
+    Rem = BO;
+  }
+
+  Value *Remainder = generatedUnsignedRemainderCode(Rem->getOperand(0),
+                                                    Rem->getOperand(1),
+                                                    Builder);
+
+  Rem->replaceAllUsesWith(Remainder);
+  Rem->dropAllReferences();
+  Rem->eraseFromParent();
+
+  // Expand the udiv
+  if (BinaryOperator *UDiv = dyn_cast<BinaryOperator>(Builder.GetInsertPoint())) {
+    assert(UDiv->getOpcode() == Instruction::UDiv && "Non-udiv in expansion?");
+    expandDivision(UDiv);
+  }
+
+  return true;
+}
+
+
+/// Generate code to divide two integers, replacing Div with the generated
+/// code. This currently generates code similarly to compiler-rt's
+/// implementations, but future work includes generating more specialized code
+/// when more information about the operands are known. Implements both
+/// 32bit and 64bit scalar division.
+///
+/// @brief Replace Div with generated code.
+bool llvm::expandDivision(BinaryOperator *Div) {
+  assert((Div->getOpcode() == Instruction::SDiv ||
+          Div->getOpcode() == Instruction::UDiv) &&
+         "Trying to expand division from a non-division function");
+
+  IRBuilder<> Builder(Div);
+
+  assert(!Div->getType()->isVectorTy() && "Div over vectors not supported");
+  assert((Div->getType()->getIntegerBitWidth() == 32 ||
+          Div->getType()->getIntegerBitWidth() == 64) &&
+         "Div of bitwidth other than 32 or 64 not supported");
+
+  // First prepare the sign if it's a signed division
+  if (Div->getOpcode() == Instruction::SDiv) {
+    // Lower the code to unsigned division, and reset Div to point to the udiv.
+    Value *Quotient = generateSignedDivisionCode(Div->getOperand(0),
+                                                 Div->getOperand(1), Builder);
+
+    // Check whether this is the insert point while Div is still valid.
+    bool IsInsertPoint = Div->getIterator() == Builder.GetInsertPoint();
+    Div->replaceAllUsesWith(Quotient);
+    Div->dropAllReferences();
+    Div->eraseFromParent();
+
+    // If we didn't actually generate an udiv instruction, we're done
+    // This happens for example if the input were constant. In this case the
+    // Builder insertion point was unchanged
+    if (IsInsertPoint)
+      return true;
+
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
+    Div = BO;
+  }
+
+  // Insert the unsigned division code
+  Value *Quotient = generateUnsignedDivisionCode(Div->getOperand(0),
+                                                 Div->getOperand(1),
+                                                 Builder);
+  Div->replaceAllUsesWith(Quotient);
+  Div->dropAllReferences();
+  Div->eraseFromParent();
+
+  return true;
+}
+
+/// Generate code to compute the remainder of two integers of bitwidth up to 
+/// 32 bits. Uses the above routines and extends the inputs/truncates the
+/// outputs to operate in 32 bits; that is, these routines are good for targets
+/// that have no or very little suppport for smaller than 32 bit integer 
+/// arithmetic.
+///
+/// @brief Replace Rem with emulation code.
+bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
+  assert((Rem->getOpcode() == Instruction::SRem ||
+          Rem->getOpcode() == Instruction::URem) &&
+          "Trying to expand remainder from a non-remainder function");
+
+  Type *RemTy = Rem->getType();
+  assert(!RemTy->isVectorTy() && "Div over vectors not supported");
+
+  unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
+
+  assert(RemTyBitWidth <= 32 &&
+         "Div of bitwidth greater than 32 not supported");
+
+  if (RemTyBitWidth == 32)
+    return expandRemainder(Rem);
+
+  // If bitwidth smaller than 32 extend inputs, extend output and proceed
+  // with 32 bit division.
+  IRBuilder<> Builder(Rem);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtRem;
+  Value *Trunc;
+  Type *Int32Ty = Builder.getInt32Ty();
+
+  if (Rem->getOpcode() == Instruction::SRem) {
+    ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int32Ty);
+    ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int32Ty);
+    ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor);
+  }
+  Trunc = Builder.CreateTrunc(ExtRem, RemTy);
+
+  Rem->replaceAllUsesWith(Trunc);
+  Rem->dropAllReferences();
+  Rem->eraseFromParent();
+
+  return expandRemainder(cast<BinaryOperator>(ExtRem));
+}
+
+/// Generate code to compute the remainder of two integers of bitwidth up to 
+/// 64 bits. Uses the above routines and extends the inputs/truncates the
+/// outputs to operate in 64 bits.
+///
+/// @brief Replace Rem with emulation code.
+bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) {
+  assert((Rem->getOpcode() == Instruction::SRem ||
+          Rem->getOpcode() == Instruction::URem) &&
+          "Trying to expand remainder from a non-remainder function");
+
+  Type *RemTy = Rem->getType();
+  assert(!RemTy->isVectorTy() && "Div over vectors not supported");
+
+  unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
+
+  assert(RemTyBitWidth <= 64 && "Div of bitwidth greater than 64 not supported");
+
+  if (RemTyBitWidth == 64)
+    return expandRemainder(Rem);
+
+  // If bitwidth smaller than 64 extend inputs, extend output and proceed
+  // with 64 bit division.
+  IRBuilder<> Builder(Rem);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtRem;
+  Value *Trunc;
+  Type *Int64Ty = Builder.getInt64Ty();
+
+  if (Rem->getOpcode() == Instruction::SRem) {
+    ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int64Ty);
+    ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int64Ty);
+    ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int64Ty);
+    ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int64Ty);
+    ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor);
+  }
+  Trunc = Builder.CreateTrunc(ExtRem, RemTy);
+
+  Rem->replaceAllUsesWith(Trunc);
+  Rem->dropAllReferences();
+  Rem->eraseFromParent();
+
+  return expandRemainder(cast<BinaryOperator>(ExtRem));
+}
+
+/// Generate code to divide two integers of bitwidth up to 32 bits. Uses the
+/// above routines and extends the inputs/truncates the outputs to operate
+/// in 32 bits; that is, these routines are good for targets that have no
+/// or very little support for smaller than 32 bit integer arithmetic.
+///
+/// @brief Replace Div with emulation code.
+bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
+  assert((Div->getOpcode() == Instruction::SDiv ||
+          Div->getOpcode() == Instruction::UDiv) &&
+          "Trying to expand division from a non-division function");
+
+  Type *DivTy = Div->getType();
+  assert(!DivTy->isVectorTy() && "Div over vectors not supported");
+
+  unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
+
+  assert(DivTyBitWidth <= 32 && "Div of bitwidth greater than 32 not supported");
+
+  if (DivTyBitWidth == 32)
+    return expandDivision(Div);
+
+  // If bitwidth smaller than 32 extend inputs, extend output and proceed
+  // with 32 bit division.
+  IRBuilder<> Builder(Div);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtDiv;
+  Value *Trunc;
+  Type *Int32Ty = Builder.getInt32Ty();
+
+  if (Div->getOpcode() == Instruction::SDiv) {
+    ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int32Ty);
+    ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int32Ty);
+    ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor);  
+  }
+  Trunc = Builder.CreateTrunc(ExtDiv, DivTy);
+
+  Div->replaceAllUsesWith(Trunc);
+  Div->dropAllReferences();
+  Div->eraseFromParent();
+
+  return expandDivision(cast<BinaryOperator>(ExtDiv));
+}
+
+/// Generate code to divide two integers of bitwidth up to 64 bits. Uses the
+/// above routines and extends the inputs/truncates the outputs to operate
+/// in 64 bits.
+///
+/// @brief Replace Div with emulation code.
+bool llvm::expandDivisionUpTo64Bits(BinaryOperator *Div) {
+  assert((Div->getOpcode() == Instruction::SDiv ||
+          Div->getOpcode() == Instruction::UDiv) &&
+          "Trying to expand division from a non-division function");
+
+  Type *DivTy = Div->getType();
+  assert(!DivTy->isVectorTy() && "Div over vectors not supported");
+
+  unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
+
+  assert(DivTyBitWidth <= 64 &&
+         "Div of bitwidth greater than 64 not supported");
+
+  if (DivTyBitWidth == 64)
+    return expandDivision(Div);
+
+  // If bitwidth smaller than 64 extend inputs, extend output and proceed
+  // with 64 bit division.
+  IRBuilder<> Builder(Div);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtDiv;
+  Value *Trunc;
+  Type *Int64Ty = Builder.getInt64Ty();
+
+  if (Div->getOpcode() == Instruction::SDiv) {
+    ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int64Ty);
+    ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int64Ty);
+    ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int64Ty);
+    ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int64Ty);
+    ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor);  
+  }
+  Trunc = Builder.CreateTrunc(ExtDiv, DivTy);
+
+  Div->replaceAllUsesWith(Trunc);
+  Div->dropAllReferences();
+  Div->eraseFromParent();
+
+  return expandDivision(cast<BinaryOperator>(ExtDiv));
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
new file mode 100644
index 000000000000..089f2b5f3b18
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -0,0 +1,438 @@
+//===-- LCSSA.cpp - Convert loops into loop-closed SSA form ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops by placing phi nodes at the end of the loops for
+// all values that are live across the loop boundary.  For example, it turns
+// the left into the right code:
+// 
+// for (...)                for (...)
+//   if (c)                   if (c)
+//     X1 = ...                 X1 = ...
+//   else                     else
+//     X2 = ...                 X2 = ...
+//   X3 = phi(X1, X2)         X3 = phi(X1, X2)
+// ... = X3 + 4             X4 = phi(X3)
+//                          ... = X4 + 4
+//
+// This is still valid LLVM; the extra phi nodes are purely redundant, and will
+// be trivially eliminated by InstCombine.  The major benefit of this 
+// transformation is that it makes many other loop optimizations, such as 
+// LoopUnswitching, simpler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LCSSA.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "lcssa"
+
+STATISTIC(NumLCSSA, "Number of live out of a loop variables");
+
+#ifdef EXPENSIVE_CHECKS
+static bool VerifyLoopLCSSA = true;
+#else
+static bool VerifyLoopLCSSA = false;
+#endif
+static cl::opt<bool,true>
+VerifyLoopLCSSAFlag("verify-loop-lcssa", cl::location(VerifyLoopLCSSA),
+                    cl::desc("Verify loop lcssa form (time consuming)"));
+
+/// Return true if the specified block is in the list.
+static bool isExitBlock(BasicBlock *BB,
+                        const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+  return is_contained(ExitBlocks, BB);
+}
+
+/// For every instruction from the worklist, check to see if it has any uses
+/// that are outside the current loop.  If so, insert LCSSA PHI nodes and
+/// rewrite the uses.
+bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
+                                    DominatorTree &DT, LoopInfo &LI) {
+  SmallVector<Use *, 16> UsesToRewrite;
+  SmallSetVector<PHINode *, 16> PHIsToRemove;
+  PredIteratorCache PredCache;
+  bool Changed = false;
+
+  // Cache the Loop ExitBlocks across this loop.  We expect to get a lot of
+  // instructions within the same loops, computing the exit blocks is
+  // expensive, and we're not mutating the loop structure.
+  SmallDenseMap<Loop*, SmallVector<BasicBlock *,1>> LoopExitBlocks;
+
+  while (!Worklist.empty()) {
+    UsesToRewrite.clear();
+
+    Instruction *I = Worklist.pop_back_val();
+    assert(!I->getType()->isTokenTy() && "Tokens shouldn't be in the worklist");
+    BasicBlock *InstBB = I->getParent();
+    Loop *L = LI.getLoopFor(InstBB);
+    assert(L && "Instruction belongs to a BB that's not part of a loop");
+    if (!LoopExitBlocks.count(L))
+      L->getExitBlocks(LoopExitBlocks[L]);
+    assert(LoopExitBlocks.count(L));
+    const SmallVectorImpl<BasicBlock *> &ExitBlocks = LoopExitBlocks[L];
+
+    if (ExitBlocks.empty())
+      continue;
+
+    for (Use &U : I->uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
+      BasicBlock *UserBB = User->getParent();
+      if (auto *PN = dyn_cast<PHINode>(User))
+        UserBB = PN->getIncomingBlock(U);
+
+      if (InstBB != UserBB && !L->contains(UserBB))
+        UsesToRewrite.push_back(&U);
+    }
+
+    // If there are no uses outside the loop, exit with no change.
+    if (UsesToRewrite.empty())
+      continue;
+
+    ++NumLCSSA; // We are applying the transformation
+
+    // Invoke instructions are special in that their result value is not
+    // available along their unwind edge. The code below tests to see whether
+    // DomBB dominates the value, so adjust DomBB to the normal destination
+    // block, which is effectively where the value is first usable.
+    BasicBlock *DomBB = InstBB;
+    if (auto *Inv = dyn_cast<InvokeInst>(I))
+      DomBB = Inv->getNormalDest();
+
+    DomTreeNode *DomNode = DT.getNode(DomBB);
+
+    SmallVector<PHINode *, 16> AddedPHIs;
+    SmallVector<PHINode *, 8> PostProcessPHIs;
+
+    SmallVector<PHINode *, 4> InsertedPHIs;
+    SSAUpdater SSAUpdate(&InsertedPHIs);
+    SSAUpdate.Initialize(I->getType(), I->getName());
+
+    // Insert the LCSSA phi's into all of the exit blocks dominated by the
+    // value, and add them to the Phi's map.
+    for (BasicBlock *ExitBB : ExitBlocks) {
+      if (!DT.dominates(DomNode, DT.getNode(ExitBB)))
+        continue;
+
+      // If we already inserted something for this BB, don't reprocess it.
+      if (SSAUpdate.HasValueForBlock(ExitBB))
+        continue;
+
+      PHINode *PN = PHINode::Create(I->getType(), PredCache.size(ExitBB),
+                                    I->getName() + ".lcssa", &ExitBB->front());
+
+      // Add inputs from inside the loop for this PHI.
+      for (BasicBlock *Pred : PredCache.get(ExitBB)) {
+        PN->addIncoming(I, Pred);
+
+        // If the exit block has a predecessor not within the loop, arrange for
+        // the incoming value use corresponding to that predecessor to be
+        // rewritten in terms of a different LCSSA PHI.
+        if (!L->contains(Pred))
+          UsesToRewrite.push_back(
+              &PN->getOperandUse(PN->getOperandNumForIncomingValue(
+                  PN->getNumIncomingValues() - 1)));
+      }
+
+      AddedPHIs.push_back(PN);
+
+      // Remember that this phi makes the value alive in this block.
+      SSAUpdate.AddAvailableValue(ExitBB, PN);
+
+      // LoopSimplify might fail to simplify some loops (e.g. when indirect
+      // branches are involved). In such situations, it might happen that an
+      // exit for Loop L1 is the header of a disjoint Loop L2. Thus, when we
+      // create PHIs in such an exit block, we are also inserting PHIs into L2's
+      // header. This could break LCSSA form for L2 because these inserted PHIs
+      // can also have uses outside of L2. Remember all PHIs in such situation
+      // as to revisit than later on. FIXME: Remove this if indirectbr support
+      // into LoopSimplify gets improved.
+      if (auto *OtherLoop = LI.getLoopFor(ExitBB))
+        if (!L->contains(OtherLoop))
+          PostProcessPHIs.push_back(PN);
+    }
+
+    // Rewrite all uses outside the loop in terms of the new PHIs we just
+    // inserted.
+    for (Use *UseToRewrite : UsesToRewrite) {
+      // If this use is in an exit block, rewrite to use the newly inserted PHI.
+      // This is required for correctness because SSAUpdate doesn't handle uses
+      // in the same block.  It assumes the PHI we inserted is at the end of the
+      // block.
+      Instruction *User = cast<Instruction>(UseToRewrite->getUser());
+      BasicBlock *UserBB = User->getParent();
+      if (auto *PN = dyn_cast<PHINode>(User))
+        UserBB = PN->getIncomingBlock(*UseToRewrite);
+
+      if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
+        // Tell the VHs that the uses changed. This updates SCEV's caches.
+        if (UseToRewrite->get()->hasValueHandle())
+          ValueHandleBase::ValueIsRAUWd(*UseToRewrite, &UserBB->front());
+        UseToRewrite->set(&UserBB->front());
+        continue;
+      }
+
+      // Otherwise, do full PHI insertion.
+      SSAUpdate.RewriteUse(*UseToRewrite);
+    }
+
+    // SSAUpdater might have inserted phi-nodes inside other loops. We'll need
+    // to post-process them to keep LCSSA form.
+    for (PHINode *InsertedPN : InsertedPHIs) {
+      if (auto *OtherLoop = LI.getLoopFor(InsertedPN->getParent()))
+        if (!L->contains(OtherLoop))
+          PostProcessPHIs.push_back(InsertedPN);
+    }
+
+    // Post process PHI instructions that were inserted into another disjoint
+    // loop and update their exits properly.
+    for (auto *PostProcessPN : PostProcessPHIs)
+      if (!PostProcessPN->use_empty())
+        Worklist.push_back(PostProcessPN);
+
+    // Keep track of PHI nodes that we want to remove because they did not have
+    // any uses rewritten.
+    for (PHINode *PN : AddedPHIs)
+      if (PN->use_empty())
+        PHIsToRemove.insert(PN);
+
+    Changed = true;
+  }
+  // Remove PHI nodes that did not have any uses rewritten.
+  for (PHINode *PN : PHIsToRemove) {
+    assert (PN->use_empty() && "Trying to remove a phi with uses.");
+    PN->eraseFromParent();
+  }
+  return Changed;
+}
+
+// Compute the set of BasicBlocks in the loop `L` dominating at least one exit.
+static void computeBlocksDominatingExits(
+    Loop &L, DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks,
+    SmallSetVector<BasicBlock *, 8> &BlocksDominatingExits) {
+  SmallVector<BasicBlock *, 8> BBWorklist;
+
+  // We start from the exit blocks, as every block trivially dominates itself
+  // (not strictly).
+  for (BasicBlock *BB : ExitBlocks)
+    BBWorklist.push_back(BB);
+
+  while (!BBWorklist.empty()) {
+    BasicBlock *BB = BBWorklist.pop_back_val();
+
+    // Check if this is a loop header. If this is the case, we're done.
+    if (L.getHeader() == BB)
+      continue;
+
+    // Otherwise, add its immediate predecessor in the dominator tree to the
+    // worklist, unless we visited it already.
+    BasicBlock *IDomBB = DT.getNode(BB)->getIDom()->getBlock();
+
+    // Exit blocks can have an immediate dominator not beloinging to the
+    // loop. For an exit block to be immediately dominated by another block
+    // outside the loop, it implies not all paths from that dominator, to the
+    // exit block, go through the loop.
+    // Example:
+    //
+    // |---- A
+    // |     |
+    // |     B<--
+    // |     |  |
+    // |---> C --
+    //       |
+    //       D
+    //
+    // C is the exit block of the loop and it's immediately dominated by A,
+    // which doesn't belong to the loop.
+    if (!L.contains(IDomBB))
+      continue;
+
+    if (BlocksDominatingExits.insert(IDomBB))
+      BBWorklist.push_back(IDomBB);
+  }
+}
+
+bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
+                     ScalarEvolution *SE) {
+  bool Changed = false;
+
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L.getExitBlocks(ExitBlocks);
+  if (ExitBlocks.empty())
+    return false;
+
+  SmallSetVector<BasicBlock *, 8> BlocksDominatingExits;
+
+  // We want to avoid use-scanning leveraging dominance informations.
+  // If a block doesn't dominate any of the loop exits, the none of the values
+  // defined in the loop can be used outside.
+  // We compute the set of blocks fullfilling the conditions in advance
+  // walking the dominator tree upwards until we hit a loop header.
+  computeBlocksDominatingExits(L, DT, ExitBlocks, BlocksDominatingExits);
+
+  SmallVector<Instruction *, 8> Worklist;
+
+  // Look at all the instructions in the loop, checking to see if they have uses
+  // outside the loop.  If so, put them into the worklist to rewrite those uses.
+  for (BasicBlock *BB : BlocksDominatingExits) {
+    for (Instruction &I : *BB) {
+      // Reject two common cases fast: instructions with no uses (like stores)
+      // and instructions with one use that is in the same block as this.
+      if (I.use_empty() ||
+          (I.hasOneUse() && I.user_back()->getParent() == BB &&
+           !isa<PHINode>(I.user_back())))
+        continue;
+
+      // Tokens cannot be used in PHI nodes, so we skip over them.
+      // We can run into tokens which are live out of a loop with catchswitch
+      // instructions in Windows EH if the catchswitch has one catchpad which
+      // is inside the loop and another which is not.
+      if (I.getType()->isTokenTy())
+        continue;
+
+      Worklist.push_back(&I);
+    }
+  }
+  Changed = formLCSSAForInstructions(Worklist, DT, *LI);
+
+  // If we modified the code, remove any caches about the loop from SCEV to
+  // avoid dangling entries.
+  // FIXME: This is a big hammer, can we clear the cache more selectively?
+  if (SE && Changed)
+    SE->forgetLoop(&L);
+
+  assert(L.isLCSSAForm(DT));
+
+  return Changed;
+}
+
+/// Process a loop nest depth first.
+bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
+                                ScalarEvolution *SE) {
+  bool Changed = false;
+
+  // Recurse depth-first through inner loops.
+  for (Loop *SubLoop : L.getSubLoops())
+    Changed |= formLCSSARecursively(*SubLoop, DT, LI, SE);
+
+  Changed |= formLCSSA(L, DT, LI, SE);
+  return Changed;
+}
+
+/// Process all loops in the function, inner-most out.
+static bool formLCSSAOnAllLoops(LoopInfo *LI, DominatorTree &DT,
+                                ScalarEvolution *SE) {
+  bool Changed = false;
+  for (auto &L : *LI)
+    Changed |= formLCSSARecursively(*L, DT, LI, SE);
+  return Changed;
+}
+
+namespace {
+struct LCSSAWrapperPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  LCSSAWrapperPass() : FunctionPass(ID) {
+    initializeLCSSAWrapperPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // Cached analysis information for the current function.
+  DominatorTree *DT;
+  LoopInfo *LI;
+  ScalarEvolution *SE;
+
+  bool runOnFunction(Function &F) override;
+  void verifyAnalysis() const override {
+    // This check is very expensive. On the loop intensive compiles it may cause
+    // up to 10x slowdown. Currently it's disabled by default. LPPassManager
+    // always does limited form of the LCSSA verification. Similar reasoning
+    // was used for the LoopInfo verifier.
+    if (VerifyLoopLCSSA) {
+      assert(all_of(*LI,
+                    [&](Loop *L) {
+                      return L->isRecursivelyLCSSAForm(*DT, *LI);
+                    }) &&
+             "LCSSA form is broken!");
+    }
+  };
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG.  It maintains both of these,
+  /// as well as the CFG.  It also requires dominator information.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreservedID(LoopSimplifyID);
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<BasicAAWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<SCEVAAWrapperPass>();
+
+    // This is needed to perform LCSSA verification inside LPPassManager
+    AU.addRequired<LCSSAVerificationPass>();
+    AU.addPreserved<LCSSAVerificationPass>();
+  }
+};
+}
+
+char LCSSAWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LCSSAVerificationPass)
+INITIALIZE_PASS_END(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass",
+                    false, false)
+
+Pass *llvm::createLCSSAPass() { return new LCSSAWrapperPass(); }
+char &llvm::LCSSAID = LCSSAWrapperPass::ID;
+
+/// Transform \p F into loop-closed SSA form.
+bool LCSSAWrapperPass::runOnFunction(Function &F) {
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+  SE = SEWP ? &SEWP->getSE() : nullptr;
+
+  return formLCSSAOnAllLoops(LI, *DT, SE);
+}
+
+PreservedAnalyses LCSSAPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
+  if (!formLCSSAOnAllLoops(&LI, DT, SE))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<BasicAA>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<SCEVAA>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
new file mode 100644
index 000000000000..42aca757c2af
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -0,0 +1,565 @@
+//===-- LibCallsShrinkWrap.cpp ----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass shrink-wraps a call to function if the result is not used.
+// The call can set errno but is otherwise side effect free. For example:
+//    sqrt(val);
+//  is transformed to
+//    if (val < 0)
+//      sqrt(val);
+//  Even if the result of library call is not being used, the compiler cannot
+//  safely delete the call because the function can set errno on error
+//  conditions.
+//  Note in many functions, the error condition solely depends on the incoming
+//  parameter. In this optimization, we can generate the condition can lead to
+//  the errno to shrink-wrap the call. Since the chances of hitting the error
+//  condition is low, the runtime call is effectively eliminated.
+//
+//  These partially dead calls are usually results of C++ abstraction penalty
+//  exposed by inlining.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "libcalls-shrinkwrap"
+
+STATISTIC(NumWrappedOneCond, "Number of One-Condition Wrappers Inserted");
+STATISTIC(NumWrappedTwoCond, "Number of Two-Condition Wrappers Inserted");
+
+namespace {
+class LibCallsShrinkWrapLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit LibCallsShrinkWrapLegacyPass() : FunctionPass(ID) {
+    initializeLibCallsShrinkWrapLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+};
+}
+
+char LibCallsShrinkWrapLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LibCallsShrinkWrapLegacyPass, "libcalls-shrinkwrap",
+                      "Conditionally eliminate dead library calls", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LibCallsShrinkWrapLegacyPass, "libcalls-shrinkwrap",
+                    "Conditionally eliminate dead library calls", false, false)
+
+namespace {
+class LibCallsShrinkWrap : public InstVisitor<LibCallsShrinkWrap> {
+public:
+  LibCallsShrinkWrap(const TargetLibraryInfo &TLI, DominatorTree *DT)
+      : TLI(TLI), DT(DT){};
+  void visitCallInst(CallInst &CI) { checkCandidate(CI); }
+  bool perform() {
+    bool Changed = false;
+    for (auto &CI : WorkList) {
+      DEBUG(dbgs() << "CDCE calls: " << CI->getCalledFunction()->getName()
+                   << "\n");
+      if (perform(CI)) {
+        Changed = true;
+        DEBUG(dbgs() << "Transformed\n");
+      }
+    }
+    return Changed;
+  }
+
+private:
+  bool perform(CallInst *CI);
+  void checkCandidate(CallInst &CI);
+  void shrinkWrapCI(CallInst *CI, Value *Cond);
+  bool performCallDomainErrorOnly(CallInst *CI, const LibFunc &Func);
+  bool performCallErrors(CallInst *CI, const LibFunc &Func);
+  bool performCallRangeErrorOnly(CallInst *CI, const LibFunc &Func);
+  Value *generateOneRangeCond(CallInst *CI, const LibFunc &Func);
+  Value *generateTwoRangeCond(CallInst *CI, const LibFunc &Func);
+  Value *generateCondForPow(CallInst *CI, const LibFunc &Func);
+
+  // Create an OR of two conditions.
+  Value *createOrCond(CallInst *CI, CmpInst::Predicate Cmp, float Val,
+                      CmpInst::Predicate Cmp2, float Val2) {
+    IRBuilder<> BBBuilder(CI);
+    Value *Arg = CI->getArgOperand(0);
+    auto Cond2 = createCond(BBBuilder, Arg, Cmp2, Val2);
+    auto Cond1 = createCond(BBBuilder, Arg, Cmp, Val);
+    return BBBuilder.CreateOr(Cond1, Cond2);
+  }
+
+  // Create a single condition using IRBuilder.
+  Value *createCond(IRBuilder<> &BBBuilder, Value *Arg, CmpInst::Predicate Cmp,
+                    float Val) {
+    Constant *V = ConstantFP::get(BBBuilder.getContext(), APFloat(Val));
+    if (!Arg->getType()->isFloatTy())
+      V = ConstantExpr::getFPExtend(V, Arg->getType());
+    return BBBuilder.CreateFCmp(Cmp, Arg, V);
+  }
+
+  // Create a single condition.
+  Value *createCond(CallInst *CI, CmpInst::Predicate Cmp, float Val) {
+    IRBuilder<> BBBuilder(CI);
+    Value *Arg = CI->getArgOperand(0);
+    return createCond(BBBuilder, Arg, Cmp, Val);
+  }
+
+  const TargetLibraryInfo &TLI;
+  DominatorTree *DT;
+  SmallVector<CallInst *, 16> WorkList;
+};
+} // end anonymous namespace
+
+// Perform the transformation to calls with errno set by domain error.
+bool LibCallsShrinkWrap::performCallDomainErrorOnly(CallInst *CI,
+                                                    const LibFunc &Func) {
+  Value *Cond = nullptr;
+
+  switch (Func) {
+  case LibFunc_acos:  // DomainError: (x < -1 || x > 1)
+  case LibFunc_acosf: // Same as acos
+  case LibFunc_acosl: // Same as acos
+  case LibFunc_asin:  // DomainError: (x < -1 || x > 1)
+  case LibFunc_asinf: // Same as asin
+  case LibFunc_asinl: // Same as asin
+  {
+    ++NumWrappedTwoCond;
+    Cond = createOrCond(CI, CmpInst::FCMP_OLT, -1.0f, CmpInst::FCMP_OGT, 1.0f);
+    break;
+  }
+  case LibFunc_cos:  // DomainError: (x == +inf || x == -inf)
+  case LibFunc_cosf: // Same as cos
+  case LibFunc_cosl: // Same as cos
+  case LibFunc_sin:  // DomainError: (x == +inf || x == -inf)
+  case LibFunc_sinf: // Same as sin
+  case LibFunc_sinl: // Same as sin
+  {
+    ++NumWrappedTwoCond;
+    Cond = createOrCond(CI, CmpInst::FCMP_OEQ, INFINITY, CmpInst::FCMP_OEQ,
+                        -INFINITY);
+    break;
+  }
+  case LibFunc_acosh:  // DomainError: (x < 1)
+  case LibFunc_acoshf: // Same as acosh
+  case LibFunc_acoshl: // Same as acosh
+  {
+    ++NumWrappedOneCond;
+    Cond = createCond(CI, CmpInst::FCMP_OLT, 1.0f);
+    break;
+  }
+  case LibFunc_sqrt:  // DomainError: (x < 0)
+  case LibFunc_sqrtf: // Same as sqrt
+  case LibFunc_sqrtl: // Same as sqrt
+  {
+    ++NumWrappedOneCond;
+    Cond = createCond(CI, CmpInst::FCMP_OLT, 0.0f);
+    break;
+  }
+  default:
+    return false;
+  }
+  shrinkWrapCI(CI, Cond);
+  return true;
+}
+
+// Perform the transformation to calls with errno set by range error.
+bool LibCallsShrinkWrap::performCallRangeErrorOnly(CallInst *CI,
+                                                   const LibFunc &Func) {
+  Value *Cond = nullptr;
+
+  switch (Func) {
+  case LibFunc_cosh:
+  case LibFunc_coshf:
+  case LibFunc_coshl:
+  case LibFunc_exp:
+  case LibFunc_expf:
+  case LibFunc_expl:
+  case LibFunc_exp10:
+  case LibFunc_exp10f:
+  case LibFunc_exp10l:
+  case LibFunc_exp2:
+  case LibFunc_exp2f:
+  case LibFunc_exp2l:
+  case LibFunc_sinh:
+  case LibFunc_sinhf:
+  case LibFunc_sinhl: {
+    Cond = generateTwoRangeCond(CI, Func);
+    break;
+  }
+  case LibFunc_expm1:  // RangeError: (709, inf)
+  case LibFunc_expm1f: // RangeError: (88, inf)
+  case LibFunc_expm1l: // RangeError: (11356, inf)
+  {
+    Cond = generateOneRangeCond(CI, Func);
+    break;
+  }
+  default:
+    return false;
+  }
+  shrinkWrapCI(CI, Cond);
+  return true;
+}
+
+// Perform the transformation to calls with errno set by combination of errors.
+bool LibCallsShrinkWrap::performCallErrors(CallInst *CI,
+                                           const LibFunc &Func) {
+  Value *Cond = nullptr;
+
+  switch (Func) {
+  case LibFunc_atanh:  // DomainError: (x < -1 || x > 1)
+                        // PoleError:   (x == -1 || x == 1)
+                        // Overall Cond: (x <= -1 || x >= 1)
+  case LibFunc_atanhf: // Same as atanh
+  case LibFunc_atanhl: // Same as atanh
+  {
+    ++NumWrappedTwoCond;
+    Cond = createOrCond(CI, CmpInst::FCMP_OLE, -1.0f, CmpInst::FCMP_OGE, 1.0f);
+    break;
+  }
+  case LibFunc_log:    // DomainError: (x < 0)
+                        // PoleError:   (x == 0)
+                        // Overall Cond: (x <= 0)
+  case LibFunc_logf:   // Same as log
+  case LibFunc_logl:   // Same as log
+  case LibFunc_log10:  // Same as log
+  case LibFunc_log10f: // Same as log
+  case LibFunc_log10l: // Same as log
+  case LibFunc_log2:   // Same as log
+  case LibFunc_log2f:  // Same as log
+  case LibFunc_log2l:  // Same as log
+  case LibFunc_logb:   // Same as log
+  case LibFunc_logbf:  // Same as log
+  case LibFunc_logbl:  // Same as log
+  {
+    ++NumWrappedOneCond;
+    Cond = createCond(CI, CmpInst::FCMP_OLE, 0.0f);
+    break;
+  }
+  case LibFunc_log1p:  // DomainError: (x < -1)
+                        // PoleError:   (x == -1)
+                        // Overall Cond: (x <= -1)
+  case LibFunc_log1pf: // Same as log1p
+  case LibFunc_log1pl: // Same as log1p
+  {
+    ++NumWrappedOneCond;
+    Cond = createCond(CI, CmpInst::FCMP_OLE, -1.0f);
+    break;
+  }
+  case LibFunc_pow: // DomainError: x < 0 and y is noninteger
+                     // PoleError:   x == 0 and y < 0
+                     // RangeError:  overflow or underflow
+  case LibFunc_powf:
+  case LibFunc_powl: {
+    Cond = generateCondForPow(CI, Func);
+    if (Cond == nullptr)
+      return false;
+    break;
+  }
+  default:
+    return false;
+  }
+  assert(Cond && "performCallErrors should not see an empty condition");
+  shrinkWrapCI(CI, Cond);
+  return true;
+}
+
+// Checks if CI is a candidate for shrinkwrapping and put it into work list if
+// true.
+void LibCallsShrinkWrap::checkCandidate(CallInst &CI) {
+  if (CI.isNoBuiltin())
+    return;
+  // A possible improvement is to handle the calls with the return value being
+  // used. If there is API for fast libcall implementation without setting
+  // errno, we can use the same framework to direct/wrap the call to the fast
+  // API in the error free path, and leave the original call in the slow path.
+  if (!CI.use_empty())
+    return;
+
+  LibFunc Func;
+  Function *Callee = CI.getCalledFunction();
+  if (!Callee)
+    return;
+  if (!TLI.getLibFunc(*Callee, Func) || !TLI.has(Func))
+    return;
+
+  if (CI.getNumArgOperands() == 0)
+    return;
+  // TODO: Handle long double in other formats.
+  Type *ArgType = CI.getArgOperand(0)->getType();
+  if (!(ArgType->isFloatTy() || ArgType->isDoubleTy() ||
+        ArgType->isX86_FP80Ty()))
+    return;
+
+  WorkList.push_back(&CI);
+}
+
+// Generate the upper bound condition for RangeError.
+Value *LibCallsShrinkWrap::generateOneRangeCond(CallInst *CI,
+                                                const LibFunc &Func) {
+  float UpperBound;
+  switch (Func) {
+  case LibFunc_expm1: // RangeError: (709, inf)
+    UpperBound = 709.0f;
+    break;
+  case LibFunc_expm1f: // RangeError: (88, inf)
+    UpperBound = 88.0f;
+    break;
+  case LibFunc_expm1l: // RangeError: (11356, inf)
+    UpperBound = 11356.0f;
+    break;
+  default:
+    llvm_unreachable("Unhandled library call!");
+  }
+
+  ++NumWrappedOneCond;
+  return createCond(CI, CmpInst::FCMP_OGT, UpperBound);
+}
+
+// Generate the lower and upper bound condition for RangeError.
+Value *LibCallsShrinkWrap::generateTwoRangeCond(CallInst *CI,
+                                                const LibFunc &Func) {
+  float UpperBound, LowerBound;
+  switch (Func) {
+  case LibFunc_cosh: // RangeError: (x < -710 || x > 710)
+  case LibFunc_sinh: // Same as cosh
+    LowerBound = -710.0f;
+    UpperBound = 710.0f;
+    break;
+  case LibFunc_coshf: // RangeError: (x < -89 || x > 89)
+  case LibFunc_sinhf: // Same as coshf
+    LowerBound = -89.0f;
+    UpperBound = 89.0f;
+    break;
+  case LibFunc_coshl: // RangeError: (x < -11357 || x > 11357)
+  case LibFunc_sinhl: // Same as coshl
+    LowerBound = -11357.0f;
+    UpperBound = 11357.0f;
+    break;
+  case LibFunc_exp: // RangeError: (x < -745 || x > 709)
+    LowerBound = -745.0f;
+    UpperBound = 709.0f;
+    break;
+  case LibFunc_expf: // RangeError: (x < -103 || x > 88)
+    LowerBound = -103.0f;
+    UpperBound = 88.0f;
+    break;
+  case LibFunc_expl: // RangeError: (x < -11399 || x > 11356)
+    LowerBound = -11399.0f;
+    UpperBound = 11356.0f;
+    break;
+  case LibFunc_exp10: // RangeError: (x < -323 || x > 308)
+    LowerBound = -323.0f;
+    UpperBound = 308.0f;
+    break;
+  case LibFunc_exp10f: // RangeError: (x < -45 || x > 38)
+    LowerBound = -45.0f;
+    UpperBound = 38.0f;
+    break;
+  case LibFunc_exp10l: // RangeError: (x < -4950 || x > 4932)
+    LowerBound = -4950.0f;
+    UpperBound = 4932.0f;
+    break;
+  case LibFunc_exp2: // RangeError: (x < -1074 || x > 1023)
+    LowerBound = -1074.0f;
+    UpperBound = 1023.0f;
+    break;
+  case LibFunc_exp2f: // RangeError: (x < -149 || x > 127)
+    LowerBound = -149.0f;
+    UpperBound = 127.0f;
+    break;
+  case LibFunc_exp2l: // RangeError: (x < -16445 || x > 11383)
+    LowerBound = -16445.0f;
+    UpperBound = 11383.0f;
+    break;
+  default:
+    llvm_unreachable("Unhandled library call!");
+  }
+
+  ++NumWrappedTwoCond;
+  return createOrCond(CI, CmpInst::FCMP_OGT, UpperBound, CmpInst::FCMP_OLT,
+                      LowerBound);
+}
+
+// For pow(x,y), We only handle the following cases:
+// (1) x is a constant && (x >= 1) && (x < MaxUInt8)
+//     Cond is: (y > 127)
+// (2) x is a value coming from an integer type.
+//   (2.1) if x's bit_size == 8
+//         Cond: (x <= 0 || y > 128)
+//   (2.2) if x's bit_size is 16
+//         Cond: (x <= 0 || y > 64)
+//   (2.3) if x's bit_size is 32
+//         Cond: (x <= 0 || y > 32)
+// Support for powl(x,y) and powf(x,y) are TBD.
+//
+// Note that condition can be more conservative than the actual condition
+// (i.e. we might invoke the calls that will not set the errno.).
+//
+Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
+                                              const LibFunc &Func) {
+  // FIXME: LibFunc_powf and powl TBD.
+  if (Func != LibFunc_pow) {
+    DEBUG(dbgs() << "Not handled powf() and powl()\n");
+    return nullptr;
+  }
+
+  Value *Base = CI->getArgOperand(0);
+  Value *Exp = CI->getArgOperand(1);
+  IRBuilder<> BBBuilder(CI);
+
+  // Constant Base case.
+  if (ConstantFP *CF = dyn_cast<ConstantFP>(Base)) {
+    double D = CF->getValueAPF().convertToDouble();
+    if (D < 1.0f || D > APInt::getMaxValue(8).getZExtValue()) {
+      DEBUG(dbgs() << "Not handled pow(): constant base out of range\n");
+      return nullptr;
+    }
+
+    ++NumWrappedOneCond;
+    Constant *V = ConstantFP::get(CI->getContext(), APFloat(127.0f));
+    if (!Exp->getType()->isFloatTy())
+      V = ConstantExpr::getFPExtend(V, Exp->getType());
+    return BBBuilder.CreateFCmp(CmpInst::FCMP_OGT, Exp, V);
+  }
+
+  // If the Base value coming from an integer type.
+  Instruction *I = dyn_cast<Instruction>(Base);
+  if (!I) {
+    DEBUG(dbgs() << "Not handled pow(): FP type base\n");
+    return nullptr;
+  }
+  unsigned Opcode = I->getOpcode();
+  if (Opcode == Instruction::UIToFP || Opcode == Instruction::SIToFP) {
+    unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+    float UpperV = 0.0f;
+    if (BW == 8)
+      UpperV = 128.0f;
+    else if (BW == 16)
+      UpperV = 64.0f;
+    else if (BW == 32)
+      UpperV = 32.0f;
+    else {
+      DEBUG(dbgs() << "Not handled pow(): type too wide\n");
+      return nullptr;
+    }
+
+    ++NumWrappedTwoCond;
+    Constant *V = ConstantFP::get(CI->getContext(), APFloat(UpperV));
+    Constant *V0 = ConstantFP::get(CI->getContext(), APFloat(0.0f));
+    if (!Exp->getType()->isFloatTy())
+      V = ConstantExpr::getFPExtend(V, Exp->getType());
+    if (!Base->getType()->isFloatTy())
+      V0 = ConstantExpr::getFPExtend(V0, Exp->getType());
+
+    Value *Cond = BBBuilder.CreateFCmp(CmpInst::FCMP_OGT, Exp, V);
+    Value *Cond0 = BBBuilder.CreateFCmp(CmpInst::FCMP_OLE, Base, V0);
+    return BBBuilder.CreateOr(Cond0, Cond);
+  }
+  DEBUG(dbgs() << "Not handled pow(): base not from integer convert\n");
+  return nullptr;
+}
+
+// Wrap conditions that can potentially generate errno to the library call.
+void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) {
+  assert(Cond != nullptr && "ShrinkWrapCI is not expecting an empty call inst");
+  MDNode *BranchWeights =
+      MDBuilder(CI->getContext()).createBranchWeights(1, 2000);
+
+  TerminatorInst *NewInst =
+      SplitBlockAndInsertIfThen(Cond, CI, false, BranchWeights, DT);
+  BasicBlock *CallBB = NewInst->getParent();
+  CallBB->setName("cdce.call");
+  BasicBlock *SuccBB = CallBB->getSingleSuccessor();
+  assert(SuccBB && "The split block should have a single successor");
+  SuccBB->setName("cdce.end");
+  CI->removeFromParent();
+  CallBB->getInstList().insert(CallBB->getFirstInsertionPt(), CI);
+  DEBUG(dbgs() << "== Basic Block After ==");
+  DEBUG(dbgs() << *CallBB->getSinglePredecessor() << *CallBB
+               << *CallBB->getSingleSuccessor() << "\n");
+}
+
+// Perform the transformation to a single candidate.
+bool LibCallsShrinkWrap::perform(CallInst *CI) {
+  LibFunc Func;
+  Function *Callee = CI->getCalledFunction();
+  assert(Callee && "perform() should apply to a non-empty callee");
+  TLI.getLibFunc(*Callee, Func);
+  assert(Func && "perform() is not expecting an empty function");
+
+  if (performCallDomainErrorOnly(CI, Func) || performCallRangeErrorOnly(CI, Func))
+    return true;
+  return performCallErrors(CI, Func);
+}
+
+void LibCallsShrinkWrapLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
+
+static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
+                    DominatorTree *DT) {
+  if (F.hasFnAttribute(Attribute::OptimizeForSize))
+    return false;
+  LibCallsShrinkWrap CCDCE(TLI, DT);
+  CCDCE.visit(F);
+  bool Changed = CCDCE.perform();
+
+// Verify the dominator after we've updated it locally.
+#ifndef NDEBUG
+  if (DT)
+    DT->verifyDomTree();
+#endif
+  return Changed;
+}
+
+bool LibCallsShrinkWrapLegacyPass::runOnFunction(Function &F) {
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  return runImpl(F, TLI, DT);
+}
+
+namespace llvm {
+char &LibCallsShrinkWrapPassID = LibCallsShrinkWrapLegacyPass::ID;
+
+// Public interface to LibCallsShrinkWrap pass.
+FunctionPass *createLibCallsShrinkWrapPass() {
+  return new LibCallsShrinkWrapLegacyPass();
+}
+
+PreservedAnalyses LibCallsShrinkWrapPass::run(Function &F,
+                                              FunctionAnalysisManager &FAM) {
+  auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+  if (!runImpl(F, TLI, DT))
+    return PreservedAnalyses::all();
+  auto PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp
new file mode 100644
index 000000000000..74610613001c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp
@@ -0,0 +1,2210 @@
+//===-- Local.cpp - Functions to perform local transformations ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform various local transformations to the
+// program.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "local"
+
+STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
+
+//===----------------------------------------------------------------------===//
+//  Local constant propagation.
+//
+
+/// ConstantFoldTerminator - If a terminator instruction is predicated on a
+/// constant value, convert it into an unconditional branch to the constant
+/// destination.  This is a nontrivial operation because the successors of this
+/// basic block must have their PHI nodes updated.
+/// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch
+/// conditions and indirectbr addresses this might make dead if
+/// DeleteDeadConditions is true.
+bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
+                                  const TargetLibraryInfo *TLI) {
+  TerminatorInst *T = BB->getTerminator();
+  IRBuilder<> Builder(T);
+
+  // Branch - See if we are conditional jumping on constant
+  if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+    if (BI->isUnconditional()) return false;  // Can't optimize uncond branch
+    BasicBlock *Dest1 = BI->getSuccessor(0);
+    BasicBlock *Dest2 = BI->getSuccessor(1);
+
+    if (ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition())) {
+      // Are we branching on constant?
+      // YES.  Change to unconditional branch...
+      BasicBlock *Destination = Cond->getZExtValue() ? Dest1 : Dest2;
+      BasicBlock *OldDest     = Cond->getZExtValue() ? Dest2 : Dest1;
+
+      //cerr << "Function: " << T->getParent()->getParent()
+      //     << "\nRemoving branch from " << T->getParent()
+      //     << "\n\nTo: " << OldDest << endl;
+
+      // Let the basic block know that we are letting go of it.  Based on this,
+      // it will adjust it's PHI nodes.
+      OldDest->removePredecessor(BB);
+
+      // Replace the conditional branch with an unconditional one.
+      Builder.CreateBr(Destination);
+      BI->eraseFromParent();
+      return true;
+    }
+
+    if (Dest2 == Dest1) {       // Conditional branch to same location?
+      // This branch matches something like this:
+      //     br bool %cond, label %Dest, label %Dest
+      // and changes it into:  br label %Dest
+
+      // Let the basic block know that we are letting go of one copy of it.
+      assert(BI->getParent() && "Terminator not inserted in block!");
+      Dest1->removePredecessor(BI->getParent());
+
+      // Replace the conditional branch with an unconditional one.
+      Builder.CreateBr(Dest1);
+      Value *Cond = BI->getCondition();
+      BI->eraseFromParent();
+      if (DeleteDeadConditions)
+        RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
+      return true;
+    }
+    return false;
+  }
+
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) {
+    // If we are switching on a constant, we can convert the switch to an
+    // unconditional branch.
+    ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition());
+    BasicBlock *DefaultDest = SI->getDefaultDest();
+    BasicBlock *TheOnlyDest = DefaultDest;
+
+    // If the default is unreachable, ignore it when searching for TheOnlyDest.
+    if (isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg()) &&
+        SI->getNumCases() > 0) {
+      TheOnlyDest = SI->case_begin()->getCaseSuccessor();
+    }
+
+    // Figure out which case it goes to.
+    for (auto i = SI->case_begin(), e = SI->case_end(); i != e;) {
+      // Found case matching a constant operand?
+      if (i->getCaseValue() == CI) {
+        TheOnlyDest = i->getCaseSuccessor();
+        break;
+      }
+
+      // Check to see if this branch is going to the same place as the default
+      // dest.  If so, eliminate it as an explicit compare.
+      if (i->getCaseSuccessor() == DefaultDest) {
+        MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
+        unsigned NCases = SI->getNumCases();
+        // Fold the case metadata into the default if there will be any branches
+        // left, unless the metadata doesn't match the switch.
+        if (NCases > 1 && MD && MD->getNumOperands() == 2 + NCases) {
+          // Collect branch weights into a vector.
+          SmallVector<uint32_t, 8> Weights;
+          for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
+               ++MD_i) {
+            auto *CI = mdconst::extract<ConstantInt>(MD->getOperand(MD_i));
+            Weights.push_back(CI->getValue().getZExtValue());
+          }
+          // Merge weight of this case to the default weight.
+          unsigned idx = i->getCaseIndex();
+          Weights[0] += Weights[idx+1];
+          // Remove weight for this case.
+          std::swap(Weights[idx+1], Weights.back());
+          Weights.pop_back();
+          SI->setMetadata(LLVMContext::MD_prof,
+                          MDBuilder(BB->getContext()).
+                          createBranchWeights(Weights));
+        }
+        // Remove this entry.
+        DefaultDest->removePredecessor(SI->getParent());
+        i = SI->removeCase(i);
+        e = SI->case_end();
+        continue;
+      }
+
+      // Otherwise, check to see if the switch only branches to one destination.
+      // We do this by reseting "TheOnlyDest" to null when we find two non-equal
+      // destinations.
+      if (i->getCaseSuccessor() != TheOnlyDest)
+        TheOnlyDest = nullptr;
+
+      // Increment this iterator as we haven't removed the case.
+      ++i;
+    }
+
+    if (CI && !TheOnlyDest) {
+      // Branching on a constant, but not any of the cases, go to the default
+      // successor.
+      TheOnlyDest = SI->getDefaultDest();
+    }
+
+    // If we found a single destination that we can fold the switch into, do so
+    // now.
+    if (TheOnlyDest) {
+      // Insert the new branch.
+      Builder.CreateBr(TheOnlyDest);
+      BasicBlock *BB = SI->getParent();
+
+      // Remove entries from PHI nodes which we no longer branch to...
+      for (BasicBlock *Succ : SI->successors()) {
+        // Found case matching a constant operand?
+        if (Succ == TheOnlyDest)
+          TheOnlyDest = nullptr; // Don't modify the first branch to TheOnlyDest
+        else
+          Succ->removePredecessor(BB);
+      }
+
+      // Delete the old switch.
+      Value *Cond = SI->getCondition();
+      SI->eraseFromParent();
+      if (DeleteDeadConditions)
+        RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
+      return true;
+    }
+
+    if (SI->getNumCases() == 1) {
+      // Otherwise, we can fold this switch into a conditional branch
+      // instruction if it has only one non-default destination.
+      auto FirstCase = *SI->case_begin();
+      Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
+          FirstCase.getCaseValue(), "cond");
+
+      // Insert the new branch.
+      BranchInst *NewBr = Builder.CreateCondBr(Cond,
+                                               FirstCase.getCaseSuccessor(),
+                                               SI->getDefaultDest());
+      MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
+      if (MD && MD->getNumOperands() == 3) {
+        ConstantInt *SICase =
+            mdconst::dyn_extract<ConstantInt>(MD->getOperand(2));
+        ConstantInt *SIDef =
+            mdconst::dyn_extract<ConstantInt>(MD->getOperand(1));
+        assert(SICase && SIDef);
+        // The TrueWeight should be the weight for the single case of SI.
+        NewBr->setMetadata(LLVMContext::MD_prof,
+                        MDBuilder(BB->getContext()).
+                        createBranchWeights(SICase->getValue().getZExtValue(),
+                                            SIDef->getValue().getZExtValue()));
+      }
+
+      // Update make.implicit metadata to the newly-created conditional branch.
+      MDNode *MakeImplicitMD = SI->getMetadata(LLVMContext::MD_make_implicit);
+      if (MakeImplicitMD)
+        NewBr->setMetadata(LLVMContext::MD_make_implicit, MakeImplicitMD);
+
+      // Delete the old switch.
+      SI->eraseFromParent();
+      return true;
+    }
+    return false;
+  }
+
+  if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(T)) {
+    // indirectbr blockaddress(@F, @BB) -> br label @BB
+    if (BlockAddress *BA =
+          dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) {
+      BasicBlock *TheOnlyDest = BA->getBasicBlock();
+      // Insert the new branch.
+      Builder.CreateBr(TheOnlyDest);
+
+      for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
+        if (IBI->getDestination(i) == TheOnlyDest)
+          TheOnlyDest = nullptr;
+        else
+          IBI->getDestination(i)->removePredecessor(IBI->getParent());
+      }
+      Value *Address = IBI->getAddress();
+      IBI->eraseFromParent();
+      if (DeleteDeadConditions)
+        RecursivelyDeleteTriviallyDeadInstructions(Address, TLI);
+
+      // If we didn't find our destination in the IBI successor list, then we
+      // have undefined behavior.  Replace the unconditional branch with an
+      // 'unreachable' instruction.
+      if (TheOnlyDest) {
+        BB->getTerminator()->eraseFromParent();
+        new UnreachableInst(BB->getContext(), BB);
+      }
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Local dead code elimination.
+//
+
+/// isInstructionTriviallyDead - Return true if the result produced by the
+/// instruction is not used, and the instruction has no side effects.
+///
+bool llvm::isInstructionTriviallyDead(Instruction *I,
+                                      const TargetLibraryInfo *TLI) {
+  if (!I->use_empty())
+    return false;
+  return wouldInstructionBeTriviallyDead(I, TLI);
+}
+
+bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
+                                           const TargetLibraryInfo *TLI) {
+  if (isa<TerminatorInst>(I))
+    return false;
+
+  // We don't want the landingpad-like instructions removed by anything this
+  // general.
+  if (I->isEHPad())
+    return false;
+
+  // We don't want debug info removed by anything this general, unless
+  // debug info is empty.
+  if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(I)) {
+    if (DDI->getAddress())
+      return false;
+    return true;
+  }
+  if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(I)) {
+    if (DVI->getValue())
+      return false;
+    return true;
+  }
+
+  if (!I->mayHaveSideEffects())
+    return true;
+
+  // Special case intrinsics that "may have side effects" but can be deleted
+  // when dead.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    // Safe to delete llvm.stacksave if dead.
+    if (II->getIntrinsicID() == Intrinsic::stacksave)
+      return true;
+
+    // Lifetime intrinsics are dead when their right-hand is undef.
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+        II->getIntrinsicID() == Intrinsic::lifetime_end)
+      return isa<UndefValue>(II->getArgOperand(1));
+
+    // Assumptions are dead if their condition is trivially true.  Guards on
+    // true are operationally no-ops.  In the future we can consider more
+    // sophisticated tradeoffs for guards considering potential for check
+    // widening, but for now we keep things simple.
+    if (II->getIntrinsicID() == Intrinsic::assume ||
+        II->getIntrinsicID() == Intrinsic::experimental_guard) {
+      if (ConstantInt *Cond = dyn_cast<ConstantInt>(II->getArgOperand(0)))
+        return !Cond->isZero();
+
+      return false;
+    }
+  }
+
+  if (isAllocLikeFn(I, TLI))
+    return true;
+
+  if (CallInst *CI = isFreeCall(I, TLI))
+    if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
+      return C->isNullValue() || isa<UndefValue>(C);
+
+  if (CallSite CS = CallSite(I))
+    if (isMathLibCallNoop(CS, TLI))
+      return true;
+
+  return false;
+}
+
+/// RecursivelyDeleteTriviallyDeadInstructions - If the specified value is a
+/// trivially dead instruction, delete it.  If that makes any of its operands
+/// trivially dead, delete them too, recursively.  Return true if any
+/// instructions were deleted.
+bool
+llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
+                                                 const TargetLibraryInfo *TLI) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || !I->use_empty() || !isInstructionTriviallyDead(I, TLI))
+    return false;
+
+  SmallVector<Instruction*, 16> DeadInsts;
+  DeadInsts.push_back(I);
+
+  do {
+    I = DeadInsts.pop_back_val();
+
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      Value *OpV = I->getOperand(i);
+      I->setOperand(i, nullptr);
+
+      if (!OpV->use_empty()) continue;
+
+      // If the operand is an instruction that became dead as we nulled out the
+      // operand, and if it is 'trivially' dead, delete it in a future loop
+      // iteration.
+      if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+        if (isInstructionTriviallyDead(OpI, TLI))
+          DeadInsts.push_back(OpI);
+    }
+
+    I->eraseFromParent();
+  } while (!DeadInsts.empty());
+
+  return true;
+}
+
+/// areAllUsesEqual - Check whether the uses of a value are all the same.
+/// This is similar to Instruction::hasOneUse() except this will also return
+/// true when there are no uses or multiple uses that all refer to the same
+/// value.
+static bool areAllUsesEqual(Instruction *I) {
+  Value::user_iterator UI = I->user_begin();
+  Value::user_iterator UE = I->user_end();
+  if (UI == UE)
+    return true;
+
+  User *TheUse = *UI;
+  for (++UI; UI != UE; ++UI) {
+    if (*UI != TheUse)
+      return false;
+  }
+  return true;
+}
+
+/// RecursivelyDeleteDeadPHINode - If the specified value is an effectively
+/// dead PHI node, due to being a def-use chain of single-use nodes that
+/// either forms a cycle or is terminated by a trivially dead instruction,
+/// delete it.  If that makes any of its operands trivially dead, delete them
+/// too, recursively.  Return true if a change was made.
+bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,
+                                        const TargetLibraryInfo *TLI) {
+  SmallPtrSet<Instruction*, 4> Visited;
+  for (Instruction *I = PN; areAllUsesEqual(I) && !I->mayHaveSideEffects();
+       I = cast<Instruction>(*I->user_begin())) {
+    if (I->use_empty())
+      return RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+
+    // If we find an instruction more than once, we're on a cycle that
+    // won't prove fruitful.
+    if (!Visited.insert(I).second) {
+      // Break the cycle and delete the instruction and its operands.
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+      (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool
+simplifyAndDCEInstruction(Instruction *I,
+                          SmallSetVector<Instruction *, 16> &WorkList,
+                          const DataLayout &DL,
+                          const TargetLibraryInfo *TLI) {
+  if (isInstructionTriviallyDead(I, TLI)) {
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      Value *OpV = I->getOperand(i);
+      I->setOperand(i, nullptr);
+
+      if (!OpV->use_empty() || I == OpV)
+        continue;
+
+      // If the operand is an instruction that became dead as we nulled out the
+      // operand, and if it is 'trivially' dead, delete it in a future loop
+      // iteration.
+      if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+        if (isInstructionTriviallyDead(OpI, TLI))
+          WorkList.insert(OpI);
+    }
+
+    I->eraseFromParent();
+
+    return true;
+  }
+
+  if (Value *SimpleV = SimplifyInstruction(I, DL)) {
+    // Add the users to the worklist. CAREFUL: an instruction can use itself,
+    // in the case of a phi node.
+    for (User *U : I->users()) {
+      if (U != I) {
+        WorkList.insert(cast<Instruction>(U));
+      }
+    }
+
+    // Replace the instruction with its simplified value.
+    bool Changed = false;
+    if (!I->use_empty()) {
+      I->replaceAllUsesWith(SimpleV);
+      Changed = true;
+    }
+    if (isInstructionTriviallyDead(I, TLI)) {
+      I->eraseFromParent();
+      Changed = true;
+    }
+    return Changed;
+  }
+  return false;
+}
+
+/// SimplifyInstructionsInBlock - Scan the specified basic block and try to
+/// simplify any instructions in it and recursively delete dead instructions.
+///
+/// This returns true if it changed the code, note that it can delete
+/// instructions in other blocks as well in this block.
+bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
+                                       const TargetLibraryInfo *TLI) {
+  bool MadeChange = false;
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+
+#ifndef NDEBUG
+  // In debug builds, ensure that the terminator of the block is never replaced
+  // or deleted by these simplifications. The idea of simplification is that it
+  // cannot introduce new instructions, and there is no way to replace the
+  // terminator of a block without introducing a new instruction.
+  AssertingVH<Instruction> TerminatorVH(&BB->back());
+#endif
+
+  SmallSetVector<Instruction *, 16> WorkList;
+  // Iterate over the original function, only adding insts to the worklist
+  // if they actually need to be revisited. This avoids having to pre-init
+  // the worklist with the entire function's worth of instructions.
+  for (BasicBlock::iterator BI = BB->begin(), E = std::prev(BB->end());
+       BI != E;) {
+    assert(!BI->isTerminator());
+    Instruction *I = &*BI;
+    ++BI;
+
+    // We're visiting this instruction now, so make sure it's not in the
+    // worklist from an earlier visit.
+    if (!WorkList.count(I))
+      MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI);
+  }
+
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.pop_back_val();
+    MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI);
+  }
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Graph Restructuring.
+//
+
+
+/// RemovePredecessorAndSimplify - Like BasicBlock::removePredecessor, this
+/// method is called when we're about to delete Pred as a predecessor of BB.  If
+/// BB contains any PHI nodes, this drops the entries in the PHI nodes for Pred.
+///
+/// Unlike the removePredecessor method, this attempts to simplify uses of PHI
+/// nodes that collapse into identity values.  For example, if we have:
+///   x = phi(1, 0, 0, 0)
+///   y = and x, z
+///
+/// .. and delete the predecessor corresponding to the '1', this will attempt to
+/// recursively fold the and to 0.
+void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred) {
+  // This only adjusts blocks with PHI nodes.
+  if (!isa<PHINode>(BB->begin()))
+    return;
+
+  // Remove the entries for Pred from the PHI nodes in BB, but do not simplify
+  // them down.  This will leave us with single entry phi nodes and other phis
+  // that can be removed.
+  BB->removePredecessor(Pred, true);
+
+  WeakTrackingVH PhiIt = &BB->front();
+  while (PHINode *PN = dyn_cast<PHINode>(PhiIt)) {
+    PhiIt = &*++BasicBlock::iterator(cast<Instruction>(PhiIt));
+    Value *OldPhiIt = PhiIt;
+
+    if (!recursivelySimplifyInstruction(PN))
+      continue;
+
+    // If recursive simplification ended up deleting the next PHI node we would
+    // iterate to, then our iterator is invalid, restart scanning from the top
+    // of the block.
+    if (PhiIt != OldPhiIt) PhiIt = &BB->front();
+  }
+}
+
+
+/// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its
+/// predecessor is known to have one successor (DestBB!).  Eliminate the edge
+/// between them, moving the instructions in the predecessor into DestBB and
+/// deleting the predecessor block.
+///
+void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
+  // If BB has single-entry PHI nodes, fold them.
+  while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
+    Value *NewVal = PN->getIncomingValue(0);
+    // Replace self referencing PHI with undef, it must be dead.
+    if (NewVal == PN) NewVal = UndefValue::get(PN->getType());
+    PN->replaceAllUsesWith(NewVal);
+    PN->eraseFromParent();
+  }
+
+  BasicBlock *PredBB = DestBB->getSinglePredecessor();
+  assert(PredBB && "Block doesn't have a single predecessor!");
+
+  // Zap anything that took the address of DestBB.  Not doing this will give the
+  // address an invalid value.
+  if (DestBB->hasAddressTaken()) {
+    BlockAddress *BA = BlockAddress::get(DestBB);
+    Constant *Replacement =
+      ConstantInt::get(llvm::Type::getInt32Ty(BA->getContext()), 1);
+    BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(Replacement,
+                                                     BA->getType()));
+    BA->destroyConstant();
+  }
+
+  // Anything that branched to PredBB now branches to DestBB.
+  PredBB->replaceAllUsesWith(DestBB);
+
+  // Splice all the instructions from PredBB to DestBB.
+  PredBB->getTerminator()->eraseFromParent();
+  DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList());
+
+  // If the PredBB is the entry block of the function, move DestBB up to
+  // become the entry block after we erase PredBB.
+  if (PredBB == &DestBB->getParent()->getEntryBlock())
+    DestBB->moveAfter(PredBB);
+
+  if (DT) {
+    BasicBlock *PredBBIDom = DT->getNode(PredBB)->getIDom()->getBlock();
+    DT->changeImmediateDominator(DestBB, PredBBIDom);
+    DT->eraseNode(PredBB);
+  }
+  // Nuke BB.
+  PredBB->eraseFromParent();
+}
+
+/// CanMergeValues - Return true if we can choose one of these values to use
+/// in place of the other. Note that we will always choose the non-undef
+/// value to keep.
+static bool CanMergeValues(Value *First, Value *Second) {
+  return First == Second || isa<UndefValue>(First) || isa<UndefValue>(Second);
+}
+
+/// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an
+/// almost-empty BB ending in an unconditional branch to Succ, into Succ.
+///
+/// Assumption: Succ is the single successor for BB.
+///
+static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
+  assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!");
+
+  DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into "
+        << Succ->getName() << "\n");
+  // Shortcut, if there is only a single predecessor it must be BB and merging
+  // is always safe
+  if (Succ->getSinglePredecessor()) return true;
+
+  // Make a list of the predecessors of BB
+  SmallPtrSet<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB));
+
+  // Look at all the phi nodes in Succ, to see if they present a conflict when
+  // merging these blocks
+  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+
+    // If the incoming value from BB is again a PHINode in
+    // BB which has the same incoming value for *PI as PN does, we can
+    // merge the phi nodes and then the blocks can still be merged
+    PHINode *BBPN = dyn_cast<PHINode>(PN->getIncomingValueForBlock(BB));
+    if (BBPN && BBPN->getParent() == BB) {
+      for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) {
+        BasicBlock *IBB = PN->getIncomingBlock(PI);
+        if (BBPreds.count(IBB) &&
+            !CanMergeValues(BBPN->getIncomingValueForBlock(IBB),
+                            PN->getIncomingValue(PI))) {
+          DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in "
+                << Succ->getName() << " is conflicting with "
+                << BBPN->getName() << " with regard to common predecessor "
+                << IBB->getName() << "\n");
+          return false;
+        }
+      }
+    } else {
+      Value* Val = PN->getIncomingValueForBlock(BB);
+      for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) {
+        // See if the incoming value for the common predecessor is equal to the
+        // one for BB, in which case this phi node will not prevent the merging
+        // of the block.
+        BasicBlock *IBB = PN->getIncomingBlock(PI);
+        if (BBPreds.count(IBB) &&
+            !CanMergeValues(Val, PN->getIncomingValue(PI))) {
+          DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in "
+                << Succ->getName() << " is conflicting with regard to common "
+                << "predecessor " << IBB->getName() << "\n");
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+typedef SmallVector<BasicBlock *, 16> PredBlockVector;
+typedef DenseMap<BasicBlock *, Value *> IncomingValueMap;
+
+/// \brief Determines the value to use as the phi node input for a block.
+///
+/// Select between \p OldVal any value that we know flows from \p BB
+/// to a particular phi on the basis of which one (if either) is not
+/// undef. Update IncomingValues based on the selected value.
+///
+/// \param OldVal The value we are considering selecting.
+/// \param BB The block that the value flows in from.
+/// \param IncomingValues A map from block-to-value for other phi inputs
+/// that we have examined.
+///
+/// \returns the selected value.
+static Value *selectIncomingValueForBlock(Value *OldVal, BasicBlock *BB,
+                                          IncomingValueMap &IncomingValues) {
+  if (!isa<UndefValue>(OldVal)) {
+    assert((!IncomingValues.count(BB) ||
+            IncomingValues.find(BB)->second == OldVal) &&
+           "Expected OldVal to match incoming value from BB!");
+
+    IncomingValues.insert(std::make_pair(BB, OldVal));
+    return OldVal;
+  }
+
+  IncomingValueMap::const_iterator It = IncomingValues.find(BB);
+  if (It != IncomingValues.end()) return It->second;
+
+  return OldVal;
+}
+
+/// \brief Create a map from block to value for the operands of a
+/// given phi.
+///
+/// Create a map from block to value for each non-undef value flowing
+/// into \p PN.
+///
+/// \param PN The phi we are collecting the map for.
+/// \param IncomingValues [out] The map from block to value for this phi.
+static void gatherIncomingValuesToPhi(PHINode *PN,
+                                      IncomingValueMap &IncomingValues) {
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *BB = PN->getIncomingBlock(i);
+    Value *V = PN->getIncomingValue(i);
+
+    if (!isa<UndefValue>(V))
+      IncomingValues.insert(std::make_pair(BB, V));
+  }
+}
+
+/// \brief Replace the incoming undef values to a phi with the values
+/// from a block-to-value map.
+///
+/// \param PN The phi we are replacing the undefs in.
+/// \param IncomingValues A map from block to value.
+static void replaceUndefValuesInPhi(PHINode *PN,
+                                    const IncomingValueMap &IncomingValues) {
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *V = PN->getIncomingValue(i);
+
+    if (!isa<UndefValue>(V)) continue;
+
+    BasicBlock *BB = PN->getIncomingBlock(i);
+    IncomingValueMap::const_iterator It = IncomingValues.find(BB);
+    if (It == IncomingValues.end()) continue;
+
+    PN->setIncomingValue(i, It->second);
+  }
+}
+
+/// \brief Replace a value flowing from a block to a phi with
+/// potentially multiple instances of that value flowing from the
+/// block's predecessors to the phi.
+///
+/// \param BB The block with the value flowing into the phi.
+/// \param BBPreds The predecessors of BB.
+/// \param PN The phi that we are updating.
+static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB,
+                                                const PredBlockVector &BBPreds,
+                                                PHINode *PN) {
+  Value *OldVal = PN->removeIncomingValue(BB, false);
+  assert(OldVal && "No entry in PHI for Pred BB!");
+
+  IncomingValueMap IncomingValues;
+
+  // We are merging two blocks - BB, and the block containing PN - and
+  // as a result we need to redirect edges from the predecessors of BB
+  // to go to the block containing PN, and update PN
+  // accordingly. Since we allow merging blocks in the case where the
+  // predecessor and successor blocks both share some predecessors,
+  // and where some of those common predecessors might have undef
+  // values flowing into PN, we want to rewrite those values to be
+  // consistent with the non-undef values.
+
+  gatherIncomingValuesToPhi(PN, IncomingValues);
+
+  // If this incoming value is one of the PHI nodes in BB, the new entries
+  // in the PHI node are the entries from the old PHI.
+  if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) {
+    PHINode *OldValPN = cast<PHINode>(OldVal);
+    for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i) {
+      // Note that, since we are merging phi nodes and BB and Succ might
+      // have common predecessors, we could end up with a phi node with
+      // identical incoming branches. This will be cleaned up later (and
+      // will trigger asserts if we try to clean it up now, without also
+      // simplifying the corresponding conditional branch).
+      BasicBlock *PredBB = OldValPN->getIncomingBlock(i);
+      Value *PredVal = OldValPN->getIncomingValue(i);
+      Value *Selected = selectIncomingValueForBlock(PredVal, PredBB,
+                                                    IncomingValues);
+
+      // And add a new incoming value for this predecessor for the
+      // newly retargeted branch.
+      PN->addIncoming(Selected, PredBB);
+    }
+  } else {
+    for (unsigned i = 0, e = BBPreds.size(); i != e; ++i) {
+      // Update existing incoming values in PN for this
+      // predecessor of BB.
+      BasicBlock *PredBB = BBPreds[i];
+      Value *Selected = selectIncomingValueForBlock(OldVal, PredBB,
+                                                    IncomingValues);
+
+      // And add a new incoming value for this predecessor for the
+      // newly retargeted branch.
+      PN->addIncoming(Selected, PredBB);
+    }
+  }
+
+  replaceUndefValuesInPhi(PN, IncomingValues);
+}
+
+/// TryToSimplifyUncondBranchFromEmptyBlock - BB is known to contain an
+/// unconditional branch, and contains no instructions other than PHI nodes,
+/// potential side-effect free intrinsics and the branch.  If possible,
+/// eliminate BB by rewriting all the predecessors to branch to the successor
+/// block and return true.  If we can't transform, return false.
+bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
+  assert(BB != &BB->getParent()->getEntryBlock() &&
+         "TryToSimplifyUncondBranchFromEmptyBlock called on entry block!");
+
+  // We can't eliminate infinite loops.
+  BasicBlock *Succ = cast<BranchInst>(BB->getTerminator())->getSuccessor(0);
+  if (BB == Succ) return false;
+
+  // Check to see if merging these blocks would cause conflicts for any of the
+  // phi nodes in BB or Succ. If not, we can safely merge.
+  if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false;
+
+  // Check for cases where Succ has multiple predecessors and a PHI node in BB
+  // has uses which will not disappear when the PHI nodes are merged.  It is
+  // possible to handle such cases, but difficult: it requires checking whether
+  // BB dominates Succ, which is non-trivial to calculate in the case where
+  // Succ has multiple predecessors.  Also, it requires checking whether
+  // constructing the necessary self-referential PHI node doesn't introduce any
+  // conflicts; this isn't too difficult, but the previous code for doing this
+  // was incorrect.
+  //
+  // Note that if this check finds a live use, BB dominates Succ, so BB is
+  // something like a loop pre-header (or rarely, a part of an irreducible CFG);
+  // folding the branch isn't profitable in that case anyway.
+  if (!Succ->getSinglePredecessor()) {
+    BasicBlock::iterator BBI = BB->begin();
+    while (isa<PHINode>(*BBI)) {
+      for (Use &U : BBI->uses()) {
+        if (PHINode* PN = dyn_cast<PHINode>(U.getUser())) {
+          if (PN->getIncomingBlock(U) != BB)
+            return false;
+        } else {
+          return false;
+        }
+      }
+      ++BBI;
+    }
+  }
+
+  DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
+
+  if (isa<PHINode>(Succ->begin())) {
+    // If there is more than one pred of succ, and there are PHI nodes in
+    // the successor, then we need to add incoming edges for the PHI nodes
+    //
+    const PredBlockVector BBPreds(pred_begin(BB), pred_end(BB));
+
+    // Loop over all of the PHI nodes in the successor of BB.
+    for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PN = cast<PHINode>(I);
+
+      redirectValuesFromPredecessorsToPhi(BB, BBPreds, PN);
+    }
+  }
+
+  if (Succ->getSinglePredecessor()) {
+    // BB is the only predecessor of Succ, so Succ will end up with exactly
+    // the same predecessors BB had.
+
+    // Copy over any phi, debug or lifetime instruction.
+    BB->getTerminator()->eraseFromParent();
+    Succ->getInstList().splice(Succ->getFirstNonPHI()->getIterator(),
+                               BB->getInstList());
+  } else {
+    while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
+      // We explicitly check for such uses in CanPropagatePredecessorsForPHIs.
+      assert(PN->use_empty() && "There shouldn't be any uses here!");
+      PN->eraseFromParent();
+    }
+  }
+
+  // If the unconditional branch we replaced contains llvm.loop metadata, we
+  // add the metadata to the branch instructions in the predecessors.
+  unsigned LoopMDKind = BB->getContext().getMDKindID("llvm.loop");
+  Instruction *TI = BB->getTerminator();
+  if (TI)
+    if (MDNode *LoopMD = TI->getMetadata(LoopMDKind))
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+        BasicBlock *Pred = *PI;
+        Pred->getTerminator()->setMetadata(LoopMDKind, LoopMD);
+      }
+
+  // Everything that jumped to BB now goes to Succ.
+  BB->replaceAllUsesWith(Succ);
+  if (!Succ->hasName()) Succ->takeName(BB);
+  BB->eraseFromParent();              // Delete the old basic block.
+  return true;
+}
+
+/// EliminateDuplicatePHINodes - Check for and eliminate duplicate PHI
+/// nodes in this block. This doesn't try to be clever about PHI nodes
+/// which differ only in the order of the incoming values, but instcombine
+/// orders them so it usually won't matter.
+///
+bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
+  // This implementation doesn't currently consider undef operands
+  // specially. Theoretically, two phis which are identical except for
+  // one having an undef where the other doesn't could be collapsed.
+
+  struct PHIDenseMapInfo {
+    static PHINode *getEmptyKey() {
+      return DenseMapInfo<PHINode *>::getEmptyKey();
+    }
+    static PHINode *getTombstoneKey() {
+      return DenseMapInfo<PHINode *>::getTombstoneKey();
+    }
+    static unsigned getHashValue(PHINode *PN) {
+      // Compute a hash value on the operands. Instcombine will likely have
+      // sorted them, which helps expose duplicates, but we have to check all
+      // the operands to be safe in case instcombine hasn't run.
+      return static_cast<unsigned>(hash_combine(
+          hash_combine_range(PN->value_op_begin(), PN->value_op_end()),
+          hash_combine_range(PN->block_begin(), PN->block_end())));
+    }
+    static bool isEqual(PHINode *LHS, PHINode *RHS) {
+      if (LHS == getEmptyKey() || LHS == getTombstoneKey() ||
+          RHS == getEmptyKey() || RHS == getTombstoneKey())
+        return LHS == RHS;
+      return LHS->isIdenticalTo(RHS);
+    }
+  };
+
+  // Set of unique PHINodes.
+  DenseSet<PHINode *, PHIDenseMapInfo> PHISet;
+
+  // Examine each PHI.
+  bool Changed = false;
+  for (auto I = BB->begin(); PHINode *PN = dyn_cast<PHINode>(I++);) {
+    auto Inserted = PHISet.insert(PN);
+    if (!Inserted.second) {
+      // A duplicate. Replace this PHI with its duplicate.
+      PN->replaceAllUsesWith(*Inserted.first);
+      PN->eraseFromParent();
+      Changed = true;
+
+      // The RAUW can change PHIs that we already visited. Start over from the
+      // beginning.
+      PHISet.clear();
+      I = BB->begin();
+    }
+  }
+
+  return Changed;
+}
+
+/// enforceKnownAlignment - If the specified pointer points to an object that
+/// we control, modify the object's alignment to PrefAlign. This isn't
+/// often possible though. If alignment is important, a more reliable approach
+/// is to simply align all global variables and allocation instructions to
+/// their preferred alignment from the beginning.
+///
+static unsigned enforceKnownAlignment(Value *V, unsigned Align,
+                                      unsigned PrefAlign,
+                                      const DataLayout &DL) {
+  assert(PrefAlign > Align);
+
+  V = V->stripPointerCasts();
+
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+    // TODO: ideally, computeKnownBits ought to have used
+    // AllocaInst::getAlignment() in its computation already, making
+    // the below max redundant. But, as it turns out,
+    // stripPointerCasts recurses through infinite layers of bitcasts,
+    // while computeKnownBits is not allowed to traverse more than 6
+    // levels.
+    Align = std::max(AI->getAlignment(), Align);
+    if (PrefAlign <= Align)
+      return Align;
+
+    // If the preferred alignment is greater than the natural stack alignment
+    // then don't round up. This avoids dynamic stack realignment.
+    if (DL.exceedsNaturalStackAlignment(PrefAlign))
+      return Align;
+    AI->setAlignment(PrefAlign);
+    return PrefAlign;
+  }
+
+  if (auto *GO = dyn_cast<GlobalObject>(V)) {
+    // TODO: as above, this shouldn't be necessary.
+    Align = std::max(GO->getAlignment(), Align);
+    if (PrefAlign <= Align)
+      return Align;
+
+    // If there is a large requested alignment and we can, bump up the alignment
+    // of the global.  If the memory we set aside for the global may not be the
+    // memory used by the final program then it is impossible for us to reliably
+    // enforce the preferred alignment.
+    if (!GO->canIncreaseAlignment())
+      return Align;
+
+    GO->setAlignment(PrefAlign);
+    return PrefAlign;
+  }
+
+  return Align;
+}
+
+unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
+                                          const DataLayout &DL,
+                                          const Instruction *CxtI,
+                                          AssumptionCache *AC,
+                                          const DominatorTree *DT) {
+  assert(V->getType()->isPointerTy() &&
+         "getOrEnforceKnownAlignment expects a pointer!");
+
+  KnownBits Known = computeKnownBits(V, DL, 0, AC, CxtI, DT);
+  unsigned TrailZ = Known.countMinTrailingZeros();
+
+  // Avoid trouble with ridiculously large TrailZ values, such as
+  // those computed from a null pointer.
+  TrailZ = std::min(TrailZ, unsigned(sizeof(unsigned) * CHAR_BIT - 1));
+
+  unsigned Align = 1u << std::min(Known.getBitWidth() - 1, TrailZ);
+
+  // LLVM doesn't support alignments larger than this currently.
+  Align = std::min(Align, +Value::MaximumAlignment);
+
+  if (PrefAlign > Align)
+    Align = enforceKnownAlignment(V, Align, PrefAlign, DL);
+
+  // We don't need to make any adjustment.
+  return Align;
+}
+
+///===---------------------------------------------------------------------===//
+///  Dbg Intrinsic utilities
+///
+
+/// See if there is a dbg.value intrinsic for DIVar before I.
+static bool LdStHasDebugValue(DILocalVariable *DIVar, DIExpression *DIExpr,
+                              Instruction *I) {
+  // Since we can't guarantee that the original dbg.declare instrinsic
+  // is removed by LowerDbgDeclare(), we need to make sure that we are
+  // not inserting the same dbg.value intrinsic over and over.
+  llvm::BasicBlock::InstListType::iterator PrevI(I);
+  if (PrevI != I->getParent()->getInstList().begin()) {
+    --PrevI;
+    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(PrevI))
+      if (DVI->getValue() == I->getOperand(0) &&
+          DVI->getOffset() == 0 &&
+          DVI->getVariable() == DIVar &&
+          DVI->getExpression() == DIExpr)
+        return true;
+  }
+  return false;
+}
+
+/// See if there is a dbg.value intrinsic for DIVar for the PHI node.
+static bool PhiHasDebugValue(DILocalVariable *DIVar,
+                             DIExpression *DIExpr,
+                             PHINode *APN) {
+  // Since we can't guarantee that the original dbg.declare instrinsic
+  // is removed by LowerDbgDeclare(), we need to make sure that we are
+  // not inserting the same dbg.value intrinsic over and over.
+  SmallVector<DbgValueInst *, 1> DbgValues;
+  findDbgValues(DbgValues, APN);
+  for (auto *DVI : DbgValues) {
+    assert(DVI->getValue() == APN);
+    assert(DVI->getOffset() == 0);
+    if ((DVI->getVariable() == DIVar) && (DVI->getExpression() == DIExpr))
+      return true;
+  }
+  return false;
+}
+
+/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
+/// that has an associated llvm.dbg.decl intrinsic.
+void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
+                                           StoreInst *SI, DIBuilder &Builder) {
+  auto *DIVar = DDI->getVariable();
+  assert(DIVar && "Missing variable");
+  auto *DIExpr = DDI->getExpression();
+  Value *DV = SI->getOperand(0);
+
+  // If an argument is zero extended then use argument directly. The ZExt
+  // may be zapped by an optimization pass in future.
+  Argument *ExtendedArg = nullptr;
+  if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
+    ExtendedArg = dyn_cast<Argument>(ZExt->getOperand(0));
+  if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
+    ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
+  if (ExtendedArg) {
+    // If this DDI was already describing only a fragment of a variable, ensure
+    // that fragment is appropriately narrowed here.
+    // But if a fragment wasn't used, describe the value as the original
+    // argument (rather than the zext or sext) so that it remains described even
+    // if the sext/zext is optimized away. This widens the variable description,
+    // leaving it up to the consumer to know how the smaller value may be
+    // represented in a larger register.
+    if (auto Fragment = DIExpr->getFragmentInfo()) {
+      unsigned FragmentOffset = Fragment->OffsetInBits;
+      SmallVector<uint64_t, 3> Ops(DIExpr->elements_begin(),
+                                   DIExpr->elements_end() - 3);
+      Ops.push_back(dwarf::DW_OP_LLVM_fragment);
+      Ops.push_back(FragmentOffset);
+      const DataLayout &DL = DDI->getModule()->getDataLayout();
+      Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
+      DIExpr = Builder.createExpression(Ops);
+    }
+    DV = ExtendedArg;
+  }
+  if (!LdStHasDebugValue(DIVar, DIExpr, SI))
+    Builder.insertDbgValueIntrinsic(DV, 0, DIVar, DIExpr, DDI->getDebugLoc(),
+                                    SI);
+}
+
+/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
+/// that has an associated llvm.dbg.decl intrinsic.
+void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
+                                           LoadInst *LI, DIBuilder &Builder) {
+  auto *DIVar = DDI->getVariable();
+  auto *DIExpr = DDI->getExpression();
+  assert(DIVar && "Missing variable");
+
+  if (LdStHasDebugValue(DIVar, DIExpr, LI))
+    return;
+
+  // We are now tracking the loaded value instead of the address. In the
+  // future if multi-location support is added to the IR, it might be
+  // preferable to keep tracking both the loaded value and the original
+  // address in case the alloca can not be elided.
+  Instruction *DbgValue = Builder.insertDbgValueIntrinsic(
+      LI, 0, DIVar, DIExpr, DDI->getDebugLoc(), (Instruction *)nullptr);
+  DbgValue->insertAfter(LI);
+}
+
+/// Inserts a llvm.dbg.value intrinsic after a phi
+/// that has an associated llvm.dbg.decl intrinsic.
+void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
+                                           PHINode *APN, DIBuilder &Builder) {
+  auto *DIVar = DDI->getVariable();
+  auto *DIExpr = DDI->getExpression();
+  assert(DIVar && "Missing variable");
+
+  if (PhiHasDebugValue(DIVar, DIExpr, APN))
+    return;
+
+  BasicBlock *BB = APN->getParent();
+  auto InsertionPt = BB->getFirstInsertionPt();
+
+  // The block may be a catchswitch block, which does not have a valid
+  // insertion point.
+  // FIXME: Insert dbg.value markers in the successors when appropriate.
+  if (InsertionPt != BB->end())
+    Builder.insertDbgValueIntrinsic(APN, 0, DIVar, DIExpr, DDI->getDebugLoc(),
+                                    &*InsertionPt);
+}
+
+/// Determine whether this alloca is either a VLA or an array.
+static bool isArray(AllocaInst *AI) {
+  return AI->isArrayAllocation() ||
+    AI->getType()->getElementType()->isArrayTy();
+}
+
+/// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set
+/// of llvm.dbg.value intrinsics.
+bool llvm::LowerDbgDeclare(Function &F) {
+  DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
+  SmallVector<DbgDeclareInst *, 4> Dbgs;
+  for (auto &FI : F)
+    for (Instruction &BI : FI)
+      if (auto DDI = dyn_cast<DbgDeclareInst>(&BI))
+        Dbgs.push_back(DDI);
+
+  if (Dbgs.empty())
+    return false;
+
+  for (auto &I : Dbgs) {
+    DbgDeclareInst *DDI = I;
+    AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
+    // If this is an alloca for a scalar variable, insert a dbg.value
+    // at each load and store to the alloca and erase the dbg.declare.
+    // The dbg.values allow tracking a variable even if it is not
+    // stored on the stack, while the dbg.declare can only describe
+    // the stack slot (and at a lexical-scope granularity). Later
+    // passes will attempt to elide the stack slot.
+    if (AI && !isArray(AI)) {
+      for (auto &AIUse : AI->uses()) {
+        User *U = AIUse.getUser();
+        if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+          if (AIUse.getOperandNo() == 1)
+            ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+        } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+          ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
+        } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
+          // This is a call by-value or some other instruction that
+          // takes a pointer to the variable. Insert a *value*
+          // intrinsic that describes the alloca.
+          DIB.insertDbgValueIntrinsic(AI, 0, DDI->getVariable(),
+                                      DDI->getExpression(), DDI->getDebugLoc(),
+                                      CI);
+        }
+      }
+      DDI->eraseFromParent();
+    }
+  }
+  return true;
+}
+
+/// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic describing the
+/// alloca 'V', if any.
+DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
+  if (auto *L = LocalAsMetadata::getIfExists(V))
+    if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
+      for (User *U : MDV->users())
+        if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
+          return DDI;
+
+  return nullptr;
+}
+
+void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) {
+  if (auto *L = LocalAsMetadata::getIfExists(V))
+    if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
+      for (User *U : MDV->users())
+        if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
+          DbgValues.push_back(DVI);
+}
+
+
+bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
+                             Instruction *InsertBefore, DIBuilder &Builder,
+                             bool Deref, int Offset) {
+  DbgDeclareInst *DDI = FindAllocaDbgDeclare(Address);
+  if (!DDI)
+    return false;
+  DebugLoc Loc = DDI->getDebugLoc();
+  auto *DIVar = DDI->getVariable();
+  auto *DIExpr = DDI->getExpression();
+  assert(DIVar && "Missing variable");
+  DIExpr = DIExpression::prepend(DIExpr, Deref, Offset);
+  // Insert llvm.dbg.declare immediately after the original alloca, and remove
+  // old llvm.dbg.declare.
+  Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore);
+  DDI->eraseFromParent();
+  return true;
+}
+
+bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
+                                      DIBuilder &Builder, bool Deref, int Offset) {
+  return replaceDbgDeclare(AI, NewAllocaAddress, AI->getNextNode(), Builder,
+                           Deref, Offset);
+}
+
+static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress,
+                                        DIBuilder &Builder, int Offset) {
+  DebugLoc Loc = DVI->getDebugLoc();
+  auto *DIVar = DVI->getVariable();
+  auto *DIExpr = DVI->getExpression();
+  assert(DIVar && "Missing variable");
+
+  // This is an alloca-based llvm.dbg.value. The first thing it should do with
+  // the alloca pointer is dereference it. Otherwise we don't know how to handle
+  // it and give up.
+  if (!DIExpr || DIExpr->getNumElements() < 1 ||
+      DIExpr->getElement(0) != dwarf::DW_OP_deref)
+    return;
+
+  // Insert the offset immediately after the first deref.
+  // We could just change the offset argument of dbg.value, but it's unsigned...
+  if (Offset) {
+    SmallVector<uint64_t, 4> Ops;
+    Ops.push_back(dwarf::DW_OP_deref);
+    DIExpression::appendOffset(Ops, Offset);
+    Ops.append(DIExpr->elements_begin() + 1, DIExpr->elements_end());
+    DIExpr = Builder.createExpression(Ops);
+  }
+
+  Builder.insertDbgValueIntrinsic(NewAddress, DVI->getOffset(), DIVar, DIExpr,
+                                  Loc, DVI);
+  DVI->eraseFromParent();
+}
+
+void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
+                                    DIBuilder &Builder, int Offset) {
+  if (auto *L = LocalAsMetadata::getIfExists(AI))
+    if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L))
+      for (auto UI = MDV->use_begin(), UE = MDV->use_end(); UI != UE;) {
+        Use &U = *UI++;
+        if (auto *DVI = dyn_cast<DbgValueInst>(U.getUser()))
+          replaceOneDbgValueForAlloca(DVI, NewAllocaAddress, Builder, Offset);
+      }
+}
+
+void llvm::salvageDebugInfo(Instruction &I) {
+  SmallVector<DbgValueInst *, 1> DbgValues;
+  auto &M = *I.getModule();
+
+  auto MDWrap = [&](Value *V) {
+    return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V));
+  };
+
+  if (isa<BitCastInst>(&I)) {
+    findDbgValues(DbgValues, &I);
+    for (auto *DVI : DbgValues) {
+      // Bitcasts are entirely irrelevant for debug info. Rewrite the dbg.value
+      // to use the cast's source.
+      DVI->setOperand(0, MDWrap(I.getOperand(0)));
+      DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
+    }
+  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+    findDbgValues(DbgValues, &I);
+    for (auto *DVI : DbgValues) {
+      unsigned BitWidth =
+          M.getDataLayout().getPointerSizeInBits(GEP->getPointerAddressSpace());
+      APInt Offset(BitWidth, 0);
+      // Rewrite a constant GEP into a DIExpression.  Since we are performing
+      // arithmetic to compute the variable's *value* in the DIExpression, we
+      // need to mark the expression with a DW_OP_stack_value.
+      if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset)) {
+        auto *DIExpr = DVI->getExpression();
+        DIBuilder DIB(M, /*AllowUnresolved*/ false);
+        // GEP offsets are i32 and thus always fit into an int64_t.
+        DIExpr = DIExpression::prepend(DIExpr, DIExpression::NoDeref,
+                                       Offset.getSExtValue(),
+                                       DIExpression::WithStackValue);
+        DVI->setOperand(0, MDWrap(I.getOperand(0)));
+        DVI->setOperand(3, MetadataAsValue::get(I.getContext(), DIExpr));
+        DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
+      }
+    }
+  } else if (isa<LoadInst>(&I)) {
+    findDbgValues(DbgValues, &I);
+    for (auto *DVI : DbgValues) {
+      // Rewrite the load into DW_OP_deref.
+      auto *DIExpr = DVI->getExpression();
+      DIBuilder DIB(M, /*AllowUnresolved*/ false);
+      DIExpr = DIExpression::prepend(DIExpr, DIExpression::WithDeref);
+      DVI->setOperand(0, MDWrap(I.getOperand(0)));
+      DVI->setOperand(3, MetadataAsValue::get(I.getContext(), DIExpr));
+      DEBUG(dbgs() << "SALVAGE:  " << *DVI << '\n');
+    }
+  }
+}
+
+unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
+  unsigned NumDeadInst = 0;
+  // Delete the instructions backwards, as it has a reduced likelihood of
+  // having to update as many def-use and use-def chains.
+  Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+  while (EndInst != &BB->front()) {
+    // Delete the next to last instruction.
+    Instruction *Inst = &*--EndInst->getIterator();
+    if (!Inst->use_empty() && !Inst->getType()->isTokenTy())
+      Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+    if (Inst->isEHPad() || Inst->getType()->isTokenTy()) {
+      EndInst = Inst;
+      continue;
+    }
+    if (!isa<DbgInfoIntrinsic>(Inst))
+      ++NumDeadInst;
+    Inst->eraseFromParent();
+  }
+  return NumDeadInst;
+}
+
+unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
+                                   bool PreserveLCSSA) {
+  BasicBlock *BB = I->getParent();
+  // Loop over all of the successors, removing BB's entry from any PHI
+  // nodes.
+  for (BasicBlock *Successor : successors(BB))
+    Successor->removePredecessor(BB, PreserveLCSSA);
+
+  // Insert a call to llvm.trap right before this.  This turns the undefined
+  // behavior into a hard fail instead of falling through into random code.
+  if (UseLLVMTrap) {
+    Function *TrapFn =
+      Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap);
+    CallInst *CallTrap = CallInst::Create(TrapFn, "", I);
+    CallTrap->setDebugLoc(I->getDebugLoc());
+  }
+  new UnreachableInst(I->getContext(), I);
+
+  // All instructions after this are dead.
+  unsigned NumInstrsRemoved = 0;
+  BasicBlock::iterator BBI = I->getIterator(), BBE = BB->end();
+  while (BBI != BBE) {
+    if (!BBI->use_empty())
+      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+    BB->getInstList().erase(BBI++);
+    ++NumInstrsRemoved;
+  }
+  return NumInstrsRemoved;
+}
+
+/// changeToCall - Convert the specified invoke into a normal call.
+static void changeToCall(InvokeInst *II) {
+  SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end());
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  II->getOperandBundlesAsDefs(OpBundles);
+  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, OpBundles,
+                                       "", II);
+  NewCall->takeName(II);
+  NewCall->setCallingConv(II->getCallingConv());
+  NewCall->setAttributes(II->getAttributes());
+  NewCall->setDebugLoc(II->getDebugLoc());
+  II->replaceAllUsesWith(NewCall);
+
+  // Follow the call by a branch to the normal destination.
+  BranchInst::Create(II->getNormalDest(), II);
+
+  // Update PHI nodes in the unwind destination
+  II->getUnwindDest()->removePredecessor(II->getParent());
+  II->eraseFromParent();
+}
+
+BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
+                                                   BasicBlock *UnwindEdge) {
+  BasicBlock *BB = CI->getParent();
+
+  // Convert this function call into an invoke instruction.  First, split the
+  // basic block.
+  BasicBlock *Split =
+      BB->splitBasicBlock(CI->getIterator(), CI->getName() + ".noexc");
+
+  // Delete the unconditional branch inserted by splitBasicBlock
+  BB->getInstList().pop_back();
+
+  // Create the new invoke instruction.
+  SmallVector<Value *, 8> InvokeArgs(CI->arg_begin(), CI->arg_end());
+  SmallVector<OperandBundleDef, 1> OpBundles;
+
+  CI->getOperandBundlesAsDefs(OpBundles);
+
+  // Note: we're round tripping operand bundles through memory here, and that
+  // can potentially be avoided with a cleverer API design that we do not have
+  // as of this time.
+
+  InvokeInst *II = InvokeInst::Create(CI->getCalledValue(), Split, UnwindEdge,
+                                      InvokeArgs, OpBundles, CI->getName(), BB);
+  II->setDebugLoc(CI->getDebugLoc());
+  II->setCallingConv(CI->getCallingConv());
+  II->setAttributes(CI->getAttributes());
+
+  // Make sure that anything using the call now uses the invoke!  This also
+  // updates the CallGraph if present, because it uses a WeakTrackingVH.
+  CI->replaceAllUsesWith(II);
+
+  // Delete the original call
+  Split->getInstList().pop_front();
+  return Split;
+}
+
+static bool markAliveBlocks(Function &F,
+                            SmallPtrSetImpl<BasicBlock*> &Reachable) {
+
+  SmallVector<BasicBlock*, 128> Worklist;
+  BasicBlock *BB = &F.front();
+  Worklist.push_back(BB);
+  Reachable.insert(BB);
+  bool Changed = false;
+  do {
+    BB = Worklist.pop_back_val();
+
+    // Do a quick scan of the basic block, turning any obviously unreachable
+    // instructions into LLVM unreachable insts.  The instruction combining pass
+    // canonicalizes unreachable insts into stores to null or undef.
+    for (Instruction &I : *BB) {
+      // Assumptions that are known to be false are equivalent to unreachable.
+      // Also, if the condition is undefined, then we make the choice most
+      // beneficial to the optimizer, and choose that to also be unreachable.
+      if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+        if (II->getIntrinsicID() == Intrinsic::assume) {
+          if (match(II->getArgOperand(0), m_CombineOr(m_Zero(), m_Undef()))) {
+            // Don't insert a call to llvm.trap right before the unreachable.
+            changeToUnreachable(II, false);
+            Changed = true;
+            break;
+          }
+        }
+
+        if (II->getIntrinsicID() == Intrinsic::experimental_guard) {
+          // A call to the guard intrinsic bails out of the current compilation
+          // unit if the predicate passed to it is false.  If the predicate is a
+          // constant false, then we know the guard will bail out of the current
+          // compile unconditionally, so all code following it is dead.
+          //
+          // Note: unlike in llvm.assume, it is not "obviously profitable" for
+          // guards to treat `undef` as `false` since a guard on `undef` can
+          // still be useful for widening.
+          if (match(II->getArgOperand(0), m_Zero()))
+            if (!isa<UnreachableInst>(II->getNextNode())) {
+              changeToUnreachable(II->getNextNode(), /*UseLLVMTrap=*/ false);
+              Changed = true;
+              break;
+            }
+        }
+      }
+
+      if (auto *CI = dyn_cast<CallInst>(&I)) {
+        Value *Callee = CI->getCalledValue();
+        if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+          changeToUnreachable(CI, /*UseLLVMTrap=*/false);
+          Changed = true;
+          break;
+        }
+        if (CI->doesNotReturn()) {
+          // If we found a call to a no-return function, insert an unreachable
+          // instruction after it.  Make sure there isn't *already* one there
+          // though.
+          if (!isa<UnreachableInst>(CI->getNextNode())) {
+            // Don't insert a call to llvm.trap right before the unreachable.
+            changeToUnreachable(CI->getNextNode(), false);
+            Changed = true;
+          }
+          break;
+        }
+      }
+
+      // Store to undef and store to null are undefined and used to signal that
+      // they should be changed to unreachable by passes that can't modify the
+      // CFG.
+      if (auto *SI = dyn_cast<StoreInst>(&I)) {
+        // Don't touch volatile stores.
+        if (SI->isVolatile()) continue;
+
+        Value *Ptr = SI->getOperand(1);
+
+        if (isa<UndefValue>(Ptr) ||
+            (isa<ConstantPointerNull>(Ptr) &&
+             SI->getPointerAddressSpace() == 0)) {
+          changeToUnreachable(SI, true);
+          Changed = true;
+          break;
+        }
+      }
+    }
+
+    TerminatorInst *Terminator = BB->getTerminator();
+    if (auto *II = dyn_cast<InvokeInst>(Terminator)) {
+      // Turn invokes that call 'nounwind' functions into ordinary calls.
+      Value *Callee = II->getCalledValue();
+      if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+        changeToUnreachable(II, true);
+        Changed = true;
+      } else if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(&F)) {
+        if (II->use_empty() && II->onlyReadsMemory()) {
+          // jump to the normal destination branch.
+          BranchInst::Create(II->getNormalDest(), II);
+          II->getUnwindDest()->removePredecessor(II->getParent());
+          II->eraseFromParent();
+        } else
+          changeToCall(II);
+        Changed = true;
+      }
+    } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Terminator)) {
+      // Remove catchpads which cannot be reached.
+      struct CatchPadDenseMapInfo {
+        static CatchPadInst *getEmptyKey() {
+          return DenseMapInfo<CatchPadInst *>::getEmptyKey();
+        }
+        static CatchPadInst *getTombstoneKey() {
+          return DenseMapInfo<CatchPadInst *>::getTombstoneKey();
+        }
+        static unsigned getHashValue(CatchPadInst *CatchPad) {
+          return static_cast<unsigned>(hash_combine_range(
+              CatchPad->value_op_begin(), CatchPad->value_op_end()));
+        }
+        static bool isEqual(CatchPadInst *LHS, CatchPadInst *RHS) {
+          if (LHS == getEmptyKey() || LHS == getTombstoneKey() ||
+              RHS == getEmptyKey() || RHS == getTombstoneKey())
+            return LHS == RHS;
+          return LHS->isIdenticalTo(RHS);
+        }
+      };
+
+      // Set of unique CatchPads.
+      SmallDenseMap<CatchPadInst *, detail::DenseSetEmpty, 4,
+                    CatchPadDenseMapInfo, detail::DenseSetPair<CatchPadInst *>>
+          HandlerSet;
+      detail::DenseSetEmpty Empty;
+      for (CatchSwitchInst::handler_iterator I = CatchSwitch->handler_begin(),
+                                             E = CatchSwitch->handler_end();
+           I != E; ++I) {
+        BasicBlock *HandlerBB = *I;
+        auto *CatchPad = cast<CatchPadInst>(HandlerBB->getFirstNonPHI());
+        if (!HandlerSet.insert({CatchPad, Empty}).second) {
+          CatchSwitch->removeHandler(I);
+          --I;
+          --E;
+          Changed = true;
+        }
+      }
+    }
+
+    Changed |= ConstantFoldTerminator(BB, true);
+    for (BasicBlock *Successor : successors(BB))
+      if (Reachable.insert(Successor).second)
+        Worklist.push_back(Successor);
+  } while (!Worklist.empty());
+  return Changed;
+}
+
+void llvm::removeUnwindEdge(BasicBlock *BB) {
+  TerminatorInst *TI = BB->getTerminator();
+
+  if (auto *II = dyn_cast<InvokeInst>(TI)) {
+    changeToCall(II);
+    return;
+  }
+
+  TerminatorInst *NewTI;
+  BasicBlock *UnwindDest;
+
+  if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
+    NewTI = CleanupReturnInst::Create(CRI->getCleanupPad(), nullptr, CRI);
+    UnwindDest = CRI->getUnwindDest();
+  } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) {
+    auto *NewCatchSwitch = CatchSwitchInst::Create(
+        CatchSwitch->getParentPad(), nullptr, CatchSwitch->getNumHandlers(),
+        CatchSwitch->getName(), CatchSwitch);
+    for (BasicBlock *PadBB : CatchSwitch->handlers())
+      NewCatchSwitch->addHandler(PadBB);
+
+    NewTI = NewCatchSwitch;
+    UnwindDest = CatchSwitch->getUnwindDest();
+  } else {
+    llvm_unreachable("Could not find unwind successor");
+  }
+
+  NewTI->takeName(TI);
+  NewTI->setDebugLoc(TI->getDebugLoc());
+  UnwindDest->removePredecessor(BB);
+  TI->replaceAllUsesWith(NewTI);
+  TI->eraseFromParent();
+}
+
+/// removeUnreachableBlocks - Remove blocks that are not reachable, even
+/// if they are in a dead cycle.  Return true if a change was made, false
+/// otherwise. If `LVI` is passed, this function preserves LazyValueInfo
+/// after modifying the CFG.
+bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) {
+  SmallPtrSet<BasicBlock*, 16> Reachable;
+  bool Changed = markAliveBlocks(F, Reachable);
+
+  // If there are unreachable blocks in the CFG...
+  if (Reachable.size() == F.size())
+    return Changed;
+
+  assert(Reachable.size() < F.size());
+  NumRemoved += F.size()-Reachable.size();
+
+  // Loop over all of the basic blocks that are not reachable, dropping all of
+  // their internal references...
+  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
+    if (Reachable.count(&*BB))
+      continue;
+
+    for (BasicBlock *Successor : successors(&*BB))
+      if (Reachable.count(Successor))
+        Successor->removePredecessor(&*BB);
+    if (LVI)
+      LVI->eraseBlock(&*BB);
+    BB->dropAllReferences();
+  }
+
+  for (Function::iterator I = ++F.begin(); I != F.end();)
+    if (!Reachable.count(&*I))
+      I = F.getBasicBlockList().erase(I);
+    else
+      ++I;
+
+  return true;
+}
+
+void llvm::combineMetadata(Instruction *K, const Instruction *J,
+                           ArrayRef<unsigned> KnownIDs) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
+  K->dropUnknownNonDebugMetadata(KnownIDs);
+  K->getAllMetadataOtherThanDebugLoc(Metadata);
+  for (const auto &MD : Metadata) {
+    unsigned Kind = MD.first;
+    MDNode *JMD = J->getMetadata(Kind);
+    MDNode *KMD = MD.second;
+
+    switch (Kind) {
+      default:
+        K->setMetadata(Kind, nullptr); // Remove unknown metadata
+        break;
+      case LLVMContext::MD_dbg:
+        llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg");
+      case LLVMContext::MD_tbaa:
+        K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
+        break;
+      case LLVMContext::MD_alias_scope:
+        K->setMetadata(Kind, MDNode::getMostGenericAliasScope(JMD, KMD));
+        break;
+      case LLVMContext::MD_noalias:
+      case LLVMContext::MD_mem_parallel_loop_access:
+        K->setMetadata(Kind, MDNode::intersect(JMD, KMD));
+        break;
+      case LLVMContext::MD_range:
+        K->setMetadata(Kind, MDNode::getMostGenericRange(JMD, KMD));
+        break;
+      case LLVMContext::MD_fpmath:
+        K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
+        break;
+      case LLVMContext::MD_invariant_load:
+        // Only set the !invariant.load if it is present in both instructions.
+        K->setMetadata(Kind, JMD);
+        break;
+      case LLVMContext::MD_nonnull:
+        // Only set the !nonnull if it is present in both instructions.
+        K->setMetadata(Kind, JMD);
+        break;
+      case LLVMContext::MD_invariant_group:
+        // Preserve !invariant.group in K.
+        break;
+      case LLVMContext::MD_align:
+        K->setMetadata(Kind,
+          MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
+        break;
+      case LLVMContext::MD_dereferenceable:
+      case LLVMContext::MD_dereferenceable_or_null:
+        K->setMetadata(Kind,
+          MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
+        break;
+    }
+  }
+  // Set !invariant.group from J if J has it. If both instructions have it
+  // then we will just pick it from J - even when they are different.
+  // Also make sure that K is load or store - f.e. combining bitcast with load
+  // could produce bitcast with invariant.group metadata, which is invalid.
+  // FIXME: we should try to preserve both invariant.group md if they are
+  // different, but right now instruction can only have one invariant.group.
+  if (auto *JMD = J->getMetadata(LLVMContext::MD_invariant_group))
+    if (isa<LoadInst>(K) || isa<StoreInst>(K))
+      K->setMetadata(LLVMContext::MD_invariant_group, JMD);
+}
+
+void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J) {
+  unsigned KnownIDs[] = {
+      LLVMContext::MD_tbaa,            LLVMContext::MD_alias_scope,
+      LLVMContext::MD_noalias,         LLVMContext::MD_range,
+      LLVMContext::MD_invariant_load,  LLVMContext::MD_nonnull,
+      LLVMContext::MD_invariant_group, LLVMContext::MD_align,
+      LLVMContext::MD_dereferenceable,
+      LLVMContext::MD_dereferenceable_or_null};
+  combineMetadata(K, J, KnownIDs);
+}
+
+template <typename RootType, typename DominatesFn>
+static unsigned replaceDominatedUsesWith(Value *From, Value *To,
+                                         const RootType &Root,
+                                         const DominatesFn &Dominates) {
+  assert(From->getType() == To->getType());
+
+  unsigned Count = 0;
+  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
+       UI != UE;) {
+    Use &U = *UI++;
+    if (!Dominates(Root, U))
+      continue;
+    U.set(To);
+    DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
+                 << *To << " in " << *U << "\n");
+    ++Count;
+  }
+  return Count;
+}
+
+unsigned llvm::replaceNonLocalUsesWith(Instruction *From, Value *To) {
+   assert(From->getType() == To->getType());
+   auto *BB = From->getParent();
+   unsigned Count = 0;
+
+  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
+       UI != UE;) {
+    Use &U = *UI++;
+    auto *I = cast<Instruction>(U.getUser());
+    if (I->getParent() == BB)
+      continue;
+    U.set(To);
+    ++Count;
+  }
+  return Count;
+}
+
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+                                        DominatorTree &DT,
+                                        const BasicBlockEdge &Root) {
+  auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) {
+    return DT.dominates(Root, U);
+  };
+  return ::replaceDominatedUsesWith(From, To, Root, Dominates);
+}
+
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+                                        DominatorTree &DT,
+                                        const BasicBlock *BB) {
+  auto ProperlyDominates = [&DT](const BasicBlock *BB, const Use &U) {
+    auto *I = cast<Instruction>(U.getUser())->getParent();
+    return DT.properlyDominates(BB, I);
+  };
+  return ::replaceDominatedUsesWith(From, To, BB, ProperlyDominates);
+}
+
+bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
+  // Check if the function is specifically marked as a gc leaf function.
+  if (CS.hasFnAttr("gc-leaf-function"))
+    return true;
+  if (const Function *F = CS.getCalledFunction()) {
+    if (F->hasFnAttribute("gc-leaf-function"))
+      return true;
+
+    if (auto IID = F->getIntrinsicID())
+      // Most LLVM intrinsics do not take safepoints.
+      return IID != Intrinsic::experimental_gc_statepoint &&
+             IID != Intrinsic::experimental_deoptimize;
+  }
+
+  return false;
+}
+
+void llvm::copyNonnullMetadata(const LoadInst &OldLI, MDNode *N,
+                               LoadInst &NewLI) {
+  auto *NewTy = NewLI.getType();
+
+  // This only directly applies if the new type is also a pointer.
+  if (NewTy->isPointerTy()) {
+    NewLI.setMetadata(LLVMContext::MD_nonnull, N);
+    return;
+  }
+
+  // The only other translation we can do is to integral loads with !range
+  // metadata.
+  if (!NewTy->isIntegerTy())
+    return;
+
+  MDBuilder MDB(NewLI.getContext());
+  const Value *Ptr = OldLI.getPointerOperand();
+  auto *ITy = cast<IntegerType>(NewTy);
+  auto *NullInt = ConstantExpr::getPtrToInt(
+      ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy);
+  auto *NonNullInt = ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1));
+  NewLI.setMetadata(LLVMContext::MD_range,
+                    MDB.createRange(NonNullInt, NullInt));
+}
+
+void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
+                             MDNode *N, LoadInst &NewLI) {
+  auto *NewTy = NewLI.getType();
+
+  // Give up unless it is converted to a pointer where there is a single very
+  // valuable mapping we can do reliably.
+  // FIXME: It would be nice to propagate this in more ways, but the type
+  // conversions make it hard.
+  if (!NewTy->isPointerTy())
+    return;
+
+  unsigned BitWidth = DL.getTypeSizeInBits(NewTy);
+  if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) {
+    MDNode *NN = MDNode::get(OldLI.getContext(), None);
+    NewLI.setMetadata(LLVMContext::MD_nonnull, NN);
+  }
+}
+
+namespace {
+/// A potential constituent of a bitreverse or bswap expression. See
+/// collectBitParts for a fuller explanation.
+struct BitPart {
+  BitPart(Value *P, unsigned BW) : Provider(P) {
+    Provenance.resize(BW);
+  }
+
+  /// The Value that this is a bitreverse/bswap of.
+  Value *Provider;
+  /// The "provenance" of each bit. Provenance[A] = B means that bit A
+  /// in Provider becomes bit B in the result of this expression.
+  SmallVector<int8_t, 32> Provenance; // int8_t means max size is i128.
+
+  enum { Unset = -1 };
+};
+} // end anonymous namespace
+
+/// Analyze the specified subexpression and see if it is capable of providing
+/// pieces of a bswap or bitreverse. The subexpression provides a potential
+/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in
+/// the output of the expression came from a corresponding bit in some other
+/// value. This function is recursive, and the end result is a mapping of
+/// bitnumber to bitnumber. It is the caller's responsibility to validate that
+/// the bitnumber to bitnumber mapping is correct for a bswap or bitreverse.
+///
+/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know
+/// that the expression deposits the low byte of %X into the high byte of the
+/// result and that all other bits are zero. This expression is accepted and a
+/// BitPart is returned with Provider set to %X and Provenance[24-31] set to
+/// [0-7].
+///
+/// To avoid revisiting values, the BitPart results are memoized into the
+/// provided map. To avoid unnecessary copying of BitParts, BitParts are
+/// constructed in-place in the \c BPS map. Because of this \c BPS needs to
+/// store BitParts objects, not pointers. As we need the concept of a nullptr
+/// BitParts (Value has been analyzed and the analysis failed), we an Optional
+/// type instead to provide the same functionality.
+///
+/// Because we pass around references into \c BPS, we must use a container that
+/// does not invalidate internal references (std::map instead of DenseMap).
+///
+static const Optional<BitPart> &
+collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
+                std::map<Value *, Optional<BitPart>> &BPS) {
+  auto I = BPS.find(V);
+  if (I != BPS.end())
+    return I->second;
+
+  auto &Result = BPS[V] = None;
+  auto BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    // If this is an or instruction, it may be an inner node of the bswap.
+    if (I->getOpcode() == Instruction::Or) {
+      auto &A = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                MatchBitReversals, BPS);
+      auto &B = collectBitParts(I->getOperand(1), MatchBSwaps,
+                                MatchBitReversals, BPS);
+      if (!A || !B)
+        return Result;
+
+      // Try and merge the two together.
+      if (!A->Provider || A->Provider != B->Provider)
+        return Result;
+
+      Result = BitPart(A->Provider, BitWidth);
+      for (unsigned i = 0; i < A->Provenance.size(); ++i) {
+        if (A->Provenance[i] != BitPart::Unset &&
+            B->Provenance[i] != BitPart::Unset &&
+            A->Provenance[i] != B->Provenance[i])
+          return Result = None;
+
+        if (A->Provenance[i] == BitPart::Unset)
+          Result->Provenance[i] = B->Provenance[i];
+        else
+          Result->Provenance[i] = A->Provenance[i];
+      }
+
+      return Result;
+    }
+
+    // If this is a logical shift by a constant, recurse then shift the result.
+    if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
+      unsigned BitShift =
+          cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
+      // Ensure the shift amount is defined.
+      if (BitShift > BitWidth)
+        return Result;
+
+      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                  MatchBitReversals, BPS);
+      if (!Res)
+        return Result;
+      Result = Res;
+
+      // Perform the "shift" on BitProvenance.
+      auto &P = Result->Provenance;
+      if (I->getOpcode() == Instruction::Shl) {
+        P.erase(std::prev(P.end(), BitShift), P.end());
+        P.insert(P.begin(), BitShift, BitPart::Unset);
+      } else {
+        P.erase(P.begin(), std::next(P.begin(), BitShift));
+        P.insert(P.end(), BitShift, BitPart::Unset);
+      }
+
+      return Result;
+    }
+
+    // If this is a logical 'and' with a mask that clears bits, recurse then
+    // unset the appropriate bits.
+    if (I->getOpcode() == Instruction::And &&
+        isa<ConstantInt>(I->getOperand(1))) {
+      APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1);
+      const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
+
+      // Check that the mask allows a multiple of 8 bits for a bswap, for an
+      // early exit.
+      unsigned NumMaskedBits = AndMask.countPopulation();
+      if (!MatchBitReversals && NumMaskedBits % 8 != 0)
+        return Result;
+
+      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                  MatchBitReversals, BPS);
+      if (!Res)
+        return Result;
+      Result = Res;
+
+      for (unsigned i = 0; i < BitWidth; ++i, Bit <<= 1)
+        // If the AndMask is zero for this bit, clear the bit.
+        if ((AndMask & Bit) == 0)
+          Result->Provenance[i] = BitPart::Unset;
+      return Result;
+    }
+
+    // If this is a zext instruction zero extend the result.
+    if (I->getOpcode() == Instruction::ZExt) {
+      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                  MatchBitReversals, BPS);
+      if (!Res)
+        return Result;
+
+      Result = BitPart(Res->Provider, BitWidth);
+      auto NarrowBitWidth =
+          cast<IntegerType>(cast<ZExtInst>(I)->getSrcTy())->getBitWidth();
+      for (unsigned i = 0; i < NarrowBitWidth; ++i)
+        Result->Provenance[i] = Res->Provenance[i];
+      for (unsigned i = NarrowBitWidth; i < BitWidth; ++i)
+        Result->Provenance[i] = BitPart::Unset;
+      return Result;
+    }
+  }
+
+  // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
+  // the input value to the bswap/bitreverse.
+  Result = BitPart(V, BitWidth);
+  for (unsigned i = 0; i < BitWidth; ++i)
+    Result->Provenance[i] = i;
+  return Result;
+}
+
+static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To,
+                                          unsigned BitWidth) {
+  if (From % 8 != To % 8)
+    return false;
+  // Convert from bit indices to byte indices and check for a byte reversal.
+  From >>= 3;
+  To >>= 3;
+  BitWidth >>= 3;
+  return From == BitWidth - To - 1;
+}
+
+static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
+                                               unsigned BitWidth) {
+  return From == BitWidth - To - 1;
+}
+
+/// Given an OR instruction, check to see if this is a bitreverse
+/// idiom. If so, insert the new intrinsic and return true.
+bool llvm::recognizeBSwapOrBitReverseIdiom(
+    Instruction *I, bool MatchBSwaps, bool MatchBitReversals,
+    SmallVectorImpl<Instruction *> &InsertedInsts) {
+  if (Operator::getOpcode(I) != Instruction::Or)
+    return false;
+  if (!MatchBSwaps && !MatchBitReversals)
+    return false;
+  IntegerType *ITy = dyn_cast<IntegerType>(I->getType());
+  if (!ITy || ITy->getBitWidth() > 128)
+    return false;   // Can't do vectors or integers > 128 bits.
+  unsigned BW = ITy->getBitWidth();
+
+  unsigned DemandedBW = BW;
+  IntegerType *DemandedTy = ITy;
+  if (I->hasOneUse()) {
+    if (TruncInst *Trunc = dyn_cast<TruncInst>(I->user_back())) {
+      DemandedTy = cast<IntegerType>(Trunc->getType());
+      DemandedBW = DemandedTy->getBitWidth();
+    }
+  }
+
+  // Try to find all the pieces corresponding to the bswap.
+  std::map<Value *, Optional<BitPart>> BPS;
+  auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS);
+  if (!Res)
+    return false;
+  auto &BitProvenance = Res->Provenance;
+
+  // Now, is the bit permutation correct for a bswap or a bitreverse? We can
+  // only byteswap values with an even number of bytes.
+  bool OKForBSwap = DemandedBW % 16 == 0, OKForBitReverse = true;
+  for (unsigned i = 0; i < DemandedBW; ++i) {
+    OKForBSwap &=
+        bitTransformIsCorrectForBSwap(BitProvenance[i], i, DemandedBW);
+    OKForBitReverse &=
+        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, DemandedBW);
+  }
+
+  Intrinsic::ID Intrin;
+  if (OKForBSwap && MatchBSwaps)
+    Intrin = Intrinsic::bswap;
+  else if (OKForBitReverse && MatchBitReversals)
+    Intrin = Intrinsic::bitreverse;
+  else
+    return false;
+
+  if (ITy != DemandedTy) {
+    Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, DemandedTy);
+    Value *Provider = Res->Provider;
+    IntegerType *ProviderTy = cast<IntegerType>(Provider->getType());
+    // We may need to truncate the provider.
+    if (DemandedTy != ProviderTy) {
+      auto *Trunc = CastInst::Create(Instruction::Trunc, Provider, DemandedTy,
+                                     "trunc", I);
+      InsertedInsts.push_back(Trunc);
+      Provider = Trunc;
+    }
+    auto *CI = CallInst::Create(F, Provider, "rev", I);
+    InsertedInsts.push_back(CI);
+    auto *ExtInst = CastInst::Create(Instruction::ZExt, CI, ITy, "zext", I);
+    InsertedInsts.push_back(ExtInst);
+    return true;
+  }
+
+  Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, ITy);
+  InsertedInsts.push_back(CallInst::Create(F, Res->Provider, "rev", I));
+  return true;
+}
+
+// CodeGen has special handling for some string functions that may replace
+// them with target-specific intrinsics.  Since that'd skip our interceptors
+// in ASan/MSan/TSan/DFSan, and thus make us miss some memory accesses,
+// we mark affected calls as NoBuiltin, which will disable optimization
+// in CodeGen.
+void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(
+    CallInst *CI, const TargetLibraryInfo *TLI) {
+  Function *F = CI->getCalledFunction();
+  LibFunc Func;
+  if (F && !F->hasLocalLinkage() && F->hasName() &&
+      TLI->getLibFunc(F->getName(), Func) && TLI->hasOptimizedCodeGen(Func) &&
+      !F->doesNotAccessMemory())
+    CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin);
+}
+
+bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
+  // We can't have a PHI with a metadata type.
+  if (I->getOperand(OpIdx)->getType()->isMetadataTy())
+    return false;
+
+  // Early exit.
+  if (!isa<Constant>(I->getOperand(OpIdx)))
+    return true;
+
+  switch (I->getOpcode()) {
+  default:
+    return true;
+  case Instruction::Call:
+  case Instruction::Invoke:
+    // Can't handle inline asm. Skip it.
+    if (isa<InlineAsm>(ImmutableCallSite(I).getCalledValue()))
+      return false;
+    // Many arithmetic intrinsics have no issue taking a
+    // variable, however it's hard to distingish these from
+    // specials such as @llvm.frameaddress that require a constant.
+    if (isa<IntrinsicInst>(I))
+      return false;
+
+    // Constant bundle operands may need to retain their constant-ness for
+    // correctness.
+    if (ImmutableCallSite(I).isBundleOperand(OpIdx))
+      return false;
+    return true;
+  case Instruction::ShuffleVector:
+    // Shufflevector masks are constant.
+    return OpIdx != 2;
+  case Instruction::Switch:
+  case Instruction::ExtractValue:
+    // All operands apart from the first are constant.
+    return OpIdx == 0;
+  case Instruction::InsertValue:
+    // All operands apart from the first and the second are constant.
+    return OpIdx < 2;
+  case Instruction::Alloca:
+    // Static allocas (constant size in the entry block) are handled by
+    // prologue/epilogue insertion so they're free anyway. We definitely don't
+    // want to make them non-constant.
+    return !dyn_cast<AllocaInst>(I)->isStaticAlloca();
+  case Instruction::GetElementPtr:
+    if (OpIdx == 0)
+      return true;
+    gep_type_iterator It = gep_type_begin(I);
+    for (auto E = std::next(It, OpIdx); It != E; ++It)
+      if (It.isStruct())
+        return false;
+    return true;
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
new file mode 100644
index 000000000000..e21e34df8ded
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -0,0 +1,877 @@
+//===- LoopSimplify.cpp - Loop Canonicalization Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs several transformations to transform natural loops into a
+// simpler form, which makes subsequent analyses and transformations simpler and
+// more effective.
+//
+// Loop pre-header insertion guarantees that there is a single, non-critical
+// entry edge from outside of the loop to the loop header.  This simplifies a
+// number of analyses and transformations, such as LICM.
+//
+// Loop exit-block insertion guarantees that all exit blocks from the loop
+// (blocks which are outside of the loop that have predecessors inside of the
+// loop) only have predecessors from inside of the loop (and are thus dominated
+// by the loop header).  This simplifies transformations such as store-sinking
+// that are built into LICM.
+//
+// This pass also guarantees that loops will have exactly one backedge.
+//
+// Indirectbr instructions introduce several complications. If the loop
+// contains or is entered by an indirectbr instruction, it may not be possible
+// to transform the loop and make these guarantees. Client code should check
+// that these conditions are true before relying on them.
+//
+// Note that the simplifycfg pass will clean up blocks which are split out but
+// end up being unnecessary, so usage of this pass should not pessimize
+// generated code.
+//
+// This pass obviously modifies the CFG, but updates loop information and
+// dominator information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-simplify"
+
+STATISTIC(NumNested  , "Number of nested loops split out");
+
+// If the block isn't already, move the new block to right after some 'outside
+// block' block.  This prevents the preheader from being placed inside the loop
+// body, e.g. when the loop hasn't been rotated.
+static void placeSplitBlockCarefully(BasicBlock *NewBB,
+                                     SmallVectorImpl<BasicBlock *> &SplitPreds,
+                                     Loop *L) {
+  // Check to see if NewBB is already well placed.
+  Function::iterator BBI = --NewBB->getIterator();
+  for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
+    if (&*BBI == SplitPreds[i])
+      return;
+  }
+
+  // If it isn't already after an outside block, move it after one.  This is
+  // always good as it makes the uncond branch from the outside block into a
+  // fall-through.
+
+  // Figure out *which* outside block to put this after.  Prefer an outside
+  // block that neighbors a BB actually in the loop.
+  BasicBlock *FoundBB = nullptr;
+  for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
+    Function::iterator BBI = SplitPreds[i]->getIterator();
+    if (++BBI != NewBB->getParent()->end() && L->contains(&*BBI)) {
+      FoundBB = SplitPreds[i];
+      break;
+    }
+  }
+
+  // If our heuristic for a *good* bb to place this after doesn't find
+  // anything, just pick something.  It's likely better than leaving it within
+  // the loop.
+  if (!FoundBB)
+    FoundBB = SplitPreds[0];
+  NewBB->moveAfter(FoundBB);
+}
+
+/// InsertPreheaderForLoop - Once we discover that a loop doesn't have a
+/// preheader, this method is called to insert one.  This method has two phases:
+/// preheader insertion and analysis updating.
+///
+BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT,
+                                         LoopInfo *LI, bool PreserveLCSSA) {
+  BasicBlock *Header = L->getHeader();
+
+  // Compute the set of predecessors of the loop that are not in the loop.
+  SmallVector<BasicBlock*, 8> OutsideBlocks;
+  for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
+       PI != PE; ++PI) {
+    BasicBlock *P = *PI;
+    if (!L->contains(P)) {         // Coming in from outside the loop?
+      // If the loop is branched to from an indirect branch, we won't
+      // be able to fully transform the loop, because it prohibits
+      // edge splitting.
+      if (isa<IndirectBrInst>(P->getTerminator())) return nullptr;
+
+      // Keep track of it.
+      OutsideBlocks.push_back(P);
+    }
+  }
+
+  // Split out the loop pre-header.
+  BasicBlock *PreheaderBB;
+  PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", DT,
+                                       LI, PreserveLCSSA);
+  if (!PreheaderBB)
+    return nullptr;
+
+  DEBUG(dbgs() << "LoopSimplify: Creating pre-header "
+               << PreheaderBB->getName() << "\n");
+
+  // Make sure that NewBB is put someplace intelligent, which doesn't mess up
+  // code layout too horribly.
+  placeSplitBlockCarefully(PreheaderBB, OutsideBlocks, L);
+
+  return PreheaderBB;
+}
+
+/// Add the specified block, and all of its predecessors, to the specified set,
+/// if it's not already in there.  Stop predecessor traversal when we reach
+/// StopBlock.
+static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
+                                  std::set<BasicBlock*> &Blocks) {
+  SmallVector<BasicBlock *, 8> Worklist;
+  Worklist.push_back(InputBB);
+  do {
+    BasicBlock *BB = Worklist.pop_back_val();
+    if (Blocks.insert(BB).second && BB != StopBlock)
+      // If BB is not already processed and it is not a stop block then
+      // insert its predecessor in the work list
+      for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+        BasicBlock *WBB = *I;
+        Worklist.push_back(WBB);
+      }
+  } while (!Worklist.empty());
+}
+
+/// \brief The first part of loop-nestification is to find a PHI node that tells
+/// us how to partition the loops.
+static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,
+                                        AssumptionCache *AC) {
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
+    PHINode *PN = cast<PHINode>(I);
+    ++I;
+    if (Value *V = SimplifyInstruction(PN, {DL, nullptr, DT, AC})) {
+      // This is a degenerate PHI already, don't modify it!
+      PN->replaceAllUsesWith(V);
+      PN->eraseFromParent();
+      continue;
+    }
+
+    // Scan this PHI node looking for a use of the PHI node by itself.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (PN->getIncomingValue(i) == PN &&
+          L->contains(PN->getIncomingBlock(i)))
+        // We found something tasty to remove.
+        return PN;
+  }
+  return nullptr;
+}
+
+/// \brief If this loop has multiple backedges, try to pull one of them out into
+/// a nested loop.
+///
+/// This is important for code that looks like
+/// this:
+///
+///  Loop:
+///     ...
+///     br cond, Loop, Next
+///     ...
+///     br cond2, Loop, Out
+///
+/// To identify this common case, we look at the PHI nodes in the header of the
+/// loop.  PHI nodes with unchanging values on one backedge correspond to values
+/// that change in the "outer" loop, but not in the "inner" loop.
+///
+/// If we are able to separate out a loop, return the new outer loop that was
+/// created.
+///
+static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
+                                DominatorTree *DT, LoopInfo *LI,
+                                ScalarEvolution *SE, bool PreserveLCSSA,
+                                AssumptionCache *AC) {
+  // Don't try to separate loops without a preheader.
+  if (!Preheader)
+    return nullptr;
+
+  // The header is not a landing pad; preheader insertion should ensure this.
+  BasicBlock *Header = L->getHeader();
+  assert(!Header->isEHPad() && "Can't insert backedge to EH pad");
+
+  PHINode *PN = findPHIToPartitionLoops(L, DT, AC);
+  if (!PN) return nullptr;  // No known way to partition.
+
+  // Pull out all predecessors that have varying values in the loop.  This
+  // handles the case when a PHI node has multiple instances of itself as
+  // arguments.
+  SmallVector<BasicBlock*, 8> OuterLoopPreds;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    if (PN->getIncomingValue(i) != PN ||
+        !L->contains(PN->getIncomingBlock(i))) {
+      // We can't split indirectbr edges.
+      if (isa<IndirectBrInst>(PN->getIncomingBlock(i)->getTerminator()))
+        return nullptr;
+      OuterLoopPreds.push_back(PN->getIncomingBlock(i));
+    }
+  }
+  DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
+
+  // If ScalarEvolution is around and knows anything about values in
+  // this loop, tell it to forget them, because we're about to
+  // substantially change it.
+  if (SE)
+    SE->forgetLoop(L);
+
+  BasicBlock *NewBB = SplitBlockPredecessors(Header, OuterLoopPreds, ".outer",
+                                             DT, LI, PreserveLCSSA);
+
+  // Make sure that NewBB is put someplace intelligent, which doesn't mess up
+  // code layout too horribly.
+  placeSplitBlockCarefully(NewBB, OuterLoopPreds, L);
+
+  // Create the new outer loop.
+  Loop *NewOuter = new Loop();
+
+  // Change the parent loop to use the outer loop as its child now.
+  if (Loop *Parent = L->getParentLoop())
+    Parent->replaceChildLoopWith(L, NewOuter);
+  else
+    LI->changeTopLevelLoop(L, NewOuter);
+
+  // L is now a subloop of our outer loop.
+  NewOuter->addChildLoop(L);
+
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    NewOuter->addBlockEntry(*I);
+
+  // Now reset the header in L, which had been moved by
+  // SplitBlockPredecessors for the outer loop.
+  L->moveToHeader(Header);
+
+  // Determine which blocks should stay in L and which should be moved out to
+  // the Outer loop now.
+  std::set<BasicBlock*> BlocksInL;
+  for (pred_iterator PI=pred_begin(Header), E = pred_end(Header); PI!=E; ++PI) {
+    BasicBlock *P = *PI;
+    if (DT->dominates(Header, P))
+      addBlockAndPredsToSet(P, Header, BlocksInL);
+  }
+
+  // Scan all of the loop children of L, moving them to OuterLoop if they are
+  // not part of the inner loop.
+  const std::vector<Loop*> &SubLoops = L->getSubLoops();
+  for (size_t I = 0; I != SubLoops.size(); )
+    if (BlocksInL.count(SubLoops[I]->getHeader()))
+      ++I;   // Loop remains in L
+    else
+      NewOuter->addChildLoop(L->removeChildLoop(SubLoops.begin() + I));
+
+  SmallVector<BasicBlock *, 8> OuterLoopBlocks;
+  OuterLoopBlocks.push_back(NewBB);
+  // Now that we know which blocks are in L and which need to be moved to
+  // OuterLoop, move any blocks that need it.
+  for (unsigned i = 0; i != L->getBlocks().size(); ++i) {
+    BasicBlock *BB = L->getBlocks()[i];
+    if (!BlocksInL.count(BB)) {
+      // Move this block to the parent, updating the exit blocks sets
+      L->removeBlockFromLoop(BB);
+      if ((*LI)[BB] == L) {
+        LI->changeLoopFor(BB, NewOuter);
+        OuterLoopBlocks.push_back(BB);
+      }
+      --i;
+    }
+  }
+
+  // Split edges to exit blocks from the inner loop, if they emerged in the
+  // process of separating the outer one.
+  formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA);
+
+  if (PreserveLCSSA) {
+    // Fix LCSSA form for L. Some values, which previously were only used inside
+    // L, can now be used in NewOuter loop. We need to insert phi-nodes for them
+    // in corresponding exit blocks.
+    // We don't need to form LCSSA recursively, because there cannot be uses
+    // inside a newly created loop of defs from inner loops as those would
+    // already be a use of an LCSSA phi node.
+    formLCSSA(*L, *DT, LI, SE);
+
+    assert(NewOuter->isRecursivelyLCSSAForm(*DT, *LI) &&
+           "LCSSA is broken after separating nested loops!");
+  }
+
+  return NewOuter;
+}
+
+/// \brief This method is called when the specified loop has more than one
+/// backedge in it.
+///
+/// If this occurs, revector all of these backedges to target a new basic block
+/// and have that block branch to the loop header.  This ensures that loops
+/// have exactly one backedge.
+static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
+                                             DominatorTree *DT, LoopInfo *LI) {
+  assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!");
+
+  // Get information about the loop
+  BasicBlock *Header = L->getHeader();
+  Function *F = Header->getParent();
+
+  // Unique backedge insertion currently depends on having a preheader.
+  if (!Preheader)
+    return nullptr;
+
+  // The header is not an EH pad; preheader insertion should ensure this.
+  assert(!Header->isEHPad() && "Can't insert backedge to EH pad");
+
+  // Figure out which basic blocks contain back-edges to the loop header.
+  std::vector<BasicBlock*> BackedgeBlocks;
+  for (pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I){
+    BasicBlock *P = *I;
+
+    // Indirectbr edges cannot be split, so we must fail if we find one.
+    if (isa<IndirectBrInst>(P->getTerminator()))
+      return nullptr;
+
+    if (P != Preheader) BackedgeBlocks.push_back(P);
+  }
+
+  // Create and insert the new backedge block...
+  BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(),
+                                           Header->getName() + ".backedge", F);
+  BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
+  BETerminator->setDebugLoc(Header->getFirstNonPHI()->getDebugLoc());
+
+  DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block "
+               << BEBlock->getName() << "\n");
+
+  // Move the new backedge block to right after the last backedge block.
+  Function::iterator InsertPos = ++BackedgeBlocks.back()->getIterator();
+  F->getBasicBlockList().splice(InsertPos, F->getBasicBlockList(), BEBlock);
+
+  // Now that the block has been inserted into the function, create PHI nodes in
+  // the backedge block which correspond to any PHI nodes in the header block.
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    PHINode *NewPN = PHINode::Create(PN->getType(), BackedgeBlocks.size(),
+                                     PN->getName()+".be", BETerminator);
+
+    // Loop over the PHI node, moving all entries except the one for the
+    // preheader over to the new PHI node.
+    unsigned PreheaderIdx = ~0U;
+    bool HasUniqueIncomingValue = true;
+    Value *UniqueValue = nullptr;
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      BasicBlock *IBB = PN->getIncomingBlock(i);
+      Value *IV = PN->getIncomingValue(i);
+      if (IBB == Preheader) {
+        PreheaderIdx = i;
+      } else {
+        NewPN->addIncoming(IV, IBB);
+        if (HasUniqueIncomingValue) {
+          if (!UniqueValue)
+            UniqueValue = IV;
+          else if (UniqueValue != IV)
+            HasUniqueIncomingValue = false;
+        }
+      }
+    }
+
+    // Delete all of the incoming values from the old PN except the preheader's
+    assert(PreheaderIdx != ~0U && "PHI has no preheader entry??");
+    if (PreheaderIdx != 0) {
+      PN->setIncomingValue(0, PN->getIncomingValue(PreheaderIdx));
+      PN->setIncomingBlock(0, PN->getIncomingBlock(PreheaderIdx));
+    }
+    // Nuke all entries except the zero'th.
+    for (unsigned i = 0, e = PN->getNumIncomingValues()-1; i != e; ++i)
+      PN->removeIncomingValue(e-i, false);
+
+    // Finally, add the newly constructed PHI node as the entry for the BEBlock.
+    PN->addIncoming(NewPN, BEBlock);
+
+    // As an optimization, if all incoming values in the new PhiNode (which is a
+    // subset of the incoming values of the old PHI node) have the same value,
+    // eliminate the PHI Node.
+    if (HasUniqueIncomingValue) {
+      NewPN->replaceAllUsesWith(UniqueValue);
+      BEBlock->getInstList().erase(NewPN);
+    }
+  }
+
+  // Now that all of the PHI nodes have been inserted and adjusted, modify the
+  // backedge blocks to jump to the BEBlock instead of the header.
+  // If one of the backedges has llvm.loop metadata attached, we remove
+  // it from the backedge and add it to BEBlock.
+  unsigned LoopMDKind = BEBlock->getContext().getMDKindID("llvm.loop");
+  MDNode *LoopMD = nullptr;
+  for (unsigned i = 0, e = BackedgeBlocks.size(); i != e; ++i) {
+    TerminatorInst *TI = BackedgeBlocks[i]->getTerminator();
+    if (!LoopMD)
+      LoopMD = TI->getMetadata(LoopMDKind);
+    TI->setMetadata(LoopMDKind, nullptr);
+    for (unsigned Op = 0, e = TI->getNumSuccessors(); Op != e; ++Op)
+      if (TI->getSuccessor(Op) == Header)
+        TI->setSuccessor(Op, BEBlock);
+  }
+  BEBlock->getTerminator()->setMetadata(LoopMDKind, LoopMD);
+
+  //===--- Update all analyses which we must preserve now -----------------===//
+
+  // Update Loop Information - we know that this block is now in the current
+  // loop and all parent loops.
+  L->addBasicBlockToLoop(BEBlock, *LI);
+
+  // Update dominator information
+  DT->splitBlock(BEBlock);
+
+  return BEBlock;
+}
+
+/// \brief Simplify one loop and queue further loops for simplification.
+static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
+                            DominatorTree *DT, LoopInfo *LI,
+                            ScalarEvolution *SE, AssumptionCache *AC,
+                            bool PreserveLCSSA) {
+  bool Changed = false;
+ReprocessLoop:
+
+  // Check to see that no blocks (other than the header) in this loop have
+  // predecessors that are not in the loop.  This is not valid for natural
+  // loops, but can occur if the blocks are unreachable.  Since they are
+  // unreachable we can just shamelessly delete those CFG edges!
+  for (Loop::block_iterator BB = L->block_begin(), E = L->block_end();
+       BB != E; ++BB) {
+    if (*BB == L->getHeader()) continue;
+
+    SmallPtrSet<BasicBlock*, 4> BadPreds;
+    for (pred_iterator PI = pred_begin(*BB),
+         PE = pred_end(*BB); PI != PE; ++PI) {
+      BasicBlock *P = *PI;
+      if (!L->contains(P))
+        BadPreds.insert(P);
+    }
+
+    // Delete each unique out-of-loop (and thus dead) predecessor.
+    for (BasicBlock *P : BadPreds) {
+
+      DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor "
+                   << P->getName() << "\n");
+
+      // Zap the dead pred's terminator and replace it with unreachable.
+      TerminatorInst *TI = P->getTerminator();
+      changeToUnreachable(TI, /*UseLLVMTrap=*/false, PreserveLCSSA);
+      Changed = true;
+    }
+  }
+
+  // If there are exiting blocks with branches on undef, resolve the undef in
+  // the direction which will exit the loop. This will help simplify loop
+  // trip count computations.
+  SmallVector<BasicBlock*, 8> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+  for (BasicBlock *ExitingBlock : ExitingBlocks)
+    if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()))
+      if (BI->isConditional()) {
+        if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) {
+
+          DEBUG(dbgs() << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
+                       << ExitingBlock->getName() << "\n");
+
+          BI->setCondition(ConstantInt::get(Cond->getType(),
+                                            !L->contains(BI->getSuccessor(0))));
+
+          // This may make the loop analyzable, force SCEV recomputation.
+          if (SE)
+            SE->forgetLoop(L);
+
+          Changed = true;
+        }
+      }
+
+  // Does the loop already have a preheader?  If so, don't insert one.
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
+    if (Preheader)
+      Changed = true;
+  }
+
+  // Next, check to make sure that all exit nodes of the loop only have
+  // predecessors that are inside of the loop.  This check guarantees that the
+  // loop preheader/header will dominate the exit blocks.  If the exit block has
+  // predecessors from outside of the loop, split the edge now.
+  if (formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA))
+    Changed = true;
+
+  // If the header has more than two predecessors at this point (from the
+  // preheader and from multiple backedges), we must adjust the loop.
+  BasicBlock *LoopLatch = L->getLoopLatch();
+  if (!LoopLatch) {
+    // If this is really a nested loop, rip it out into a child loop.  Don't do
+    // this for loops with a giant number of backedges, just factor them into a
+    // common backedge instead.
+    if (L->getNumBackEdges() < 8) {
+      if (Loop *OuterL =
+              separateNestedLoop(L, Preheader, DT, LI, SE, PreserveLCSSA, AC)) {
+        ++NumNested;
+        // Enqueue the outer loop as it should be processed next in our
+        // depth-first nest walk.
+        Worklist.push_back(OuterL);
+
+        // This is a big restructuring change, reprocess the whole loop.
+        Changed = true;
+        // GCC doesn't tail recursion eliminate this.
+        // FIXME: It isn't clear we can't rely on LLVM to TRE this.
+        goto ReprocessLoop;
+      }
+    }
+
+    // If we either couldn't, or didn't want to, identify nesting of the loops,
+    // insert a new block that all backedges target, then make it jump to the
+    // loop header.
+    LoopLatch = insertUniqueBackedgeBlock(L, Preheader, DT, LI);
+    if (LoopLatch)
+      Changed = true;
+  }
+
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+  // Scan over the PHI nodes in the loop header.  Since they now have only two
+  // incoming values (the loop is canonicalized), we may have simplified the PHI
+  // down to 'X = phi [X, Y]', which should be replaced with 'Y'.
+  PHINode *PN;
+  for (BasicBlock::iterator I = L->getHeader()->begin();
+       (PN = dyn_cast<PHINode>(I++)); )
+    if (Value *V = SimplifyInstruction(PN, {DL, nullptr, DT, AC})) {
+      if (SE) SE->forgetValue(PN);
+      if (!PreserveLCSSA || LI->replacementPreservesLCSSAForm(PN, V)) {
+        PN->replaceAllUsesWith(V);
+        PN->eraseFromParent();
+      }
+    }
+
+  // If this loop has multiple exits and the exits all go to the same
+  // block, attempt to merge the exits. This helps several passes, such
+  // as LoopRotation, which do not support loops with multiple exits.
+  // SimplifyCFG also does this (and this code uses the same utility
+  // function), however this code is loop-aware, where SimplifyCFG is
+  // not. That gives it the advantage of being able to hoist
+  // loop-invariant instructions out of the way to open up more
+  // opportunities, and the disadvantage of having the responsibility
+  // to preserve dominator information.
+  auto HasUniqueExitBlock = [&]() {
+    BasicBlock *UniqueExit = nullptr;
+    for (auto *ExitingBB : ExitingBlocks)
+      for (auto *SuccBB : successors(ExitingBB)) {
+        if (L->contains(SuccBB))
+          continue;
+
+        if (!UniqueExit)
+          UniqueExit = SuccBB;
+        else if (UniqueExit != SuccBB)
+          return false;
+      }
+
+    return true;
+  };
+  if (HasUniqueExitBlock()) {
+    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+      BasicBlock *ExitingBlock = ExitingBlocks[i];
+      if (!ExitingBlock->getSinglePredecessor()) continue;
+      BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+      if (!BI || !BI->isConditional()) continue;
+      CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition());
+      if (!CI || CI->getParent() != ExitingBlock) continue;
+
+      // Attempt to hoist out all instructions except for the
+      // comparison and the branch.
+      bool AllInvariant = true;
+      bool AnyInvariant = false;
+      for (BasicBlock::iterator I = ExitingBlock->begin(); &*I != BI; ) {
+        Instruction *Inst = &*I++;
+        // Skip debug info intrinsics.
+        if (isa<DbgInfoIntrinsic>(Inst))
+          continue;
+        if (Inst == CI)
+          continue;
+        if (!L->makeLoopInvariant(Inst, AnyInvariant,
+                                  Preheader ? Preheader->getTerminator()
+                                            : nullptr)) {
+          AllInvariant = false;
+          break;
+        }
+      }
+      if (AnyInvariant) {
+        Changed = true;
+        // The loop disposition of all SCEV expressions that depend on any
+        // hoisted values have also changed.
+        if (SE)
+          SE->forgetLoopDispositions(L);
+      }
+      if (!AllInvariant) continue;
+
+      // The block has now been cleared of all instructions except for
+      // a comparison and a conditional branch. SimplifyCFG may be able
+      // to fold it now.
+      if (!FoldBranchToCommonDest(BI))
+        continue;
+
+      // Success. The block is now dead, so remove it from the loop,
+      // update the dominator tree and delete it.
+      DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block "
+                   << ExitingBlock->getName() << "\n");
+
+      // Notify ScalarEvolution before deleting this block. Currently assume the
+      // parent loop doesn't change (spliting edges doesn't count). If blocks,
+      // CFG edges, or other values in the parent loop change, then we need call
+      // to forgetLoop() for the parent instead.
+      if (SE)
+        SE->forgetLoop(L);
+
+      assert(pred_begin(ExitingBlock) == pred_end(ExitingBlock));
+      Changed = true;
+      LI->removeBlock(ExitingBlock);
+
+      DomTreeNode *Node = DT->getNode(ExitingBlock);
+      const std::vector<DomTreeNodeBase<BasicBlock> *> &Children =
+        Node->getChildren();
+      while (!Children.empty()) {
+        DomTreeNode *Child = Children.front();
+        DT->changeImmediateDominator(Child, Node->getIDom());
+      }
+      DT->eraseNode(ExitingBlock);
+
+      BI->getSuccessor(0)->removePredecessor(
+          ExitingBlock, /* DontDeleteUselessPHIs */ PreserveLCSSA);
+      BI->getSuccessor(1)->removePredecessor(
+          ExitingBlock, /* DontDeleteUselessPHIs */ PreserveLCSSA);
+      ExitingBlock->eraseFromParent();
+    }
+  }
+
+  return Changed;
+}
+
+bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI,
+                        ScalarEvolution *SE, AssumptionCache *AC,
+                        bool PreserveLCSSA) {
+  bool Changed = false;
+
+#ifndef NDEBUG
+  // If we're asked to preserve LCSSA, the loop nest needs to start in LCSSA
+  // form.
+  if (PreserveLCSSA) {
+    assert(DT && "DT not available.");
+    assert(LI && "LI not available.");
+    assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+           "Requested to preserve LCSSA, but it's already broken.");
+  }
+#endif
+
+  // Worklist maintains our depth-first queue of loops in this nest to process.
+  SmallVector<Loop *, 4> Worklist;
+  Worklist.push_back(L);
+
+  // Walk the worklist from front to back, pushing newly found sub loops onto
+  // the back. This will let us process loops from back to front in depth-first
+  // order. We can use this simple process because loops form a tree.
+  for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
+    Loop *L2 = Worklist[Idx];
+    Worklist.append(L2->begin(), L2->end());
+  }
+
+  while (!Worklist.empty())
+    Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, DT, LI, SE,
+                               AC, PreserveLCSSA);
+
+  return Changed;
+}
+
+namespace {
+  struct LoopSimplify : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    LoopSimplify() : FunctionPass(ID) {
+      initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionCacheTracker>();
+
+      // We need loop information to identify the loops...
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+
+      AU.addPreserved<BasicAAWrapperPass>();
+      AU.addPreserved<AAResultsWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+      AU.addPreserved<SCEVAAWrapperPass>();
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<DependenceAnalysisWrapperPass>();
+      AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
+    }
+
+    /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
+    void verifyAnalysis() const override;
+  };
+}
+
+char LoopSimplify::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
+                "Canonicalize natural loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",
+                "Canonicalize natural loops", false, false)
+
+// Publicly exposed interface to pass...
+char &llvm::LoopSimplifyID = LoopSimplify::ID;
+Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
+
+/// runOnFunction - Run down all loops in the CFG (recursively, but we could do
+/// it in any convenient order) inserting preheaders...
+///
+bool LoopSimplify::runOnFunction(Function &F) {
+  bool Changed = false;
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+  ScalarEvolution *SE = SEWP ? &SEWP->getSE() : nullptr;
+  AssumptionCache *AC =
+      &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+  bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+  // Simplify each loop nest in the function.
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+    Changed |= simplifyLoop(*I, DT, LI, SE, AC, PreserveLCSSA);
+
+#ifndef NDEBUG
+  if (PreserveLCSSA) {
+    bool InLCSSA = all_of(
+        *LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT, *LI); });
+    assert(InLCSSA && "LCSSA is broken after loop-simplify.");
+  }
+#endif
+  return Changed;
+}
+
+PreservedAnalyses LoopSimplifyPass::run(Function &F,
+                                        FunctionAnalysisManager &AM) {
+  bool Changed = false;
+  LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  ScalarEvolution *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
+  AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+
+  // Note that we don't preserve LCSSA in the new PM, if you need it run LCSSA
+  // after simplifying the loops.
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+    Changed |= simplifyLoop(*I, DT, LI, SE, AC, /*PreserveLCSSA*/ false);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<BasicAA>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<SCEVAA>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  PA.preserve<DependenceAnalysis>();
+  return PA;
+}
+
+// FIXME: Restore this code when we re-enable verification in verifyAnalysis
+// below.
+#if 0
+static void verifyLoop(Loop *L) {
+  // Verify subloops.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    verifyLoop(*I);
+
+  // It used to be possible to just assert L->isLoopSimplifyForm(), however
+  // with the introduction of indirectbr, there are now cases where it's
+  // not possible to transform a loop as necessary. We can at least check
+  // that there is an indirectbr near any time there's trouble.
+
+  // Indirectbr can interfere with preheader and unique backedge insertion.
+  if (!L->getLoopPreheader() || !L->getLoopLatch()) {
+    bool HasIndBrPred = false;
+    for (pred_iterator PI = pred_begin(L->getHeader()),
+         PE = pred_end(L->getHeader()); PI != PE; ++PI)
+      if (isa<IndirectBrInst>((*PI)->getTerminator())) {
+        HasIndBrPred = true;
+        break;
+      }
+    assert(HasIndBrPred &&
+           "LoopSimplify has no excuse for missing loop header info!");
+    (void)HasIndBrPred;
+  }
+
+  // Indirectbr can interfere with exit block canonicalization.
+  if (!L->hasDedicatedExits()) {
+    bool HasIndBrExiting = false;
+    SmallVector<BasicBlock*, 8> ExitingBlocks;
+    L->getExitingBlocks(ExitingBlocks);
+    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+      if (isa<IndirectBrInst>((ExitingBlocks[i])->getTerminator())) {
+        HasIndBrExiting = true;
+        break;
+      }
+    }
+
+    assert(HasIndBrExiting &&
+           "LoopSimplify has no excuse for missing exit block info!");
+    (void)HasIndBrExiting;
+  }
+}
+#endif
+
+void LoopSimplify::verifyAnalysis() const {
+  // FIXME: This routine is being called mid-way through the loop pass manager
+  // as loop passes destroy this analysis. That's actually fine, but we have no
+  // way of expressing that here. Once all of the passes that destroy this are
+  // hoisted out of the loop pass manager we can add back verification here.
+#if 0
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+    verifyLoop(*I);
+#endif
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
new file mode 100644
index 000000000000..f2527f89e83e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -0,0 +1,871 @@
+//===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some loop unrolling utilities. It does not define any
+// actual pass or policy, but provides a single function to perform loop
+// unrolling.
+//
+// The process of unrolling can produce extraneous basic blocks linked with
+// unconditional branches.  This will be corrected in the future.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll"
+
+// TODO: Should these be here or in LoopUnroll?
+STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
+STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
+
+static cl::opt<bool>
+UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
+                    cl::desc("Allow runtime unrolled loops to be unrolled "
+                             "with epilog instead of prolog."));
+
+static cl::opt<bool>
+UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
+                    cl::desc("Verify domtree after unrolling"),
+#ifdef NDEBUG
+    cl::init(false)
+#else
+    cl::init(true)
+#endif
+                    );
+
+/// Convert the instruction operands from referencing the current values into
+/// those specified by VMap.
+static inline void remapInstruction(Instruction *I,
+                                    ValueToValueMapTy &VMap) {
+  for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
+    Value *Op = I->getOperand(op);
+    ValueToValueMapTy::iterator It = VMap.find(Op);
+    if (It != VMap.end())
+      I->setOperand(op, It->second);
+  }
+
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      ValueToValueMapTy::iterator It = VMap.find(PN->getIncomingBlock(i));
+      if (It != VMap.end())
+        PN->setIncomingBlock(i, cast<BasicBlock>(It->second));
+    }
+  }
+}
+
+/// Folds a basic block into its predecessor if it only has one predecessor, and
+/// that predecessor only has one successor.
+/// The LoopInfo Analysis that is passed will be kept consistent.  If folding is
+/// successful references to the containing loop must be removed from
+/// ScalarEvolution by calling ScalarEvolution::forgetLoop because SE may have
+/// references to the eliminated BB.  The argument ForgottenLoops contains a set
+/// of loops that have already been forgotten to prevent redundant, expensive
+/// calls to ScalarEvolution::forgetLoop.  Returns the new combined block.
+static BasicBlock *
+foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
+                         SmallPtrSetImpl<Loop *> &ForgottenLoops,
+                         DominatorTree *DT) {
+  // Merge basic blocks into their predecessor if there is only one distinct
+  // pred, and if there is only one distinct successor of the predecessor, and
+  // if there are no PHI nodes.
+  BasicBlock *OnlyPred = BB->getSinglePredecessor();
+  if (!OnlyPred) return nullptr;
+
+  if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
+    return nullptr;
+
+  DEBUG(dbgs() << "Merging: " << *BB << "into: " << *OnlyPred);
+
+  // Resolve any PHI nodes at the start of the block.  They are all
+  // guaranteed to have exactly one entry if they exist, unless there are
+  // multiple duplicate (but guaranteed to be equal) entries for the
+  // incoming edges.  This occurs when there are multiple edges from
+  // OnlyPred to OnlySucc.
+  FoldSingleEntryPHINodes(BB);
+
+  // Delete the unconditional branch from the predecessor...
+  OnlyPred->getInstList().pop_back();
+
+  // Make all PHI nodes that referred to BB now refer to Pred as their
+  // source...
+  BB->replaceAllUsesWith(OnlyPred);
+
+  // Move all definitions in the successor to the predecessor...
+  OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList());
+
+  // OldName will be valid until erased.
+  StringRef OldName = BB->getName();
+
+  // Erase the old block and update dominator info.
+  if (DT)
+    if (DomTreeNode *DTN = DT->getNode(BB)) {
+      DomTreeNode *PredDTN = DT->getNode(OnlyPred);
+      SmallVector<DomTreeNode *, 8> Children(DTN->begin(), DTN->end());
+      for (auto *DI : Children)
+        DT->changeImmediateDominator(DI, PredDTN);
+
+      DT->eraseNode(BB);
+    }
+
+  // ScalarEvolution holds references to loop exit blocks.
+  if (SE) {
+    if (Loop *L = LI->getLoopFor(BB)) {
+      if (ForgottenLoops.insert(L).second)
+        SE->forgetLoop(L);
+    }
+  }
+  LI->removeBlock(BB);
+
+  // Inherit predecessor's name if it exists...
+  if (!OldName.empty() && !OnlyPred->hasName())
+    OnlyPred->setName(OldName);
+
+  BB->eraseFromParent();
+
+  return OnlyPred;
+}
+
+/// Check if unrolling created a situation where we need to insert phi nodes to
+/// preserve LCSSA form.
+/// \param Blocks is a vector of basic blocks representing unrolled loop.
+/// \param L is the outer loop.
+/// It's possible that some of the blocks are in L, and some are not. In this
+/// case, if there is a use is outside L, and definition is inside L, we need to
+/// insert a phi-node, otherwise LCSSA will be broken.
+/// The function is just a helper function for llvm::UnrollLoop that returns
+/// true if this situation occurs, indicating that LCSSA needs to be fixed.
+static bool needToInsertPhisForLCSSA(Loop *L, std::vector<BasicBlock *> Blocks,
+                                     LoopInfo *LI) {
+  for (BasicBlock *BB : Blocks) {
+    if (LI->getLoopFor(BB) == L)
+      continue;
+    for (Instruction &I : *BB) {
+      for (Use &U : I.operands()) {
+        if (auto Def = dyn_cast<Instruction>(U)) {
+          Loop *DefLoop = LI->getLoopFor(Def->getParent());
+          if (!DefLoop)
+            continue;
+          if (DefLoop->contains(L))
+            return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+/// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary
+/// and adds a mapping from the original loop to the new loop to NewLoops.
+/// Returns nullptr if no new loop was created and a pointer to the
+/// original loop OriginalBB was part of otherwise.
+const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
+                                           BasicBlock *ClonedBB, LoopInfo *LI,
+                                           NewLoopsMap &NewLoops) {
+  // Figure out which loop New is in.
+  const Loop *OldLoop = LI->getLoopFor(OriginalBB);
+  assert(OldLoop && "Should (at least) be in the loop being unrolled!");
+
+  Loop *&NewLoop = NewLoops[OldLoop];
+  if (!NewLoop) {
+    // Found a new sub-loop.
+    assert(OriginalBB == OldLoop->getHeader() &&
+           "Header should be first in RPO");
+
+    NewLoop = new Loop();
+    Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
+
+    if (NewLoopParent)
+      NewLoopParent->addChildLoop(NewLoop);
+    else
+      LI->addTopLevelLoop(NewLoop);
+
+    NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
+    return OldLoop;
+  } else {
+    NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
+    return nullptr;
+  }
+}
+
+/// The function chooses which type of unroll (epilog or prolog) is more
+/// profitabale.
+/// Epilog unroll is more profitable when there is PHI that starts from
+/// constant.  In this case epilog will leave PHI start from constant,
+/// but prolog will convert it to non-constant.
+///
+/// loop:
+///   PN = PHI [I, Latch], [CI, PreHeader]
+///   I = foo(PN)
+///   ...
+///
+/// Epilog unroll case.
+/// loop:
+///   PN = PHI [I2, Latch], [CI, PreHeader]
+///   I1 = foo(PN)
+///   I2 = foo(I1)
+///   ...
+/// Prolog unroll case.
+///   NewPN = PHI [PrologI, Prolog], [CI, PreHeader]
+/// loop:
+///   PN = PHI [I2, Latch], [NewPN, PreHeader]
+///   I1 = foo(PN)
+///   I2 = foo(I1)
+///   ...
+///
+static bool isEpilogProfitable(Loop *L) {
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  assert(PreHeader && Header);
+  for (Instruction &BBI : *Header) {
+    PHINode *PN = dyn_cast<PHINode>(&BBI);
+    if (!PN)
+      break;
+    if (isa<ConstantInt>(PN->getIncomingValueForBlock(PreHeader)))
+      return true;
+  }
+  return false;
+}
+
+/// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
+/// if unrolling was successful, or false if the loop was unmodified. Unrolling
+/// can only fail when the loop's latch block is not terminated by a conditional
+/// branch instruction. However, if the trip count (and multiple) are not known,
+/// loop unrolling will mostly produce more code that is no faster.
+///
+/// TripCount is the upper bound of the iteration on which control exits
+/// LatchBlock. Control may exit the loop prior to TripCount iterations either
+/// via an early branch in other loop block or via LatchBlock terminator. This
+/// is relaxed from the general definition of trip count which is the number of
+/// times the loop header executes. Note that UnrollLoop assumes that the loop
+/// counter test is in LatchBlock in order to remove unnecesssary instances of
+/// the test.  If control can exit the loop from the LatchBlock's terminator
+/// prior to TripCount iterations, flag PreserveCondBr needs to be set.
+///
+/// PreserveCondBr indicates whether the conditional branch of the LatchBlock
+/// needs to be preserved.  It is needed when we use trip count upper bound to
+/// fully unroll the loop. If PreserveOnlyFirst is also set then only the first
+/// conditional branch needs to be preserved.
+///
+/// Similarly, TripMultiple divides the number of times that the LatchBlock may
+/// execute without exiting the loop.
+///
+/// If AllowRuntime is true then UnrollLoop will consider unrolling loops that
+/// have a runtime (i.e. not compile time constant) trip count.  Unrolling these
+/// loops require a unroll "prologue" that runs "RuntimeTripCount % Count"
+/// iterations before branching into the unrolled loop.  UnrollLoop will not
+/// runtime-unroll the loop if computing RuntimeTripCount will be expensive and
+/// AllowExpensiveTripCount is false.
+///
+/// If we want to perform PGO-based loop peeling, PeelCount is set to the 
+/// number of iterations we want to peel off.
+///
+/// The LoopInfo Analysis that is passed will be kept consistent.
+///
+/// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
+/// DominatorTree if they are non-null.
+bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
+                      bool AllowRuntime, bool AllowExpensiveTripCount,
+                      bool PreserveCondBr, bool PreserveOnlyFirst,
+                      unsigned TripMultiple, unsigned PeelCount, LoopInfo *LI,
+                      ScalarEvolution *SE, DominatorTree *DT,
+                      AssumptionCache *AC, OptimizationRemarkEmitter *ORE,
+                      bool PreserveLCSSA) {
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
+    return false;
+  }
+
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  if (!LatchBlock) {
+    DEBUG(dbgs() << "  Can't unroll; loop exit-block-insertion failed.\n");
+    return false;
+  }
+
+  // Loops with indirectbr cannot be cloned.
+  if (!L->isSafeToClone()) {
+    DEBUG(dbgs() << "  Can't unroll; Loop body cannot be cloned.\n");
+    return false;
+  }
+
+  // The current loop unroll pass can only unroll loops with a single latch
+  // that's a conditional branch exiting the loop.
+  // FIXME: The implementation can be extended to work with more complicated
+  // cases, e.g. loops with multiple latches.
+  BasicBlock *Header = L->getHeader();
+  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+
+  if (!BI || BI->isUnconditional()) {
+    // The loop-rotate pass can be helpful to avoid this in many cases.
+    DEBUG(dbgs() <<
+             "  Can't unroll; loop not terminated by a conditional branch.\n");
+    return false;
+  }
+
+  auto CheckSuccessors = [&](unsigned S1, unsigned S2) {
+    return BI->getSuccessor(S1) == Header && !L->contains(BI->getSuccessor(S2));
+  };
+
+  if (!CheckSuccessors(0, 1) && !CheckSuccessors(1, 0)) {
+    DEBUG(dbgs() << "Can't unroll; only loops with one conditional latch"
+                    " exiting the loop can be unrolled\n");
+    return false;
+  }
+
+  if (Header->hasAddressTaken()) {
+    // The loop-rotate pass can be helpful to avoid this in many cases.
+    DEBUG(dbgs() <<
+          "  Won't unroll loop: address of header block is taken.\n");
+    return false;
+  }
+
+  if (TripCount != 0)
+    DEBUG(dbgs() << "  Trip Count = " << TripCount << "\n");
+  if (TripMultiple != 1)
+    DEBUG(dbgs() << "  Trip Multiple = " << TripMultiple << "\n");
+
+  // Effectively "DCE" unrolled iterations that are beyond the tripcount
+  // and will never be executed.
+  if (TripCount != 0 && Count > TripCount)
+    Count = TripCount;
+
+  // Don't enter the unroll code if there is nothing to do.
+  if (TripCount == 0 && Count < 2 && PeelCount == 0) {
+    DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
+    return false;
+  }
+
+  assert(Count > 0);
+  assert(TripMultiple > 0);
+  assert(TripCount == 0 || TripCount % TripMultiple == 0);
+
+  // Are we eliminating the loop control altogether?
+  bool CompletelyUnroll = Count == TripCount;
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  std::vector<BasicBlock*> OriginalLoopBlocks = L->getBlocks();
+
+  // Go through all exits of L and see if there are any phi-nodes there. We just
+  // conservatively assume that they're inserted to preserve LCSSA form, which
+  // means that complete unrolling might break this form. We need to either fix
+  // it in-place after the transformation, or entirely rebuild LCSSA. TODO: For
+  // now we just recompute LCSSA for the outer loop, but it should be possible
+  // to fix it in-place.
+  bool NeedToFixLCSSA = PreserveLCSSA && CompletelyUnroll &&
+                        any_of(ExitBlocks, [](const BasicBlock *BB) {
+                          return isa<PHINode>(BB->begin());
+                        });
+
+  // We assume a run-time trip count if the compiler cannot
+  // figure out the loop trip count and the unroll-runtime
+  // flag is specified.
+  bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);
+
+  assert((!RuntimeTripCount || !PeelCount) &&
+         "Did not expect runtime trip-count unrolling "
+         "and peeling for the same loop");
+
+  if (PeelCount)
+    peelLoop(L, PeelCount, LI, SE, DT, AC, PreserveLCSSA);
+
+  // Loops containing convergent instructions must have a count that divides
+  // their TripMultiple.
+  DEBUG(
+      {
+        bool HasConvergent = false;
+        for (auto &BB : L->blocks())
+          for (auto &I : *BB)
+            if (auto CS = CallSite(&I))
+              HasConvergent |= CS.isConvergent();
+        assert((!HasConvergent || TripMultiple % Count == 0) &&
+               "Unroll count must divide trip multiple if loop contains a "
+               "convergent operation.");
+      });
+
+  bool EpilogProfitability =
+      UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
+                                              : isEpilogProfitable(L);
+
+  if (RuntimeTripCount && TripMultiple % Count != 0 &&
+      !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
+                                  EpilogProfitability, LI, SE, DT,
+                                  PreserveLCSSA)) {
+    if (Force)
+      RuntimeTripCount = false;
+    else {
+      DEBUG(
+          dbgs() << "Wont unroll; remainder loop could not be generated"
+                    "when assuming runtime trip count\n");
+      return false;
+    }
+  }
+
+  // Notify ScalarEvolution that the loop will be substantially changed,
+  // if not outright eliminated.
+  if (SE)
+    SE->forgetLoop(L);
+
+  // If we know the trip count, we know the multiple...
+  unsigned BreakoutTrip = 0;
+  if (TripCount != 0) {
+    BreakoutTrip = TripCount % Count;
+    TripMultiple = 0;
+  } else {
+    // Figure out what multiple to use.
+    BreakoutTrip = TripMultiple =
+      (unsigned)GreatestCommonDivisor64(Count, TripMultiple);
+  }
+
+  using namespace ore;
+  // Report the unrolling decision.
+  if (CompletelyUnroll) {
+    DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
+          << " with trip count " << TripCount << "!\n");
+    ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
+                                 L->getHeader())
+              << "completely unrolled loop with "
+              << NV("UnrollCount", TripCount) << " iterations");
+  } else if (PeelCount) {
+    DEBUG(dbgs() << "PEELING loop %" << Header->getName()
+                 << " with iteration count " << PeelCount << "!\n");
+    ORE->emit(OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
+                                 L->getHeader())
+              << " peeled loop by " << NV("PeelCount", PeelCount)
+              << " iterations");
+  } else {
+    OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
+                            L->getHeader());
+    Diag << "unrolled loop by a factor of " << NV("UnrollCount", Count);
+
+    DEBUG(dbgs() << "UNROLLING loop %" << Header->getName()
+          << " by " << Count);
+    if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
+      DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
+      ORE->emit(Diag << " with a breakout at trip "
+                     << NV("BreakoutTrip", BreakoutTrip));
+    } else if (TripMultiple != 1) {
+      DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
+      ORE->emit(Diag << " with " << NV("TripMultiple", TripMultiple)
+                     << " trips per branch");
+    } else if (RuntimeTripCount) {
+      DEBUG(dbgs() << " with run-time trip count");
+      ORE->emit(Diag << " with run-time trip count");
+    }
+    DEBUG(dbgs() << "!\n");
+  }
+
+  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
+  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
+
+  // For the first iteration of the loop, we should use the precloned values for
+  // PHI nodes.  Insert associations now.
+  ValueToValueMapTy LastValueMap;
+  std::vector<PHINode*> OrigPHINode;
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    OrigPHINode.push_back(cast<PHINode>(I));
+  }
+
+  std::vector<BasicBlock*> Headers;
+  std::vector<BasicBlock*> Latches;
+  Headers.push_back(Header);
+  Latches.push_back(LatchBlock);
+
+  // The current on-the-fly SSA update requires blocks to be processed in
+  // reverse postorder so that LastValueMap contains the correct value at each
+  // exit.
+  LoopBlocksDFS DFS(L);
+  DFS.perform(LI);
+
+  // Stash the DFS iterators before adding blocks to the loop.
+  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
+
+  std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks();
+
+  // Loop Unrolling might create new loops. While we do preserve LoopInfo, we
+  // might break loop-simplified form for these loops (as they, e.g., would
+  // share the same exit blocks). We'll keep track of loops for which we can
+  // break this so that later we can re-simplify them.
+  SmallSetVector<Loop *, 4> LoopsToSimplify;
+  for (Loop *SubLoop : *L)
+    LoopsToSimplify.insert(SubLoop);
+
+  if (Header->getParent()->isDebugInfoForProfiling())
+    for (BasicBlock *BB : L->getBlocks())
+      for (Instruction &I : *BB)
+        if (const DILocation *DIL = I.getDebugLoc())
+          I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
+
+  for (unsigned It = 1; It != Count; ++It) {
+    std::vector<BasicBlock*> NewBlocks;
+    SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
+    NewLoops[L] = L;
+
+    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+      ValueToValueMapTy VMap;
+      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
+      Header->getParent()->getBasicBlockList().push_back(New);
+
+      assert((*BB != Header || LI->getLoopFor(*BB) == L) &&
+             "Header should not be in a sub-loop");
+      // Tell LI about New.
+      const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
+      if (OldLoop) {
+        LoopsToSimplify.insert(NewLoops[OldLoop]);
+
+        // Forget the old loop, since its inputs may have changed.
+        if (SE)
+          SE->forgetLoop(OldLoop);
+      }
+
+      if (*BB == Header)
+        // Loop over all of the PHI nodes in the block, changing them to use
+        // the incoming values from the previous block.
+        for (PHINode *OrigPHI : OrigPHINode) {
+          PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
+          Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
+          if (Instruction *InValI = dyn_cast<Instruction>(InVal))
+            if (It > 1 && L->contains(InValI))
+              InVal = LastValueMap[InValI];
+          VMap[OrigPHI] = InVal;
+          New->getInstList().erase(NewPHI);
+        }
+
+      // Update our running map of newest clones
+      LastValueMap[*BB] = New;
+      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
+           VI != VE; ++VI)
+        LastValueMap[VI->first] = VI->second;
+
+      // Add phi entries for newly created values to all exit blocks.
+      for (BasicBlock *Succ : successors(*BB)) {
+        if (L->contains(Succ))
+          continue;
+        for (BasicBlock::iterator BBI = Succ->begin();
+             PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) {
+          Value *Incoming = phi->getIncomingValueForBlock(*BB);
+          ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
+          if (It != LastValueMap.end())
+            Incoming = It->second;
+          phi->addIncoming(Incoming, New);
+        }
+      }
+      // Keep track of new headers and latches as we create them, so that
+      // we can insert the proper branches later.
+      if (*BB == Header)
+        Headers.push_back(New);
+      if (*BB == LatchBlock)
+        Latches.push_back(New);
+
+      NewBlocks.push_back(New);
+      UnrolledLoopBlocks.push_back(New);
+
+      // Update DomTree: since we just copy the loop body, and each copy has a
+      // dedicated entry block (copy of the header block), this header's copy
+      // dominates all copied blocks. That means, dominance relations in the
+      // copied body are the same as in the original body.
+      if (DT) {
+        if (*BB == Header)
+          DT->addNewBlock(New, Latches[It - 1]);
+        else {
+          auto BBDomNode = DT->getNode(*BB);
+          auto BBIDom = BBDomNode->getIDom();
+          BasicBlock *OriginalBBIDom = BBIDom->getBlock();
+          DT->addNewBlock(
+              New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
+        }
+      }
+    }
+
+    // Remap all instructions in the most recent iteration
+    for (BasicBlock *NewBlock : NewBlocks) {
+      for (Instruction &I : *NewBlock) {
+        ::remapInstruction(&I, LastValueMap);
+        if (auto *II = dyn_cast<IntrinsicInst>(&I))
+          if (II->getIntrinsicID() == Intrinsic::assume)
+            AC->registerAssumption(II);
+      }
+    }
+  }
+
+  // Loop over the PHI nodes in the original block, setting incoming values.
+  for (PHINode *PN : OrigPHINode) {
+    if (CompletelyUnroll) {
+      PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
+      Header->getInstList().erase(PN);
+    }
+    else if (Count > 1) {
+      Value *InVal = PN->removeIncomingValue(LatchBlock, false);
+      // If this value was defined in the loop, take the value defined by the
+      // last iteration of the loop.
+      if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
+        if (L->contains(InValI))
+          InVal = LastValueMap[InVal];
+      }
+      assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
+      PN->addIncoming(InVal, Latches.back());
+    }
+  }
+
+  // Now that all the basic blocks for the unrolled iterations are in place,
+  // set up the branches to connect them.
+  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
+    // The original branch was replicated in each unrolled iteration.
+    BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
+
+    // The branch destination.
+    unsigned j = (i + 1) % e;
+    BasicBlock *Dest = Headers[j];
+    bool NeedConditional = true;
+
+    if (RuntimeTripCount && j != 0) {
+      NeedConditional = false;
+    }
+
+    // For a complete unroll, make the last iteration end with a branch
+    // to the exit block.
+    if (CompletelyUnroll) {
+      if (j == 0)
+        Dest = LoopExit;
+      // If using trip count upper bound to completely unroll, we need to keep
+      // the conditional branch except the last one because the loop may exit
+      // after any iteration.
+      assert(NeedConditional &&
+             "NeedCondition cannot be modified by both complete "
+             "unrolling and runtime unrolling");
+      NeedConditional = (PreserveCondBr && j && !(PreserveOnlyFirst && i != 0));
+    } else if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
+      // If we know the trip count or a multiple of it, we can safely use an
+      // unconditional branch for some iterations.
+      NeedConditional = false;
+    }
+
+    if (NeedConditional) {
+      // Update the conditional branch's successor for the following
+      // iteration.
+      Term->setSuccessor(!ContinueOnTrue, Dest);
+    } else {
+      // Remove phi operands at this loop exit
+      if (Dest != LoopExit) {
+        BasicBlock *BB = Latches[i];
+        for (BasicBlock *Succ: successors(BB)) {
+          if (Succ == Headers[i])
+            continue;
+          for (BasicBlock::iterator BBI = Succ->begin();
+               PHINode *Phi = dyn_cast<PHINode>(BBI); ++BBI) {
+            Phi->removeIncomingValue(BB, false);
+          }
+        }
+      }
+      // Replace the conditional branch with an unconditional one.
+      BranchInst::Create(Dest, Term);
+      Term->eraseFromParent();
+    }
+  }
+
+  // Update dominators of blocks we might reach through exits.
+  // Immediate dominator of such block might change, because we add more
+  // routes which can lead to the exit: we can now reach it from the copied
+  // iterations too.
+  if (DT && Count > 1) {
+    for (auto *BB : OriginalLoopBlocks) {
+      auto *BBDomNode = DT->getNode(BB);
+      SmallVector<BasicBlock *, 16> ChildrenToUpdate;
+      for (auto *ChildDomNode : BBDomNode->getChildren()) {
+        auto *ChildBB = ChildDomNode->getBlock();
+        if (!L->contains(ChildBB))
+          ChildrenToUpdate.push_back(ChildBB);
+      }
+      BasicBlock *NewIDom;
+      if (BB == LatchBlock) {
+        // The latch is special because we emit unconditional branches in
+        // some cases where the original loop contained a conditional branch.
+        // Since the latch is always at the bottom of the loop, if the latch
+        // dominated an exit before unrolling, the new dominator of that exit
+        // must also be a latch.  Specifically, the dominator is the first
+        // latch which ends in a conditional branch, or the last latch if
+        // there is no such latch.
+        NewIDom = Latches.back();
+        for (BasicBlock *IterLatch : Latches) {
+          TerminatorInst *Term = IterLatch->getTerminator();
+          if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
+            NewIDom = IterLatch;
+            break;
+          }
+        }
+      } else {
+        // The new idom of the block will be the nearest common dominator
+        // of all copies of the previous idom. This is equivalent to the
+        // nearest common dominator of the previous idom and the first latch,
+        // which dominates all copies of the previous idom.
+        NewIDom = DT->findNearestCommonDominator(BB, LatchBlock);
+      }
+      for (auto *ChildBB : ChildrenToUpdate)
+        DT->changeImmediateDominator(ChildBB, NewIDom);
+    }
+  }
+
+  if (DT && UnrollVerifyDomtree)
+    DT->verifyDomTree();
+
+  // Merge adjacent basic blocks, if possible.
+  SmallPtrSet<Loop *, 4> ForgottenLoops;
+  for (BasicBlock *Latch : Latches) {
+    BranchInst *Term = cast<BranchInst>(Latch->getTerminator());
+    if (Term->isUnconditional()) {
+      BasicBlock *Dest = Term->getSuccessor(0);
+      if (BasicBlock *Fold =
+              foldBlockIntoPredecessor(Dest, LI, SE, ForgottenLoops, DT)) {
+        // Dest has been folded into Fold. Update our worklists accordingly.
+        std::replace(Latches.begin(), Latches.end(), Dest, Fold);
+        UnrolledLoopBlocks.erase(std::remove(UnrolledLoopBlocks.begin(),
+                                             UnrolledLoopBlocks.end(), Dest),
+                                 UnrolledLoopBlocks.end());
+      }
+    }
+  }
+
+  // Simplify any new induction variables in the partially unrolled loop.
+  if (SE && !CompletelyUnroll && Count > 1) {
+    SmallVector<WeakTrackingVH, 16> DeadInsts;
+    simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
+
+    // Aggressively clean up dead instructions that simplifyLoopIVs already
+    // identified. Any remaining should be cleaned up below.
+    while (!DeadInsts.empty())
+      if (Instruction *Inst =
+              dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
+        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+  }
+
+  // At this point, the code is well formed.  We now do a quick sweep over the
+  // inserted code, doing constant propagation and dead code elimination as we
+  // go.
+  const DataLayout &DL = Header->getModule()->getDataLayout();
+  const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
+  for (BasicBlock *BB : NewLoopBlocks) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+      Instruction *Inst = &*I++;
+
+      if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
+        if (LI->replacementPreservesLCSSAForm(Inst, V))
+          Inst->replaceAllUsesWith(V);
+      if (isInstructionTriviallyDead(Inst))
+        BB->getInstList().erase(Inst);
+    }
+  }
+
+  // TODO: after peeling or unrolling, previously loop variant conditions are
+  // likely to fold to constants, eagerly propagating those here will require
+  // fewer cleanup passes to be run.  Alternatively, a LoopEarlyCSE might be
+  // appropriate.
+
+  NumCompletelyUnrolled += CompletelyUnroll;
+  ++NumUnrolled;
+
+  Loop *OuterL = L->getParentLoop();
+  // Update LoopInfo if the loop is completely removed.
+  if (CompletelyUnroll)
+    LI->markAsRemoved(L);
+
+  // After complete unrolling most of the blocks should be contained in OuterL.
+  // However, some of them might happen to be out of OuterL (e.g. if they
+  // precede a loop exit). In this case we might need to insert PHI nodes in
+  // order to preserve LCSSA form.
+  // We don't need to check this if we already know that we need to fix LCSSA
+  // form.
+  // TODO: For now we just recompute LCSSA for the outer loop in this case, but
+  // it should be possible to fix it in-place.
+  if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA)
+    NeedToFixLCSSA |= ::needToInsertPhisForLCSSA(OuterL, UnrolledLoopBlocks, LI);
+
+  // If we have a pass and a DominatorTree we should re-simplify impacted loops
+  // to ensure subsequent analyses can rely on this form. We want to simplify
+  // at least one layer outside of the loop that was unrolled so that any
+  // changes to the parent loop exposed by the unrolling are considered.
+  if (DT) {
+    if (OuterL) {
+      // OuterL includes all loops for which we can break loop-simplify, so
+      // it's sufficient to simplify only it (it'll recursively simplify inner
+      // loops too).
+      if (NeedToFixLCSSA) {
+        // LCSSA must be performed on the outermost affected loop. The unrolled
+        // loop's last loop latch is guaranteed to be in the outermost loop
+        // after LoopInfo's been updated by markAsRemoved.
+        Loop *LatchLoop = LI->getLoopFor(Latches.back());
+        Loop *FixLCSSALoop = OuterL;
+        if (!FixLCSSALoop->contains(LatchLoop))
+          while (FixLCSSALoop->getParentLoop() != LatchLoop)
+            FixLCSSALoop = FixLCSSALoop->getParentLoop();
+
+        formLCSSARecursively(*FixLCSSALoop, *DT, LI, SE);
+      } else if (PreserveLCSSA) {
+        assert(OuterL->isLCSSAForm(*DT) &&
+               "Loops should be in LCSSA form after loop-unroll.");
+      }
+
+      // TODO: That potentially might be compile-time expensive. We should try
+      // to fix the loop-simplified form incrementally.
+      simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA);
+    } else {
+      // Simplify loops for which we might've broken loop-simplify form.
+      for (Loop *SubLoop : LoopsToSimplify)
+        simplifyLoop(SubLoop, DT, LI, SE, AC, PreserveLCSSA);
+    }
+  }
+
+  return true;
+}
+
+/// Given an llvm.loop loop id metadata node, returns the loop hint metadata
+/// node with the given name (for example, "llvm.loop.unroll.count"). If no
+/// such metadata node exists, then nullptr is returned.
+MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
+  // First operand should refer to the loop id itself.
+  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+    if (!MD)
+      continue;
+
+    MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+    if (!S)
+      continue;
+
+    if (Name.equals(S->getString()))
+      return MD;
+  }
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
new file mode 100644
index 000000000000..5c21490793e7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -0,0 +1,554 @@
+//===-- UnrollLoopPeel.cpp - Loop peeling utilities -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some loop unrolling utilities for peeling loops
+// with dynamically inferred (from PGO) trip counts. See LoopUnroll.cpp for
+// unrolling loops with compile-time constant trip counts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll"
+STATISTIC(NumPeeled, "Number of loops peeled");
+
+static cl::opt<unsigned> UnrollPeelMaxCount(
+    "unroll-peel-max-count", cl::init(7), cl::Hidden,
+    cl::desc("Max average trip count which will cause loop peeling."));
+
+static cl::opt<unsigned> UnrollForcePeelCount(
+    "unroll-force-peel-count", cl::init(0), cl::Hidden,
+    cl::desc("Force a peel count regardless of profiling information."));
+
+// Designates that a Phi is estimated to become invariant after an "infinite"
+// number of loop iterations (i.e. only may become an invariant if the loop is
+// fully unrolled).
+static const unsigned InfiniteIterationsToInvariance = UINT_MAX;
+
+// Check whether we are capable of peeling this loop.
+static bool canPeel(Loop *L) {
+  // Make sure the loop is in simplified form
+  if (!L->isLoopSimplifyForm())
+    return false;
+
+  // Only peel loops that contain a single exit
+  if (!L->getExitingBlock() || !L->getUniqueExitBlock())
+    return false;
+
+  // Don't try to peel loops where the latch is not the exiting block.
+  // This can be an indication of two different things:
+  // 1) The loop is not rotated.
+  // 2) The loop contains irreducible control flow that involves the latch.
+  if (L->getLoopLatch() != L->getExitingBlock())
+    return false;
+
+  return true;
+}
+
+// This function calculates the number of iterations after which the given Phi
+// becomes an invariant. The pre-calculated values are memorized in the map. The
+// function (shortcut is I) is calculated according to the following definition:
+// Given %x = phi <Inputs from above the loop>, ..., [%y, %back.edge].
+//   If %y is a loop invariant, then I(%x) = 1.
+//   If %y is a Phi from the loop header, I(%x) = I(%y) + 1.
+//   Otherwise, I(%x) is infinite.
+// TODO: Actually if %y is an expression that depends only on Phi %z and some
+//       loop invariants, we can estimate I(%x) = I(%z) + 1. The example
+//       looks like:
+//         %x = phi(0, %a),  <-- becomes invariant starting from 3rd iteration.
+//         %y = phi(0, 5),
+//         %a = %y + 1.
+static unsigned calculateIterationsToInvariance(
+    PHINode *Phi, Loop *L, BasicBlock *BackEdge,
+    SmallDenseMap<PHINode *, unsigned> &IterationsToInvariance) {
+  assert(Phi->getParent() == L->getHeader() &&
+         "Non-loop Phi should not be checked for turning into invariant.");
+  assert(BackEdge == L->getLoopLatch() && "Wrong latch?");
+  // If we already know the answer, take it from the map.
+  auto I = IterationsToInvariance.find(Phi);
+  if (I != IterationsToInvariance.end())
+    return I->second;
+
+  // Otherwise we need to analyze the input from the back edge.
+  Value *Input = Phi->getIncomingValueForBlock(BackEdge);
+  // Place infinity to map to avoid infinite recursion for cycled Phis. Such
+  // cycles can never stop on an invariant.
+  IterationsToInvariance[Phi] = InfiniteIterationsToInvariance;
+  unsigned ToInvariance = InfiniteIterationsToInvariance;
+
+  if (L->isLoopInvariant(Input))
+    ToInvariance = 1u;
+  else if (PHINode *IncPhi = dyn_cast<PHINode>(Input)) {
+    // Only consider Phis in header block.
+    if (IncPhi->getParent() != L->getHeader())
+      return InfiniteIterationsToInvariance;
+    // If the input becomes an invariant after X iterations, then our Phi
+    // becomes an invariant after X + 1 iterations.
+    unsigned InputToInvariance = calculateIterationsToInvariance(
+        IncPhi, L, BackEdge, IterationsToInvariance);
+    if (InputToInvariance != InfiniteIterationsToInvariance)
+      ToInvariance = InputToInvariance + 1u;
+  }
+
+  // If we found that this Phi lies in an invariant chain, update the map.
+  if (ToInvariance != InfiniteIterationsToInvariance)
+    IterationsToInvariance[Phi] = ToInvariance;
+  return ToInvariance;
+}
+
+// Return the number of iterations we want to peel off.
+void llvm::computePeelCount(Loop *L, unsigned LoopSize,
+                            TargetTransformInfo::UnrollingPreferences &UP,
+                            unsigned &TripCount) {
+  assert(LoopSize > 0 && "Zero loop size is not allowed!");
+  UP.PeelCount = 0;
+  if (!canPeel(L))
+    return;
+
+  // Only try to peel innermost loops.
+  if (!L->empty())
+    return;
+
+  // Here we try to get rid of Phis which become invariants after 1, 2, ..., N
+  // iterations of the loop. For this we compute the number for iterations after
+  // which every Phi is guaranteed to become an invariant, and try to peel the
+  // maximum number of iterations among these values, thus turning all those
+  // Phis into invariants.
+  // First, check that we can peel at least one iteration.
+  if (2 * LoopSize <= UP.Threshold && UnrollPeelMaxCount > 0) {
+    // Store the pre-calculated values here.
+    SmallDenseMap<PHINode *, unsigned> IterationsToInvariance;
+    // Now go through all Phis to calculate their the number of iterations they
+    // need to become invariants.
+    unsigned DesiredPeelCount = 0;
+    BasicBlock *BackEdge = L->getLoopLatch();
+    assert(BackEdge && "Loop is not in simplified form?");
+    for (auto BI = L->getHeader()->begin(); isa<PHINode>(&*BI); ++BI) {
+      PHINode *Phi = cast<PHINode>(&*BI);
+      unsigned ToInvariance = calculateIterationsToInvariance(
+          Phi, L, BackEdge, IterationsToInvariance);
+      if (ToInvariance != InfiniteIterationsToInvariance)
+        DesiredPeelCount = std::max(DesiredPeelCount, ToInvariance);
+    }
+    if (DesiredPeelCount > 0) {
+      // Pay respect to limitations implied by loop size and the max peel count.
+      unsigned MaxPeelCount = UnrollPeelMaxCount;
+      MaxPeelCount = std::min(MaxPeelCount, UP.Threshold / LoopSize - 1);
+      DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
+      // Consider max peel count limitation.
+      assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
+      DEBUG(dbgs() << "Peel " << DesiredPeelCount << " iteration(s) to turn"
+                   << " some Phis into invariants.\n");
+      UP.PeelCount = DesiredPeelCount;
+      return;
+    }
+  }
+
+  // Bail if we know the statically calculated trip count.
+  // In this case we rather prefer partial unrolling.
+  if (TripCount)
+    return;
+
+  // If the user provided a peel count, use that.
+  bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
+  if (UserPeelCount) {
+    DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
+                 << " iterations.\n");
+    UP.PeelCount = UnrollForcePeelCount;
+    return;
+  }
+
+  // If we don't know the trip count, but have reason to believe the average
+  // trip count is low, peeling should be beneficial, since we will usually
+  // hit the peeled section.
+  // We only do this in the presence of profile information, since otherwise
+  // our estimates of the trip count are not reliable enough.
+  if (UP.AllowPeeling && L->getHeader()->getParent()->getEntryCount()) {
+    Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
+    if (!PeelCount)
+      return;
+
+    DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount
+                 << "\n");
+
+    if (*PeelCount) {
+      if ((*PeelCount <= UnrollPeelMaxCount) &&
+          (LoopSize * (*PeelCount + 1) <= UP.Threshold)) {
+        DEBUG(dbgs() << "Peeling first " << *PeelCount << " iterations.\n");
+        UP.PeelCount = *PeelCount;
+        return;
+      }
+      DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
+      DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
+      DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1) << "\n");
+      DEBUG(dbgs() << "Max peel cost: " << UP.Threshold << "\n");
+    }
+  }
+
+  return;
+}
+
+/// \brief Update the branch weights of the latch of a peeled-off loop
+/// iteration.
+/// This sets the branch weights for the latch of the recently peeled off loop
+/// iteration correctly. 
+/// Our goal is to make sure that:
+/// a) The total weight of all the copies of the loop body is preserved.
+/// b) The total weight of the loop exit is preserved.
+/// c) The body weight is reasonably distributed between the peeled iterations.
+///
+/// \param Header The copy of the header block that belongs to next iteration.
+/// \param LatchBR The copy of the latch branch that belongs to this iteration.
+/// \param IterNumber The serial number of the iteration that was just
+/// peeled off.
+/// \param AvgIters The average number of iterations we expect the loop to have.
+/// \param[in,out] PeeledHeaderWeight The total number of dynamic loop
+/// iterations that are unaccounted for. As an input, it represents the number
+/// of times we expect to enter the header of the iteration currently being
+/// peeled off. The output is the number of times we expect to enter the
+/// header of the next iteration.
+static void updateBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
+                                unsigned IterNumber, unsigned AvgIters,
+                                uint64_t &PeeledHeaderWeight) {
+
+  // FIXME: Pick a more realistic distribution.
+  // Currently the proportion of weight we assign to the fall-through
+  // side of the branch drops linearly with the iteration number, and we use
+  // a 0.9 fudge factor to make the drop-off less sharp...
+  if (PeeledHeaderWeight) {
+    uint64_t FallThruWeight =
+        PeeledHeaderWeight * ((float)(AvgIters - IterNumber) / AvgIters * 0.9);
+    uint64_t ExitWeight = PeeledHeaderWeight - FallThruWeight;
+    PeeledHeaderWeight -= ExitWeight;
+
+    unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
+    MDBuilder MDB(LatchBR->getContext());
+    MDNode *WeightNode =
+        HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThruWeight)
+                  : MDB.createBranchWeights(FallThruWeight, ExitWeight);
+    LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
+  }
+}
+
+/// \brief Clones the body of the loop L, putting it between \p InsertTop and \p
+/// InsertBot.
+/// \param IterNumber The serial number of the iteration currently being
+/// peeled off.
+/// \param Exit The exit block of the original loop.
+/// \param[out] NewBlocks A list of the the blocks in the newly created clone
+/// \param[out] VMap The value map between the loop and the new clone.
+/// \param LoopBlocks A helper for DFS-traversal of the loop.
+/// \param LVMap A value-map that maps instructions from the original loop to
+/// instructions in the last peeled-off iteration.
+static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
+                            BasicBlock *InsertBot, BasicBlock *Exit,
+                            SmallVectorImpl<BasicBlock *> &NewBlocks,
+                            LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
+                            ValueToValueMapTy &LVMap, DominatorTree *DT,
+                            LoopInfo *LI) {
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *PreHeader = L->getLoopPreheader();
+
+  Function *F = Header->getParent();
+  LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
+  Loop *ParentLoop = L->getParentLoop();
+
+  // For each block in the original loop, create a new copy,
+  // and update the value map with the newly created values.
+  for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".peel", F);
+    NewBlocks.push_back(NewBB);
+
+    if (ParentLoop)
+      ParentLoop->addBasicBlockToLoop(NewBB, *LI);
+
+    VMap[*BB] = NewBB;
+
+    // If dominator tree is available, insert nodes to represent cloned blocks.
+    if (DT) {
+      if (Header == *BB)
+        DT->addNewBlock(NewBB, InsertTop);
+      else {
+        DomTreeNode *IDom = DT->getNode(*BB)->getIDom();
+        // VMap must contain entry for IDom, as the iteration order is RPO.
+        DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDom->getBlock()]));
+      }
+    }
+  }
+
+  // Hook-up the control flow for the newly inserted blocks.
+  // The new header is hooked up directly to the "top", which is either
+  // the original loop preheader (for the first iteration) or the previous
+  // iteration's exiting block (for every other iteration)
+  InsertTop->getTerminator()->setSuccessor(0, cast<BasicBlock>(VMap[Header]));
+
+  // Similarly, for the latch:
+  // The original exiting edge is still hooked up to the loop exit.
+  // The backedge now goes to the "bottom", which is either the loop's real
+  // header (for the last peeled iteration) or the copied header of the next
+  // iteration (for every other iteration)
+  BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
+  BranchInst *LatchBR = cast<BranchInst>(NewLatch->getTerminator());
+  unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
+  LatchBR->setSuccessor(HeaderIdx, InsertBot);
+  LatchBR->setSuccessor(1 - HeaderIdx, Exit);
+  if (DT)
+    DT->changeImmediateDominator(InsertBot, NewLatch);
+
+  // The new copy of the loop body starts with a bunch of PHI nodes
+  // that pick an incoming value from either the preheader, or the previous
+  // loop iteration. Since this copy is no longer part of the loop, we
+  // resolve this statically:
+  // For the first iteration, we use the value from the preheader directly.
+  // For any other iteration, we replace the phi with the value generated by
+  // the immediately preceding clone of the loop body (which represents
+  // the previous iteration).
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
+    if (IterNumber == 0) {
+      VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader);
+    } else {
+      Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch);
+      Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
+      if (LatchInst && L->contains(LatchInst))
+        VMap[&*I] = LVMap[LatchInst];
+      else
+        VMap[&*I] = LatchVal;
+    }
+    cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
+  }
+
+  // Fix up the outgoing values - we need to add a value for the iteration
+  // we've just created. Note that this must happen *after* the incoming
+  // values are adjusted, since the value going out of the latch may also be
+  // a value coming into the header.
+  for (BasicBlock::iterator I = Exit->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PHI = cast<PHINode>(I);
+    Value *LatchVal = PHI->getIncomingValueForBlock(Latch);
+    Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
+    if (LatchInst && L->contains(LatchInst))
+      LatchVal = VMap[LatchVal];
+    PHI->addIncoming(LatchVal, cast<BasicBlock>(VMap[Latch]));
+  }
+
+  // LastValueMap is updated with the values for the current loop
+  // which are used the next time this function is called.
+  for (const auto &KV : VMap)
+    LVMap[KV.first] = KV.second;
+}
+
+/// \brief Peel off the first \p PeelCount iterations of loop \p L.
+///
+/// Note that this does not peel them off as a single straight-line block.
+/// Rather, each iteration is peeled off separately, and needs to check the
+/// exit condition.
+/// For loops that dynamically execute \p PeelCount iterations or less
+/// this provides a benefit, since the peeled off iterations, which account
+/// for the bulk of dynamic execution, can be further simplified by scalar
+/// optimizations.
+bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
+                    ScalarEvolution *SE, DominatorTree *DT,
+                    AssumptionCache *AC, bool PreserveLCSSA) {
+  if (!canPeel(L))
+    return false;
+
+  LoopBlocksDFS LoopBlocks(L);
+  LoopBlocks.perform(LI);
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Exit = L->getUniqueExitBlock();
+
+  Function *F = Header->getParent();
+
+  // Set up all the necessary basic blocks. It is convenient to split the
+  // preheader into 3 parts - two blocks to anchor the peeled copy of the loop
+  // body, and a new preheader for the "real" loop.
+
+  // Peeling the first iteration transforms.
+  //
+  // PreHeader:
+  // ...
+  // Header:
+  //   LoopBody
+  //   If (cond) goto Header
+  // Exit:
+  //
+  // into
+  //
+  // InsertTop:
+  //   LoopBody
+  //   If (!cond) goto Exit
+  // InsertBot:
+  // NewPreHeader:
+  // ...
+  // Header:
+  //  LoopBody
+  //  If (cond) goto Header
+  // Exit:
+  //
+  // Each following iteration will split the current bottom anchor in two,
+  // and put the new copy of the loop body between these two blocks. That is,
+  // after peeling another iteration from the example above, we'll split 
+  // InsertBot, and get:
+  //
+  // InsertTop:
+  //   LoopBody
+  //   If (!cond) goto Exit
+  // InsertBot:
+  //   LoopBody
+  //   If (!cond) goto Exit
+  // InsertBot.next:
+  // NewPreHeader:
+  // ...
+  // Header:
+  //  LoopBody
+  //  If (cond) goto Header
+  // Exit:
+
+  BasicBlock *InsertTop = SplitEdge(PreHeader, Header, DT, LI);
+  BasicBlock *InsertBot =
+      SplitBlock(InsertTop, InsertTop->getTerminator(), DT, LI);
+  BasicBlock *NewPreHeader =
+      SplitBlock(InsertBot, InsertBot->getTerminator(), DT, LI);
+
+  InsertTop->setName(Header->getName() + ".peel.begin");
+  InsertBot->setName(Header->getName() + ".peel.next");
+  NewPreHeader->setName(PreHeader->getName() + ".peel.newph");
+
+  ValueToValueMapTy LVMap;
+
+  // If we have branch weight information, we'll want to update it for the
+  // newly created branches.
+  BranchInst *LatchBR =
+      cast<BranchInst>(cast<BasicBlock>(Latch)->getTerminator());
+  unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
+
+  uint64_t TrueWeight, FalseWeight;
+  uint64_t ExitWeight = 0, CurHeaderWeight = 0;
+  if (LatchBR->extractProfMetadata(TrueWeight, FalseWeight)) {
+    ExitWeight = HeaderIdx ? TrueWeight : FalseWeight;
+    // The # of times the loop body executes is the sum of the exit block
+    // weight and the # of times the backedges are taken.
+    CurHeaderWeight = TrueWeight + FalseWeight;
+  }
+
+  // For each peeled-off iteration, make a copy of the loop.
+  for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
+    SmallVector<BasicBlock *, 8> NewBlocks;
+    ValueToValueMapTy VMap;
+
+    // Subtract the exit weight from the current header weight -- the exit
+    // weight is exactly the weight of the previous iteration's header.
+    // FIXME: due to the way the distribution is constructed, we need a
+    // guard here to make sure we don't end up with non-positive weights.
+    if (ExitWeight < CurHeaderWeight)
+      CurHeaderWeight -= ExitWeight;
+    else
+      CurHeaderWeight = 1;
+
+    cloneLoopBlocks(L, Iter, InsertTop, InsertBot, Exit,
+                    NewBlocks, LoopBlocks, VMap, LVMap, DT, LI);
+
+    // Remap to use values from the current iteration instead of the
+    // previous one.
+    remapInstructionsInBlocks(NewBlocks, VMap);
+
+    if (DT) {
+      // Latches of the cloned loops dominate over the loop exit, so idom of the
+      // latter is the first cloned loop body, as original PreHeader dominates
+      // the original loop body.
+      if (Iter == 0)
+        DT->changeImmediateDominator(Exit, cast<BasicBlock>(LVMap[Latch]));
+#ifndef NDEBUG
+      if (VerifyDomInfo)
+        DT->verifyDomTree();
+#endif
+    }
+
+    updateBranchWeights(InsertBot, cast<BranchInst>(VMap[LatchBR]), Iter,
+                        PeelCount, ExitWeight);
+
+    InsertTop = InsertBot;
+    InsertBot = SplitBlock(InsertBot, InsertBot->getTerminator(), DT, LI);
+    InsertBot->setName(Header->getName() + ".peel.next");
+
+    F->getBasicBlockList().splice(InsertTop->getIterator(),
+                                  F->getBasicBlockList(),
+                                  NewBlocks[0]->getIterator(), F->end());
+  }
+
+  // Now adjust the phi nodes in the loop header to get their initial values
+  // from the last peeled-off iteration instead of the preheader.
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PHI = cast<PHINode>(I);
+    Value *NewVal = PHI->getIncomingValueForBlock(Latch);
+    Instruction *LatchInst = dyn_cast<Instruction>(NewVal);
+    if (LatchInst && L->contains(LatchInst))
+      NewVal = LVMap[LatchInst];
+
+    PHI->setIncomingValue(PHI->getBasicBlockIndex(NewPreHeader), NewVal);
+  }
+
+  // Adjust the branch weights on the loop exit.
+  if (ExitWeight) {
+    // The backedge count is the difference of current header weight and
+    // current loop exit weight. If the current header weight is smaller than
+    // the current loop exit weight, we mark the loop backedge weight as 1.
+    uint64_t BackEdgeWeight = 0;
+    if (ExitWeight < CurHeaderWeight)
+      BackEdgeWeight = CurHeaderWeight - ExitWeight;
+    else
+      BackEdgeWeight = 1;
+    MDBuilder MDB(LatchBR->getContext());
+    MDNode *WeightNode =
+        HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
+                  : MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
+    LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
+  }
+
+  // If the loop is nested, we changed the parent loop, update SE.
+  if (Loop *ParentLoop = L->getParentLoop()) {
+    SE->forgetLoop(ParentLoop);
+
+    // FIXME: Incrementally update loop-simplify
+    simplifyLoop(ParentLoop, DT, LI, SE, AC, PreserveLCSSA);
+  } else {
+    // FIXME: Incrementally update loop-simplify
+    simplifyLoop(L, DT, LI, SE, AC, PreserveLCSSA);
+  }
+
+  NumPeeled++;
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
new file mode 100644
index 000000000000..5170c68e2915
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -0,0 +1,847 @@
+//===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some loop unrolling utilities for loops with run-time
+// trip counts.  See LoopUnroll.cpp for unrolling loops with compile-time
+// trip counts.
+//
+// The functions in this file are used to generate extra code when the
+// run-time trip count modulo the unroll factor is not 0.  When this is the
+// case, we need to generate code to execute these 'left over' iterations.
+//
+// The current strategy generates an if-then-else sequence prior to the
+// unrolled loop to execute the 'left over' iterations before or after the
+// unrolled loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll"
+
+STATISTIC(NumRuntimeUnrolled,
+          "Number of loops unrolled with run-time trip counts");
+static cl::opt<bool> UnrollRuntimeMultiExit(
+    "unroll-runtime-multi-exit", cl::init(false), cl::Hidden,
+    cl::desc("Allow runtime unrolling for loops with multiple exits, when "
+             "epilog is generated"));
+
+/// Connect the unrolling prolog code to the original loop.
+/// The unrolling prolog code contains code to execute the
+/// 'extra' iterations if the run-time trip count modulo the
+/// unroll count is non-zero.
+///
+/// This function performs the following:
+/// - Create PHI nodes at prolog end block to combine values
+///   that exit the prolog code and jump around the prolog.
+/// - Add a PHI operand to a PHI node at the loop exit block
+///   for values that exit the prolog and go around the loop.
+/// - Branch around the original loop if the trip count is less
+///   than the unroll factor.
+///
+static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
+                          BasicBlock *PrologExit,
+                          BasicBlock *OriginalLoopLatchExit,
+                          BasicBlock *PreHeader, BasicBlock *NewPreHeader,
+                          ValueToValueMapTy &VMap, DominatorTree *DT,
+                          LoopInfo *LI, bool PreserveLCSSA) {
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch && "Loop must have a latch");
+  BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);
+
+  // Create a PHI node for each outgoing value from the original loop
+  // (which means it is an outgoing value from the prolog code too).
+  // The new PHI node is inserted in the prolog end basic block.
+  // The new PHI node value is added as an operand of a PHI node in either
+  // the loop header or the loop exit block.
+  for (BasicBlock *Succ : successors(Latch)) {
+    for (Instruction &BBI : *Succ) {
+      PHINode *PN = dyn_cast<PHINode>(&BBI);
+      // Exit when we passed all PHI nodes.
+      if (!PN)
+        break;
+      // Add a new PHI node to the prolog end block and add the
+      // appropriate incoming values.
+      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
+                                       PrologExit->getFirstNonPHI());
+      // Adding a value to the new PHI node from the original loop preheader.
+      // This is the value that skips all the prolog code.
+      if (L->contains(PN)) {
+        NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader),
+                           PreHeader);
+      } else {
+        NewPN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
+      }
+
+      Value *V = PN->getIncomingValueForBlock(Latch);
+      if (Instruction *I = dyn_cast<Instruction>(V)) {
+        if (L->contains(I)) {
+          V = VMap.lookup(I);
+        }
+      }
+      // Adding a value to the new PHI node from the last prolog block
+      // that was created.
+      NewPN->addIncoming(V, PrologLatch);
+
+      // Update the existing PHI node operand with the value from the
+      // new PHI node.  How this is done depends on if the existing
+      // PHI node is in the original loop block, or the exit block.
+      if (L->contains(PN)) {
+        PN->setIncomingValue(PN->getBasicBlockIndex(NewPreHeader), NewPN);
+      } else {
+        PN->addIncoming(NewPN, PrologExit);
+      }
+    }
+  }
+
+  // Make sure that created prolog loop is in simplified form
+  SmallVector<BasicBlock *, 4> PrologExitPreds;
+  Loop *PrologLoop = LI->getLoopFor(PrologLatch);
+  if (PrologLoop) {
+    for (BasicBlock *PredBB : predecessors(PrologExit))
+      if (PrologLoop->contains(PredBB))
+        PrologExitPreds.push_back(PredBB);
+
+    SplitBlockPredecessors(PrologExit, PrologExitPreds, ".unr-lcssa", DT, LI,
+                           PreserveLCSSA);
+  }
+
+  // Create a branch around the original loop, which is taken if there are no
+  // iterations remaining to be executed after running the prologue.
+  Instruction *InsertPt = PrologExit->getTerminator();
+  IRBuilder<> B(InsertPt);
+
+  assert(Count != 0 && "nonsensical Count!");
+
+  // If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1)
+  // This means %xtraiter is (BECount + 1) and all of the iterations of this
+  // loop were executed by the prologue.  Note that if BECount <u (Count - 1)
+  // then (BECount + 1) cannot unsigned-overflow.
+  Value *BrLoopExit =
+      B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1));
+  // Split the exit to maintain loop canonicalization guarantees
+  SmallVector<BasicBlock *, 4> Preds(predecessors(OriginalLoopLatchExit));
+  SplitBlockPredecessors(OriginalLoopLatchExit, Preds, ".unr-lcssa", DT, LI,
+                         PreserveLCSSA);
+  // Add the branch to the exit block (around the unrolled loop)
+  B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader);
+  InsertPt->eraseFromParent();
+  if (DT)
+    DT->changeImmediateDominator(OriginalLoopLatchExit, PrologExit);
+}
+
+/// Connect the unrolling epilog code to the original loop.
+/// The unrolling epilog code contains code to execute the
+/// 'extra' iterations if the run-time trip count modulo the
+/// unroll count is non-zero.
+///
+/// This function performs the following:
+/// - Update PHI nodes at the unrolling loop exit and epilog loop exit
+/// - Create PHI nodes at the unrolling loop exit to combine
+///   values that exit the unrolling loop code and jump around it.
+/// - Update PHI operands in the epilog loop by the new PHI nodes
+/// - Branch around the epilog loop if extra iters (ModVal) is zero.
+///
+static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
+                          BasicBlock *Exit, BasicBlock *PreHeader,
+                          BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
+                          ValueToValueMapTy &VMap, DominatorTree *DT,
+                          LoopInfo *LI, bool PreserveLCSSA)  {
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch && "Loop must have a latch");
+  BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
+
+  // Loop structure should be the following:
+  //
+  // PreHeader
+  // NewPreHeader
+  //   Header
+  //   ...
+  //   Latch
+  // NewExit (PN)
+  // EpilogPreHeader
+  //   EpilogHeader
+  //   ...
+  //   EpilogLatch
+  // Exit (EpilogPN)
+
+  // Update PHI nodes at NewExit and Exit.
+  for (Instruction &BBI : *NewExit) {
+    PHINode *PN = dyn_cast<PHINode>(&BBI);
+    // Exit when we passed all PHI nodes.
+    if (!PN)
+      break;
+    // PN should be used in another PHI located in Exit block as
+    // Exit was split by SplitBlockPredecessors into Exit and NewExit
+    // Basicaly it should look like:
+    // NewExit:
+    //   PN = PHI [I, Latch]
+    // ...
+    // Exit:
+    //   EpilogPN = PHI [PN, EpilogPreHeader]
+    //
+    // There is EpilogPreHeader incoming block instead of NewExit as
+    // NewExit was spilt 1 more time to get EpilogPreHeader.
+    assert(PN->hasOneUse() && "The phi should have 1 use");
+    PHINode *EpilogPN = cast<PHINode> (PN->use_begin()->getUser());
+    assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
+
+    // Add incoming PreHeader from branch around the Loop
+    PN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
+
+    Value *V = PN->getIncomingValueForBlock(Latch);
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (I && L->contains(I))
+      // If value comes from an instruction in the loop add VMap value.
+      V = VMap.lookup(I);
+    // For the instruction out of the loop, constant or undefined value
+    // insert value itself.
+    EpilogPN->addIncoming(V, EpilogLatch);
+
+    assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&
+          "EpilogPN should have EpilogPreHeader incoming block");
+    // Change EpilogPreHeader incoming block to NewExit.
+    EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader),
+                               NewExit);
+    // Now PHIs should look like:
+    // NewExit:
+    //   PN = PHI [I, Latch], [undef, PreHeader]
+    // ...
+    // Exit:
+    //   EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
+  }
+
+  // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
+  // Update corresponding PHI nodes in epilog loop.
+  for (BasicBlock *Succ : successors(Latch)) {
+    // Skip this as we already updated phis in exit blocks.
+    if (!L->contains(Succ))
+      continue;
+    for (Instruction &BBI : *Succ) {
+      PHINode *PN = dyn_cast<PHINode>(&BBI);
+      // Exit when we passed all PHI nodes.
+      if (!PN)
+        break;
+      // Add new PHI nodes to the loop exit block and update epilog
+      // PHIs with the new PHI values.
+      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
+                                       NewExit->getFirstNonPHI());
+      // Adding a value to the new PHI node from the unrolling loop preheader.
+      NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), PreHeader);
+      // Adding a value to the new PHI node from the unrolling loop latch.
+      NewPN->addIncoming(PN->getIncomingValueForBlock(Latch), Latch);
+
+      // Update the existing PHI node operand with the value from the new PHI
+      // node.  Corresponding instruction in epilog loop should be PHI.
+      PHINode *VPN = cast<PHINode>(VMap[&BBI]);
+      VPN->setIncomingValue(VPN->getBasicBlockIndex(EpilogPreHeader), NewPN);
+    }
+  }
+
+  Instruction *InsertPt = NewExit->getTerminator();
+  IRBuilder<> B(InsertPt);
+  Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");
+  assert(Exit && "Loop must have a single exit block only");
+  // Split the epilogue exit to maintain loop canonicalization guarantees
+  SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
+  SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI,
+                         PreserveLCSSA);
+  // Add the branch to the exit block (around the unrolling loop)
+  B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
+  InsertPt->eraseFromParent();
+  if (DT)
+    DT->changeImmediateDominator(Exit, NewExit);
+
+  // Split the main loop exit to maintain canonicalization guarantees.
+  SmallVector<BasicBlock*, 4> NewExitPreds{Latch};
+  SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI,
+                         PreserveLCSSA);
+}
+
+/// Create a clone of the blocks in a loop and connect them together.
+/// If CreateRemainderLoop is false, loop structure will not be cloned,
+/// otherwise a new loop will be created including all cloned blocks, and the
+/// iterator of it switches to count NewIter down to 0.
+/// The cloned blocks should be inserted between InsertTop and InsertBot.
+/// If loop structure is cloned InsertTop should be new preheader, InsertBot
+/// new loop exit.
+/// Return the new cloned loop that is created when CreateRemainderLoop is true.
+static Loop *
+CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
+                const bool UseEpilogRemainder, BasicBlock *InsertTop,
+                BasicBlock *InsertBot, BasicBlock *Preheader,
+                std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
+                ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) {
+  StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  Function *F = Header->getParent();
+  LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
+  Loop *ParentLoop = L->getParentLoop();
+  NewLoopsMap NewLoops;
+  NewLoops[ParentLoop] = ParentLoop;
+  if (!CreateRemainderLoop)
+    NewLoops[L] = ParentLoop;
+
+  // For each block in the original loop, create a new copy,
+  // and update the value map with the newly created values.
+  for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
+    NewBlocks.push_back(NewBB);
+
+    // If we're unrolling the outermost loop, there's no remainder loop,
+    // and this block isn't in a nested loop, then the new block is not
+    // in any loop. Otherwise, add it to loopinfo.
+    if (CreateRemainderLoop || LI->getLoopFor(*BB) != L || ParentLoop)
+      addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);
+
+    VMap[*BB] = NewBB;
+    if (Header == *BB) {
+      // For the first block, add a CFG connection to this newly
+      // created block.
+      InsertTop->getTerminator()->setSuccessor(0, NewBB);
+    }
+
+    if (DT) {
+      if (Header == *BB) {
+        // The header is dominated by the preheader.
+        DT->addNewBlock(NewBB, InsertTop);
+      } else {
+        // Copy information from original loop to unrolled loop.
+        BasicBlock *IDomBB = DT->getNode(*BB)->getIDom()->getBlock();
+        DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
+      }
+    }
+
+    if (Latch == *BB) {
+      // For the last block, if CreateRemainderLoop is false, create a direct
+      // jump to InsertBot. If not, create a loop back to cloned head.
+      VMap.erase((*BB)->getTerminator());
+      BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
+      BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
+      IRBuilder<> Builder(LatchBR);
+      if (!CreateRemainderLoop) {
+        Builder.CreateBr(InsertBot);
+      } else {
+        PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
+                                          suffix + ".iter",
+                                          FirstLoopBB->getFirstNonPHI());
+        Value *IdxSub =
+            Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+                              NewIdx->getName() + ".sub");
+        Value *IdxCmp =
+            Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp");
+        Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot);
+        NewIdx->addIncoming(NewIter, InsertTop);
+        NewIdx->addIncoming(IdxSub, NewBB);
+      }
+      LatchBR->eraseFromParent();
+    }
+  }
+
+  // Change the incoming values to the ones defined in the preheader or
+  // cloned loop.
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
+    if (!CreateRemainderLoop) {
+      if (UseEpilogRemainder) {
+        unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
+        NewPHI->setIncomingBlock(idx, InsertTop);
+        NewPHI->removeIncomingValue(Latch, false);
+      } else {
+        VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
+        cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
+      }
+    } else {
+      unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
+      NewPHI->setIncomingBlock(idx, InsertTop);
+      BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
+      idx = NewPHI->getBasicBlockIndex(Latch);
+      Value *InVal = NewPHI->getIncomingValue(idx);
+      NewPHI->setIncomingBlock(idx, NewLatch);
+      if (Value *V = VMap.lookup(InVal))
+        NewPHI->setIncomingValue(idx, V);
+    }
+  }
+  if (CreateRemainderLoop) {
+    Loop *NewLoop = NewLoops[L];
+    assert(NewLoop && "L should have been cloned");
+    // Add unroll disable metadata to disable future unrolling for this loop.
+    SmallVector<Metadata *, 4> MDs;
+    // Reserve first location for self reference to the LoopID metadata node.
+    MDs.push_back(nullptr);
+    MDNode *LoopID = NewLoop->getLoopID();
+    if (LoopID) {
+      // First remove any existing loop unrolling metadata.
+      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+        bool IsUnrollMetadata = false;
+        MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+        if (MD) {
+          const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+          IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
+        }
+        if (!IsUnrollMetadata)
+          MDs.push_back(LoopID->getOperand(i));
+      }
+    }
+
+    LLVMContext &Context = NewLoop->getHeader()->getContext();
+    SmallVector<Metadata *, 1> DisableOperands;
+    DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
+    MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+    MDs.push_back(DisableNode);
+
+    MDNode *NewLoopID = MDNode::get(Context, MDs);
+    // Set operand 0 to refer to the loop id itself.
+    NewLoopID->replaceOperandWith(0, NewLoopID);
+    NewLoop->setLoopID(NewLoopID);
+    return NewLoop;
+  }
+  else
+    return nullptr;
+}
+
+/// Returns true if we can safely unroll a multi-exit/exiting loop. OtherExits
+/// is populated with all the loop exit blocks other than the LatchExit block.
+static bool
+canSafelyUnrollMultiExitLoop(Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits,
+                             BasicBlock *LatchExit, bool PreserveLCSSA,
+                             bool UseEpilogRemainder) {
+
+  // Support runtime unrolling for multiple exit blocks and multiple exiting
+  // blocks.
+  if (!UnrollRuntimeMultiExit)
+    return false;
+  // Even if runtime multi exit is enabled, we currently have some correctness
+  // constrains in unrolling a multi-exit loop.
+  // We rely on LCSSA form being preserved when the exit blocks are transformed.
+  if (!PreserveLCSSA)
+    return false;
+  SmallVector<BasicBlock *, 4> Exits;
+  L->getUniqueExitBlocks(Exits);
+  for (auto *BB : Exits)
+    if (BB != LatchExit)
+      OtherExits.push_back(BB);
+
+  // TODO: Support multiple exiting blocks jumping to the `LatchExit` when
+  // UnrollRuntimeMultiExit is true. This will need updating the logic in
+  // connectEpilog/connectProlog.
+  if (!LatchExit->getSinglePredecessor()) {
+    DEBUG(dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
+                    "predecessor.\n");
+    return false;
+  }
+  // FIXME: We bail out of multi-exit unrolling when epilog loop is generated
+  // and L is an inner loop. This is because in presence of multiple exits, the
+  // outer loop is incorrect: we do not add the EpilogPreheader and exit to the
+  // outer loop. This is automatically handled in the prolog case, so we do not
+  // have that bug in prolog generation.
+  if (UseEpilogRemainder && L->getParentLoop())
+    return false;
+
+  // All constraints have been satisfied.
+  return true;
+}
+
+
+
+/// Insert code in the prolog/epilog code when unrolling a loop with a
+/// run-time trip-count.
+///
+/// This method assumes that the loop unroll factor is total number
+/// of loop bodies in the loop after unrolling. (Some folks refer
+/// to the unroll factor as the number of *extra* copies added).
+/// We assume also that the loop unroll factor is a power-of-two. So, after
+/// unrolling the loop, the number of loop bodies executed is 2,
+/// 4, 8, etc.  Note - LLVM converts the if-then-sequence to a switch
+/// instruction in SimplifyCFG.cpp.  Then, the backend decides how code for
+/// the switch instruction is generated.
+///
+/// ***Prolog case***
+///        extraiters = tripcount % loopfactor
+///        if (extraiters == 0) jump Loop:
+///        else jump Prol:
+/// Prol:  LoopBody;
+///        extraiters -= 1                 // Omitted if unroll factor is 2.
+///        if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
+///        if (tripcount < loopfactor) jump End:
+/// Loop:
+/// ...
+/// End:
+///
+/// ***Epilog case***
+///        extraiters = tripcount % loopfactor
+///        if (tripcount < loopfactor) jump LoopExit:
+///        unroll_iters = tripcount - extraiters
+/// Loop:  LoopBody; (executes unroll_iter times);
+///        unroll_iter -= 1
+///        if (unroll_iter != 0) jump Loop:
+/// LoopExit:
+///        if (extraiters == 0) jump EpilExit:
+/// Epil:  LoopBody; (executes extraiters times)
+///        extraiters -= 1                 // Omitted if unroll factor is 2.
+///        if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
+/// EpilExit:
+
+bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
+                                      bool AllowExpensiveTripCount,
+                                      bool UseEpilogRemainder,
+                                      LoopInfo *LI, ScalarEvolution *SE,
+                                      DominatorTree *DT, bool PreserveLCSSA) {
+  DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
+  DEBUG(L->dump());
+
+  // Make sure the loop is in canonical form.
+  if (!L->isLoopSimplifyForm()) {
+    DEBUG(dbgs() << "Not in simplify form!\n");
+    return false;
+  }
+
+  // Guaranteed by LoopSimplifyForm.
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Header = L->getHeader();
+
+  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+  unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
+  BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
+  // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
+  // targets of the Latch be an exit block out of the loop. This needs
+  // to be guaranteed by the callers of UnrollRuntimeLoopRemainder.
+  assert(!L->contains(LatchExit) &&
+         "one of the loop latch successors should be the exit block!");
+  // These are exit blocks other than the target of the latch exiting block.
+  SmallVector<BasicBlock *, 4> OtherExits;
+  bool isMultiExitUnrollingEnabled = canSafelyUnrollMultiExitLoop(
+      L, OtherExits, LatchExit, PreserveLCSSA, UseEpilogRemainder);
+  // Support only single exit and exiting block unless multi-exit loop unrolling is enabled.
+  if (!isMultiExitUnrollingEnabled &&
+      (!L->getExitingBlock() || OtherExits.size())) {
+    DEBUG(
+        dbgs()
+        << "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
+           "enabled!\n");
+    return false;
+  }
+  // Use Scalar Evolution to compute the trip count. This allows more loops to
+  // be unrolled than relying on induction var simplification.
+  if (!SE)
+    return false;
+
+  // Only unroll loops with a computable trip count, and the trip count needs
+  // to be an int value (allowing a pointer type is a TODO item).
+  // We calculate the backedge count by using getExitCount on the Latch block,
+  // which is proven to be the only exiting block in this loop. This is same as
+  // calculating getBackedgeTakenCount on the loop (which computes SCEV for all
+  // exiting blocks).
+  const SCEV *BECountSC = SE->getExitCount(L, Latch);
+  if (isa<SCEVCouldNotCompute>(BECountSC) ||
+      !BECountSC->getType()->isIntegerTy()) {
+    DEBUG(dbgs() << "Could not compute exit block SCEV\n");
+    return false;
+  }
+
+  unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
+
+  // Add 1 since the backedge count doesn't include the first loop iteration.
+  const SCEV *TripCountSC =
+      SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
+  if (isa<SCEVCouldNotCompute>(TripCountSC)) {
+    DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
+    return false;
+  }
+
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
+  const DataLayout &DL = Header->getModule()->getDataLayout();
+  SCEVExpander Expander(*SE, DL, "loop-unroll");
+  if (!AllowExpensiveTripCount &&
+      Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR)) {
+    DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
+    return false;
+  }
+
+  // This constraint lets us deal with an overflowing trip count easily; see the
+  // comment on ModVal below.
+  if (Log2_32(Count) > BEWidth) {
+    DEBUG(dbgs()
+          << "Count failed constraint on overflow trip count calculation.\n");
+    return false;
+  }
+
+  // Loop structure is the following:
+  //
+  // PreHeader
+  //   Header
+  //   ...
+  //   Latch
+  // LatchExit
+
+  BasicBlock *NewPreHeader;
+  BasicBlock *NewExit = nullptr;
+  BasicBlock *PrologExit = nullptr;
+  BasicBlock *EpilogPreHeader = nullptr;
+  BasicBlock *PrologPreHeader = nullptr;
+
+  if (UseEpilogRemainder) {
+    // If epilog remainder
+    // Split PreHeader to insert a branch around loop for unrolling.
+    NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
+    NewPreHeader->setName(PreHeader->getName() + ".new");
+    // Split LatchExit to create phi nodes from branch above.
+    SmallVector<BasicBlock*, 4> Preds(predecessors(LatchExit));
+    NewExit = SplitBlockPredecessors(LatchExit, Preds, ".unr-lcssa",
+                                     DT, LI, PreserveLCSSA);
+    // Split NewExit to insert epilog remainder loop.
+    EpilogPreHeader = SplitBlock(NewExit, NewExit->getTerminator(), DT, LI);
+    EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
+  } else {
+    // If prolog remainder
+    // Split the original preheader twice to insert prolog remainder loop
+    PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI);
+    PrologPreHeader->setName(Header->getName() + ".prol.preheader");
+    PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(),
+                            DT, LI);
+    PrologExit->setName(Header->getName() + ".prol.loopexit");
+    // Split PrologExit to get NewPreHeader.
+    NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI);
+    NewPreHeader->setName(PreHeader->getName() + ".new");
+  }
+  // Loop structure should be the following:
+  //  Epilog             Prolog
+  //
+  // PreHeader         PreHeader
+  // *NewPreHeader     *PrologPreHeader
+  //   Header          *PrologExit
+  //   ...             *NewPreHeader
+  //   Latch             Header
+  // *NewExit            ...
+  // *EpilogPreHeader    Latch
+  // LatchExit              LatchExit
+
+  // Calculate conditions for branch around loop for unrolling
+  // in epilog case and around prolog remainder loop in prolog case.
+  // Compute the number of extra iterations required, which is:
+  //  extra iterations = run-time trip count % loop unroll factor
+  PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
+  Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
+                                            PreHeaderBR);
+  Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
+                                          PreHeaderBR);
+  IRBuilder<> B(PreHeaderBR);
+  Value *ModVal;
+  // Calculate ModVal = (BECount + 1) % Count.
+  // Note that TripCount is BECount + 1.
+  if (isPowerOf2_32(Count)) {
+    // When Count is power of 2 we don't BECount for epilog case, however we'll
+    // need it for a branch around unrolling loop for prolog case.
+    ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
+    //  1. There are no iterations to be run in the prolog/epilog loop.
+    // OR
+    //  2. The addition computing TripCount overflowed.
+    //
+    // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
+    // the number of iterations that remain to be run in the original loop is a
+    // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
+    // explicitly check this above).
+  } else {
+    // As (BECount + 1) can potentially unsigned overflow we count
+    // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
+    Value *ModValTmp = B.CreateURem(BECount,
+                                    ConstantInt::get(BECount->getType(),
+                                                     Count));
+    Value *ModValAdd = B.CreateAdd(ModValTmp,
+                                   ConstantInt::get(ModValTmp->getType(), 1));
+    // At that point (BECount % Count) + 1 could be equal to Count.
+    // To handle this case we need to take mod by Count one more time.
+    ModVal = B.CreateURem(ModValAdd,
+                          ConstantInt::get(BECount->getType(), Count),
+                          "xtraiter");
+  }
+  Value *BranchVal =
+      UseEpilogRemainder ? B.CreateICmpULT(BECount,
+                                           ConstantInt::get(BECount->getType(),
+                                                            Count - 1)) :
+                           B.CreateIsNotNull(ModVal, "lcmp.mod");
+  BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader;
+  BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
+  // Branch to either remainder (extra iterations) loop or unrolling loop.
+  B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop);
+  PreHeaderBR->eraseFromParent();
+  if (DT) {
+    if (UseEpilogRemainder)
+      DT->changeImmediateDominator(NewExit, PreHeader);
+    else
+      DT->changeImmediateDominator(PrologExit, PreHeader);
+  }
+  Function *F = Header->getParent();
+  // Get an ordered list of blocks in the loop to help with the ordering of the
+  // cloned blocks in the prolog/epilog code
+  LoopBlocksDFS LoopBlocks(L);
+  LoopBlocks.perform(LI);
+
+  //
+  // For each extra loop iteration, create a copy of the loop's basic blocks
+  // and generate a condition that branches to the copy depending on the
+  // number of 'left over' iterations.
+  //
+  std::vector<BasicBlock *> NewBlocks;
+  ValueToValueMapTy VMap;
+
+  // For unroll factor 2 remainder loop will have 1 iterations.
+  // Do not create 1 iteration loop.
+  bool CreateRemainderLoop = (Count != 2);
+
+  // Clone all the basic blocks in the loop. If Count is 2, we don't clone
+  // the loop, otherwise we create a cloned loop to execute the extra
+  // iterations. This function adds the appropriate CFG connections.
+  BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
+  BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
+  Loop *remainderLoop = CloneLoopBlocks(
+      L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, InsertBot,
+      NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
+
+  // Insert the cloned blocks into the function.
+  F->getBasicBlockList().splice(InsertBot->getIterator(),
+                                F->getBasicBlockList(),
+                                NewBlocks[0]->getIterator(),
+                                F->end());
+
+  // Now the loop blocks are cloned and the other exiting blocks from the
+  // remainder are connected to the original Loop's exit blocks. The remaining
+  // work is to update the phi nodes in the original loop, and take in the
+  // values from the cloned region. Also update the dominator info for
+  // OtherExits, since we have new edges into OtherExits.
+  for (auto *BB : OtherExits) {
+   for (auto &II : *BB) {
+
+     // Given we preserve LCSSA form, we know that the values used outside the
+     // loop will be used through these phi nodes at the exit blocks that are
+     // transformed below.
+     if (!isa<PHINode>(II))
+       break;
+     PHINode *Phi = cast<PHINode>(&II);
+     unsigned oldNumOperands = Phi->getNumIncomingValues();
+     // Add the incoming values from the remainder code to the end of the phi
+     // node.
+     for (unsigned i =0; i < oldNumOperands; i++){
+       Value *newVal = VMap[Phi->getIncomingValue(i)];
+       // newVal can be a constant or derived from values outside the loop, and
+       // hence need not have a VMap value.
+       if (!newVal)
+         newVal = Phi->getIncomingValue(i);
+       Phi->addIncoming(newVal,
+                           cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)]));
+     }
+   }
+   // Update the dominator info because the immediate dominator is no longer the
+   // header of the original Loop. BB has edges both from L and remainder code.
+   // Since the preheader determines which loop is run (L or directly jump to
+   // the remainder code), we set the immediate dominator as the preheader.
+   if (DT)
+     DT->changeImmediateDominator(BB, PreHeader);
+  }
+
+  // Loop structure should be the following:
+  //  Epilog             Prolog
+  //
+  // PreHeader         PreHeader
+  // NewPreHeader      PrologPreHeader
+  //   Header            PrologHeader
+  //   ...               ...
+  //   Latch             PrologLatch
+  // NewExit           PrologExit
+  // EpilogPreHeader   NewPreHeader
+  //   EpilogHeader      Header
+  //   ...               ...
+  //   EpilogLatch       Latch
+  // LatchExit              LatchExit
+
+  // Rewrite the cloned instruction operands to use the values created when the
+  // clone is created.
+  for (BasicBlock *BB : NewBlocks) {
+    for (Instruction &I : *BB) {
+      RemapInstruction(&I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+    }
+  }
+
+  if (UseEpilogRemainder) {
+    // Connect the epilog code to the original loop and update the
+    // PHI functions.
+    ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader,
+                  EpilogPreHeader, NewPreHeader, VMap, DT, LI,
+                  PreserveLCSSA);
+
+    // Update counter in loop for unrolling.
+    // I should be multiply of Count.
+    IRBuilder<> B2(NewPreHeader->getTerminator());
+    Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
+    BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+    B2.SetInsertPoint(LatchBR);
+    PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
+                                      Header->getFirstNonPHI());
+    Value *IdxSub =
+        B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+                     NewIdx->getName() + ".nsub");
+    Value *IdxCmp;
+    if (LatchBR->getSuccessor(0) == Header)
+      IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp");
+    else
+      IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
+    NewIdx->addIncoming(TestVal, NewPreHeader);
+    NewIdx->addIncoming(IdxSub, Latch);
+    LatchBR->setCondition(IdxCmp);
+  } else {
+    // Connect the prolog code to the original loop and update the
+    // PHI functions.
+    ConnectProlog(L, BECount, Count, PrologExit, LatchExit, PreHeader,
+                  NewPreHeader, VMap, DT, LI, PreserveLCSSA);
+  }
+
+  // If this loop is nested, then the loop unroller changes the code in the
+  // parent loop, so the Scalar Evolution pass needs to be run again.
+  if (Loop *ParentLoop = L->getParentLoop())
+    SE->forgetLoop(ParentLoop);
+
+  // Canonicalize to LoopSimplifyForm both original and remainder loops. We
+  // cannot rely on the LoopUnrollPass to do this because it only does
+  // canonicalization for parent/subloops and not the sibling loops.
+  if (OtherExits.size() > 0) {
+    // Generate dedicated exit blocks for the original loop, to preserve
+    // LoopSimplifyForm.
+    formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA);
+    // Generate dedicated exit blocks for the remainder loop if one exists, to
+    // preserve LoopSimplifyForm.
+    if (remainderLoop)
+      formDedicatedExitBlocks(remainderLoop, DT, LI, PreserveLCSSA);
+  }
+
+  NumRuntimeUnrolled++;
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp
new file mode 100644
index 000000000000..58b70be95d99
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -0,0 +1,1391 @@
+//===-- LoopUtils.cpp - Loop Utility functions -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines common loop utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "loop-utils"
+
+bool RecurrenceDescriptor::areAllUsesIn(Instruction *I,
+                                        SmallPtrSetImpl<Instruction *> &Set) {
+  for (User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
+    if (!Set.count(dyn_cast<Instruction>(*Use)))
+      return false;
+  return true;
+}
+
+bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurrenceKind Kind) {
+  switch (Kind) {
+  default:
+    break;
+  case RK_IntegerAdd:
+  case RK_IntegerMult:
+  case RK_IntegerOr:
+  case RK_IntegerAnd:
+  case RK_IntegerXor:
+  case RK_IntegerMinMax:
+    return true;
+  }
+  return false;
+}
+
+bool RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind Kind) {
+  return (Kind != RK_NoRecurrence) && !isIntegerRecurrenceKind(Kind);
+}
+
+bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurrenceKind Kind) {
+  switch (Kind) {
+  default:
+    break;
+  case RK_IntegerAdd:
+  case RK_IntegerMult:
+  case RK_FloatAdd:
+  case RK_FloatMult:
+    return true;
+  }
+  return false;
+}
+
+Instruction *
+RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT,
+                                     SmallPtrSetImpl<Instruction *> &Visited,
+                                     SmallPtrSetImpl<Instruction *> &CI) {
+  if (!Phi->hasOneUse())
+    return Phi;
+
+  const APInt *M = nullptr;
+  Instruction *I, *J = cast<Instruction>(Phi->use_begin()->getUser());
+
+  // Matches either I & 2^x-1 or 2^x-1 & I. If we find a match, we update RT
+  // with a new integer type of the corresponding bit width.
+  if (match(J, m_c_And(m_Instruction(I), m_APInt(M)))) {
+    int32_t Bits = (*M + 1).exactLogBase2();
+    if (Bits > 0) {
+      RT = IntegerType::get(Phi->getContext(), Bits);
+      Visited.insert(Phi);
+      CI.insert(J);
+      return J;
+    }
+  }
+  return Phi;
+}
+
+bool RecurrenceDescriptor::getSourceExtensionKind(
+    Instruction *Start, Instruction *Exit, Type *RT, bool &IsSigned,
+    SmallPtrSetImpl<Instruction *> &Visited,
+    SmallPtrSetImpl<Instruction *> &CI) {
+
+  SmallVector<Instruction *, 8> Worklist;
+  bool FoundOneOperand = false;
+  unsigned DstSize = RT->getPrimitiveSizeInBits();
+  Worklist.push_back(Exit);
+
+  // Traverse the instructions in the reduction expression, beginning with the
+  // exit value.
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    for (Use &U : I->operands()) {
+
+      // Terminate the traversal if the operand is not an instruction, or we
+      // reach the starting value.
+      Instruction *J = dyn_cast<Instruction>(U.get());
+      if (!J || J == Start)
+        continue;
+
+      // Otherwise, investigate the operation if it is also in the expression.
+      if (Visited.count(J)) {
+        Worklist.push_back(J);
+        continue;
+      }
+
+      // If the operand is not in Visited, it is not a reduction operation, but
+      // it does feed into one. Make sure it is either a single-use sign- or
+      // zero-extend instruction.
+      CastInst *Cast = dyn_cast<CastInst>(J);
+      bool IsSExtInst = isa<SExtInst>(J);
+      if (!Cast || !Cast->hasOneUse() || !(isa<ZExtInst>(J) || IsSExtInst))
+        return false;
+
+      // Ensure the source type of the extend is no larger than the reduction
+      // type. It is not necessary for the types to be identical.
+      unsigned SrcSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
+      if (SrcSize > DstSize)
+        return false;
+
+      // Furthermore, ensure that all such extends are of the same kind.
+      if (FoundOneOperand) {
+        if (IsSigned != IsSExtInst)
+          return false;
+      } else {
+        FoundOneOperand = true;
+        IsSigned = IsSExtInst;
+      }
+
+      // Lastly, if the source type of the extend matches the reduction type,
+      // add the extend to CI so that we can avoid accounting for it in the
+      // cost model.
+      if (SrcSize == DstSize)
+        CI.insert(Cast);
+    }
+  }
+  return true;
+}
+
+bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
+                                           Loop *TheLoop, bool HasFunNoNaNAttr,
+                                           RecurrenceDescriptor &RedDes) {
+  if (Phi->getNumIncomingValues() != 2)
+    return false;
+
+  // Reduction variables are only found in the loop header block.
+  if (Phi->getParent() != TheLoop->getHeader())
+    return false;
+
+  // Obtain the reduction start value from the value that comes from the loop
+  // preheader.
+  Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
+
+  // ExitInstruction is the single value which is used outside the loop.
+  // We only allow for a single reduction value to be used outside the loop.
+  // This includes users of the reduction, variables (which form a cycle
+  // which ends in the phi node).
+  Instruction *ExitInstruction = nullptr;
+  // Indicates that we found a reduction operation in our scan.
+  bool FoundReduxOp = false;
+
+  // We start with the PHI node and scan for all of the users of this
+  // instruction. All users must be instructions that can be used as reduction
+  // variables (such as ADD). We must have a single out-of-block user. The cycle
+  // must include the original PHI.
+  bool FoundStartPHI = false;
+
+  // To recognize min/max patterns formed by a icmp select sequence, we store
+  // the number of instruction we saw from the recognized min/max pattern,
+  //  to make sure we only see exactly the two instructions.
+  unsigned NumCmpSelectPatternInst = 0;
+  InstDesc ReduxDesc(false, nullptr);
+
+  // Data used for determining if the recurrence has been type-promoted.
+  Type *RecurrenceType = Phi->getType();
+  SmallPtrSet<Instruction *, 4> CastInsts;
+  Instruction *Start = Phi;
+  bool IsSigned = false;
+
+  SmallPtrSet<Instruction *, 8> VisitedInsts;
+  SmallVector<Instruction *, 8> Worklist;
+
+  // Return early if the recurrence kind does not match the type of Phi. If the
+  // recurrence kind is arithmetic, we attempt to look through AND operations
+  // resulting from the type promotion performed by InstCombine.  Vector
+  // operations are not limited to the legal integer widths, so we may be able
+  // to evaluate the reduction in the narrower width.
+  if (RecurrenceType->isFloatingPointTy()) {
+    if (!isFloatingPointRecurrenceKind(Kind))
+      return false;
+  } else {
+    if (!isIntegerRecurrenceKind(Kind))
+      return false;
+    if (isArithmeticRecurrenceKind(Kind))
+      Start = lookThroughAnd(Phi, RecurrenceType, VisitedInsts, CastInsts);
+  }
+
+  Worklist.push_back(Start);
+  VisitedInsts.insert(Start);
+
+  // A value in the reduction can be used:
+  //  - By the reduction:
+  //      - Reduction operation:
+  //        - One use of reduction value (safe).
+  //        - Multiple use of reduction value (not safe).
+  //      - PHI:
+  //        - All uses of the PHI must be the reduction (safe).
+  //        - Otherwise, not safe.
+  //  - By instructions outside of the loop (safe).
+  //      * One value may have several outside users, but all outside
+  //        uses must be of the same value.
+  //  - By an instruction that is not part of the reduction (not safe).
+  //    This is either:
+  //      * An instruction type other than PHI or the reduction operation.
+  //      * A PHI in the header other than the initial PHI.
+  while (!Worklist.empty()) {
+    Instruction *Cur = Worklist.back();
+    Worklist.pop_back();
+
+    // No Users.
+    // If the instruction has no users then this is a broken chain and can't be
+    // a reduction variable.
+    if (Cur->use_empty())
+      return false;
+
+    bool IsAPhi = isa<PHINode>(Cur);
+
+    // A header PHI use other than the original PHI.
+    if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
+      return false;
+
+    // Reductions of instructions such as Div, and Sub is only possible if the
+    // LHS is the reduction variable.
+    if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) &&
+        !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) &&
+        !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))
+      return false;
+
+    // Any reduction instruction must be of one of the allowed kinds. We ignore
+    // the starting value (the Phi or an AND instruction if the Phi has been
+    // type-promoted).
+    if (Cur != Start) {
+      ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, HasFunNoNaNAttr);
+      if (!ReduxDesc.isRecurrence())
+        return false;
+    }
+
+    // A reduction operation must only have one use of the reduction value.
+    if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
+        hasMultipleUsesOf(Cur, VisitedInsts))
+      return false;
+
+    // All inputs to a PHI node must be a reduction value.
+    if (IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
+      return false;
+
+    if (Kind == RK_IntegerMinMax &&
+        (isa<ICmpInst>(Cur) || isa<SelectInst>(Cur)))
+      ++NumCmpSelectPatternInst;
+    if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) || isa<SelectInst>(Cur)))
+      ++NumCmpSelectPatternInst;
+
+    // Check  whether we found a reduction operator.
+    FoundReduxOp |= !IsAPhi && Cur != Start;
+
+    // Process users of current instruction. Push non-PHI nodes after PHI nodes
+    // onto the stack. This way we are going to have seen all inputs to PHI
+    // nodes once we get to them.
+    SmallVector<Instruction *, 8> NonPHIs;
+    SmallVector<Instruction *, 8> PHIs;
+    for (User *U : Cur->users()) {
+      Instruction *UI = cast<Instruction>(U);
+
+      // Check if we found the exit user.
+      BasicBlock *Parent = UI->getParent();
+      if (!TheLoop->contains(Parent)) {
+        // If we already know this instruction is used externally, move on to
+        // the next user.
+        if (ExitInstruction == Cur)
+          continue;
+
+        // Exit if you find multiple values used outside or if the header phi
+        // node is being used. In this case the user uses the value of the
+        // previous iteration, in which case we would loose "VF-1" iterations of
+        // the reduction operation if we vectorize.
+        if (ExitInstruction != nullptr || Cur == Phi)
+          return false;
+
+        // The instruction used by an outside user must be the last instruction
+        // before we feed back to the reduction phi. Otherwise, we loose VF-1
+        // operations on the value.
+        if (!is_contained(Phi->operands(), Cur))
+          return false;
+
+        ExitInstruction = Cur;
+        continue;
+      }
+
+      // Process instructions only once (termination). Each reduction cycle
+      // value must only be used once, except by phi nodes and min/max
+      // reductions which are represented as a cmp followed by a select.
+      InstDesc IgnoredVal(false, nullptr);
+      if (VisitedInsts.insert(UI).second) {
+        if (isa<PHINode>(UI))
+          PHIs.push_back(UI);
+        else
+          NonPHIs.push_back(UI);
+      } else if (!isa<PHINode>(UI) &&
+                 ((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) &&
+                   !isa<SelectInst>(UI)) ||
+                  !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence()))
+        return false;
+
+      // Remember that we completed the cycle.
+      if (UI == Phi)
+        FoundStartPHI = true;
+    }
+    Worklist.append(PHIs.begin(), PHIs.end());
+    Worklist.append(NonPHIs.begin(), NonPHIs.end());
+  }
+
+  // This means we have seen one but not the other instruction of the
+  // pattern or more than just a select and cmp.
+  if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
+      NumCmpSelectPatternInst != 2)
+    return false;
+
+  if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
+    return false;
+
+  // If we think Phi may have been type-promoted, we also need to ensure that
+  // all source operands of the reduction are either SExtInsts or ZEstInsts. If
+  // so, we will be able to evaluate the reduction in the narrower bit width.
+  if (Start != Phi)
+    if (!getSourceExtensionKind(Start, ExitInstruction, RecurrenceType,
+                                IsSigned, VisitedInsts, CastInsts))
+      return false;
+
+  // We found a reduction var if we have reached the original phi node and we
+  // only have a single instruction with out-of-loop users.
+
+  // The ExitInstruction(Instruction which is allowed to have out-of-loop users)
+  // is saved as part of the RecurrenceDescriptor.
+
+  // Save the description of this reduction variable.
+  RecurrenceDescriptor RD(
+      RdxStart, ExitInstruction, Kind, ReduxDesc.getMinMaxKind(),
+      ReduxDesc.getUnsafeAlgebraInst(), RecurrenceType, IsSigned, CastInsts);
+  RedDes = RD;
+
+  return true;
+}
+
+/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
+/// pattern corresponding to a min(X, Y) or max(X, Y).
+RecurrenceDescriptor::InstDesc
+RecurrenceDescriptor::isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev) {
+
+  assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
+         "Expect a select instruction");
+  Instruction *Cmp = nullptr;
+  SelectInst *Select = nullptr;
+
+  // We must handle the select(cmp()) as a single instruction. Advance to the
+  // select.
+  if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
+    if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->user_begin())))
+      return InstDesc(false, I);
+    return InstDesc(Select, Prev.getMinMaxKind());
+  }
+
+  // Only handle single use cases for now.
+  if (!(Select = dyn_cast<SelectInst>(I)))
+    return InstDesc(false, I);
+  if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
+      !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
+    return InstDesc(false, I);
+  if (!Cmp->hasOneUse())
+    return InstDesc(false, I);
+
+  Value *CmpLeft;
+  Value *CmpRight;
+
+  // Look for a min/max pattern.
+  if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return InstDesc(Select, MRK_UIntMin);
+  else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return InstDesc(Select, MRK_UIntMax);
+  else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return InstDesc(Select, MRK_SIntMax);
+  else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return InstDesc(Select, MRK_SIntMin);
+  else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return InstDesc(Select, MRK_FloatMin);
+  else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return InstDesc(Select, MRK_FloatMax);
+  else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return InstDesc(Select, MRK_FloatMin);
+  else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return InstDesc(Select, MRK_FloatMax);
+
+  return InstDesc(false, I);
+}
+
+RecurrenceDescriptor::InstDesc
+RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
+                                        InstDesc &Prev, bool HasFunNoNaNAttr) {
+  bool FP = I->getType()->isFloatingPointTy();
+  Instruction *UAI = Prev.getUnsafeAlgebraInst();
+  if (!UAI && FP && !I->hasUnsafeAlgebra())
+    UAI = I; // Found an unsafe (unvectorizable) algebra instruction.
+
+  switch (I->getOpcode()) {
+  default:
+    return InstDesc(false, I);
+  case Instruction::PHI:
+    return InstDesc(I, Prev.getMinMaxKind(), Prev.getUnsafeAlgebraInst());
+  case Instruction::Sub:
+  case Instruction::Add:
+    return InstDesc(Kind == RK_IntegerAdd, I);
+  case Instruction::Mul:
+    return InstDesc(Kind == RK_IntegerMult, I);
+  case Instruction::And:
+    return InstDesc(Kind == RK_IntegerAnd, I);
+  case Instruction::Or:
+    return InstDesc(Kind == RK_IntegerOr, I);
+  case Instruction::Xor:
+    return InstDesc(Kind == RK_IntegerXor, I);
+  case Instruction::FMul:
+    return InstDesc(Kind == RK_FloatMult, I, UAI);
+  case Instruction::FSub:
+  case Instruction::FAdd:
+    return InstDesc(Kind == RK_FloatAdd, I, UAI);
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+  case Instruction::Select:
+    if (Kind != RK_IntegerMinMax &&
+        (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
+      return InstDesc(false, I);
+    return isMinMaxSelectCmpPattern(I, Prev);
+  }
+}
+
+bool RecurrenceDescriptor::hasMultipleUsesOf(
+    Instruction *I, SmallPtrSetImpl<Instruction *> &Insts) {
+  unsigned NumUses = 0;
+  for (User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E;
+       ++Use) {
+    if (Insts.count(dyn_cast<Instruction>(*Use)))
+      ++NumUses;
+    if (NumUses > 1)
+      return true;
+  }
+
+  return false;
+}
+bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
+                                          RecurrenceDescriptor &RedDes) {
+
+  BasicBlock *Header = TheLoop->getHeader();
+  Function &F = *Header->getParent();
+  bool HasFunNoNaNAttr =
+      F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+
+  if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes)) {
+    DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
+    return true;
+  }
+  if (AddReductionVar(Phi, RK_IntegerMult, TheLoop, HasFunNoNaNAttr, RedDes)) {
+    DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
+    return true;
+  }
+  if (AddReductionVar(Phi, RK_IntegerOr, TheLoop, HasFunNoNaNAttr, RedDes)) {
+    DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
+    return true;
+  }
+  if (AddReductionVar(Phi, RK_IntegerAnd, TheLoop, HasFunNoNaNAttr, RedDes)) {
+    DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
+    return true;
+  }
+  if (AddReductionVar(Phi, RK_IntegerXor, TheLoop, HasFunNoNaNAttr, RedDes)) {
+    DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
+    return true;
+  }
+  if (AddReductionVar(Phi, RK_IntegerMinMax, TheLoop, HasFunNoNaNAttr,
+                      RedDes)) {
+    DEBUG(dbgs() << "Found a MINMAX reduction PHI." << *Phi << "\n");
+    return true;
+  }
+  if (AddReductionVar(Phi, RK_FloatMult, TheLoop, HasFunNoNaNAttr, RedDes)) {
+    DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
+    return true;
+  }
+  if (AddReductionVar(Phi, RK_FloatAdd, TheLoop, HasFunNoNaNAttr, RedDes)) {
+    DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
+    return true;
+  }
+  if (AddReductionVar(Phi, RK_FloatMinMax, TheLoop, HasFunNoNaNAttr, RedDes)) {
+    DEBUG(dbgs() << "Found an float MINMAX reduction PHI." << *Phi << "\n");
+    return true;
+  }
+  // Not a reduction of known type.
+  return false;
+}
+
+bool RecurrenceDescriptor::isFirstOrderRecurrence(
+    PHINode *Phi, Loop *TheLoop,
+    DenseMap<Instruction *, Instruction *> &SinkAfter, DominatorTree *DT) {
+
+  // Ensure the phi node is in the loop header and has two incoming values.
+  if (Phi->getParent() != TheLoop->getHeader() ||
+      Phi->getNumIncomingValues() != 2)
+    return false;
+
+  // Ensure the loop has a preheader and a single latch block. The loop
+  // vectorizer will need the latch to set up the next iteration of the loop.
+  auto *Preheader = TheLoop->getLoopPreheader();
+  auto *Latch = TheLoop->getLoopLatch();
+  if (!Preheader || !Latch)
+    return false;
+
+  // Ensure the phi node's incoming blocks are the loop preheader and latch.
+  if (Phi->getBasicBlockIndex(Preheader) < 0 ||
+      Phi->getBasicBlockIndex(Latch) < 0)
+    return false;
+
+  // Get the previous value. The previous value comes from the latch edge while
+  // the initial value comes form the preheader edge.
+  auto *Previous = dyn_cast<Instruction>(Phi->getIncomingValueForBlock(Latch));
+  if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous) ||
+      SinkAfter.count(Previous)) // Cannot rely on dominance due to motion.
+    return false;
+
+  // Ensure every user of the phi node is dominated by the previous value.
+  // The dominance requirement ensures the loop vectorizer will not need to
+  // vectorize the initial value prior to the first iteration of the loop.
+  // TODO: Consider extending this sinking to handle other kinds of instructions
+  // and expressions, beyond sinking a single cast past Previous.
+  if (Phi->hasOneUse()) {
+    auto *I = Phi->user_back();
+    if (I->isCast() && (I->getParent() == Phi->getParent()) && I->hasOneUse() &&
+        DT->dominates(Previous, I->user_back())) {
+      SinkAfter[I] = Previous;
+      return true;
+    }
+  }
+
+  for (User *U : Phi->users())
+    if (auto *I = dyn_cast<Instruction>(U)) {
+      if (!DT->dominates(Previous, I))
+        return false;
+    }
+
+  return true;
+}
+
+/// This function returns the identity element (or neutral element) for
+/// the operation K.
+Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurrenceKind K,
+                                                      Type *Tp) {
+  switch (K) {
+  case RK_IntegerXor:
+  case RK_IntegerAdd:
+  case RK_IntegerOr:
+    // Adding, Xoring, Oring zero to a number does not change it.
+    return ConstantInt::get(Tp, 0);
+  case RK_IntegerMult:
+    // Multiplying a number by 1 does not change it.
+    return ConstantInt::get(Tp, 1);
+  case RK_IntegerAnd:
+    // AND-ing a number with an all-1 value does not change it.
+    return ConstantInt::get(Tp, -1, true);
+  case RK_FloatMult:
+    // Multiplying a number by 1 does not change it.
+    return ConstantFP::get(Tp, 1.0L);
+  case RK_FloatAdd:
+    // Adding zero to a number does not change it.
+    return ConstantFP::get(Tp, 0.0L);
+  default:
+    llvm_unreachable("Unknown recurrence kind");
+  }
+}
+
+/// This function translates the recurrence kind to an LLVM binary operator.
+unsigned RecurrenceDescriptor::getRecurrenceBinOp(RecurrenceKind Kind) {
+  switch (Kind) {
+  case RK_IntegerAdd:
+    return Instruction::Add;
+  case RK_IntegerMult:
+    return Instruction::Mul;
+  case RK_IntegerOr:
+    return Instruction::Or;
+  case RK_IntegerAnd:
+    return Instruction::And;
+  case RK_IntegerXor:
+    return Instruction::Xor;
+  case RK_FloatMult:
+    return Instruction::FMul;
+  case RK_FloatAdd:
+    return Instruction::FAdd;
+  case RK_IntegerMinMax:
+    return Instruction::ICmp;
+  case RK_FloatMinMax:
+    return Instruction::FCmp;
+  default:
+    llvm_unreachable("Unknown recurrence operation");
+  }
+}
+
+Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder,
+                                            MinMaxRecurrenceKind RK,
+                                            Value *Left, Value *Right) {
+  CmpInst::Predicate P = CmpInst::ICMP_NE;
+  switch (RK) {
+  default:
+    llvm_unreachable("Unknown min/max recurrence kind");
+  case MRK_UIntMin:
+    P = CmpInst::ICMP_ULT;
+    break;
+  case MRK_UIntMax:
+    P = CmpInst::ICMP_UGT;
+    break;
+  case MRK_SIntMin:
+    P = CmpInst::ICMP_SLT;
+    break;
+  case MRK_SIntMax:
+    P = CmpInst::ICMP_SGT;
+    break;
+  case MRK_FloatMin:
+    P = CmpInst::FCMP_OLT;
+    break;
+  case MRK_FloatMax:
+    P = CmpInst::FCMP_OGT;
+    break;
+  }
+
+  // We only match FP sequences with unsafe algebra, so we can unconditionally
+  // set it on any generated instructions.
+  IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+  FastMathFlags FMF;
+  FMF.setUnsafeAlgebra();
+  Builder.setFastMathFlags(FMF);
+
+  Value *Cmp;
+  if (RK == MRK_FloatMin || RK == MRK_FloatMax)
+    Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
+  else
+    Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
+
+  Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
+  return Select;
+}
+
+InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K,
+                                         const SCEV *Step, BinaryOperator *BOp)
+  : StartValue(Start), IK(K), Step(Step), InductionBinOp(BOp) {
+  assert(IK != IK_NoInduction && "Not an induction");
+
+  // Start value type should match the induction kind and the value
+  // itself should not be null.
+  assert(StartValue && "StartValue is null");
+  assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) &&
+         "StartValue is not a pointer for pointer induction");
+  assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) &&
+         "StartValue is not an integer for integer induction");
+
+  // Check the Step Value. It should be non-zero integer value.
+  assert((!getConstIntStepValue() || !getConstIntStepValue()->isZero()) &&
+         "Step value is zero");
+
+  assert((IK != IK_PtrInduction || getConstIntStepValue()) &&
+         "Step value should be constant for pointer induction");
+  assert((IK == IK_FpInduction || Step->getType()->isIntegerTy()) &&
+         "StepValue is not an integer");
+
+  assert((IK != IK_FpInduction || Step->getType()->isFloatingPointTy()) &&
+         "StepValue is not FP for FpInduction");
+  assert((IK != IK_FpInduction || (InductionBinOp &&
+          (InductionBinOp->getOpcode() == Instruction::FAdd ||
+           InductionBinOp->getOpcode() == Instruction::FSub))) &&
+         "Binary opcode should be specified for FP induction");
+}
+
+int InductionDescriptor::getConsecutiveDirection() const {
+  ConstantInt *ConstStep = getConstIntStepValue();
+  if (ConstStep && (ConstStep->isOne() || ConstStep->isMinusOne()))
+    return ConstStep->getSExtValue();
+  return 0;
+}
+
+ConstantInt *InductionDescriptor::getConstIntStepValue() const {
+  if (isa<SCEVConstant>(Step))
+    return dyn_cast<ConstantInt>(cast<SCEVConstant>(Step)->getValue());
+  return nullptr;
+}
+
+Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index,
+                                      ScalarEvolution *SE,
+                                      const DataLayout& DL) const {
+
+  SCEVExpander Exp(*SE, DL, "induction");
+  assert(Index->getType() == Step->getType() &&
+         "Index type does not match StepValue type");
+  switch (IK) {
+  case IK_IntInduction: {
+    assert(Index->getType() == StartValue->getType() &&
+           "Index type does not match StartValue type");
+
+    // FIXME: Theoretically, we can call getAddExpr() of ScalarEvolution
+    // and calculate (Start + Index * Step) for all cases, without
+    // special handling for "isOne" and "isMinusOne".
+    // But in the real life the result code getting worse. We mix SCEV
+    // expressions and ADD/SUB operations and receive redundant
+    // intermediate values being calculated in different ways and
+    // Instcombine is unable to reduce them all.
+
+    if (getConstIntStepValue() &&
+        getConstIntStepValue()->isMinusOne())
+      return B.CreateSub(StartValue, Index);
+    if (getConstIntStepValue() &&
+        getConstIntStepValue()->isOne())
+      return B.CreateAdd(StartValue, Index);
+    const SCEV *S = SE->getAddExpr(SE->getSCEV(StartValue),
+                                   SE->getMulExpr(Step, SE->getSCEV(Index)));
+    return Exp.expandCodeFor(S, StartValue->getType(), &*B.GetInsertPoint());
+  }
+  case IK_PtrInduction: {
+    assert(isa<SCEVConstant>(Step) &&
+           "Expected constant step for pointer induction");
+    const SCEV *S = SE->getMulExpr(SE->getSCEV(Index), Step);
+    Index = Exp.expandCodeFor(S, Index->getType(), &*B.GetInsertPoint());
+    return B.CreateGEP(nullptr, StartValue, Index);
+  }
+  case IK_FpInduction: {
+    assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
+    assert(InductionBinOp &&
+           (InductionBinOp->getOpcode() == Instruction::FAdd ||
+            InductionBinOp->getOpcode() == Instruction::FSub) &&
+           "Original bin op should be defined for FP induction");
+
+    Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
+
+    // Floating point operations had to be 'fast' to enable the induction.
+    FastMathFlags Flags;
+    Flags.setUnsafeAlgebra();
+
+    Value *MulExp = B.CreateFMul(StepValue, Index);
+    if (isa<Instruction>(MulExp))
+      // We have to check, the MulExp may be a constant.
+      cast<Instruction>(MulExp)->setFastMathFlags(Flags);
+
+    Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode() , StartValue,
+                               MulExp, "induction");
+    if (isa<Instruction>(BOp))
+      cast<Instruction>(BOp)->setFastMathFlags(Flags);
+
+    return BOp;
+  }
+  case IK_NoInduction:
+    return nullptr;
+  }
+  llvm_unreachable("invalid enum");
+}
+
+bool InductionDescriptor::isFPInductionPHI(PHINode *Phi, const Loop *TheLoop,
+                                           ScalarEvolution *SE,
+                                           InductionDescriptor &D) {
+
+  // Here we only handle FP induction variables.
+  assert(Phi->getType()->isFloatingPointTy() && "Unexpected Phi type");
+
+  if (TheLoop->getHeader() != Phi->getParent())
+    return false;
+
+  // The loop may have multiple entrances or multiple exits; we can analyze
+  // this phi if it has a unique entry value and a unique backedge value.
+  if (Phi->getNumIncomingValues() != 2)
+    return false;
+  Value *BEValue = nullptr, *StartValue = nullptr;
+  if (TheLoop->contains(Phi->getIncomingBlock(0))) {
+    BEValue = Phi->getIncomingValue(0);
+    StartValue = Phi->getIncomingValue(1);
+  } else {
+    assert(TheLoop->contains(Phi->getIncomingBlock(1)) &&
+           "Unexpected Phi node in the loop"); 
+    BEValue = Phi->getIncomingValue(1);
+    StartValue = Phi->getIncomingValue(0);
+  }
+
+  BinaryOperator *BOp = dyn_cast<BinaryOperator>(BEValue);
+  if (!BOp)
+    return false;
+
+  Value *Addend = nullptr;
+  if (BOp->getOpcode() == Instruction::FAdd) {
+    if (BOp->getOperand(0) == Phi)
+      Addend = BOp->getOperand(1);
+    else if (BOp->getOperand(1) == Phi)
+      Addend = BOp->getOperand(0);
+  } else if (BOp->getOpcode() == Instruction::FSub)
+    if (BOp->getOperand(0) == Phi)
+      Addend = BOp->getOperand(1);
+
+  if (!Addend)
+    return false;
+
+  // The addend should be loop invariant
+  if (auto *I = dyn_cast<Instruction>(Addend))
+    if (TheLoop->contains(I))
+      return false;
+
+  // FP Step has unknown SCEV
+  const SCEV *Step = SE->getUnknown(Addend);
+  D = InductionDescriptor(StartValue, IK_FpInduction, Step, BOp);
+  return true;
+}
+
+bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
+                                         PredicatedScalarEvolution &PSE,
+                                         InductionDescriptor &D,
+                                         bool Assume) {
+  Type *PhiTy = Phi->getType();
+
+  // Handle integer and pointer inductions variables.
+  // Now we handle also FP induction but not trying to make a
+  // recurrent expression from the PHI node in-place.
+
+  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy() &&
+      !PhiTy->isFloatTy() && !PhiTy->isDoubleTy() && !PhiTy->isHalfTy())
+    return false;
+
+  if (PhiTy->isFloatingPointTy())
+    return isFPInductionPHI(Phi, TheLoop, PSE.getSE(), D);
+
+  const SCEV *PhiScev = PSE.getSCEV(Phi);
+  const auto *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+
+  // We need this expression to be an AddRecExpr.
+  if (Assume && !AR)
+    AR = PSE.getAsAddRec(Phi);
+
+  if (!AR) {
+    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+    return false;
+  }
+
+  return isInductionPHI(Phi, TheLoop, PSE.getSE(), D, AR);
+}
+
+bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
+                                         ScalarEvolution *SE,
+                                         InductionDescriptor &D,
+                                         const SCEV *Expr) {
+  Type *PhiTy = Phi->getType();
+  // We only handle integer and pointer inductions variables.
+  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+    return false;
+
+  // Check that the PHI is consecutive.
+  const SCEV *PhiScev = Expr ? Expr : SE->getSCEV(Phi);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+
+  if (!AR) {
+    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+    return false;
+  }
+
+  if (AR->getLoop() != TheLoop) {
+    // FIXME: We should treat this as a uniform. Unfortunately, we
+    // don't currently know how to handled uniform PHIs.
+    DEBUG(dbgs() << "LV: PHI is a recurrence with respect to an outer loop.\n");
+    return false;    
+  }
+
+  Value *StartValue =
+    Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader());
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+  // Calculate the pointer stride and check if it is consecutive.
+  // The stride may be a constant or a loop invariant integer value.
+  const SCEVConstant *ConstStep = dyn_cast<SCEVConstant>(Step);
+  if (!ConstStep && !SE->isLoopInvariant(Step, TheLoop))
+    return false;
+
+  if (PhiTy->isIntegerTy()) {
+    D = InductionDescriptor(StartValue, IK_IntInduction, Step);
+    return true;
+  }
+
+  assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
+  // Pointer induction should be a constant.
+  if (!ConstStep)
+    return false;
+
+  ConstantInt *CV = ConstStep->getValue();
+  Type *PointerElementType = PhiTy->getPointerElementType();
+  // The pointer stride cannot be determined if the pointer element type is not
+  // sized.
+  if (!PointerElementType->isSized())
+    return false;
+
+  const DataLayout &DL = Phi->getModule()->getDataLayout();
+  int64_t Size = static_cast<int64_t>(DL.getTypeAllocSize(PointerElementType));
+  if (!Size)
+    return false;
+
+  int64_t CVSize = CV->getSExtValue();
+  if (CVSize % Size)
+    return false;
+  auto *StepValue = SE->getConstant(CV->getType(), CVSize / Size,
+                                    true /* signed */);
+  D = InductionDescriptor(StartValue, IK_PtrInduction, StepValue);
+  return true;
+}
+
+bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
+                                   bool PreserveLCSSA) {
+  bool Changed = false;
+
+  // We re-use a vector for the in-loop predecesosrs.
+  SmallVector<BasicBlock *, 4> InLoopPredecessors;
+
+  auto RewriteExit = [&](BasicBlock *BB) {
+    assert(InLoopPredecessors.empty() &&
+           "Must start with an empty predecessors list!");
+    auto Cleanup = make_scope_exit([&] { InLoopPredecessors.clear(); });
+
+    // See if there are any non-loop predecessors of this exit block and
+    // keep track of the in-loop predecessors.
+    bool IsDedicatedExit = true;
+    for (auto *PredBB : predecessors(BB))
+      if (L->contains(PredBB)) {
+        if (isa<IndirectBrInst>(PredBB->getTerminator()))
+          // We cannot rewrite exiting edges from an indirectbr.
+          return false;
+
+        InLoopPredecessors.push_back(PredBB);
+      } else {
+        IsDedicatedExit = false;
+      }
+
+    assert(!InLoopPredecessors.empty() && "Must have *some* loop predecessor!");
+
+    // Nothing to do if this is already a dedicated exit.
+    if (IsDedicatedExit)
+      return false;
+
+    auto *NewExitBB = SplitBlockPredecessors(
+        BB, InLoopPredecessors, ".loopexit", DT, LI, PreserveLCSSA);
+
+    if (!NewExitBB)
+      DEBUG(dbgs() << "WARNING: Can't create a dedicated exit block for loop: "
+                   << *L << "\n");
+    else
+      DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
+                   << NewExitBB->getName() << "\n");
+    return true;
+  };
+
+  // Walk the exit blocks directly rather than building up a data structure for
+  // them, but only visit each one once.
+  SmallPtrSet<BasicBlock *, 4> Visited;
+  for (auto *BB : L->blocks())
+    for (auto *SuccBB : successors(BB)) {
+      // We're looking for exit blocks so skip in-loop successors.
+      if (L->contains(SuccBB))
+        continue;
+
+      // Visit each exit block exactly once.
+      if (!Visited.insert(SuccBB).second)
+        continue;
+
+      Changed |= RewriteExit(SuccBB);
+    }
+
+  return Changed;
+}
+
+/// \brief Returns the instructions that use values defined in the loop.
+SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
+  SmallVector<Instruction *, 8> UsedOutside;
+
+  for (auto *Block : L->getBlocks())
+    // FIXME: I believe that this could use copy_if if the Inst reference could
+    // be adapted into a pointer.
+    for (auto &Inst : *Block) {
+      auto Users = Inst.users();
+      if (any_of(Users, [&](User *U) {
+            auto *Use = cast<Instruction>(U);
+            return !L->contains(Use->getParent());
+          }))
+        UsedOutside.push_back(&Inst);
+    }
+
+  return UsedOutside;
+}
+
+void llvm::getLoopAnalysisUsage(AnalysisUsage &AU) {
+  // By definition, all loop passes need the LoopInfo analysis and the
+  // Dominator tree it depends on. Because they all participate in the loop
+  // pass manager, they must also preserve these.
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+
+  // We must also preserve LoopSimplify and LCSSA. We locally access their IDs
+  // here because users shouldn't directly get them from this header.
+  extern char &LoopSimplifyID;
+  extern char &LCSSAID;
+  AU.addRequiredID(LoopSimplifyID);
+  AU.addPreservedID(LoopSimplifyID);
+  AU.addRequiredID(LCSSAID);
+  AU.addPreservedID(LCSSAID);
+  // This is used in the LPPassManager to perform LCSSA verification on passes
+  // which preserve lcssa form
+  AU.addRequired<LCSSAVerificationPass>();
+  AU.addPreserved<LCSSAVerificationPass>();
+
+  // Loop passes are designed to run inside of a loop pass manager which means
+  // that any function analyses they require must be required by the first loop
+  // pass in the manager (so that it is computed before the loop pass manager
+  // runs) and preserved by all loop pasess in the manager. To make this
+  // reasonably robust, the set needed for most loop passes is maintained here.
+  // If your loop pass requires an analysis not listed here, you will need to
+  // carefully audit the loop pass manager nesting structure that results.
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<BasicAAWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+  AU.addPreserved<SCEVAAWrapperPass>();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+}
+
+/// Manually defined generic "LoopPass" dependency initialization. This is used
+/// to initialize the exact set of passes from above in \c
+/// getLoopAnalysisUsage. It can be used within a loop pass's initialization
+/// with:
+///
+///   INITIALIZE_PASS_DEPENDENCY(LoopPass)
+///
+/// As-if "LoopPass" were a pass.
+void llvm::initializeLoopPassPass(PassRegistry &Registry) {
+  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+  INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+}
+
+/// \brief Find string metadata for loop
+///
+/// If it has a value (e.g. {"llvm.distribute", 1} return the value as an
+/// operand or null otherwise.  If the string metadata is not found return
+/// Optional's not-a-value.
+Optional<const MDOperand *> llvm::findStringMetadataForLoop(Loop *TheLoop,
+                                                            StringRef Name) {
+  MDNode *LoopID = TheLoop->getLoopID();
+  // Return none if LoopID is false.
+  if (!LoopID)
+    return None;
+
+  // First operand should refer to the loop id itself.
+  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+  // Iterate over LoopID operands and look for MDString Metadata
+  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+    if (!MD)
+      continue;
+    MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+    if (!S)
+      continue;
+    // Return true if MDString holds expected MetaData.
+    if (Name.equals(S->getString()))
+      switch (MD->getNumOperands()) {
+      case 1:
+        return nullptr;
+      case 2:
+        return &MD->getOperand(1);
+      default:
+        llvm_unreachable("loop metadata has 0 or 1 operand");
+      }
+  }
+  return None;
+}
+
+/// Returns true if the instruction in a loop is guaranteed to execute at least
+/// once.
+bool llvm::isGuaranteedToExecute(const Instruction &Inst,
+                                 const DominatorTree *DT, const Loop *CurLoop,
+                                 const LoopSafetyInfo *SafetyInfo) {
+  // We have to check to make sure that the instruction dominates all
+  // of the exit blocks.  If it doesn't, then there is a path out of the loop
+  // which does not execute this instruction, so we can't hoist it.
+
+  // If the instruction is in the header block for the loop (which is very
+  // common), it is always guaranteed to dominate the exit blocks.  Since this
+  // is a common case, and can save some work, check it now.
+  if (Inst.getParent() == CurLoop->getHeader())
+    // If there's a throw in the header block, we can't guarantee we'll reach
+    // Inst.
+    return !SafetyInfo->HeaderMayThrow;
+
+  // Somewhere in this loop there is an instruction which may throw and make us
+  // exit the loop.
+  if (SafetyInfo->MayThrow)
+    return false;
+
+  // Get the exit blocks for the current loop.
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  // Verify that the block dominates each of the exit blocks of the loop.
+  for (BasicBlock *ExitBlock : ExitBlocks)
+    if (!DT->dominates(Inst.getParent(), ExitBlock))
+      return false;
+
+  // As a degenerate case, if the loop is statically infinite then we haven't
+  // proven anything since there are no exit blocks.
+  if (ExitBlocks.empty())
+    return false;
+
+  // FIXME: In general, we have to prove that the loop isn't an infinite loop.
+  // See http::llvm.org/PR24078 .  (The "ExitBlocks.empty()" check above is
+  // just a special case of this.)
+  return true;
+}
+
+Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
+  // Only support loops with a unique exiting block, and a latch.
+  if (!L->getExitingBlock())
+    return None;
+
+  // Get the branch weights for the the loop's backedge.
+  BranchInst *LatchBR =
+      dyn_cast<BranchInst>(L->getLoopLatch()->getTerminator());
+  if (!LatchBR || LatchBR->getNumSuccessors() != 2)
+    return None;
+
+  assert((LatchBR->getSuccessor(0) == L->getHeader() ||
+          LatchBR->getSuccessor(1) == L->getHeader()) &&
+         "At least one edge out of the latch must go to the header");
+
+  // To estimate the number of times the loop body was executed, we want to
+  // know the number of times the backedge was taken, vs. the number of times
+  // we exited the loop.
+  uint64_t TrueVal, FalseVal;
+  if (!LatchBR->extractProfMetadata(TrueVal, FalseVal))
+    return None;
+
+  if (!TrueVal || !FalseVal)
+    return 0;
+
+  // Divide the count of the backedge by the count of the edge exiting the loop,
+  // rounding to nearest.
+  if (LatchBR->getSuccessor(0) == L->getHeader())
+    return (TrueVal + (FalseVal / 2)) / FalseVal;
+  else
+    return (FalseVal + (TrueVal / 2)) / TrueVal;
+}
+
+/// \brief Adds a 'fast' flag to floating point operations.
+static Value *addFastMathFlag(Value *V) {
+  if (isa<FPMathOperator>(V)) {
+    FastMathFlags Flags;
+    Flags.setUnsafeAlgebra();
+    cast<Instruction>(V)->setFastMathFlags(Flags);
+  }
+  return V;
+}
+
+// Helper to generate a log2 shuffle reduction.
+Value *
+llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
+                          RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
+                          ArrayRef<Value *> RedOps) {
+  unsigned VF = Src->getType()->getVectorNumElements();
+  // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+  // and vector ops, reducing the set of values being computed by half each
+  // round.
+  assert(isPowerOf2_32(VF) &&
+         "Reduction emission only supported for pow2 vectors!");
+  Value *TmpVec = Src;
+  SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
+  for (unsigned i = VF; i != 1; i >>= 1) {
+    // Move the upper half of the vector to the lower half.
+    for (unsigned j = 0; j != i / 2; ++j)
+      ShuffleMask[j] = Builder.getInt32(i / 2 + j);
+
+    // Fill the rest of the mask with undef.
+    std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
+              UndefValue::get(Builder.getInt32Ty()));
+
+    Value *Shuf = Builder.CreateShuffleVector(
+        TmpVec, UndefValue::get(TmpVec->getType()),
+        ConstantVector::get(ShuffleMask), "rdx.shuf");
+
+    if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+      // Floating point operations had to be 'fast' to enable the reduction.
+      TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op,
+                                                   TmpVec, Shuf, "bin.rdx"));
+    } else {
+      assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+             "Invalid min/max");
+      TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec,
+                                                    Shuf);
+    }
+    if (!RedOps.empty())
+      propagateIRFlags(TmpVec, RedOps);
+  }
+  // The result is in the first element of the vector.
+  return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+}
+
+/// Create a simple vector reduction specified by an opcode and some
+/// flags (if generating min/max reductions).
+Value *llvm::createSimpleTargetReduction(
+    IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode,
+    Value *Src, TargetTransformInfo::ReductionFlags Flags,
+    ArrayRef<Value *> RedOps) {
+  assert(isa<VectorType>(Src->getType()) && "Type must be a vector");
+
+  Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType());
+  std::function<Value*()> BuildFunc;
+  using RD = RecurrenceDescriptor;
+  RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid;
+  // TODO: Support creating ordered reductions.
+  FastMathFlags FMFUnsafe;
+  FMFUnsafe.setUnsafeAlgebra();
+
+  switch (Opcode) {
+  case Instruction::Add:
+    BuildFunc = [&]() { return Builder.CreateAddReduce(Src); };
+    break;
+  case Instruction::Mul:
+    BuildFunc = [&]() { return Builder.CreateMulReduce(Src); };
+    break;
+  case Instruction::And:
+    BuildFunc = [&]() { return Builder.CreateAndReduce(Src); };
+    break;
+  case Instruction::Or:
+    BuildFunc = [&]() { return Builder.CreateOrReduce(Src); };
+    break;
+  case Instruction::Xor:
+    BuildFunc = [&]() { return Builder.CreateXorReduce(Src); };
+    break;
+  case Instruction::FAdd:
+    BuildFunc = [&]() {
+      auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src);
+      cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+      return Rdx;
+    };
+    break;
+  case Instruction::FMul:
+    BuildFunc = [&]() {
+      auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src);
+      cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+      return Rdx;
+    };
+    break;
+  case Instruction::ICmp:
+    if (Flags.IsMaxOp) {
+      MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax;
+      BuildFunc = [&]() {
+        return Builder.CreateIntMaxReduce(Src, Flags.IsSigned);
+      };
+    } else {
+      MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin;
+      BuildFunc = [&]() {
+        return Builder.CreateIntMinReduce(Src, Flags.IsSigned);
+      };
+    }
+    break;
+  case Instruction::FCmp:
+    if (Flags.IsMaxOp) {
+      MinMaxKind = RD::MRK_FloatMax;
+      BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); };
+    } else {
+      MinMaxKind = RD::MRK_FloatMin;
+      BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); };
+    }
+    break;
+  default:
+    llvm_unreachable("Unhandled opcode");
+    break;
+  }
+  if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
+    return BuildFunc();
+  return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps);
+}
+
+/// Create a vector reduction using a given recurrence descriptor.
+Value *llvm::createTargetReduction(IRBuilder<> &Builder,
+                                   const TargetTransformInfo *TTI,
+                                   RecurrenceDescriptor &Desc, Value *Src,
+                                   bool NoNaN) {
+  // TODO: Support in-order reductions based on the recurrence descriptor.
+  RecurrenceDescriptor::RecurrenceKind RecKind = Desc.getRecurrenceKind();
+  TargetTransformInfo::ReductionFlags Flags;
+  Flags.NoNaN = NoNaN;
+  auto getSimpleRdx = [&](unsigned Opc) {
+    return createSimpleTargetReduction(Builder, TTI, Opc, Src, Flags);
+  };
+  switch (RecKind) {
+  case RecurrenceDescriptor::RK_FloatAdd:
+    return getSimpleRdx(Instruction::FAdd);
+  case RecurrenceDescriptor::RK_FloatMult:
+    return getSimpleRdx(Instruction::FMul);
+  case RecurrenceDescriptor::RK_IntegerAdd:
+    return getSimpleRdx(Instruction::Add);
+  case RecurrenceDescriptor::RK_IntegerMult:
+    return getSimpleRdx(Instruction::Mul);
+  case RecurrenceDescriptor::RK_IntegerAnd:
+    return getSimpleRdx(Instruction::And);
+  case RecurrenceDescriptor::RK_IntegerOr:
+    return getSimpleRdx(Instruction::Or);
+  case RecurrenceDescriptor::RK_IntegerXor:
+    return getSimpleRdx(Instruction::Xor);
+  case RecurrenceDescriptor::RK_IntegerMinMax: {
+    switch (Desc.getMinMaxRecurrenceKind()) {
+    case RecurrenceDescriptor::MRK_SIntMax:
+      Flags.IsSigned = true;
+      Flags.IsMaxOp = true;
+      break;
+    case RecurrenceDescriptor::MRK_UIntMax:
+      Flags.IsMaxOp = true;
+      break;
+    case RecurrenceDescriptor::MRK_SIntMin:
+      Flags.IsSigned = true;
+      break;
+    case RecurrenceDescriptor::MRK_UIntMin:
+      break;
+    default:
+      llvm_unreachable("Unhandled MRK");
+    }
+    return getSimpleRdx(Instruction::ICmp);
+  }
+  case RecurrenceDescriptor::RK_FloatMinMax: {
+    Flags.IsMaxOp =
+        Desc.getMinMaxRecurrenceKind() == RecurrenceDescriptor::MRK_FloatMax;
+    return getSimpleRdx(Instruction::FCmp);
+  }
+  default:
+    llvm_unreachable("Unhandled RecKind");
+  }
+}
+
+void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
+  if (auto *VecOp = dyn_cast<Instruction>(I)) {
+    if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
+      // VecOVp is initialized to the 0th scalar, so start counting from index
+      // '1'.
+      VecOp->copyIRFlags(I0);
+      for (int i = 1, e = VL.size(); i < e; ++i) {
+        if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
+          VecOp->andIRFlags(Scalar);
+      }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp
new file mode 100644
index 000000000000..29756d9dab7f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -0,0 +1,323 @@
+//===- LoopVersioning.cpp - Utility to version a loop ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a utility class to perform loop versioning.  The versioned
+// loop speculates that otherwise may-aliasing memory accesses don't overlap and
+// emits checks to prove this.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    AnnotateNoAlias("loop-version-annotate-no-alias", cl::init(true),
+                    cl::Hidden,
+                    cl::desc("Add no-alias annotation for instructions that "
+                             "are disambiguated by memchecks"));
+
+LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
+                               DominatorTree *DT, ScalarEvolution *SE,
+                               bool UseLAIChecks)
+    : VersionedLoop(L), NonVersionedLoop(nullptr), LAI(LAI), LI(LI), DT(DT),
+      SE(SE) {
+  assert(L->getExitBlock() && "No single exit block");
+  assert(L->isLoopSimplifyForm() && "Loop is not in loop-simplify form");
+  if (UseLAIChecks) {
+    setAliasChecks(LAI.getRuntimePointerChecking()->getChecks());
+    setSCEVChecks(LAI.getPSE().getUnionPredicate());
+  }
+}
+
+void LoopVersioning::setAliasChecks(
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks) {
+  AliasChecks = std::move(Checks);
+}
+
+void LoopVersioning::setSCEVChecks(SCEVUnionPredicate Check) {
+  Preds = std::move(Check);
+}
+
+void LoopVersioning::versionLoop(
+    const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
+  Instruction *FirstCheckInst;
+  Instruction *MemRuntimeCheck;
+  Value *SCEVRuntimeCheck;
+  Value *RuntimeCheck = nullptr;
+
+  // Add the memcheck in the original preheader (this is empty initially).
+  BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader();
+  std::tie(FirstCheckInst, MemRuntimeCheck) =
+      LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks);
+
+  const SCEVUnionPredicate &Pred = LAI.getPSE().getUnionPredicate();
+  SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
+                   "scev.check");
+  SCEVRuntimeCheck =
+      Exp.expandCodeForPredicate(&Pred, RuntimeCheckBB->getTerminator());
+  auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck);
+
+  // Discard the SCEV runtime check if it is always true.
+  if (CI && CI->isZero())
+    SCEVRuntimeCheck = nullptr;
+
+  if (MemRuntimeCheck && SCEVRuntimeCheck) {
+    RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck,
+                                          SCEVRuntimeCheck, "lver.safe");
+    if (auto *I = dyn_cast<Instruction>(RuntimeCheck))
+      I->insertBefore(RuntimeCheckBB->getTerminator());
+  } else
+    RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck;
+
+  assert(RuntimeCheck && "called even though we don't need "
+                         "any runtime checks");
+
+  // Rename the block to make the IR more readable.
+  RuntimeCheckBB->setName(VersionedLoop->getHeader()->getName() +
+                          ".lver.check");
+
+  // Create empty preheader for the loop (and after cloning for the
+  // non-versioned loop).
+  BasicBlock *PH =
+      SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI);
+  PH->setName(VersionedLoop->getHeader()->getName() + ".ph");
+
+  // Clone the loop including the preheader.
+  //
+  // FIXME: This does not currently preserve SimplifyLoop because the exit
+  // block is a join between the two loops.
+  SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks;
+  NonVersionedLoop =
+      cloneLoopWithPreheader(PH, RuntimeCheckBB, VersionedLoop, VMap,
+                             ".lver.orig", LI, DT, NonVersionedLoopBlocks);
+  remapInstructionsInBlocks(NonVersionedLoopBlocks, VMap);
+
+  // Insert the conditional branch based on the result of the memchecks.
+  Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
+  BranchInst::Create(NonVersionedLoop->getLoopPreheader(),
+                     VersionedLoop->getLoopPreheader(), RuntimeCheck, OrigTerm);
+  OrigTerm->eraseFromParent();
+
+  // The loops merge in the original exit block.  This is now dominated by the
+  // memchecking block.
+  DT->changeImmediateDominator(VersionedLoop->getExitBlock(), RuntimeCheckBB);
+
+  // Adds the necessary PHI nodes for the versioned loops based on the
+  // loop-defined values used outside of the loop.
+  addPHINodes(DefsUsedOutside);
+}
+
+void LoopVersioning::addPHINodes(
+    const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
+  BasicBlock *PHIBlock = VersionedLoop->getExitBlock();
+  assert(PHIBlock && "No single successor to loop exit block");
+  PHINode *PN;
+
+  // First add a single-operand PHI for each DefsUsedOutside if one does not
+  // exists yet.
+  for (auto *Inst : DefsUsedOutside) {
+    // See if we have a single-operand PHI with the value defined by the
+    // original loop.
+    for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
+      if (PN->getIncomingValue(0) == Inst)
+        break;
+    }
+    // If not create it.
+    if (!PN) {
+      PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver",
+                           &PHIBlock->front());
+      for (auto *User : Inst->users())
+        if (!VersionedLoop->contains(cast<Instruction>(User)->getParent()))
+          User->replaceUsesOfWith(Inst, PN);
+      PN->addIncoming(Inst, VersionedLoop->getExitingBlock());
+    }
+  }
+
+  // Then for each PHI add the operand for the edge from the cloned loop.
+  for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
+    assert(PN->getNumOperands() == 1 &&
+           "Exit block should only have on predecessor");
+
+    // If the definition was cloned used that otherwise use the same value.
+    Value *ClonedValue = PN->getIncomingValue(0);
+    auto Mapped = VMap.find(ClonedValue);
+    if (Mapped != VMap.end())
+      ClonedValue = Mapped->second;
+
+    PN->addIncoming(ClonedValue, NonVersionedLoop->getExitingBlock());
+  }
+}
+
+void LoopVersioning::prepareNoAliasMetadata() {
+  // We need to turn the no-alias relation between pointer checking groups into
+  // no-aliasing annotations between instructions.
+  //
+  // We accomplish this by mapping each pointer checking group (a set of
+  // pointers memchecked together) to an alias scope and then also mapping each
+  // group to the list of scopes it can't alias.
+
+  const RuntimePointerChecking *RtPtrChecking = LAI.getRuntimePointerChecking();
+  LLVMContext &Context = VersionedLoop->getHeader()->getContext();
+
+  // First allocate an aliasing scope for each pointer checking group.
+  //
+  // While traversing through the checking groups in the loop, also create a
+  // reverse map from pointers to the pointer checking group they were assigned
+  // to.
+  MDBuilder MDB(Context);
+  MDNode *Domain = MDB.createAnonymousAliasScopeDomain("LVerDomain");
+
+  for (const auto &Group : RtPtrChecking->CheckingGroups) {
+    GroupToScope[&Group] = MDB.createAnonymousAliasScope(Domain);
+
+    for (unsigned PtrIdx : Group.Members)
+      PtrToGroup[RtPtrChecking->getPointerInfo(PtrIdx).PointerValue] = &Group;
+  }
+
+  // Go through the checks and for each pointer group, collect the scopes for
+  // each non-aliasing pointer group.
+  DenseMap<const RuntimePointerChecking::CheckingPtrGroup *,
+           SmallVector<Metadata *, 4>>
+      GroupToNonAliasingScopes;
+
+  for (const auto &Check : AliasChecks)
+    GroupToNonAliasingScopes[Check.first].push_back(GroupToScope[Check.second]);
+
+  // Finally, transform the above to actually map to scope list which is what
+  // the metadata uses.
+
+  for (auto Pair : GroupToNonAliasingScopes)
+    GroupToNonAliasingScopeList[Pair.first] = MDNode::get(Context, Pair.second);
+}
+
+void LoopVersioning::annotateLoopWithNoAlias() {
+  if (!AnnotateNoAlias)
+    return;
+
+  // First prepare the maps.
+  prepareNoAliasMetadata();
+
+  // Add the scope and no-alias metadata to the instructions.
+  for (Instruction *I : LAI.getDepChecker().getMemoryInstructions()) {
+    annotateInstWithNoAlias(I);
+  }
+}
+
+void LoopVersioning::annotateInstWithNoAlias(Instruction *VersionedInst,
+                                             const Instruction *OrigInst) {
+  if (!AnnotateNoAlias)
+    return;
+
+  LLVMContext &Context = VersionedLoop->getHeader()->getContext();
+  const Value *Ptr = isa<LoadInst>(OrigInst)
+                         ? cast<LoadInst>(OrigInst)->getPointerOperand()
+                         : cast<StoreInst>(OrigInst)->getPointerOperand();
+
+  // Find the group for the pointer and then add the scope metadata.
+  auto Group = PtrToGroup.find(Ptr);
+  if (Group != PtrToGroup.end()) {
+    VersionedInst->setMetadata(
+        LLVMContext::MD_alias_scope,
+        MDNode::concatenate(
+            VersionedInst->getMetadata(LLVMContext::MD_alias_scope),
+            MDNode::get(Context, GroupToScope[Group->second])));
+
+    // Add the no-alias metadata.
+    auto NonAliasingScopeList = GroupToNonAliasingScopeList.find(Group->second);
+    if (NonAliasingScopeList != GroupToNonAliasingScopeList.end())
+      VersionedInst->setMetadata(
+          LLVMContext::MD_noalias,
+          MDNode::concatenate(
+              VersionedInst->getMetadata(LLVMContext::MD_noalias),
+              NonAliasingScopeList->second));
+  }
+}
+
+namespace {
+/// \brief Also expose this is a pass.  Currently this is only used for
+/// unit-testing.  It adds all memchecks necessary to remove all may-aliasing
+/// array accesses from the loop.
+class LoopVersioningPass : public FunctionPass {
+public:
+  LoopVersioningPass() : FunctionPass(ID) {
+    initializeLoopVersioningPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+    // Build up a worklist of inner-loops to version. This is necessary as the
+    // act of versioning a loop creates new loops and can invalidate iterators
+    // across the loops.
+    SmallVector<Loop *, 8> Worklist;
+
+    for (Loop *TopLevelLoop : *LI)
+      for (Loop *L : depth_first(TopLevelLoop))
+        // We only handle inner-most loops.
+        if (L->empty())
+          Worklist.push_back(L);
+
+    // Now walk the identified inner loops.
+    bool Changed = false;
+    for (Loop *L : Worklist) {
+      const LoopAccessInfo &LAI = LAA->getInfo(L);
+      if (L->isLoopSimplifyForm() && (LAI.getNumRuntimePointerChecks() ||
+          !LAI.getPSE().getUnionPredicate().isAlwaysTrue())) {
+        LoopVersioning LVer(LAI, L, LI, DT, SE);
+        LVer.versionLoop();
+        LVer.annotateLoopWithNoAlias();
+        Changed = true;
+      }
+    }
+
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+  }
+
+  static char ID;
+};
+}
+
+#define LVER_OPTION "loop-versioning"
+#define DEBUG_TYPE LVER_OPTION
+
+char LoopVersioningPass::ID;
+static const char LVer_name[] = "Loop Versioning";
+
+INITIALIZE_PASS_BEGIN(LoopVersioningPass, LVER_OPTION, LVer_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopVersioningPass, LVER_OPTION, LVer_name, false, false)
+
+namespace llvm {
+FunctionPass *createLoopVersioningPass() {
+  return new LoopVersioningPass();
+}
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
new file mode 100644
index 000000000000..ee84541e526d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -0,0 +1,94 @@
+//===- LowerInvoke.cpp - Eliminate Invoke instructions --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation is designed for use by code generators which do not yet
+// support stack unwinding.  This pass converts 'invoke' instructions to 'call'
+// instructions, so that any exception-handling 'landingpad' blocks become dead
+// code (which can be removed by running the '-simplifycfg' pass afterwards).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LowerInvoke.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "lowerinvoke"
+
+STATISTIC(NumInvokes, "Number of invokes replaced");
+
+namespace {
+  class LowerInvokeLegacyPass : public FunctionPass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit LowerInvokeLegacyPass() : FunctionPass(ID) {
+      initializeLowerInvokeLegacyPassPass(*PassRegistry::getPassRegistry());
+    }
+    bool runOnFunction(Function &F) override;
+  };
+}
+
+char LowerInvokeLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerInvokeLegacyPass, "lowerinvoke",
+                "Lower invoke and unwind, for unwindless code generators",
+                false, false)
+
+static bool runImpl(Function &F) {
+  bool Changed = false;
+  for (BasicBlock &BB : F)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB.getTerminator())) {
+      SmallVector<Value *, 16> CallArgs(II->op_begin(), II->op_end() - 3);
+      // Insert a normal call instruction...
+      CallInst *NewCall =
+          CallInst::Create(II->getCalledValue(), CallArgs, "", II);
+      NewCall->takeName(II);
+      NewCall->setCallingConv(II->getCallingConv());
+      NewCall->setAttributes(II->getAttributes());
+      NewCall->setDebugLoc(II->getDebugLoc());
+      II->replaceAllUsesWith(NewCall);
+
+      // Insert an unconditional branch to the normal destination.
+      BranchInst::Create(II->getNormalDest(), II);
+
+      // Remove any PHI node entries from the exception destination.
+      II->getUnwindDest()->removePredecessor(&BB);
+
+      // Remove the invoke instruction now.
+      BB.getInstList().erase(II);
+
+      ++NumInvokes;
+      Changed = true;
+    }
+  return Changed;
+}
+
+bool LowerInvokeLegacyPass::runOnFunction(Function &F) {
+  return runImpl(F);
+}
+
+namespace llvm {
+char &LowerInvokePassID = LowerInvokeLegacyPass::ID;
+
+// Public Interface To the LowerInvoke pass.
+FunctionPass *createLowerInvokePass() { return new LowerInvokeLegacyPass(); }
+
+PreservedAnalyses LowerInvokePass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  bool Changed = runImpl(F);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/contrib/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
new file mode 100644
index 000000000000..900450b40061
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -0,0 +1,510 @@
+//===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+static unsigned getLoopOperandSizeInBytes(Type *Type) {
+  if (VectorType *VTy = dyn_cast<VectorType>(Type)) {
+    return VTy->getBitWidth() / 8;
+  }
+
+  return Type->getPrimitiveSizeInBits() / 8;
+}
+
+void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
+                                     Value *DstAddr, ConstantInt *CopyLen,
+                                     unsigned SrcAlign, unsigned DestAlign,
+                                     bool SrcIsVolatile, bool DstIsVolatile,
+                                     const TargetTransformInfo &TTI) {
+  // No need to expand zero length copies.
+  if (CopyLen->isZero())
+    return;
+
+  BasicBlock *PreLoopBB = InsertBefore->getParent();
+  BasicBlock *PostLoopBB = nullptr;
+  Function *ParentFunc = PreLoopBB->getParent();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+
+  Type *TypeOfCopyLen = CopyLen->getType();
+  Type *LoopOpType =
+      TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAlign, DestAlign);
+
+  unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType);
+  uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
+
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  if (LoopEndCount != 0) {
+    // Split
+    PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "memcpy-split");
+    BasicBlock *LoopBB =
+        BasicBlock::Create(Ctx, "load-store-loop", ParentFunc, PostLoopBB);
+    PreLoopBB->getTerminator()->setSuccessor(0, LoopBB);
+
+    IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
+
+    // Cast the Src and Dst pointers to pointers to the loop operand type (if
+    // needed).
+    PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
+    PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
+    if (SrcAddr->getType() != SrcOpType) {
+      SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType);
+    }
+    if (DstAddr->getType() != DstOpType) {
+      DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
+    }
+
+    IRBuilder<> LoopBuilder(LoopBB);
+    PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index");
+    LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB);
+    // Loop Body
+    Value *SrcGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+    Value *Load = LoopBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+    Value *DstGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+    LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+    Value *NewIndex =
+        LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U));
+    LoopIndex->addIncoming(NewIndex, LoopBB);
+
+    // Create the loop branch condition.
+    Constant *LoopEndCI = ConstantInt::get(TypeOfCopyLen, LoopEndCount);
+    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopEndCI),
+                             LoopBB, PostLoopBB);
+  }
+
+  uint64_t BytesCopied = LoopEndCount * LoopOpSize;
+  uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
+  if (RemainingBytes) {
+    IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI()
+                                    : InsertBefore);
+
+    // Update the alignment based on the copy size used in the loop body.
+    SrcAlign = std::min(SrcAlign, LoopOpSize);
+    DestAlign = std::min(DestAlign, LoopOpSize);
+
+    SmallVector<Type *, 5> RemainingOps;
+    TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+                                          SrcAlign, DestAlign);
+
+    for (auto OpTy : RemainingOps) {
+      // Calaculate the new index
+      unsigned OperandSize = getLoopOperandSizeInBytes(OpTy);
+      uint64_t GepIndex = BytesCopied / OperandSize;
+      assert(GepIndex * OperandSize == BytesCopied &&
+             "Division should have no Remainder!");
+      // Cast source to operand type and load
+      PointerType *SrcPtrType = PointerType::get(OpTy, SrcAS);
+      Value *CastedSrc = SrcAddr->getType() == SrcPtrType
+                             ? SrcAddr
+                             : RBuilder.CreateBitCast(SrcAddr, SrcPtrType);
+      Value *SrcGEP = RBuilder.CreateInBoundsGEP(
+          OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex));
+      Value *Load = RBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+
+      // Cast destination to operand type and store.
+      PointerType *DstPtrType = PointerType::get(OpTy, DstAS);
+      Value *CastedDst = DstAddr->getType() == DstPtrType
+                             ? DstAddr
+                             : RBuilder.CreateBitCast(DstAddr, DstPtrType);
+      Value *DstGEP = RBuilder.CreateInBoundsGEP(
+          OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex));
+      RBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+      BytesCopied += OperandSize;
+    }
+  }
+  assert(BytesCopied == CopyLen->getZExtValue() &&
+         "Bytes copied should match size in the call!");
+}
+
+void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
+                                       Value *SrcAddr, Value *DstAddr,
+                                       Value *CopyLen, unsigned SrcAlign,
+                                       unsigned DestAlign, bool SrcIsVolatile,
+                                       bool DstIsVolatile,
+                                       const TargetTransformInfo &TTI) {
+  BasicBlock *PreLoopBB = InsertBefore->getParent();
+  BasicBlock *PostLoopBB =
+      PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion");
+
+  Function *ParentFunc = PreLoopBB->getParent();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+
+  Type *LoopOpType =
+      TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAlign, DestAlign);
+  unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType);
+
+  IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
+
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+  PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
+  PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
+  if (SrcAddr->getType() != SrcOpType) {
+    SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType);
+  }
+  if (DstAddr->getType() != DstOpType) {
+    DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
+  }
+
+  // Calculate the loop trip count, and remaining bytes to copy after the loop.
+  Type *CopyLenType = CopyLen->getType();
+  IntegerType *ILengthType = dyn_cast<IntegerType>(CopyLenType);
+  assert(ILengthType &&
+         "expected size argument to memcpy to be an integer type!");
+  ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
+  Value *RuntimeLoopCount = PLBuilder.CreateUDiv(CopyLen, CILoopOpSize);
+  Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
+  Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
+
+  BasicBlock *LoopBB =
+      BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, nullptr);
+  IRBuilder<> LoopBuilder(LoopBB);
+
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index");
+  LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
+
+  Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+  Value *Load = LoopBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+  Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+  LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  Type *Int8Type = Type::getInt8Ty(Ctx);
+  if (LoopOpType != Int8Type) {
+    // Loop body for the residual copy.
+    BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual",
+                                               PreLoopBB->getParent(), nullptr);
+    // Residual loop header.
+    BasicBlock *ResHeaderBB = BasicBlock::Create(
+        Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr);
+
+    // Need to update the pre-loop basic block to branch to the correct place.
+    // branch to the main loop if the count is non-zero, branch to the residual
+    // loop if the copy size is smaller then 1 iteration of the main loop but
+    // non-zero and finally branch to after the residual loop if the memcpy
+    //  size is zero.
+    ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
+    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+                           LoopBB, ResHeaderBB);
+    PreLoopBB->getTerminator()->eraseFromParent();
+
+    LoopBuilder.CreateCondBr(
+        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+        ResHeaderBB);
+
+    // Determine if we need to branch to the residual loop or bypass it.
+    IRBuilder<> RHBuilder(ResHeaderBB);
+    RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidual, Zero),
+                           ResLoopBB, PostLoopBB);
+
+    // Copy the residual with single byte load/store loop.
+    IRBuilder<> ResBuilder(ResLoopBB);
+    PHINode *ResidualIndex =
+        ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index");
+    ResidualIndex->addIncoming(Zero, ResHeaderBB);
+
+    Value *SrcAsInt8 =
+        ResBuilder.CreateBitCast(SrcAddr, PointerType::get(Int8Type, SrcAS));
+    Value *DstAsInt8 =
+        ResBuilder.CreateBitCast(DstAddr, PointerType::get(Int8Type, DstAS));
+    Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex);
+    Value *SrcGEP =
+        ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset);
+    Value *Load = ResBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+    Value *DstGEP =
+        ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset);
+    ResBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+    Value *ResNewIndex =
+        ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U));
+    ResidualIndex->addIncoming(ResNewIndex, ResLoopBB);
+
+    // Create the loop branch condition.
+    ResBuilder.CreateCondBr(
+        ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidual), ResLoopBB,
+        PostLoopBB);
+  } else {
+    // In this case the loop operand type was a byte, and there is no need for a
+    // residual loop to copy the remaining memory after the main loop.
+    // We do however need to patch up the control flow by creating the
+    // terminators for the preloop block and the memcpy loop.
+    ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
+    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+                           LoopBB, PostLoopBB);
+    PreLoopBB->getTerminator()->eraseFromParent();
+    LoopBuilder.CreateCondBr(
+        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+        PostLoopBB);
+  }
+}
+
+void llvm::createMemCpyLoop(Instruction *InsertBefore,
+                            Value *SrcAddr, Value *DstAddr, Value *CopyLen,
+                            unsigned SrcAlign, unsigned DestAlign,
+                            bool SrcIsVolatile, bool DstIsVolatile) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  BasicBlock *NewBB =
+    InsertBefore->getParent()->splitBasicBlock(InsertBefore, "split");
+  BasicBlock *LoopBB = BasicBlock::Create(F->getContext(), "loadstoreloop",
+                                          F, NewBB);
+
+  IRBuilder<> Builder(OrigBB->getTerminator());
+
+  // SrcAddr and DstAddr are expected to be pointer types,
+  // so no check is made here.
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  // Cast pointers to (char *)
+  SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS));
+  DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS));
+
+  Builder.CreateCondBr(
+      Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
+      LoopBB);
+  OrigBB->getTerminator()->eraseFromParent();
+
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
+
+  // load from SrcAddr+LoopIndex
+  // TODO: we can leverage the align parameter of llvm.memcpy for more efficient
+  // word-sized loads and stores.
+  Value *Element =
+    LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP(
+                             LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex),
+                           SrcIsVolatile);
+  // store at DstAddr+LoopIndex
+  LoopBuilder.CreateStore(Element,
+                          LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(),
+                                                        DstAddr, LoopIndex),
+                          DstIsVolatile);
+
+  // The value for LoopIndex coming from backedge is (LoopIndex + 1)
+  Value *NewIndex =
+    LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+                           NewBB);
+}
+
+// Lower memmove to IR. memmove is required to correctly copy overlapping memory
+// regions; therefore, it has to check the relative positions of the source and
+// destination pointers and choose the copy direction accordingly.
+//
+// The code below is an IR rendition of this C function:
+//
+// void* memmove(void* dst, const void* src, size_t n) {
+//   unsigned char* d = dst;
+//   const unsigned char* s = src;
+//   if (s < d) {
+//     // copy backwards
+//     while (n--) {
+//       d[n] = s[n];
+//     }
+//   } else {
+//     // copy forward
+//     for (size_t i = 0; i < n; ++i) {
+//       d[i] = s[i];
+//     }
+//   }
+//   return dst;
+// }
+static void createMemMoveLoop(Instruction *InsertBefore,
+                              Value *SrcAddr, Value *DstAddr, Value *CopyLen,
+                              unsigned SrcAlign, unsigned DestAlign,
+                              bool SrcIsVolatile, bool DstIsVolatile) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+
+  // Create the a comparison of src and dst, based on which we jump to either
+  // the forward-copy part of the function (if src >= dst) or the backwards-copy
+  // part (if src < dst).
+  // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
+  // structure. Its block terminators (unconditional branches) are replaced by
+  // the appropriate conditional branches when the loop is built.
+  ICmpInst *PtrCompare = new ICmpInst(InsertBefore, ICmpInst::ICMP_ULT,
+                                      SrcAddr, DstAddr, "compare_src_dst");
+  TerminatorInst *ThenTerm, *ElseTerm;
+  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore, &ThenTerm,
+                                &ElseTerm);
+
+  // Each part of the function consists of two blocks:
+  //   copy_backwards:        used to skip the loop when n == 0
+  //   copy_backwards_loop:   the actual backwards loop BB
+  //   copy_forward:          used to skip the loop when n == 0
+  //   copy_forward_loop:     the actual forward loop BB
+  BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
+  CopyBackwardsBB->setName("copy_backwards");
+  BasicBlock *CopyForwardBB = ElseTerm->getParent();
+  CopyForwardBB->setName("copy_forward");
+  BasicBlock *ExitBB = InsertBefore->getParent();
+  ExitBB->setName("memmove_done");
+
+  // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
+  // between both backwards and forward copy clauses.
+  ICmpInst *CompareN =
+      new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
+                   ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
+
+  // Copying backwards.
+  BasicBlock *LoopBB =
+    BasicBlock::Create(F->getContext(), "copy_backwards_loop", F, CopyForwardBB);
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  Value *IndexPtr = LoopBuilder.CreateSub(
+      LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
+  Value *Element = LoopBuilder.CreateLoad(
+      LoopBuilder.CreateInBoundsGEP(SrcAddr, IndexPtr), "element");
+  LoopBuilder.CreateStore(Element,
+                          LoopBuilder.CreateInBoundsGEP(DstAddr, IndexPtr));
+  LoopBuilder.CreateCondBr(
+      LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
+      ExitBB, LoopBB);
+  LoopPhi->addIncoming(IndexPtr, LoopBB);
+  LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
+  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
+  ThenTerm->eraseFromParent();
+
+  // Copying forward.
+  BasicBlock *FwdLoopBB =
+    BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB);
+  IRBuilder<> FwdLoopBuilder(FwdLoopBB);
+  PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
+  Value *FwdElement = FwdLoopBuilder.CreateLoad(
+      FwdLoopBuilder.CreateInBoundsGEP(SrcAddr, FwdCopyPhi), "element");
+  FwdLoopBuilder.CreateStore(
+      FwdElement, FwdLoopBuilder.CreateInBoundsGEP(DstAddr, FwdCopyPhi));
+  Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
+      FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
+  FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
+                              ExitBB, FwdLoopBB);
+  FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
+  FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
+
+  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
+  ElseTerm->eraseFromParent();
+}
+
+static void createMemSetLoop(Instruction *InsertBefore,
+                             Value *DstAddr, Value *CopyLen, Value *SetValue,
+                             unsigned Align, bool IsVolatile) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  BasicBlock *NewBB =
+      OrigBB->splitBasicBlock(InsertBefore, "split");
+  BasicBlock *LoopBB
+    = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
+
+  IRBuilder<> Builder(OrigBB->getTerminator());
+
+  // Cast pointer to the type of value getting stored
+  unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+  DstAddr = Builder.CreateBitCast(DstAddr,
+                                  PointerType::get(SetValue->getType(), dstAS));
+
+  Builder.CreateCondBr(
+      Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
+      LoopBB);
+  OrigBB->getTerminator()->eraseFromParent();
+
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
+
+  LoopBuilder.CreateStore(
+      SetValue,
+      LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
+      IsVolatile);
+
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+                           NewBB);
+}
+
+void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
+                              const TargetTransformInfo &TTI) {
+  // Original implementation
+  if (!TTI.useWideIRMemcpyLoopLowering()) {
+    createMemCpyLoop(/* InsertBefore */ Memcpy,
+                     /* SrcAddr */ Memcpy->getRawSource(),
+                     /* DstAddr */ Memcpy->getRawDest(),
+                     /* CopyLen */ Memcpy->getLength(),
+                     /* SrcAlign */ Memcpy->getAlignment(),
+                     /* DestAlign */ Memcpy->getAlignment(),
+                     /* SrcIsVolatile */ Memcpy->isVolatile(),
+                     /* DstIsVolatile */ Memcpy->isVolatile());
+  } else {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
+      createMemCpyLoopKnownSize(/* InsertBefore */ Memcpy,
+                                /* SrcAddr */ Memcpy->getRawSource(),
+                                /* DstAddr */ Memcpy->getRawDest(),
+                                /* CopyLen */ CI,
+                                /* SrcAlign */ Memcpy->getAlignment(),
+                                /* DestAlign */ Memcpy->getAlignment(),
+                                /* SrcIsVolatile */ Memcpy->isVolatile(),
+                                /* DstIsVolatile */ Memcpy->isVolatile(),
+                                /* TargetTransformInfo */ TTI);
+    } else {
+      createMemCpyLoopUnknownSize(/* InsertBefore */ Memcpy,
+                                  /* SrcAddr */ Memcpy->getRawSource(),
+                                  /* DstAddr */ Memcpy->getRawDest(),
+                                  /* CopyLen */ Memcpy->getLength(),
+                                  /* SrcAlign */ Memcpy->getAlignment(),
+                                  /* DestAlign */ Memcpy->getAlignment(),
+                                  /* SrcIsVolatile */ Memcpy->isVolatile(),
+                                  /* DstIsVolatile */ Memcpy->isVolatile(),
+                                  /* TargetTransfomrInfo */ TTI);
+    }
+  }
+}
+
+void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
+  createMemMoveLoop(/* InsertBefore */ Memmove,
+                    /* SrcAddr */ Memmove->getRawSource(),
+                    /* DstAddr */ Memmove->getRawDest(),
+                    /* CopyLen */ Memmove->getLength(),
+                    /* SrcAlign */ Memmove->getAlignment(),
+                    /* DestAlign */ Memmove->getAlignment(),
+                    /* SrcIsVolatile */ Memmove->isVolatile(),
+                    /* DstIsVolatile */ Memmove->isVolatile());
+}
+
+void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
+  createMemSetLoop(/* InsertBefore */ Memset,
+                   /* DstAddr */ Memset->getRawDest(),
+                   /* CopyLen */ Memset->getLength(),
+                   /* SetValue */ Memset->getValue(),
+                   /* Alignment */ Memset->getAlignment(),
+                   Memset->isVolatile());
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
new file mode 100644
index 000000000000..890afbc46e63
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -0,0 +1,531 @@
+//===- LowerSwitch.cpp - Eliminate Switch instructions --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LowerSwitch transformation rewrites switch instructions with a sequence
+// of branches, which allows targets to get away with not implementing the
+// switch instruction until it is convenient.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-switch"
+
+namespace {
+  struct IntRange {
+    int64_t Low, High;
+  };
+  // Return true iff R is covered by Ranges.
+  static bool IsInRanges(const IntRange &R,
+                         const std::vector<IntRange> &Ranges) {
+    // Note: Ranges must be sorted, non-overlapping and non-adjacent.
+
+    // Find the first range whose High field is >= R.High,
+    // then check if the Low field is <= R.Low. If so, we
+    // have a Range that covers R.
+    auto I = std::lower_bound(
+        Ranges.begin(), Ranges.end(), R,
+        [](const IntRange &A, const IntRange &B) { return A.High < B.High; });
+    return I != Ranges.end() && I->Low <= R.Low;
+  }
+
+  /// Replace all SwitchInst instructions with chained branch instructions.
+  class LowerSwitch : public FunctionPass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    LowerSwitch() : FunctionPass(ID) {
+      initializeLowerSwitchPass(*PassRegistry::getPassRegistry());
+    } 
+
+    bool runOnFunction(Function &F) override;
+
+    struct CaseRange {
+      ConstantInt* Low;
+      ConstantInt* High;
+      BasicBlock* BB;
+
+      CaseRange(ConstantInt *low, ConstantInt *high, BasicBlock *bb)
+          : Low(low), High(high), BB(bb) {}
+    };
+
+    typedef std::vector<CaseRange> CaseVector;
+    typedef std::vector<CaseRange>::iterator CaseItr;
+  private:
+    void processSwitchInst(SwitchInst *SI, SmallPtrSetImpl<BasicBlock*> &DeleteList);
+
+    BasicBlock *switchConvert(CaseItr Begin, CaseItr End,
+                              ConstantInt *LowerBound, ConstantInt *UpperBound,
+                              Value *Val, BasicBlock *Predecessor,
+                              BasicBlock *OrigBlock, BasicBlock *Default,
+                              const std::vector<IntRange> &UnreachableRanges);
+    BasicBlock *newLeafBlock(CaseRange &Leaf, Value *Val, BasicBlock *OrigBlock,
+                             BasicBlock *Default);
+    unsigned Clusterify(CaseVector &Cases, SwitchInst *SI);
+  };
+
+  /// The comparison function for sorting the switch case values in the vector.
+  /// WARNING: Case ranges should be disjoint!
+  struct CaseCmp {
+    bool operator () (const LowerSwitch::CaseRange& C1,
+                      const LowerSwitch::CaseRange& C2) {
+
+      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
+      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
+      return CI1->getValue().slt(CI2->getValue());
+    }
+  };
+}
+
+char LowerSwitch::ID = 0;
+INITIALIZE_PASS(LowerSwitch, "lowerswitch",
+                "Lower SwitchInst's to branches", false, false)
+
+// Publicly exposed interface to pass...
+char &llvm::LowerSwitchID = LowerSwitch::ID;
+// createLowerSwitchPass - Interface to this file...
+FunctionPass *llvm::createLowerSwitchPass() {
+  return new LowerSwitch();
+}
+
+bool LowerSwitch::runOnFunction(Function &F) {
+  bool Changed = false;
+  SmallPtrSet<BasicBlock*, 8> DeleteList;
+
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
+    BasicBlock *Cur = &*I++; // Advance over block so we don't traverse new blocks
+
+    // If the block is a dead Default block that will be deleted later, don't
+    // waste time processing it.
+    if (DeleteList.count(Cur))
+      continue;
+
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(Cur->getTerminator())) {
+      Changed = true;
+      processSwitchInst(SI, DeleteList);
+    }
+  }
+
+  for (BasicBlock* BB: DeleteList) {
+    DeleteDeadBlock(BB);
+  }
+
+  return Changed;
+}
+
+/// Used for debugging purposes.
+static raw_ostream& operator<<(raw_ostream &O,
+                               const LowerSwitch::CaseVector &C)
+    LLVM_ATTRIBUTE_USED;
+static raw_ostream& operator<<(raw_ostream &O,
+                               const LowerSwitch::CaseVector &C) {
+  O << "[";
+
+  for (LowerSwitch::CaseVector::const_iterator B = C.begin(),
+         E = C.end(); B != E; ) {
+    O << *B->Low << " -" << *B->High;
+    if (++B != E) O << ", ";
+  }
+
+  return O << "]";
+}
+
+/// \brief Update the first occurrence of the "switch statement" BB in the PHI
+/// node with the "new" BB. The other occurrences will:
+///
+/// 1) Be updated by subsequent calls to this function.  Switch statements may
+/// have more than one outcoming edge into the same BB if they all have the same
+/// value. When the switch statement is converted these incoming edges are now
+/// coming from multiple BBs.
+/// 2) Removed if subsequent incoming values now share the same case, i.e.,
+/// multiple outcome edges are condensed into one. This is necessary to keep the
+/// number of phi values equal to the number of branches to SuccBB.
+static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
+                    unsigned NumMergedCases) {
+  for (BasicBlock::iterator I = SuccBB->begin(),
+                            IE = SuccBB->getFirstNonPHI()->getIterator();
+       I != IE; ++I) {
+    PHINode *PN = cast<PHINode>(I);
+
+    // Only update the first occurrence.
+    unsigned Idx = 0, E = PN->getNumIncomingValues();
+    unsigned LocalNumMergedCases = NumMergedCases;
+    for (; Idx != E; ++Idx) {
+      if (PN->getIncomingBlock(Idx) == OrigBB) {
+        PN->setIncomingBlock(Idx, NewBB);
+        break;
+      }
+    }
+
+    // Remove additional occurrences coming from condensed cases and keep the
+    // number of incoming values equal to the number of branches to SuccBB.
+    SmallVector<unsigned, 8> Indices;
+    for (++Idx; LocalNumMergedCases > 0 && Idx < E; ++Idx)
+      if (PN->getIncomingBlock(Idx) == OrigBB) {
+        Indices.push_back(Idx);
+        LocalNumMergedCases--;
+      }
+    // Remove incoming values in the reverse order to prevent invalidating
+    // *successive* index.
+    for (unsigned III : reverse(Indices))
+      PN->removeIncomingValue(III);
+  }
+}
+
+/// Convert the switch statement into a binary lookup of the case values.
+/// The function recursively builds this tree. LowerBound and UpperBound are
+/// used to keep track of the bounds for Val that have already been checked by
+/// a block emitted by one of the previous calls to switchConvert in the call
+/// stack.
+BasicBlock *
+LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
+                           ConstantInt *UpperBound, Value *Val,
+                           BasicBlock *Predecessor, BasicBlock *OrigBlock,
+                           BasicBlock *Default,
+                           const std::vector<IntRange> &UnreachableRanges) {
+  unsigned Size = End - Begin;
+
+  if (Size == 1) {
+    // Check if the Case Range is perfectly squeezed in between
+    // already checked Upper and Lower bounds. If it is then we can avoid
+    // emitting the code that checks if the value actually falls in the range
+    // because the bounds already tell us so.
+    if (Begin->Low == LowerBound && Begin->High == UpperBound) {
+      unsigned NumMergedCases = 0;
+      if (LowerBound && UpperBound)
+        NumMergedCases =
+            UpperBound->getSExtValue() - LowerBound->getSExtValue();
+      fixPhis(Begin->BB, OrigBlock, Predecessor, NumMergedCases);
+      return Begin->BB;
+    }
+    return newLeafBlock(*Begin, Val, OrigBlock, Default);
+  }
+
+  unsigned Mid = Size / 2;
+  std::vector<CaseRange> LHS(Begin, Begin + Mid);
+  DEBUG(dbgs() << "LHS: " << LHS << "\n");
+  std::vector<CaseRange> RHS(Begin + Mid, End);
+  DEBUG(dbgs() << "RHS: " << RHS << "\n");
+
+  CaseRange &Pivot = *(Begin + Mid);
+  DEBUG(dbgs() << "Pivot ==> "
+               << Pivot.Low->getValue()
+               << " -" << Pivot.High->getValue() << "\n");
+
+  // NewLowerBound here should never be the integer minimal value.
+  // This is because it is computed from a case range that is never
+  // the smallest, so there is always a case range that has at least
+  // a smaller value.
+  ConstantInt *NewLowerBound = Pivot.Low;
+
+  // Because NewLowerBound is never the smallest representable integer
+  // it is safe here to subtract one.
+  ConstantInt *NewUpperBound = ConstantInt::get(NewLowerBound->getContext(),
+                                                NewLowerBound->getValue() - 1);
+
+  if (!UnreachableRanges.empty()) {
+    // Check if the gap between LHS's highest and NewLowerBound is unreachable.
+    int64_t GapLow = LHS.back().High->getSExtValue() + 1;
+    int64_t GapHigh = NewLowerBound->getSExtValue() - 1;
+    IntRange Gap = { GapLow, GapHigh };
+    if (GapHigh >= GapLow && IsInRanges(Gap, UnreachableRanges))
+      NewUpperBound = LHS.back().High;
+  }
+
+  DEBUG(dbgs() << "LHS Bounds ==> ";
+        if (LowerBound) {
+          dbgs() << LowerBound->getSExtValue();
+        } else {
+          dbgs() << "NONE";
+        }
+        dbgs() << " - " << NewUpperBound->getSExtValue() << "\n";
+        dbgs() << "RHS Bounds ==> ";
+        dbgs() << NewLowerBound->getSExtValue() << " - ";
+        if (UpperBound) {
+          dbgs() << UpperBound->getSExtValue() << "\n";
+        } else {
+          dbgs() << "NONE\n";
+        });
+
+  // Create a new node that checks if the value is < pivot. Go to the
+  // left branch if it is and right branch if not.
+  Function* F = OrigBlock->getParent();
+  BasicBlock* NewNode = BasicBlock::Create(Val->getContext(), "NodeBlock");
+
+  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT,
+                                Val, Pivot.Low, "Pivot");
+
+  BasicBlock *LBranch = switchConvert(LHS.begin(), LHS.end(), LowerBound,
+                                      NewUpperBound, Val, NewNode, OrigBlock,
+                                      Default, UnreachableRanges);
+  BasicBlock *RBranch = switchConvert(RHS.begin(), RHS.end(), NewLowerBound,
+                                      UpperBound, Val, NewNode, OrigBlock,
+                                      Default, UnreachableRanges);
+
+  F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewNode);
+  NewNode->getInstList().push_back(Comp);
+
+  BranchInst::Create(LBranch, RBranch, Comp, NewNode);
+  return NewNode;
+}
+
+/// Create a new leaf block for the binary lookup tree. It checks if the
+/// switch's value == the case's value. If not, then it jumps to the default
+/// branch. At this point in the tree, the value can't be another valid case
+/// value, so the jump to the "default" branch is warranted.
+BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
+                                      BasicBlock* OrigBlock,
+                                      BasicBlock* Default)
+{
+  Function* F = OrigBlock->getParent();
+  BasicBlock* NewLeaf = BasicBlock::Create(Val->getContext(), "LeafBlock");
+  F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewLeaf);
+
+  // Emit comparison
+  ICmpInst* Comp = nullptr;
+  if (Leaf.Low == Leaf.High) {
+    // Make the seteq instruction...
+    Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_EQ, Val,
+                        Leaf.Low, "SwitchLeaf");
+  } else {
+    // Make range comparison
+    if (Leaf.Low->isMinValue(true /*isSigned*/)) {
+      // Val >= Min && Val <= Hi --> Val <= Hi
+      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SLE, Val, Leaf.High,
+                          "SwitchLeaf");
+    } else if (Leaf.Low->isZero()) {
+      // Val >= 0 && Val <= Hi --> Val <=u Hi
+      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High,
+                          "SwitchLeaf");      
+    } else {
+      // Emit V-Lo <=u Hi-Lo
+      Constant* NegLo = ConstantExpr::getNeg(Leaf.Low);
+      Instruction* Add = BinaryOperator::CreateAdd(Val, NegLo,
+                                                   Val->getName()+".off",
+                                                   NewLeaf);
+      Constant *UpperBound = ConstantExpr::getAdd(NegLo, Leaf.High);
+      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Add, UpperBound,
+                          "SwitchLeaf");
+    }
+  }
+
+  // Make the conditional branch...
+  BasicBlock* Succ = Leaf.BB;
+  BranchInst::Create(Succ, Default, Comp, NewLeaf);
+
+  // If there were any PHI nodes in this successor, rewrite one entry
+  // from OrigBlock to come from NewLeaf.
+  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+    PHINode* PN = cast<PHINode>(I);
+    // Remove all but one incoming entries from the cluster
+    uint64_t Range = Leaf.High->getSExtValue() -
+                     Leaf.Low->getSExtValue();
+    for (uint64_t j = 0; j < Range; ++j) {
+      PN->removeIncomingValue(OrigBlock);
+    }
+    
+    int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
+    assert(BlockIdx != -1 && "Switch didn't go to this successor??");
+    PN->setIncomingBlock((unsigned)BlockIdx, NewLeaf);
+  }
+
+  return NewLeaf;
+}
+
+/// Transform simple list of Cases into list of CaseRange's.
+unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
+  unsigned numCmps = 0;
+
+  // Start with "simple" cases
+  for (auto Case : SI->cases())
+    Cases.push_back(CaseRange(Case.getCaseValue(), Case.getCaseValue(),
+                              Case.getCaseSuccessor()));
+
+  std::sort(Cases.begin(), Cases.end(), CaseCmp());
+
+  // Merge case into clusters
+  if (Cases.size() >= 2) {
+    CaseItr I = Cases.begin();
+    for (CaseItr J = std::next(I), E = Cases.end(); J != E; ++J) {
+      int64_t nextValue = J->Low->getSExtValue();
+      int64_t currentValue = I->High->getSExtValue();
+      BasicBlock* nextBB = J->BB;
+      BasicBlock* currentBB = I->BB;
+
+      // If the two neighboring cases go to the same destination, merge them
+      // into a single case.
+      assert(nextValue > currentValue && "Cases should be strictly ascending");
+      if ((nextValue == currentValue + 1) && (currentBB == nextBB)) {
+        I->High = J->High;
+        // FIXME: Combine branch weights.
+      } else if (++I != J) {
+        *I = *J;
+      }
+    }
+    Cases.erase(std::next(I), Cases.end());
+  }
+
+  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
+    if (I->Low != I->High)
+      // A range counts double, since it requires two compares.
+      ++numCmps;
+  }
+
+  return numCmps;
+}
+
+/// Replace the specified switch instruction with a sequence of chained if-then
+/// insts in a balanced binary search.
+void LowerSwitch::processSwitchInst(SwitchInst *SI,
+                                    SmallPtrSetImpl<BasicBlock*> &DeleteList) {
+  BasicBlock *CurBlock = SI->getParent();
+  BasicBlock *OrigBlock = CurBlock;
+  Function *F = CurBlock->getParent();
+  Value *Val = SI->getCondition();  // The value we are switching on...
+  BasicBlock* Default = SI->getDefaultDest();
+
+  // Don't handle unreachable blocks. If there are successors with phis, this
+  // would leave them behind with missing predecessors.
+  if ((CurBlock != &F->getEntryBlock() && pred_empty(CurBlock)) ||
+      CurBlock->getSinglePredecessor() == CurBlock) {
+    DeleteList.insert(CurBlock);
+    return;
+  }
+
+  // If there is only the default destination, just branch.
+  if (!SI->getNumCases()) {
+    BranchInst::Create(Default, CurBlock);
+    SI->eraseFromParent();
+    return;
+  }
+
+  // Prepare cases vector.
+  CaseVector Cases;
+  unsigned numCmps = Clusterify(Cases, SI);
+  DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
+               << ". Total compares: " << numCmps << "\n");
+  DEBUG(dbgs() << "Cases: " << Cases << "\n");
+  (void)numCmps;
+
+  ConstantInt *LowerBound = nullptr;
+  ConstantInt *UpperBound = nullptr;
+  std::vector<IntRange> UnreachableRanges;
+
+  if (isa<UnreachableInst>(Default->getFirstNonPHIOrDbg())) {
+    // Make the bounds tightly fitted around the case value range, because we
+    // know that the value passed to the switch must be exactly one of the case
+    // values.
+    assert(!Cases.empty());
+    LowerBound = Cases.front().Low;
+    UpperBound = Cases.back().High;
+
+    DenseMap<BasicBlock *, unsigned> Popularity;
+    unsigned MaxPop = 0;
+    BasicBlock *PopSucc = nullptr;
+
+    IntRange R = { INT64_MIN, INT64_MAX };
+    UnreachableRanges.push_back(R);
+    for (const auto &I : Cases) {
+      int64_t Low = I.Low->getSExtValue();
+      int64_t High = I.High->getSExtValue();
+
+      IntRange &LastRange = UnreachableRanges.back();
+      if (LastRange.Low == Low) {
+        // There is nothing left of the previous range.
+        UnreachableRanges.pop_back();
+      } else {
+        // Terminate the previous range.
+        assert(Low > LastRange.Low);
+        LastRange.High = Low - 1;
+      }
+      if (High != INT64_MAX) {
+        IntRange R = { High + 1, INT64_MAX };
+        UnreachableRanges.push_back(R);
+      }
+
+      // Count popularity.
+      int64_t N = High - Low + 1;
+      unsigned &Pop = Popularity[I.BB];
+      if ((Pop += N) > MaxPop) {
+        MaxPop = Pop;
+        PopSucc = I.BB;
+      }
+    }
+#ifndef NDEBUG
+    /* UnreachableRanges should be sorted and the ranges non-adjacent. */
+    for (auto I = UnreachableRanges.begin(), E = UnreachableRanges.end();
+         I != E; ++I) {
+      assert(I->Low <= I->High);
+      auto Next = I + 1;
+      if (Next != E) {
+        assert(Next->Low > I->High);
+      }
+    }
+#endif
+
+    // Use the most popular block as the new default, reducing the number of
+    // cases.
+    assert(MaxPop > 0 && PopSucc);
+    Default = PopSucc;
+    Cases.erase(
+        remove_if(Cases,
+                  [PopSucc](const CaseRange &R) { return R.BB == PopSucc; }),
+        Cases.end());
+
+    // If there are no cases left, just branch.
+    if (Cases.empty()) {
+      BranchInst::Create(Default, CurBlock);
+      SI->eraseFromParent();
+      return;
+    }
+  }
+
+  // Create a new, empty default block so that the new hierarchy of
+  // if-then statements go to this and the PHI nodes are happy.
+  BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
+  F->getBasicBlockList().insert(Default->getIterator(), NewDefault);
+  BranchInst::Create(Default, NewDefault);
+
+  // If there is an entry in any PHI nodes for the default edge, make sure
+  // to update them as well.
+  for (BasicBlock::iterator I = Default->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
+    assert(BlockIdx != -1 && "Switch didn't go to this successor??");
+    PN->setIncomingBlock((unsigned)BlockIdx, NewDefault);
+  }
+
+  BasicBlock *SwitchBlock =
+      switchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
+                    OrigBlock, OrigBlock, NewDefault, UnreachableRanges);
+
+  // Branch to our shiny new if-then stuff...
+  BranchInst::Create(SwitchBlock, OrigBlock);
+
+  // We are now done with the switch instruction, delete it.
+  BasicBlock *OldDefault = SI->getDefaultDest();
+  CurBlock->getInstList().erase(SI);
+
+  // If the Default block has no more predecessors just add it to DeleteList.
+  if (pred_begin(OldDefault) == pred_end(OldDefault))
+    DeleteList.insert(OldDefault);
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
new file mode 100644
index 000000000000..b659a2e4463f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
@@ -0,0 +1,108 @@
+//===- Mem2Reg.cpp - The -mem2reg pass, a wrapper around the Utils lib ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is a simple pass wrapper around the PromoteMemToReg function call
+// exposed by the Utils library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Mem2Reg.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mem2reg"
+
+STATISTIC(NumPromoted, "Number of alloca's promoted");
+
+static bool promoteMemoryToRegister(Function &F, DominatorTree &DT,
+                                    AssumptionCache &AC) {
+  std::vector<AllocaInst *> Allocas;
+  BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function
+  bool Changed = false;
+
+  while (1) {
+    Allocas.clear();
+
+    // Find allocas that are safe to promote, by looking at all instructions in
+    // the entry node
+    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca?
+        if (isAllocaPromotable(AI))
+          Allocas.push_back(AI);
+
+    if (Allocas.empty())
+      break;
+
+    PromoteMemToReg(Allocas, DT, &AC);
+    NumPromoted += Allocas.size();
+    Changed = true;
+  }
+  return Changed;
+}
+
+PreservedAnalyses PromotePass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  if (!promoteMemoryToRegister(F, DT, AC))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+namespace {
+struct PromoteLegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  PromoteLegacyPass() : FunctionPass(ID) {
+    initializePromoteLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // runOnFunction - To run this pass, first we calculate the alloca
+  // instructions that are safe for promotion, then we promote each one.
+  //
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    AssumptionCache &AC =
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    return promoteMemoryToRegister(F, DT, AC);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.setPreservesCFG();
+  }
+  };
+}  // end of anonymous namespace
+
+char PromoteLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PromoteLegacyPass, "mem2reg", "Promote Memory to "
+                                                    "Register",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(PromoteLegacyPass, "mem2reg", "Promote Memory to Register",
+                    false, false)
+
+// createPromoteMemoryToRegister - Provide an entry point to create this pass.
+//
+FunctionPass *llvm::createPromoteMemoryToRegisterPass() {
+  return new PromoteLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
new file mode 100644
index 000000000000..9f2ad540c83d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
@@ -0,0 +1,161 @@
+//===- MetaRenamer.cpp - Rename everything with metasyntatic names --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass renames everything with metasyntatic names. The intent is to use
+// this pass after bugpoint reduction to conceal the nature of the original
+// program.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+using namespace llvm;
+
+namespace {
+
+  // This PRNG is from the ISO C spec. It is intentionally simple and
+  // unsuitable for cryptographic use. We're just looking for enough
+  // variety to surprise and delight users.
+  struct PRNG {
+    unsigned long next;
+
+    void srand(unsigned int seed) {
+      next = seed;
+    }
+
+    int rand() {
+      next = next * 1103515245 + 12345;
+      return (unsigned int)(next / 65536) % 32768;
+    }
+  };
+
+  static const char *const metaNames[] = {
+    // See http://en.wikipedia.org/wiki/Metasyntactic_variable
+    "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge",
+    "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam"
+  };
+
+  struct Renamer {
+    Renamer(unsigned int seed) {
+      prng.srand(seed);
+    }
+
+    const char *newName() {
+      return metaNames[prng.rand() % array_lengthof(metaNames)];
+    }
+
+    PRNG prng;
+  };
+  
+  struct MetaRenamer : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    MetaRenamer() : ModulePass(ID) {
+      initializeMetaRenamerPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.setPreservesAll();
+    }
+
+    bool runOnModule(Module &M) override {
+      // Seed our PRNG with simple additive sum of ModuleID. We're looking to
+      // simply avoid always having the same function names, and we need to
+      // remain deterministic.
+      unsigned int randSeed = 0;
+      for (auto C : M.getModuleIdentifier())
+        randSeed += C;
+
+      Renamer renamer(randSeed);
+
+      // Rename all aliases
+      for (auto AI = M.alias_begin(), AE = M.alias_end(); AI != AE; ++AI) {
+        StringRef Name = AI->getName();
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+          continue;
+
+        AI->setName("alias");
+      }
+
+      // Rename all global variables
+      for (auto GI = M.global_begin(), GE = M.global_end(); GI != GE; ++GI) {
+        StringRef Name = GI->getName();
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+          continue;
+
+        GI->setName("global");
+      }
+
+      // Rename all struct types
+      TypeFinder StructTypes;
+      StructTypes.run(M, true);
+      for (StructType *STy : StructTypes) {
+        if (STy->isLiteral() || STy->getName().empty()) continue;
+
+        SmallString<128> NameStorage;
+        STy->setName((Twine("struct.") +
+          renamer.newName()).toStringRef(NameStorage));
+      }
+
+      // Rename all functions
+      const TargetLibraryInfo &TLI =
+          getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+      for (auto &F : M) {
+        StringRef Name = F.getName();
+        LibFunc Tmp;
+        // Leave library functions alone because their presence or absence could
+        // affect the behavior of other passes.
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
+            TLI.getLibFunc(F, Tmp))
+          continue;
+
+        F.setName(renamer.newName());
+        runOnFunction(F);
+      }
+      return true;
+    }
+
+    bool runOnFunction(Function &F) {
+      for (auto AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI)
+        if (!AI->getType()->isVoidTy())
+          AI->setName("arg");
+
+      for (auto &BB : F) {
+        BB.setName("bb");
+
+        for (auto &I : BB)
+          if (!I.getType()->isVoidTy())
+            I.setName("tmp");
+      }
+      return true;
+    }
+  };
+}
+
+char MetaRenamer::ID = 0;
+INITIALIZE_PASS_BEGIN(MetaRenamer, "metarenamer",
+                      "Assign new names to everything", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(MetaRenamer, "metarenamer",
+                    "Assign new names to everything", false, false)
+//===----------------------------------------------------------------------===//
+//
+// MetaRenamer - Rename everything with metasyntactic names.
+//
+ModulePass *llvm::createMetaRenamerPass() {
+  return new MetaRenamer();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
new file mode 100644
index 000000000000..2ef3d6336ae2
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -0,0 +1,271 @@
+//===-- ModuleUtils.cpp - Functions to manipulate Modules -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform manipulations on Modules.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static void appendToGlobalArray(const char *Array, Module &M, Function *F,
+                                int Priority, Constant *Data) {
+  IRBuilder<> IRB(M.getContext());
+  FunctionType *FnTy = FunctionType::get(IRB.getVoidTy(), false);
+
+  // Get the current set of static global constructors and add the new ctor
+  // to the list.
+  SmallVector<Constant *, 16> CurrentCtors;
+  StructType *EltTy;
+  if (GlobalVariable *GVCtor = M.getNamedGlobal(Array)) {
+    ArrayType *ATy = cast<ArrayType>(GVCtor->getValueType());
+    StructType *OldEltTy = cast<StructType>(ATy->getElementType());
+    // Upgrade a 2-field global array type to the new 3-field format if needed.
+    if (Data && OldEltTy->getNumElements() < 3)
+      EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
+                              IRB.getInt8PtrTy());
+    else
+      EltTy = OldEltTy;
+    if (Constant *Init = GVCtor->getInitializer()) {
+      unsigned n = Init->getNumOperands();
+      CurrentCtors.reserve(n + 1);
+      for (unsigned i = 0; i != n; ++i) {
+        auto Ctor = cast<Constant>(Init->getOperand(i));
+        if (EltTy != OldEltTy)
+          Ctor =
+              ConstantStruct::get(EltTy, Ctor->getAggregateElement((unsigned)0),
+                                  Ctor->getAggregateElement(1),
+                                  Constant::getNullValue(IRB.getInt8PtrTy()));
+        CurrentCtors.push_back(Ctor);
+      }
+    }
+    GVCtor->eraseFromParent();
+  } else {
+    // Use the new three-field struct if there isn't one already.
+    EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
+                            IRB.getInt8PtrTy());
+  }
+
+  // Build a 2 or 3 field global_ctor entry.  We don't take a comdat key.
+  Constant *CSVals[3];
+  CSVals[0] = IRB.getInt32(Priority);
+  CSVals[1] = F;
+  // FIXME: Drop support for the two element form in LLVM 4.0.
+  if (EltTy->getNumElements() >= 3)
+    CSVals[2] = Data ? ConstantExpr::getPointerCast(Data, IRB.getInt8PtrTy())
+                     : Constant::getNullValue(IRB.getInt8PtrTy());
+  Constant *RuntimeCtorInit =
+      ConstantStruct::get(EltTy, makeArrayRef(CSVals, EltTy->getNumElements()));
+
+  CurrentCtors.push_back(RuntimeCtorInit);
+
+  // Create a new initializer.
+  ArrayType *AT = ArrayType::get(EltTy, CurrentCtors.size());
+  Constant *NewInit = ConstantArray::get(AT, CurrentCtors);
+
+  // Create the new global variable and replace all uses of
+  // the old global variable with the new one.
+  (void)new GlobalVariable(M, NewInit->getType(), false,
+                           GlobalValue::AppendingLinkage, NewInit, Array);
+}
+
+void llvm::appendToGlobalCtors(Module &M, Function *F, int Priority, Constant *Data) {
+  appendToGlobalArray("llvm.global_ctors", M, F, Priority, Data);
+}
+
+void llvm::appendToGlobalDtors(Module &M, Function *F, int Priority, Constant *Data) {
+  appendToGlobalArray("llvm.global_dtors", M, F, Priority, Data);
+}
+
+static void appendToUsedList(Module &M, StringRef Name, ArrayRef<GlobalValue *> Values) {
+  GlobalVariable *GV = M.getGlobalVariable(Name);
+  SmallPtrSet<Constant *, 16> InitAsSet;
+  SmallVector<Constant *, 16> Init;
+  if (GV) {
+    ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
+    for (auto &Op : CA->operands()) {
+      Constant *C = cast_or_null<Constant>(Op);
+      if (InitAsSet.insert(C).second)
+        Init.push_back(C);
+    }
+    GV->eraseFromParent();
+  }
+
+  Type *Int8PtrTy = llvm::Type::getInt8PtrTy(M.getContext());
+  for (auto *V : Values) {
+    Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy);
+    if (InitAsSet.insert(C).second)
+      Init.push_back(C);
+  }
+
+  if (Init.empty())
+    return;
+
+  ArrayType *ATy = ArrayType::get(Int8PtrTy, Init.size());
+  GV = new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+                                ConstantArray::get(ATy, Init), Name);
+  GV->setSection("llvm.metadata");
+}
+
+void llvm::appendToUsed(Module &M, ArrayRef<GlobalValue *> Values) {
+  appendToUsedList(M, "llvm.used", Values);
+}
+
+void llvm::appendToCompilerUsed(Module &M, ArrayRef<GlobalValue *> Values) {
+  appendToUsedList(M, "llvm.compiler.used", Values);
+}
+
+Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) {
+  if (isa<Function>(FuncOrBitcast))
+    return cast<Function>(FuncOrBitcast);
+  FuncOrBitcast->print(errs());
+  errs() << '\n';
+  std::string Err;
+  raw_string_ostream Stream(Err);
+  Stream << "Sanitizer interface function redefined: " << *FuncOrBitcast;
+  report_fatal_error(Err);
+}
+
+Function *llvm::declareSanitizerInitFunction(Module &M, StringRef InitName,
+                                             ArrayRef<Type *> InitArgTypes) {
+  assert(!InitName.empty() && "Expected init function name");
+  Function *F = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      InitName,
+      FunctionType::get(Type::getVoidTy(M.getContext()), InitArgTypes, false),
+      AttributeList()));
+  F->setLinkage(Function::ExternalLinkage);
+  return F;
+}
+
+std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(
+    Module &M, StringRef CtorName, StringRef InitName,
+    ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs,
+    StringRef VersionCheckName) {
+  assert(!InitName.empty() && "Expected init function name");
+  assert(InitArgs.size() == InitArgTypes.size() &&
+         "Sanitizer's init function expects different number of arguments");
+  Function *InitFunction =
+      declareSanitizerInitFunction(M, InitName, InitArgTypes);
+  Function *Ctor = Function::Create(
+      FunctionType::get(Type::getVoidTy(M.getContext()), false),
+      GlobalValue::InternalLinkage, CtorName, &M);
+  BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor);
+  IRBuilder<> IRB(ReturnInst::Create(M.getContext(), CtorBB));
+  IRB.CreateCall(InitFunction, InitArgs);
+  if (!VersionCheckName.empty()) {
+    Function *VersionCheckFunction =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            VersionCheckName, FunctionType::get(IRB.getVoidTy(), {}, false),
+            AttributeList()));
+    IRB.CreateCall(VersionCheckFunction, {});
+  }
+  return std::make_pair(Ctor, InitFunction);
+}
+
+void llvm::filterDeadComdatFunctions(
+    Module &M, SmallVectorImpl<Function *> &DeadComdatFunctions) {
+  // Build a map from the comdat to the number of entries in that comdat we
+  // think are dead. If this fully covers the comdat group, then the entire
+  // group is dead. If we find another entry in the comdat group though, we'll
+  // have to preserve the whole group.
+  SmallDenseMap<Comdat *, int, 16> ComdatEntriesCovered;
+  for (Function *F : DeadComdatFunctions) {
+    Comdat *C = F->getComdat();
+    assert(C && "Expected all input GVs to be in a comdat!");
+    ComdatEntriesCovered[C] += 1;
+  }
+
+  auto CheckComdat = [&](Comdat &C) {
+    auto CI = ComdatEntriesCovered.find(&C);
+    if (CI == ComdatEntriesCovered.end())
+      return;
+
+    // If this could have been covered by a dead entry, just subtract one to
+    // account for it.
+    if (CI->second > 0) {
+      CI->second -= 1;
+      return;
+    }
+
+    // If we've already accounted for all the entries that were dead, the
+    // entire comdat is alive so remove it from the map.
+    ComdatEntriesCovered.erase(CI);
+  };
+
+  auto CheckAllComdats = [&] {
+    for (Function &F : M.functions())
+      if (Comdat *C = F.getComdat()) {
+        CheckComdat(*C);
+        if (ComdatEntriesCovered.empty())
+          return;
+      }
+    for (GlobalVariable &GV : M.globals())
+      if (Comdat *C = GV.getComdat()) {
+        CheckComdat(*C);
+        if (ComdatEntriesCovered.empty())
+          return;
+      }
+    for (GlobalAlias &GA : M.aliases())
+      if (Comdat *C = GA.getComdat()) {
+        CheckComdat(*C);
+        if (ComdatEntriesCovered.empty())
+          return;
+      }
+  };
+  CheckAllComdats();
+
+  if (ComdatEntriesCovered.empty()) {
+    DeadComdatFunctions.clear();
+    return;
+  }
+
+  // Remove the entries that were not covering.
+  erase_if(DeadComdatFunctions, [&](GlobalValue *GV) {
+    return ComdatEntriesCovered.find(GV->getComdat()) ==
+           ComdatEntriesCovered.end();
+  });
+}
+
+std::string llvm::getUniqueModuleId(Module *M) {
+  MD5 Md5;
+  bool ExportsSymbols = false;
+  auto AddGlobal = [&](GlobalValue &GV) {
+    if (GV.isDeclaration() || GV.getName().startswith("llvm.") ||
+        !GV.hasExternalLinkage())
+      return;
+    ExportsSymbols = true;
+    Md5.update(GV.getName());
+    Md5.update(ArrayRef<uint8_t>{0});
+  };
+
+  for (auto &F : *M)
+    AddGlobal(F);
+  for (auto &GV : M->globals())
+    AddGlobal(GV);
+  for (auto &GA : M->aliases())
+    AddGlobal(GA);
+  for (auto &IF : M->ifuncs())
+    AddGlobal(IF);
+
+  if (!ExportsSymbols)
+    return "";
+
+  MD5::MD5Result R;
+  Md5.final(R);
+
+  SmallString<32> Str;
+  MD5::stringifyResult(R, Str);
+  return ("$" + Str).str();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp b/contrib/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
new file mode 100644
index 000000000000..34dc1cccdd5b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
@@ -0,0 +1,121 @@
+//===- NameAnonGlobals.cpp - ThinLTO Support: Name Unnamed Globals --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements naming anonymous globals to make sure they can be
+// referred to by ThinLTO.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/NameAnonGlobals.h"
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+namespace {
+// Compute a "unique" hash for the module based on the name of the public
+// globals.
+class ModuleHasher {
+  Module &TheModule;
+  std::string TheHash;
+
+public:
+  ModuleHasher(Module &M) : TheModule(M) {}
+
+  /// Return the lazily computed hash.
+  std::string &get() {
+    if (!TheHash.empty())
+      // Cache hit :)
+      return TheHash;
+
+    MD5 Hasher;
+    for (auto &F : TheModule) {
+      if (F.isDeclaration() || F.hasLocalLinkage() || !F.hasName())
+        continue;
+      auto Name = F.getName();
+      Hasher.update(Name);
+    }
+    for (auto &GV : TheModule.globals()) {
+      if (GV.isDeclaration() || GV.hasLocalLinkage() || !GV.hasName())
+        continue;
+      auto Name = GV.getName();
+      Hasher.update(Name);
+    }
+
+    // Now return the result.
+    MD5::MD5Result Hash;
+    Hasher.final(Hash);
+    SmallString<32> Result;
+    MD5::stringifyResult(Hash, Result);
+    TheHash = Result.str();
+    return TheHash;
+  }
+};
+} // end anonymous namespace
+
+// Rename all the anon globals in the module
+bool llvm::nameUnamedGlobals(Module &M) {
+  bool Changed = false;
+  ModuleHasher ModuleHash(M);
+  int count = 0;
+  auto RenameIfNeed = [&](GlobalValue &GV) {
+    if (GV.hasName())
+      return;
+    GV.setName(Twine("anon.") + ModuleHash.get() + "." + Twine(count++));
+    Changed = true;
+  };
+  for (auto &GO : M.global_objects())
+    RenameIfNeed(GO);
+  for (auto &GA : M.aliases())
+    RenameIfNeed(GA);
+
+  return Changed;
+}
+
+namespace {
+
+// Legacy pass that provides a name to every anon globals.
+class NameAnonGlobalLegacyPass : public ModulePass {
+
+public:
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  /// Specify pass name for debug output
+  StringRef getPassName() const override { return "Name Anon Globals"; }
+
+  explicit NameAnonGlobalLegacyPass() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override { return nameUnamedGlobals(M); }
+};
+char NameAnonGlobalLegacyPass::ID = 0;
+
+} // anonymous namespace
+
+PreservedAnalyses NameAnonGlobalPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
+  if (!nameUnamedGlobals(M))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+INITIALIZE_PASS_BEGIN(NameAnonGlobalLegacyPass, "name-anon-globals",
+                      "Provide a name to nameless globals", false, false)
+INITIALIZE_PASS_END(NameAnonGlobalLegacyPass, "name-anon-globals",
+                    "Provide a name to nameless globals", false, false)
+
+namespace llvm {
+ModulePass *createNameAnonGlobalPass() {
+  return new NameAnonGlobalLegacyPass();
+}
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/OrderedInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/OrderedInstructions.cpp
new file mode 100644
index 000000000000..dc780542ce68
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/OrderedInstructions.cpp
@@ -0,0 +1,32 @@
+//===-- OrderedInstructions.cpp - Instruction dominance function ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utility to check dominance relation of 2 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/OrderedInstructions.h"
+using namespace llvm;
+
+/// Given 2 instructions, use OrderedBasicBlock to check for dominance relation
+/// if the instructions are in the same basic block, Otherwise, use dominator
+/// tree.
+bool OrderedInstructions::dominates(const Instruction *InstA,
+                                    const Instruction *InstB) const {
+  const BasicBlock *IBB = InstA->getParent();
+  // Use ordered basic block to do dominance check in case the 2 instructions
+  // are in the same basic block.
+  if (IBB == InstB->getParent()) {
+    auto OBB = OBBMap.find(IBB);
+    if (OBB == OBBMap.end())
+      OBB = OBBMap.insert({IBB, make_unique<OrderedBasicBlock>(IBB)}).first;
+    return OBB->second->dominates(InstA, InstB);
+  }
+  return DT->dominates(InstA->getParent(), InstB->getParent());
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/contrib/llvm/lib/Transforms/Utils/PredicateInfo.cpp
new file mode 100644
index 000000000000..d4cdaede6b86
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -0,0 +1,793 @@
+//===-- PredicateInfo.cpp - PredicateInfo Builder--------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------===//
+//
+// This file implements the PredicateInfo class.
+//
+//===----------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/OrderedInstructions.h"
+#include <algorithm>
+#define DEBUG_TYPE "predicateinfo"
+using namespace llvm;
+using namespace PatternMatch;
+using namespace llvm::PredicateInfoClasses;
+
+INITIALIZE_PASS_BEGIN(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
+                      "PredicateInfo Printer", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
+                    "PredicateInfo Printer", false, false)
+static cl::opt<bool> VerifyPredicateInfo(
+    "verify-predicateinfo", cl::init(false), cl::Hidden,
+    cl::desc("Verify PredicateInfo in legacy printer pass."));
+namespace {
+DEBUG_COUNTER(RenameCounter, "predicateinfo-rename",
+              "Controls which variables are renamed with predicateinfo")
+// Given a predicate info that is a type of branching terminator, get the
+// branching block.
+const BasicBlock *getBranchBlock(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Only branches and switches should have PHIOnly defs that "
+         "require branch blocks.");
+  return cast<PredicateWithEdge>(PB)->From;
+}
+
+// Given a predicate info that is a type of branching terminator, get the
+// branching terminator.
+static Instruction *getBranchTerminator(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Not a predicate info type we know how to get a terminator from.");
+  return cast<PredicateWithEdge>(PB)->From->getTerminator();
+}
+
+// Given a predicate info that is a type of branching terminator, get the
+// edge this predicate info represents
+const std::pair<BasicBlock *, BasicBlock *>
+getBlockEdge(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Not a predicate info type we know how to get an edge from.");
+  const auto *PEdge = cast<PredicateWithEdge>(PB);
+  return std::make_pair(PEdge->From, PEdge->To);
+}
+}
+
+namespace llvm {
+namespace PredicateInfoClasses {
+enum LocalNum {
+  // Operations that must appear first in the block.
+  LN_First,
+  // Operations that are somewhere in the middle of the block, and are sorted on
+  // demand.
+  LN_Middle,
+  // Operations that must appear last in a block, like successor phi node uses.
+  LN_Last
+};
+
+// Associate global and local DFS info with defs and uses, so we can sort them
+// into a global domination ordering.
+struct ValueDFS {
+  int DFSIn = 0;
+  int DFSOut = 0;
+  unsigned int LocalNum = LN_Middle;
+  // Only one of Def or Use will be set.
+  Value *Def = nullptr;
+  Use *U = nullptr;
+  // Neither PInfo nor EdgeOnly participate in the ordering
+  PredicateBase *PInfo = nullptr;
+  bool EdgeOnly = false;
+};
+
+// Perform a strict weak ordering on instructions and arguments.
+static bool valueComesBefore(OrderedInstructions &OI, const Value *A,
+                             const Value *B) {
+  auto *ArgA = dyn_cast_or_null<Argument>(A);
+  auto *ArgB = dyn_cast_or_null<Argument>(B);
+  if (ArgA && !ArgB)
+    return true;
+  if (ArgB && !ArgA)
+    return false;
+  if (ArgA && ArgB)
+    return ArgA->getArgNo() < ArgB->getArgNo();
+  return OI.dominates(cast<Instruction>(A), cast<Instruction>(B));
+}
+
+// This compares ValueDFS structures, creating OrderedBasicBlocks where
+// necessary to compare uses/defs in the same block.  Doing so allows us to walk
+// the minimum number of instructions necessary to compute our def/use ordering.
+struct ValueDFS_Compare {
+  OrderedInstructions &OI;
+  ValueDFS_Compare(OrderedInstructions &OI) : OI(OI) {}
+
+  bool operator()(const ValueDFS &A, const ValueDFS &B) const {
+    if (&A == &B)
+      return false;
+    // The only case we can't directly compare them is when they in the same
+    // block, and both have localnum == middle.  In that case, we have to use
+    // comesbefore to see what the real ordering is, because they are in the
+    // same basic block.
+
+    bool SameBlock = std::tie(A.DFSIn, A.DFSOut) == std::tie(B.DFSIn, B.DFSOut);
+
+    // We want to put the def that will get used for a given set of phi uses,
+    // before those phi uses.
+    // So we sort by edge, then by def.
+    // Note that only phi nodes uses and defs can come last.
+    if (SameBlock && A.LocalNum == LN_Last && B.LocalNum == LN_Last)
+      return comparePHIRelated(A, B);
+
+    if (!SameBlock || A.LocalNum != LN_Middle || B.LocalNum != LN_Middle)
+      return std::tie(A.DFSIn, A.DFSOut, A.LocalNum, A.Def, A.U) <
+             std::tie(B.DFSIn, B.DFSOut, B.LocalNum, B.Def, B.U);
+    return localComesBefore(A, B);
+  }
+
+  // For a phi use, or a non-materialized def, return the edge it represents.
+  const std::pair<BasicBlock *, BasicBlock *>
+  getBlockEdge(const ValueDFS &VD) const {
+    if (!VD.Def && VD.U) {
+      auto *PHI = cast<PHINode>(VD.U->getUser());
+      return std::make_pair(PHI->getIncomingBlock(*VD.U), PHI->getParent());
+    }
+    // This is really a non-materialized def.
+    return ::getBlockEdge(VD.PInfo);
+  }
+
+  // For two phi related values, return the ordering.
+  bool comparePHIRelated(const ValueDFS &A, const ValueDFS &B) const {
+    auto &ABlockEdge = getBlockEdge(A);
+    auto &BBlockEdge = getBlockEdge(B);
+    // Now sort by block edge and then defs before uses.
+    return std::tie(ABlockEdge, A.Def, A.U) < std::tie(BBlockEdge, B.Def, B.U);
+  }
+
+  // Get the definition of an instruction that occurs in the middle of a block.
+  Value *getMiddleDef(const ValueDFS &VD) const {
+    if (VD.Def)
+      return VD.Def;
+    // It's possible for the defs and uses to be null.  For branches, the local
+    // numbering will say the placed predicaeinfos should go first (IE
+    // LN_beginning), so we won't be in this function. For assumes, we will end
+    // up here, beause we need to order the def we will place relative to the
+    // assume.  So for the purpose of ordering, we pretend the def is the assume
+    // because that is where we will insert the info.
+    if (!VD.U) {
+      assert(VD.PInfo &&
+             "No def, no use, and no predicateinfo should not occur");
+      assert(isa<PredicateAssume>(VD.PInfo) &&
+             "Middle of block should only occur for assumes");
+      return cast<PredicateAssume>(VD.PInfo)->AssumeInst;
+    }
+    return nullptr;
+  }
+
+  // Return either the Def, if it's not null, or the user of the Use, if the def
+  // is null.
+  const Instruction *getDefOrUser(const Value *Def, const Use *U) const {
+    if (Def)
+      return cast<Instruction>(Def);
+    return cast<Instruction>(U->getUser());
+  }
+
+  // This performs the necessary local basic block ordering checks to tell
+  // whether A comes before B, where both are in the same basic block.
+  bool localComesBefore(const ValueDFS &A, const ValueDFS &B) const {
+    auto *ADef = getMiddleDef(A);
+    auto *BDef = getMiddleDef(B);
+
+    // See if we have real values or uses. If we have real values, we are
+    // guaranteed they are instructions or arguments. No matter what, we are
+    // guaranteed they are in the same block if they are instructions.
+    auto *ArgA = dyn_cast_or_null<Argument>(ADef);
+    auto *ArgB = dyn_cast_or_null<Argument>(BDef);
+
+    if (ArgA || ArgB)
+      return valueComesBefore(OI, ArgA, ArgB);
+
+    auto *AInst = getDefOrUser(ADef, A.U);
+    auto *BInst = getDefOrUser(BDef, B.U);
+    return valueComesBefore(OI, AInst, BInst);
+  }
+};
+
+} // namespace PredicateInfoClasses
+
+bool PredicateInfo::stackIsInScope(const ValueDFSStack &Stack,
+                                   const ValueDFS &VDUse) const {
+  if (Stack.empty())
+    return false;
+  // If it's a phi only use, make sure it's for this phi node edge, and that the
+  // use is in a phi node.  If it's anything else, and the top of the stack is
+  // EdgeOnly, we need to pop the stack.  We deliberately sort phi uses next to
+  // the defs they must go with so that we can know it's time to pop the stack
+  // when we hit the end of the phi uses for a given def.
+  if (Stack.back().EdgeOnly) {
+    if (!VDUse.U)
+      return false;
+    auto *PHI = dyn_cast<PHINode>(VDUse.U->getUser());
+    if (!PHI)
+      return false;
+    // Check edge
+    BasicBlock *EdgePred = PHI->getIncomingBlock(*VDUse.U);
+    if (EdgePred != getBranchBlock(Stack.back().PInfo))
+      return false;
+
+    // Use dominates, which knows how to handle edge dominance.
+    return DT.dominates(getBlockEdge(Stack.back().PInfo), *VDUse.U);
+  }
+
+  return (VDUse.DFSIn >= Stack.back().DFSIn &&
+          VDUse.DFSOut <= Stack.back().DFSOut);
+}
+
+void PredicateInfo::popStackUntilDFSScope(ValueDFSStack &Stack,
+                                          const ValueDFS &VD) {
+  while (!Stack.empty() && !stackIsInScope(Stack, VD))
+    Stack.pop_back();
+}
+
+// Convert the uses of Op into a vector of uses, associating global and local
+// DFS info with each one.
+void PredicateInfo::convertUsesToDFSOrdered(
+    Value *Op, SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
+  for (auto &U : Op->uses()) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      ValueDFS VD;
+      // Put the phi node uses in the incoming block.
+      BasicBlock *IBlock;
+      if (auto *PN = dyn_cast<PHINode>(I)) {
+        IBlock = PN->getIncomingBlock(U);
+        // Make phi node users appear last in the incoming block
+        // they are from.
+        VD.LocalNum = LN_Last;
+      } else {
+        // If it's not a phi node use, it is somewhere in the middle of the
+        // block.
+        IBlock = I->getParent();
+        VD.LocalNum = LN_Middle;
+      }
+      DomTreeNode *DomNode = DT.getNode(IBlock);
+      // It's possible our use is in an unreachable block. Skip it if so.
+      if (!DomNode)
+        continue;
+      VD.DFSIn = DomNode->getDFSNumIn();
+      VD.DFSOut = DomNode->getDFSNumOut();
+      VD.U = &U;
+      DFSOrderedSet.push_back(VD);
+    }
+  }
+}
+
+// Collect relevant operations from Comparison that we may want to insert copies
+// for.
+void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) {
+  auto *Op0 = Comparison->getOperand(0);
+  auto *Op1 = Comparison->getOperand(1);
+  if (Op0 == Op1)
+    return;
+  CmpOperands.push_back(Comparison);
+  // Only want real values, not constants.  Additionally, operands with one use
+  // are only being used in the comparison, which means they will not be useful
+  // for us to consider for predicateinfo.
+  //
+  if ((isa<Instruction>(Op0) || isa<Argument>(Op0)) && !Op0->hasOneUse())
+    CmpOperands.push_back(Op0);
+  if ((isa<Instruction>(Op1) || isa<Argument>(Op1)) && !Op1->hasOneUse())
+    CmpOperands.push_back(Op1);
+}
+
+// Add Op, PB to the list of value infos for Op, and mark Op to be renamed.
+void PredicateInfo::addInfoFor(SmallPtrSetImpl<Value *> &OpsToRename, Value *Op,
+                               PredicateBase *PB) {
+  OpsToRename.insert(Op);
+  auto &OperandInfo = getOrCreateValueInfo(Op);
+  AllInfos.push_back(PB);
+  OperandInfo.Infos.push_back(PB);
+}
+
+// Process an assume instruction and place relevant operations we want to rename
+// into OpsToRename.
+void PredicateInfo::processAssume(IntrinsicInst *II, BasicBlock *AssumeBB,
+                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+  // See if we have a comparison we support
+  SmallVector<Value *, 8> CmpOperands;
+  SmallVector<Value *, 2> ConditionsToProcess;
+  CmpInst::Predicate Pred;
+  Value *Operand = II->getOperand(0);
+  if (m_c_And(m_Cmp(Pred, m_Value(), m_Value()),
+              m_Cmp(Pred, m_Value(), m_Value()))
+          .match(II->getOperand(0))) {
+    ConditionsToProcess.push_back(cast<BinaryOperator>(Operand)->getOperand(0));
+    ConditionsToProcess.push_back(cast<BinaryOperator>(Operand)->getOperand(1));
+    ConditionsToProcess.push_back(Operand);
+  } else if (isa<CmpInst>(Operand)) {
+
+    ConditionsToProcess.push_back(Operand);
+  }
+  for (auto Cond : ConditionsToProcess) {
+    if (auto *Cmp = dyn_cast<CmpInst>(Cond)) {
+      collectCmpOps(Cmp, CmpOperands);
+      // Now add our copy infos for our operands
+      for (auto *Op : CmpOperands) {
+        auto *PA = new PredicateAssume(Op, II, Cmp);
+        addInfoFor(OpsToRename, Op, PA);
+      }
+      CmpOperands.clear();
+    } else if (auto *BinOp = dyn_cast<BinaryOperator>(Cond)) {
+      // Otherwise, it should be an AND.
+      assert(BinOp->getOpcode() == Instruction::And &&
+             "Should have been an AND");
+      auto *PA = new PredicateAssume(BinOp, II, BinOp);
+      addInfoFor(OpsToRename, BinOp, PA);
+    } else {
+      llvm_unreachable("Unknown type of condition");
+    }
+  }
+}
+
+// Process a block terminating branch, and place relevant operations to be
+// renamed into OpsToRename.
+void PredicateInfo::processBranch(BranchInst *BI, BasicBlock *BranchBB,
+                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+  BasicBlock *FirstBB = BI->getSuccessor(0);
+  BasicBlock *SecondBB = BI->getSuccessor(1);
+  SmallVector<BasicBlock *, 2> SuccsToProcess;
+  SuccsToProcess.push_back(FirstBB);
+  SuccsToProcess.push_back(SecondBB);
+  SmallVector<Value *, 2> ConditionsToProcess;
+
+  auto InsertHelper = [&](Value *Op, bool isAnd, bool isOr, Value *Cond) {
+    for (auto *Succ : SuccsToProcess) {
+      // Don't try to insert on a self-edge. This is mainly because we will
+      // eliminate during renaming anyway.
+      if (Succ == BranchBB)
+        continue;
+      bool TakenEdge = (Succ == FirstBB);
+      // For and, only insert on the true edge
+      // For or, only insert on the false edge
+      if ((isAnd && !TakenEdge) || (isOr && TakenEdge))
+        continue;
+      PredicateBase *PB =
+          new PredicateBranch(Op, BranchBB, Succ, Cond, TakenEdge);
+      addInfoFor(OpsToRename, Op, PB);
+      if (!Succ->getSinglePredecessor())
+        EdgeUsesOnly.insert({BranchBB, Succ});
+    }
+  };
+
+  // Match combinations of conditions.
+  CmpInst::Predicate Pred;
+  bool isAnd = false;
+  bool isOr = false;
+  SmallVector<Value *, 8> CmpOperands;
+  if (match(BI->getCondition(), m_And(m_Cmp(Pred, m_Value(), m_Value()),
+                                      m_Cmp(Pred, m_Value(), m_Value()))) ||
+      match(BI->getCondition(), m_Or(m_Cmp(Pred, m_Value(), m_Value()),
+                                     m_Cmp(Pred, m_Value(), m_Value())))) {
+    auto *BinOp = cast<BinaryOperator>(BI->getCondition());
+    if (BinOp->getOpcode() == Instruction::And)
+      isAnd = true;
+    else if (BinOp->getOpcode() == Instruction::Or)
+      isOr = true;
+    ConditionsToProcess.push_back(BinOp->getOperand(0));
+    ConditionsToProcess.push_back(BinOp->getOperand(1));
+    ConditionsToProcess.push_back(BI->getCondition());
+  } else if (isa<CmpInst>(BI->getCondition())) {
+    ConditionsToProcess.push_back(BI->getCondition());
+  }
+  for (auto Cond : ConditionsToProcess) {
+    if (auto *Cmp = dyn_cast<CmpInst>(Cond)) {
+      collectCmpOps(Cmp, CmpOperands);
+      // Now add our copy infos for our operands
+      for (auto *Op : CmpOperands)
+        InsertHelper(Op, isAnd, isOr, Cmp);
+    } else if (auto *BinOp = dyn_cast<BinaryOperator>(Cond)) {
+      // This must be an AND or an OR.
+      assert((BinOp->getOpcode() == Instruction::And ||
+              BinOp->getOpcode() == Instruction::Or) &&
+             "Should have been an AND or an OR");
+      // The actual value of the binop is not subject to the same restrictions
+      // as the comparison. It's either true or false on the true/false branch.
+      InsertHelper(BinOp, false, false, BinOp);
+    } else {
+      llvm_unreachable("Unknown type of condition");
+    }
+    CmpOperands.clear();
+  }
+}
+// Process a block terminating switch, and place relevant operations to be
+// renamed into OpsToRename.
+void PredicateInfo::processSwitch(SwitchInst *SI, BasicBlock *BranchBB,
+                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+  Value *Op = SI->getCondition();
+  if ((!isa<Instruction>(Op) && !isa<Argument>(Op)) || Op->hasOneUse())
+    return;
+
+  // Remember how many outgoing edges there are to every successor.
+  SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+  for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+    BasicBlock *TargetBlock = SI->getSuccessor(i);
+    ++SwitchEdges[TargetBlock];
+  }
+
+  // Now propagate info for each case value
+  for (auto C : SI->cases()) {
+    BasicBlock *TargetBlock = C.getCaseSuccessor();
+    if (SwitchEdges.lookup(TargetBlock) == 1) {
+      PredicateSwitch *PS = new PredicateSwitch(
+          Op, SI->getParent(), TargetBlock, C.getCaseValue(), SI);
+      addInfoFor(OpsToRename, Op, PS);
+      if (!TargetBlock->getSinglePredecessor())
+        EdgeUsesOnly.insert({BranchBB, TargetBlock});
+    }
+  }
+}
+
+// Build predicate info for our function
+void PredicateInfo::buildPredicateInfo() {
+  DT.updateDFSNumbers();
+  // Collect operands to rename from all conditional branch terminators, as well
+  // as assume statements.
+  SmallPtrSet<Value *, 8> OpsToRename;
+  for (auto DTN : depth_first(DT.getRootNode())) {
+    BasicBlock *BranchBB = DTN->getBlock();
+    if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) {
+      if (!BI->isConditional())
+        continue;
+      // Can't insert conditional information if they all go to the same place.
+      if (BI->getSuccessor(0) == BI->getSuccessor(1))
+        continue;
+      processBranch(BI, BranchBB, OpsToRename);
+    } else if (auto *SI = dyn_cast<SwitchInst>(BranchBB->getTerminator())) {
+      processSwitch(SI, BranchBB, OpsToRename);
+    }
+  }
+  for (auto &Assume : AC.assumptions()) {
+    if (auto *II = dyn_cast_or_null<IntrinsicInst>(Assume))
+      processAssume(II, II->getParent(), OpsToRename);
+  }
+  // Now rename all our operations.
+  renameUses(OpsToRename);
+}
+
+// Given the renaming stack, make all the operands currently on the stack real
+// by inserting them into the IR.  Return the last operation's value.
+Value *PredicateInfo::materializeStack(unsigned int &Counter,
+                                       ValueDFSStack &RenameStack,
+                                       Value *OrigOp) {
+  // Find the first thing we have to materialize
+  auto RevIter = RenameStack.rbegin();
+  for (; RevIter != RenameStack.rend(); ++RevIter)
+    if (RevIter->Def)
+      break;
+
+  size_t Start = RevIter - RenameStack.rbegin();
+  // The maximum number of things we should be trying to materialize at once
+  // right now is 4, depending on if we had an assume, a branch, and both used
+  // and of conditions.
+  for (auto RenameIter = RenameStack.end() - Start;
+       RenameIter != RenameStack.end(); ++RenameIter) {
+    auto *Op =
+        RenameIter == RenameStack.begin() ? OrigOp : (RenameIter - 1)->Def;
+    ValueDFS &Result = *RenameIter;
+    auto *ValInfo = Result.PInfo;
+    // For edge predicates, we can just place the operand in the block before
+    // the terminator.  For assume, we have to place it right before the assume
+    // to ensure we dominate all of our uses.  Always insert right before the
+    // relevant instruction (terminator, assume), so that we insert in proper
+    // order in the case of multiple predicateinfo in the same block.
+    if (isa<PredicateWithEdge>(ValInfo)) {
+      IRBuilder<> B(getBranchTerminator(ValInfo));
+      Function *IF = Intrinsic::getDeclaration(
+          F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      CallInst *PIC =
+          B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
+      PredicateMap.insert({PIC, ValInfo});
+      Result.Def = PIC;
+    } else {
+      auto *PAssume = dyn_cast<PredicateAssume>(ValInfo);
+      assert(PAssume &&
+             "Should not have gotten here without it being an assume");
+      IRBuilder<> B(PAssume->AssumeInst);
+      Function *IF = Intrinsic::getDeclaration(
+          F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      CallInst *PIC = B.CreateCall(IF, Op);
+      PredicateMap.insert({PIC, ValInfo});
+      Result.Def = PIC;
+    }
+  }
+  return RenameStack.back().Def;
+}
+
+// Instead of the standard SSA renaming algorithm, which is O(Number of
+// instructions), and walks the entire dominator tree, we walk only the defs +
+// uses.  The standard SSA renaming algorithm does not really rely on the
+// dominator tree except to order the stack push/pops of the renaming stacks, so
+// that defs end up getting pushed before hitting the correct uses.  This does
+// not require the dominator tree, only the *order* of the dominator tree. The
+// complete and correct ordering of the defs and uses, in dominator tree is
+// contained in the DFS numbering of the dominator tree. So we sort the defs and
+// uses into the DFS ordering, and then just use the renaming stack as per
+// normal, pushing when we hit a def (which is a predicateinfo instruction),
+// popping when we are out of the dfs scope for that def, and replacing any uses
+// with top of stack if it exists.  In order to handle liveness without
+// propagating liveness info, we don't actually insert the predicateinfo
+// instruction def until we see a use that it would dominate.  Once we see such
+// a use, we materialize the predicateinfo instruction in the right place and
+// use it.
+//
+// TODO: Use this algorithm to perform fast single-variable renaming in
+// promotememtoreg and memoryssa.
+void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
+  // Sort OpsToRename since we are going to iterate it.
+  SmallVector<Value *, 8> OpsToRename(OpSet.begin(), OpSet.end());
+  auto Comparator = [&](const Value *A, const Value *B) {
+    return valueComesBefore(OI, A, B);
+  };
+  std::sort(OpsToRename.begin(), OpsToRename.end(), Comparator);
+  ValueDFS_Compare Compare(OI);
+  // Compute liveness, and rename in O(uses) per Op.
+  for (auto *Op : OpsToRename) {
+    unsigned Counter = 0;
+    SmallVector<ValueDFS, 16> OrderedUses;
+    const auto &ValueInfo = getValueInfo(Op);
+    // Insert the possible copies into the def/use list.
+    // They will become real copies if we find a real use for them, and never
+    // created otherwise.
+    for (auto &PossibleCopy : ValueInfo.Infos) {
+      ValueDFS VD;
+      // Determine where we are going to place the copy by the copy type.
+      // The predicate info for branches always come first, they will get
+      // materialized in the split block at the top of the block.
+      // The predicate info for assumes will be somewhere in the middle,
+      // it will get materialized in front of the assume.
+      if (const auto *PAssume = dyn_cast<PredicateAssume>(PossibleCopy)) {
+        VD.LocalNum = LN_Middle;
+        DomTreeNode *DomNode = DT.getNode(PAssume->AssumeInst->getParent());
+        if (!DomNode)
+          continue;
+        VD.DFSIn = DomNode->getDFSNumIn();
+        VD.DFSOut = DomNode->getDFSNumOut();
+        VD.PInfo = PossibleCopy;
+        OrderedUses.push_back(VD);
+      } else if (isa<PredicateWithEdge>(PossibleCopy)) {
+        // If we can only do phi uses, we treat it like it's in the branch
+        // block, and handle it specially. We know that it goes last, and only
+        // dominate phi uses.
+        auto BlockEdge = getBlockEdge(PossibleCopy);
+        if (EdgeUsesOnly.count(BlockEdge)) {
+          VD.LocalNum = LN_Last;
+          auto *DomNode = DT.getNode(BlockEdge.first);
+          if (DomNode) {
+            VD.DFSIn = DomNode->getDFSNumIn();
+            VD.DFSOut = DomNode->getDFSNumOut();
+            VD.PInfo = PossibleCopy;
+            VD.EdgeOnly = true;
+            OrderedUses.push_back(VD);
+          }
+        } else {
+          // Otherwise, we are in the split block (even though we perform
+          // insertion in the branch block).
+          // Insert a possible copy at the split block and before the branch.
+          VD.LocalNum = LN_First;
+          auto *DomNode = DT.getNode(BlockEdge.second);
+          if (DomNode) {
+            VD.DFSIn = DomNode->getDFSNumIn();
+            VD.DFSOut = DomNode->getDFSNumOut();
+            VD.PInfo = PossibleCopy;
+            OrderedUses.push_back(VD);
+          }
+        }
+      }
+    }
+
+    convertUsesToDFSOrdered(Op, OrderedUses);
+    std::sort(OrderedUses.begin(), OrderedUses.end(), Compare);
+    SmallVector<ValueDFS, 8> RenameStack;
+    // For each use, sorted into dfs order, push values and replaces uses with
+    // top of stack, which will represent the reaching def.
+    for (auto &VD : OrderedUses) {
+      // We currently do not materialize copy over copy, but we should decide if
+      // we want to.
+      bool PossibleCopy = VD.PInfo != nullptr;
+      if (RenameStack.empty()) {
+        DEBUG(dbgs() << "Rename Stack is empty\n");
+      } else {
+        DEBUG(dbgs() << "Rename Stack Top DFS numbers are ("
+                     << RenameStack.back().DFSIn << ","
+                     << RenameStack.back().DFSOut << ")\n");
+      }
+
+      DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
+                   << VD.DFSOut << ")\n");
+
+      bool ShouldPush = (VD.Def || PossibleCopy);
+      bool OutOfScope = !stackIsInScope(RenameStack, VD);
+      if (OutOfScope || ShouldPush) {
+        // Sync to our current scope.
+        popStackUntilDFSScope(RenameStack, VD);
+        if (ShouldPush) {
+          RenameStack.push_back(VD);
+        }
+      }
+      // If we get to this point, and the stack is empty we must have a use
+      // with no renaming needed, just skip it.
+      if (RenameStack.empty())
+        continue;
+      // Skip values, only want to rename the uses
+      if (VD.Def || PossibleCopy)
+        continue;
+      if (!DebugCounter::shouldExecute(RenameCounter)) {
+        DEBUG(dbgs() << "Skipping execution due to debug counter\n");
+        continue;
+      }
+      ValueDFS &Result = RenameStack.back();
+
+      // If the possible copy dominates something, materialize our stack up to
+      // this point. This ensures every comparison that affects our operation
+      // ends up with predicateinfo.
+      if (!Result.Def)
+        Result.Def = materializeStack(Counter, RenameStack, Op);
+
+      DEBUG(dbgs() << "Found replacement " << *Result.Def << " for "
+                   << *VD.U->get() << " in " << *(VD.U->getUser()) << "\n");
+      assert(DT.dominates(cast<Instruction>(Result.Def), *VD.U) &&
+             "Predicateinfo def should have dominated this use");
+      VD.U->set(Result.Def);
+    }
+  }
+}
+
+PredicateInfo::ValueInfo &PredicateInfo::getOrCreateValueInfo(Value *Operand) {
+  auto OIN = ValueInfoNums.find(Operand);
+  if (OIN == ValueInfoNums.end()) {
+    // This will grow it
+    ValueInfos.resize(ValueInfos.size() + 1);
+    // This will use the new size and give us a 0 based number of the info
+    auto InsertResult = ValueInfoNums.insert({Operand, ValueInfos.size() - 1});
+    assert(InsertResult.second && "Value info number already existed?");
+    return ValueInfos[InsertResult.first->second];
+  }
+  return ValueInfos[OIN->second];
+}
+
+const PredicateInfo::ValueInfo &
+PredicateInfo::getValueInfo(Value *Operand) const {
+  auto OINI = ValueInfoNums.lookup(Operand);
+  assert(OINI != 0 && "Operand was not really in the Value Info Numbers");
+  assert(OINI < ValueInfos.size() &&
+         "Value Info Number greater than size of Value Info Table");
+  return ValueInfos[OINI];
+}
+
+PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
+                             AssumptionCache &AC)
+    : F(F), DT(DT), AC(AC), OI(&DT) {
+  // Push an empty operand info so that we can detect 0 as not finding one
+  ValueInfos.resize(1);
+  buildPredicateInfo();
+}
+
+PredicateInfo::~PredicateInfo() {}
+
+void PredicateInfo::verifyPredicateInfo() const {}
+
+char PredicateInfoPrinterLegacyPass::ID = 0;
+
+PredicateInfoPrinterLegacyPass::PredicateInfoPrinterLegacyPass()
+    : FunctionPass(ID) {
+  initializePredicateInfoPrinterLegacyPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+void PredicateInfoPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+  AU.addRequired<AssumptionCacheTracker>();
+}
+
+bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto PredInfo = make_unique<PredicateInfo>(F, DT, AC);
+  PredInfo->print(dbgs());
+  if (VerifyPredicateInfo)
+    PredInfo->verifyPredicateInfo();
+  return false;
+}
+
+PreservedAnalyses PredicateInfoPrinterPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  OS << "PredicateInfo for function: " << F.getName() << "\n";
+  make_unique<PredicateInfo>(F, DT, AC)->print(OS);
+
+  return PreservedAnalyses::all();
+}
+
+/// \brief An assembly annotator class to print PredicateInfo information in
+/// comments.
+class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter {
+  friend class PredicateInfo;
+  const PredicateInfo *PredInfo;
+
+public:
+  PredicateInfoAnnotatedWriter(const PredicateInfo *M) : PredInfo(M) {}
+
+  virtual void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                        formatted_raw_ostream &OS) {}
+
+  virtual void emitInstructionAnnot(const Instruction *I,
+                                    formatted_raw_ostream &OS) {
+    if (const auto *PI = PredInfo->getPredicateInfoFor(I)) {
+      OS << "; Has predicate info\n";
+      if (const auto *PB = dyn_cast<PredicateBranch>(PI)) {
+        OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge
+           << " Comparison:" << *PB->Condition << " Edge: [";
+        PB->From->printAsOperand(OS);
+        OS << ",";
+        PB->To->printAsOperand(OS);
+        OS << "] }\n";
+      } else if (const auto *PS = dyn_cast<PredicateSwitch>(PI)) {
+        OS << "; switch predicate info { CaseValue: " << *PS->CaseValue
+           << " Switch:" << *PS->Switch << " Edge: [";
+        PS->From->printAsOperand(OS);
+        OS << ",";
+        PS->To->printAsOperand(OS);
+        OS << "] }\n";
+      } else if (const auto *PA = dyn_cast<PredicateAssume>(PI)) {
+        OS << "; assume predicate info {"
+           << " Comparison:" << *PA->Condition << " }\n";
+      }
+    }
+  }
+};
+
+void PredicateInfo::print(raw_ostream &OS) const {
+  PredicateInfoAnnotatedWriter Writer(this);
+  F.print(OS, &Writer);
+}
+
+void PredicateInfo::dump() const {
+  PredicateInfoAnnotatedWriter Writer(this);
+  F.print(dbgs(), &Writer);
+}
+
+PreservedAnalyses PredicateInfoVerifierPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  make_unique<PredicateInfo>(F, DT, AC)->verifyPredicateInfo();
+
+  return PreservedAnalyses::all();
+}
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
new file mode 100644
index 000000000000..cdba982e6641
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -0,0 +1,1000 @@
+//===- PromoteMemoryToRegister.cpp - Convert allocas to registers ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file promotes memory references to be register references.  It promotes
+// alloca instructions which only have loads and stores as uses.  An alloca is
+// transformed by using iterated dominator frontiers to place PHI nodes, then
+// traversing the function in depth-first order to rewrite loads and stores as
+// appropriate.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "mem2reg"
+
+STATISTIC(NumLocalPromoted, "Number of alloca's promoted within one block");
+STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store");
+STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
+STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
+
+bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+  // FIXME: If the memory unit is of pointer or integer type, we can permit
+  // assignments to subsections of the memory unit.
+  unsigned AS = AI->getType()->getAddressSpace();
+
+  // Only allow direct and non-volatile loads and stores...
+  for (const User *U : AI->users()) {
+    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      // Note that atomic loads can be transformed; atomic semantics do
+      // not have any meaning for a local alloca.
+      if (LI->isVolatile())
+        return false;
+    } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getOperand(0) == AI)
+        return false; // Don't allow a store OF the AI, only INTO the AI.
+      // Note that atomic stores can be transformed; atomic semantics do
+      // not have any meaning for a local alloca.
+      if (SI->isVolatile())
+        return false;
+    } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+          II->getIntrinsicID() != Intrinsic::lifetime_end)
+        return false;
+    } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      if (BCI->getType() != Type::getInt8PtrTy(U->getContext(), AS))
+        return false;
+      if (!onlyUsedByLifetimeMarkers(BCI))
+        return false;
+    } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      if (GEPI->getType() != Type::getInt8PtrTy(U->getContext(), AS))
+        return false;
+      if (!GEPI->hasAllZeroIndices())
+        return false;
+      if (!onlyUsedByLifetimeMarkers(GEPI))
+        return false;
+    } else {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+namespace {
+
+struct AllocaInfo {
+  SmallVector<BasicBlock *, 32> DefiningBlocks;
+  SmallVector<BasicBlock *, 32> UsingBlocks;
+
+  StoreInst *OnlyStore;
+  BasicBlock *OnlyBlock;
+  bool OnlyUsedInOneBlock;
+
+  Value *AllocaPointerVal;
+  DbgDeclareInst *DbgDeclare;
+
+  void clear() {
+    DefiningBlocks.clear();
+    UsingBlocks.clear();
+    OnlyStore = nullptr;
+    OnlyBlock = nullptr;
+    OnlyUsedInOneBlock = true;
+    AllocaPointerVal = nullptr;
+    DbgDeclare = nullptr;
+  }
+
+  /// Scan the uses of the specified alloca, filling in the AllocaInfo used
+  /// by the rest of the pass to reason about the uses of this alloca.
+  void AnalyzeAlloca(AllocaInst *AI) {
+    clear();
+
+    // As we scan the uses of the alloca instruction, keep track of stores,
+    // and decide whether all of the loads and stores to the alloca are within
+    // the same basic block.
+    for (auto UI = AI->user_begin(), E = AI->user_end(); UI != E;) {
+      Instruction *User = cast<Instruction>(*UI++);
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+        // Remember the basic blocks which define new values for the alloca
+        DefiningBlocks.push_back(SI->getParent());
+        AllocaPointerVal = SI->getOperand(0);
+        OnlyStore = SI;
+      } else {
+        LoadInst *LI = cast<LoadInst>(User);
+        // Otherwise it must be a load instruction, keep track of variable
+        // reads.
+        UsingBlocks.push_back(LI->getParent());
+        AllocaPointerVal = LI;
+      }
+
+      if (OnlyUsedInOneBlock) {
+        if (!OnlyBlock)
+          OnlyBlock = User->getParent();
+        else if (OnlyBlock != User->getParent())
+          OnlyUsedInOneBlock = false;
+      }
+    }
+
+    DbgDeclare = FindAllocaDbgDeclare(AI);
+  }
+};
+
+// Data package used by RenamePass()
+class RenamePassData {
+public:
+  typedef std::vector<Value *> ValVector;
+
+  RenamePassData() : BB(nullptr), Pred(nullptr), Values() {}
+  RenamePassData(BasicBlock *B, BasicBlock *P, const ValVector &V)
+      : BB(B), Pred(P), Values(V) {}
+  BasicBlock *BB;
+  BasicBlock *Pred;
+  ValVector Values;
+
+  void swap(RenamePassData &RHS) {
+    std::swap(BB, RHS.BB);
+    std::swap(Pred, RHS.Pred);
+    Values.swap(RHS.Values);
+  }
+};
+
+/// \brief This assigns and keeps a per-bb relative ordering of load/store
+/// instructions in the block that directly load or store an alloca.
+///
+/// This functionality is important because it avoids scanning large basic
+/// blocks multiple times when promoting many allocas in the same block.
+class LargeBlockInfo {
+  /// \brief For each instruction that we track, keep the index of the
+  /// instruction.
+  ///
+  /// The index starts out as the number of the instruction from the start of
+  /// the block.
+  DenseMap<const Instruction *, unsigned> InstNumbers;
+
+public:
+
+  /// This code only looks at accesses to allocas.
+  static bool isInterestingInstruction(const Instruction *I) {
+    return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) ||
+           (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1)));
+  }
+
+  /// Get or calculate the index of the specified instruction.
+  unsigned getInstructionIndex(const Instruction *I) {
+    assert(isInterestingInstruction(I) &&
+           "Not a load/store to/from an alloca?");
+
+    // If we already have this instruction number, return it.
+    DenseMap<const Instruction *, unsigned>::iterator It = InstNumbers.find(I);
+    if (It != InstNumbers.end())
+      return It->second;
+
+    // Scan the whole block to get the instruction.  This accumulates
+    // information for every interesting instruction in the block, in order to
+    // avoid gratuitus rescans.
+    const BasicBlock *BB = I->getParent();
+    unsigned InstNo = 0;
+    for (const Instruction &BBI : *BB)
+      if (isInterestingInstruction(&BBI))
+        InstNumbers[&BBI] = InstNo++;
+    It = InstNumbers.find(I);
+
+    assert(It != InstNumbers.end() && "Didn't insert instruction?");
+    return It->second;
+  }
+
+  void deleteValue(const Instruction *I) { InstNumbers.erase(I); }
+
+  void clear() { InstNumbers.clear(); }
+};
+
+struct PromoteMem2Reg {
+  /// The alloca instructions being promoted.
+  std::vector<AllocaInst *> Allocas;
+  DominatorTree &DT;
+  DIBuilder DIB;
+  /// A cache of @llvm.assume intrinsics used by SimplifyInstruction.
+  AssumptionCache *AC;
+
+  const SimplifyQuery SQ;
+  /// Reverse mapping of Allocas.
+  DenseMap<AllocaInst *, unsigned> AllocaLookup;
+
+  /// \brief The PhiNodes we're adding.
+  ///
+  /// That map is used to simplify some Phi nodes as we iterate over it, so
+  /// it should have deterministic iterators.  We could use a MapVector, but
+  /// since we already maintain a map from BasicBlock* to a stable numbering
+  /// (BBNumbers), the DenseMap is more efficient (also supports removal).
+  DenseMap<std::pair<unsigned, unsigned>, PHINode *> NewPhiNodes;
+
+  /// For each PHI node, keep track of which entry in Allocas it corresponds
+  /// to.
+  DenseMap<PHINode *, unsigned> PhiToAllocaMap;
+
+  /// If we are updating an AliasSetTracker, then for each alloca that is of
+  /// pointer type, we keep track of what to copyValue to the inserted PHI
+  /// nodes here.
+  std::vector<Value *> PointerAllocaValues;
+
+  /// For each alloca, we keep track of the dbg.declare intrinsic that
+  /// describes it, if any, so that we can convert it to a dbg.value
+  /// intrinsic if the alloca gets promoted.
+  SmallVector<DbgDeclareInst *, 8> AllocaDbgDeclares;
+
+  /// The set of basic blocks the renamer has already visited.
+  ///
+  SmallPtrSet<BasicBlock *, 16> Visited;
+
+  /// Contains a stable numbering of basic blocks to avoid non-determinstic
+  /// behavior.
+  DenseMap<BasicBlock *, unsigned> BBNumbers;
+
+  /// Lazily compute the number of predecessors a block has.
+  DenseMap<const BasicBlock *, unsigned> BBNumPreds;
+
+public:
+  PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
+                 AssumptionCache *AC)
+      : Allocas(Allocas.begin(), Allocas.end()), DT(DT),
+        DIB(*DT.getRoot()->getParent()->getParent(), /*AllowUnresolved*/ false),
+        AC(AC), SQ(DT.getRoot()->getParent()->getParent()->getDataLayout(),
+                   nullptr, &DT, AC) {}
+
+  void run();
+
+private:
+  void RemoveFromAllocasList(unsigned &AllocaIdx) {
+    Allocas[AllocaIdx] = Allocas.back();
+    Allocas.pop_back();
+    --AllocaIdx;
+  }
+
+  unsigned getNumPreds(const BasicBlock *BB) {
+    unsigned &NP = BBNumPreds[BB];
+    if (NP == 0)
+      NP = std::distance(pred_begin(BB), pred_end(BB)) + 1;
+    return NP - 1;
+  }
+
+  void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info,
+                           const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+                           SmallPtrSetImpl<BasicBlock *> &LiveInBlocks);
+  void RenamePass(BasicBlock *BB, BasicBlock *Pred,
+                  RenamePassData::ValVector &IncVals,
+                  std::vector<RenamePassData> &Worklist);
+  bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version);
+};
+
+} // end of anonymous namespace
+
+/// Given a LoadInst LI this adds assume(LI != null) after it.
+static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
+  Function *AssumeIntrinsic =
+      Intrinsic::getDeclaration(LI->getModule(), Intrinsic::assume);
+  ICmpInst *LoadNotNull = new ICmpInst(ICmpInst::ICMP_NE, LI,
+                                       Constant::getNullValue(LI->getType()));
+  LoadNotNull->insertAfter(LI);
+  CallInst *CI = CallInst::Create(AssumeIntrinsic, {LoadNotNull});
+  CI->insertAfter(LoadNotNull);
+  AC->registerAssumption(CI);
+}
+
+static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
+  // Knowing that this alloca is promotable, we know that it's safe to kill all
+  // instructions except for load and store.
+
+  for (auto UI = AI->user_begin(), UE = AI->user_end(); UI != UE;) {
+    Instruction *I = cast<Instruction>(*UI);
+    ++UI;
+    if (isa<LoadInst>(I) || isa<StoreInst>(I))
+      continue;
+
+    if (!I->getType()->isVoidTy()) {
+      // The only users of this bitcast/GEP instruction are lifetime intrinsics.
+      // Follow the use/def chain to erase them now instead of leaving it for
+      // dead code elimination later.
+      for (auto UUI = I->user_begin(), UUE = I->user_end(); UUI != UUE;) {
+        Instruction *Inst = cast<Instruction>(*UUI);
+        ++UUI;
+        Inst->eraseFromParent();
+      }
+    }
+    I->eraseFromParent();
+  }
+}
+
+/// \brief Rewrite as many loads as possible given a single store.
+///
+/// When there is only a single store, we can use the domtree to trivially
+/// replace all of the dominated loads with the stored value. Do so, and return
+/// true if this has successfully promoted the alloca entirely. If this returns
+/// false there were some loads which were not dominated by the single store
+/// and thus must be phi-ed with undef. We fall back to the standard alloca
+/// promotion algorithm in that case.
+static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
+                                     LargeBlockInfo &LBI, DominatorTree &DT,
+                                     AssumptionCache *AC) {
+  StoreInst *OnlyStore = Info.OnlyStore;
+  bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
+  BasicBlock *StoreBB = OnlyStore->getParent();
+  int StoreIndex = -1;
+
+  // Clear out UsingBlocks.  We will reconstruct it here if needed.
+  Info.UsingBlocks.clear();
+
+  for (auto UI = AI->user_begin(), E = AI->user_end(); UI != E;) {
+    Instruction *UserInst = cast<Instruction>(*UI++);
+    if (!isa<LoadInst>(UserInst)) {
+      assert(UserInst == OnlyStore && "Should only have load/stores");
+      continue;
+    }
+    LoadInst *LI = cast<LoadInst>(UserInst);
+
+    // Okay, if we have a load from the alloca, we want to replace it with the
+    // only value stored to the alloca.  We can do this if the value is
+    // dominated by the store.  If not, we use the rest of the mem2reg machinery
+    // to insert the phi nodes as needed.
+    if (!StoringGlobalVal) { // Non-instructions are always dominated.
+      if (LI->getParent() == StoreBB) {
+        // If we have a use that is in the same block as the store, compare the
+        // indices of the two instructions to see which one came first.  If the
+        // load came before the store, we can't handle it.
+        if (StoreIndex == -1)
+          StoreIndex = LBI.getInstructionIndex(OnlyStore);
+
+        if (unsigned(StoreIndex) > LBI.getInstructionIndex(LI)) {
+          // Can't handle this load, bail out.
+          Info.UsingBlocks.push_back(StoreBB);
+          continue;
+        }
+
+      } else if (LI->getParent() != StoreBB &&
+                 !DT.dominates(StoreBB, LI->getParent())) {
+        // If the load and store are in different blocks, use BB dominance to
+        // check their relationships.  If the store doesn't dom the use, bail
+        // out.
+        Info.UsingBlocks.push_back(LI->getParent());
+        continue;
+      }
+    }
+
+    // Otherwise, we *can* safely rewrite this load.
+    Value *ReplVal = OnlyStore->getOperand(0);
+    // If the replacement value is the load, this must occur in unreachable
+    // code.
+    if (ReplVal == LI)
+      ReplVal = UndefValue::get(LI->getType());
+
+    // If the load was marked as nonnull we don't want to lose
+    // that information when we erase this Load. So we preserve
+    // it with an assume.
+    if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+        !llvm::isKnownNonNullAt(ReplVal, LI, &DT))
+      addAssumeNonNull(AC, LI);
+
+    LI->replaceAllUsesWith(ReplVal);
+    LI->eraseFromParent();
+    LBI.deleteValue(LI);
+  }
+
+  // Finally, after the scan, check to see if the store is all that is left.
+  if (!Info.UsingBlocks.empty())
+    return false; // If not, we'll have to fall back for the remainder.
+
+  // Record debuginfo for the store and remove the declaration's
+  // debuginfo.
+  if (DbgDeclareInst *DDI = Info.DbgDeclare) {
+    DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
+    ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, DIB);
+    DDI->eraseFromParent();
+    LBI.deleteValue(DDI);
+  }
+  // Remove the (now dead) store and alloca.
+  Info.OnlyStore->eraseFromParent();
+  LBI.deleteValue(Info.OnlyStore);
+
+  AI->eraseFromParent();
+  LBI.deleteValue(AI);
+  return true;
+}
+
+/// Many allocas are only used within a single basic block.  If this is the
+/// case, avoid traversing the CFG and inserting a lot of potentially useless
+/// PHI nodes by just performing a single linear pass over the basic block
+/// using the Alloca.
+///
+/// If we cannot promote this alloca (because it is read before it is written),
+/// return false.  This is necessary in cases where, due to control flow, the
+/// alloca is undefined only on some control flow paths.  e.g. code like
+/// this is correct in LLVM IR:
+///  // A is an alloca with no stores so far
+///  for (...) {
+///    int t = *A;
+///    if (!first_iteration)
+///      use(t);
+///    *A = 42;
+///  }
+static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
+                                     LargeBlockInfo &LBI,
+                                     DominatorTree &DT,
+                                     AssumptionCache *AC) {
+  // The trickiest case to handle is when we have large blocks. Because of this,
+  // this code is optimized assuming that large blocks happen.  This does not
+  // significantly pessimize the small block case.  This uses LargeBlockInfo to
+  // make it efficient to get the index of various operations in the block.
+
+  // Walk the use-def list of the alloca, getting the locations of all stores.
+  typedef SmallVector<std::pair<unsigned, StoreInst *>, 64> StoresByIndexTy;
+  StoresByIndexTy StoresByIndex;
+
+  for (User *U : AI->users())
+    if (StoreInst *SI = dyn_cast<StoreInst>(U))
+      StoresByIndex.push_back(std::make_pair(LBI.getInstructionIndex(SI), SI));
+
+  // Sort the stores by their index, making it efficient to do a lookup with a
+  // binary search.
+  std::sort(StoresByIndex.begin(), StoresByIndex.end(), less_first());
+
+  // Walk all of the loads from this alloca, replacing them with the nearest
+  // store above them, if any.
+  for (auto UI = AI->user_begin(), E = AI->user_end(); UI != E;) {
+    LoadInst *LI = dyn_cast<LoadInst>(*UI++);
+    if (!LI)
+      continue;
+
+    unsigned LoadIdx = LBI.getInstructionIndex(LI);
+
+    // Find the nearest store that has a lower index than this load.
+    StoresByIndexTy::iterator I =
+        std::lower_bound(StoresByIndex.begin(), StoresByIndex.end(),
+                         std::make_pair(LoadIdx,
+                                        static_cast<StoreInst *>(nullptr)),
+                         less_first());
+    if (I == StoresByIndex.begin()) {
+      if (StoresByIndex.empty())
+        // If there are no stores, the load takes the undef value.
+        LI->replaceAllUsesWith(UndefValue::get(LI->getType()));
+      else
+        // There is no store before this load, bail out (load may be affected
+        // by the following stores - see main comment).
+        return false;
+    } else {
+      // Otherwise, there was a store before this load, the load takes its value.
+      // Note, if the load was marked as nonnull we don't want to lose that
+      // information when we erase it. So we preserve it with an assume.
+      Value *ReplVal = std::prev(I)->second->getOperand(0);
+      if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+          !llvm::isKnownNonNullAt(ReplVal, LI, &DT))
+        addAssumeNonNull(AC, LI);
+
+      LI->replaceAllUsesWith(ReplVal);
+    }
+
+    LI->eraseFromParent();
+    LBI.deleteValue(LI);
+  }
+
+  // Remove the (now dead) stores and alloca.
+  while (!AI->use_empty()) {
+    StoreInst *SI = cast<StoreInst>(AI->user_back());
+    // Record debuginfo for the store before removing it.
+    if (DbgDeclareInst *DDI = Info.DbgDeclare) {
+      DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
+      ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+    }
+    SI->eraseFromParent();
+    LBI.deleteValue(SI);
+  }
+
+  AI->eraseFromParent();
+  LBI.deleteValue(AI);
+
+  // The alloca's debuginfo can be removed as well.
+  if (DbgDeclareInst *DDI = Info.DbgDeclare) {
+    DDI->eraseFromParent();
+    LBI.deleteValue(DDI);
+  }
+
+  ++NumLocalPromoted;
+  return true;
+}
+
+void PromoteMem2Reg::run() {
+  Function &F = *DT.getRoot()->getParent();
+
+  AllocaDbgDeclares.resize(Allocas.size());
+
+  AllocaInfo Info;
+  LargeBlockInfo LBI;
+  ForwardIDFCalculator IDF(DT);
+
+  for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
+    AllocaInst *AI = Allocas[AllocaNum];
+
+    assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!");
+    assert(AI->getParent()->getParent() == &F &&
+           "All allocas should be in the same function, which is same as DF!");
+
+    removeLifetimeIntrinsicUsers(AI);
+
+    if (AI->use_empty()) {
+      // If there are no uses of the alloca, just delete it now.
+      AI->eraseFromParent();
+
+      // Remove the alloca from the Allocas list, since it has been processed
+      RemoveFromAllocasList(AllocaNum);
+      ++NumDeadAlloca;
+      continue;
+    }
+
+    // Calculate the set of read and write-locations for each alloca.  This is
+    // analogous to finding the 'uses' and 'definitions' of each variable.
+    Info.AnalyzeAlloca(AI);
+
+    // If there is only a single store to this value, replace any loads of
+    // it that are directly dominated by the definition with the value stored.
+    if (Info.DefiningBlocks.size() == 1) {
+      if (rewriteSingleStoreAlloca(AI, Info, LBI, DT, AC)) {
+        // The alloca has been processed, move on.
+        RemoveFromAllocasList(AllocaNum);
+        ++NumSingleStore;
+        continue;
+      }
+    }
+
+    // If the alloca is only read and written in one basic block, just perform a
+    // linear sweep over the block to eliminate it.
+    if (Info.OnlyUsedInOneBlock &&
+        promoteSingleBlockAlloca(AI, Info, LBI, DT, AC)) {
+      // The alloca has been processed, move on.
+      RemoveFromAllocasList(AllocaNum);
+      continue;
+    }
+
+    // If we haven't computed a numbering for the BB's in the function, do so
+    // now.
+    if (BBNumbers.empty()) {
+      unsigned ID = 0;
+      for (auto &BB : F)
+        BBNumbers[&BB] = ID++;
+    }
+
+    // Remember the dbg.declare intrinsic describing this alloca, if any.
+    if (Info.DbgDeclare)
+      AllocaDbgDeclares[AllocaNum] = Info.DbgDeclare;
+
+    // Keep the reverse mapping of the 'Allocas' array for the rename pass.
+    AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
+
+    // At this point, we're committed to promoting the alloca using IDF's, and
+    // the standard SSA construction algorithm.  Determine which blocks need PHI
+    // nodes and see if we can optimize out some work by avoiding insertion of
+    // dead phi nodes.
+
+
+    // Unique the set of defining blocks for efficient lookup.
+    SmallPtrSet<BasicBlock *, 32> DefBlocks;
+    DefBlocks.insert(Info.DefiningBlocks.begin(), Info.DefiningBlocks.end());
+
+    // Determine which blocks the value is live in.  These are blocks which lead
+    // to uses.
+    SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
+    ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
+
+    // At this point, we're committed to promoting the alloca using IDF's, and
+    // the standard SSA construction algorithm.  Determine which blocks need phi
+    // nodes and see if we can optimize out some work by avoiding insertion of
+    // dead phi nodes.
+    IDF.setLiveInBlocks(LiveInBlocks);
+    IDF.setDefiningBlocks(DefBlocks);
+    SmallVector<BasicBlock *, 32> PHIBlocks;
+    IDF.calculate(PHIBlocks);
+    if (PHIBlocks.size() > 1)
+      std::sort(PHIBlocks.begin(), PHIBlocks.end(),
+                [this](BasicBlock *A, BasicBlock *B) {
+                  return BBNumbers.lookup(A) < BBNumbers.lookup(B);
+                });
+
+    unsigned CurrentVersion = 0;
+    for (unsigned i = 0, e = PHIBlocks.size(); i != e; ++i)
+      QueuePhiNode(PHIBlocks[i], AllocaNum, CurrentVersion);
+  }
+
+  if (Allocas.empty())
+    return; // All of the allocas must have been trivial!
+
+  LBI.clear();
+
+  // Set the incoming values for the basic block to be null values for all of
+  // the alloca's.  We do this in case there is a load of a value that has not
+  // been stored yet.  In this case, it will get this null value.
+  //
+  RenamePassData::ValVector Values(Allocas.size());
+  for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
+    Values[i] = UndefValue::get(Allocas[i]->getAllocatedType());
+
+  // Walks all basic blocks in the function performing the SSA rename algorithm
+  // and inserting the phi nodes we marked as necessary
+  //
+  std::vector<RenamePassData> RenamePassWorkList;
+  RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values));
+  do {
+    RenamePassData RPD;
+    RPD.swap(RenamePassWorkList.back());
+    RenamePassWorkList.pop_back();
+    // RenamePass may add new worklist entries.
+    RenamePass(RPD.BB, RPD.Pred, RPD.Values, RenamePassWorkList);
+  } while (!RenamePassWorkList.empty());
+
+  // The renamer uses the Visited set to avoid infinite loops.  Clear it now.
+  Visited.clear();
+
+  // Remove the allocas themselves from the function.
+  for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
+    Instruction *A = Allocas[i];
+
+    // If there are any uses of the alloca instructions left, they must be in
+    // unreachable basic blocks that were not processed by walking the dominator
+    // tree. Just delete the users now.
+    if (!A->use_empty())
+      A->replaceAllUsesWith(UndefValue::get(A->getType()));
+    A->eraseFromParent();
+  }
+
+  // Remove alloca's dbg.declare instrinsics from the function.
+  for (unsigned i = 0, e = AllocaDbgDeclares.size(); i != e; ++i)
+    if (DbgDeclareInst *DDI = AllocaDbgDeclares[i])
+      DDI->eraseFromParent();
+
+  // Loop over all of the PHI nodes and see if there are any that we can get
+  // rid of because they merge all of the same incoming values.  This can
+  // happen due to undef values coming into the PHI nodes.  This process is
+  // iterative, because eliminating one PHI node can cause others to be removed.
+  bool EliminatedAPHI = true;
+  while (EliminatedAPHI) {
+    EliminatedAPHI = false;
+
+    // Iterating over NewPhiNodes is deterministic, so it is safe to try to
+    // simplify and RAUW them as we go.  If it was not, we could add uses to
+    // the values we replace with in a non-deterministic order, thus creating
+    // non-deterministic def->use chains.
+    for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator
+             I = NewPhiNodes.begin(),
+             E = NewPhiNodes.end();
+         I != E;) {
+      PHINode *PN = I->second;
+
+      // If this PHI node merges one value and/or undefs, get the value.
+      if (Value *V = SimplifyInstruction(PN, SQ)) {
+        PN->replaceAllUsesWith(V);
+        PN->eraseFromParent();
+        NewPhiNodes.erase(I++);
+        EliminatedAPHI = true;
+        continue;
+      }
+      ++I;
+    }
+  }
+
+  // At this point, the renamer has added entries to PHI nodes for all reachable
+  // code.  Unfortunately, there may be unreachable blocks which the renamer
+  // hasn't traversed.  If this is the case, the PHI nodes may not
+  // have incoming values for all predecessors.  Loop over all PHI nodes we have
+  // created, inserting undef values if they are missing any incoming values.
+  //
+  for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator
+           I = NewPhiNodes.begin(),
+           E = NewPhiNodes.end();
+       I != E; ++I) {
+    // We want to do this once per basic block.  As such, only process a block
+    // when we find the PHI that is the first entry in the block.
+    PHINode *SomePHI = I->second;
+    BasicBlock *BB = SomePHI->getParent();
+    if (&BB->front() != SomePHI)
+      continue;
+
+    // Only do work here if there the PHI nodes are missing incoming values.  We
+    // know that all PHI nodes that were inserted in a block will have the same
+    // number of incoming values, so we can just check any of them.
+    if (SomePHI->getNumIncomingValues() == getNumPreds(BB))
+      continue;
+
+    // Get the preds for BB.
+    SmallVector<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
+
+    // Ok, now we know that all of the PHI nodes are missing entries for some
+    // basic blocks.  Start by sorting the incoming predecessors for efficient
+    // access.
+    std::sort(Preds.begin(), Preds.end());
+
+    // Now we loop through all BB's which have entries in SomePHI and remove
+    // them from the Preds list.
+    for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) {
+      // Do a log(n) search of the Preds list for the entry we want.
+      SmallVectorImpl<BasicBlock *>::iterator EntIt = std::lower_bound(
+          Preds.begin(), Preds.end(), SomePHI->getIncomingBlock(i));
+      assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i) &&
+             "PHI node has entry for a block which is not a predecessor!");
+
+      // Remove the entry
+      Preds.erase(EntIt);
+    }
+
+    // At this point, the blocks left in the preds list must have dummy
+    // entries inserted into every PHI nodes for the block.  Update all the phi
+    // nodes in this block that we are inserting (there could be phis before
+    // mem2reg runs).
+    unsigned NumBadPreds = SomePHI->getNumIncomingValues();
+    BasicBlock::iterator BBI = BB->begin();
+    while ((SomePHI = dyn_cast<PHINode>(BBI++)) &&
+           SomePHI->getNumIncomingValues() == NumBadPreds) {
+      Value *UndefVal = UndefValue::get(SomePHI->getType());
+      for (unsigned pred = 0, e = Preds.size(); pred != e; ++pred)
+        SomePHI->addIncoming(UndefVal, Preds[pred]);
+    }
+  }
+
+  NewPhiNodes.clear();
+}
+
+/// \brief Determine which blocks the value is live in.
+///
+/// These are blocks which lead to uses.  Knowing this allows us to avoid
+/// inserting PHI nodes into blocks which don't lead to uses (thus, the
+/// inserted phi nodes would be dead).
+void PromoteMem2Reg::ComputeLiveInBlocks(
+    AllocaInst *AI, AllocaInfo &Info,
+    const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+    SmallPtrSetImpl<BasicBlock *> &LiveInBlocks) {
+
+  // To determine liveness, we must iterate through the predecessors of blocks
+  // where the def is live.  Blocks are added to the worklist if we need to
+  // check their predecessors.  Start with all the using blocks.
+  SmallVector<BasicBlock *, 64> LiveInBlockWorklist(Info.UsingBlocks.begin(),
+                                                    Info.UsingBlocks.end());
+
+  // If any of the using blocks is also a definition block, check to see if the
+  // definition occurs before or after the use.  If it happens before the use,
+  // the value isn't really live-in.
+  for (unsigned i = 0, e = LiveInBlockWorklist.size(); i != e; ++i) {
+    BasicBlock *BB = LiveInBlockWorklist[i];
+    if (!DefBlocks.count(BB))
+      continue;
+
+    // Okay, this is a block that both uses and defines the value.  If the first
+    // reference to the alloca is a def (store), then we know it isn't live-in.
+    for (BasicBlock::iterator I = BB->begin();; ++I) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        if (SI->getOperand(1) != AI)
+          continue;
+
+        // We found a store to the alloca before a load.  The alloca is not
+        // actually live-in here.
+        LiveInBlockWorklist[i] = LiveInBlockWorklist.back();
+        LiveInBlockWorklist.pop_back();
+        --i;
+        --e;
+        break;
+      }
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        if (LI->getOperand(0) != AI)
+          continue;
+
+        // Okay, we found a load before a store to the alloca.  It is actually
+        // live into this block.
+        break;
+      }
+    }
+  }
+
+  // Now that we have a set of blocks where the phi is live-in, recursively add
+  // their predecessors until we find the full region the value is live.
+  while (!LiveInBlockWorklist.empty()) {
+    BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
+
+    // The block really is live in here, insert it into the set.  If already in
+    // the set, then it has already been processed.
+    if (!LiveInBlocks.insert(BB).second)
+      continue;
+
+    // Since the value is live into BB, it is either defined in a predecessor or
+    // live into it to.  Add the preds to the worklist unless they are a
+    // defining block.
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+      BasicBlock *P = *PI;
+
+      // The value is not live into a predecessor if it defines the value.
+      if (DefBlocks.count(P))
+        continue;
+
+      // Otherwise it is, add to the worklist.
+      LiveInBlockWorklist.push_back(P);
+    }
+  }
+}
+
+/// \brief Queue a phi-node to be added to a basic-block for a specific Alloca.
+///
+/// Returns true if there wasn't already a phi-node for that variable
+bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
+                                  unsigned &Version) {
+  // Look up the basic-block in question.
+  PHINode *&PN = NewPhiNodes[std::make_pair(BBNumbers[BB], AllocaNo)];
+
+  // If the BB already has a phi node added for the i'th alloca then we're done!
+  if (PN)
+    return false;
+
+  // Create a PhiNode using the dereferenced type... and add the phi-node to the
+  // BasicBlock.
+  PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(), getNumPreds(BB),
+                       Allocas[AllocaNo]->getName() + "." + Twine(Version++),
+                       &BB->front());
+  ++NumPHIInsert;
+  PhiToAllocaMap[PN] = AllocaNo;
+  return true;
+}
+
+/// \brief Recursively traverse the CFG of the function, renaming loads and
+/// stores to the allocas which we are promoting.
+///
+/// IncomingVals indicates what value each Alloca contains on exit from the
+/// predecessor block Pred.
+void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
+                                RenamePassData::ValVector &IncomingVals,
+                                std::vector<RenamePassData> &Worklist) {
+NextIteration:
+  // If we are inserting any phi nodes into this BB, they will already be in the
+  // block.
+  if (PHINode *APN = dyn_cast<PHINode>(BB->begin())) {
+    // If we have PHI nodes to update, compute the number of edges from Pred to
+    // BB.
+    if (PhiToAllocaMap.count(APN)) {
+      // We want to be able to distinguish between PHI nodes being inserted by
+      // this invocation of mem2reg from those phi nodes that already existed in
+      // the IR before mem2reg was run.  We determine that APN is being inserted
+      // because it is missing incoming edges.  All other PHI nodes being
+      // inserted by this pass of mem2reg will have the same number of incoming
+      // operands so far.  Remember this count.
+      unsigned NewPHINumOperands = APN->getNumOperands();
+
+      unsigned NumEdges = std::count(succ_begin(Pred), succ_end(Pred), BB);
+      assert(NumEdges && "Must be at least one edge from Pred to BB!");
+
+      // Add entries for all the phis.
+      BasicBlock::iterator PNI = BB->begin();
+      do {
+        unsigned AllocaNo = PhiToAllocaMap[APN];
+
+        // Add N incoming values to the PHI node.
+        for (unsigned i = 0; i != NumEdges; ++i)
+          APN->addIncoming(IncomingVals[AllocaNo], Pred);
+
+        // The currently active variable for this block is now the PHI.
+        IncomingVals[AllocaNo] = APN;
+        if (DbgDeclareInst *DDI = AllocaDbgDeclares[AllocaNo])
+          ConvertDebugDeclareToDebugValue(DDI, APN, DIB);
+
+        // Get the next phi node.
+        ++PNI;
+        APN = dyn_cast<PHINode>(PNI);
+        if (!APN)
+          break;
+
+        // Verify that it is missing entries.  If not, it is not being inserted
+        // by this mem2reg invocation so we want to ignore it.
+      } while (APN->getNumOperands() == NewPHINumOperands);
+    }
+  }
+
+  // Don't revisit blocks.
+  if (!Visited.insert(BB).second)
+    return;
+
+  for (BasicBlock::iterator II = BB->begin(); !isa<TerminatorInst>(II);) {
+    Instruction *I = &*II++; // get the instruction, increment iterator
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand());
+      if (!Src)
+        continue;
+
+      DenseMap<AllocaInst *, unsigned>::iterator AI = AllocaLookup.find(Src);
+      if (AI == AllocaLookup.end())
+        continue;
+
+      Value *V = IncomingVals[AI->second];
+
+      // If the load was marked as nonnull we don't want to lose
+      // that information when we erase this Load. So we preserve
+      // it with an assume.
+      if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+          !llvm::isKnownNonNullAt(V, LI, &DT))
+        addAssumeNonNull(AC, LI);
+
+      // Anything using the load now uses the current value.
+      LI->replaceAllUsesWith(V);
+      BB->getInstList().erase(LI);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      // Delete this instruction and mark the name as the current holder of the
+      // value
+      AllocaInst *Dest = dyn_cast<AllocaInst>(SI->getPointerOperand());
+      if (!Dest)
+        continue;
+
+      DenseMap<AllocaInst *, unsigned>::iterator ai = AllocaLookup.find(Dest);
+      if (ai == AllocaLookup.end())
+        continue;
+
+      // what value were we writing?
+      IncomingVals[ai->second] = SI->getOperand(0);
+      // Record debuginfo for the store before removing it.
+      if (DbgDeclareInst *DDI = AllocaDbgDeclares[ai->second])
+        ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+      BB->getInstList().erase(SI);
+    }
+  }
+
+  // 'Recurse' to our successors.
+  succ_iterator I = succ_begin(BB), E = succ_end(BB);
+  if (I == E)
+    return;
+
+  // Keep track of the successors so we don't visit the same successor twice
+  SmallPtrSet<BasicBlock *, 8> VisitedSuccs;
+
+  // Handle the first successor without using the worklist.
+  VisitedSuccs.insert(*I);
+  Pred = BB;
+  BB = *I;
+  ++I;
+
+  for (; I != E; ++I)
+    if (VisitedSuccs.insert(*I).second)
+      Worklist.emplace_back(*I, Pred, IncomingVals);
+
+  goto NextIteration;
+}
+
+void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
+                           AssumptionCache *AC) {
+  // If there is nothing to do, bail out...
+  if (Allocas.empty())
+    return;
+
+  PromoteMem2Reg(Allocas, DT, AC).run();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
new file mode 100644
index 000000000000..6ccf54e49dd3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -0,0 +1,495 @@
+//===- SSAUpdater.cpp - Unstructured SSA Update Tool ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SSAUpdater class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ssaupdater"
+
+typedef DenseMap<BasicBlock*, Value*> AvailableValsTy;
+static AvailableValsTy &getAvailableVals(void *AV) {
+  return *static_cast<AvailableValsTy*>(AV);
+}
+
+SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode*> *NewPHI)
+  : InsertedPHIs(NewPHI) {}
+
+SSAUpdater::~SSAUpdater() {
+  delete static_cast<AvailableValsTy*>(AV);
+}
+
+void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
+  if (!AV)
+    AV = new AvailableValsTy();
+  else
+    getAvailableVals(AV).clear();
+  ProtoType = Ty;
+  ProtoName = Name;
+}
+
+bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const {
+  return getAvailableVals(AV).count(BB);
+}
+
+void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
+  assert(ProtoType && "Need to initialize SSAUpdater");
+  assert(ProtoType == V->getType() &&
+         "All rewritten values must have the same type");
+  getAvailableVals(AV)[BB] = V;
+}
+
+static bool IsEquivalentPHI(PHINode *PHI,
+                          SmallDenseMap<BasicBlock*, Value*, 8> &ValueMapping) {
+  unsigned PHINumValues = PHI->getNumIncomingValues();
+  if (PHINumValues != ValueMapping.size())
+    return false;
+
+  // Scan the phi to see if it matches.
+  for (unsigned i = 0, e = PHINumValues; i != e; ++i)
+    if (ValueMapping[PHI->getIncomingBlock(i)] !=
+        PHI->getIncomingValue(i)) {
+      return false;
+    }
+
+  return true;
+}
+
+Value *SSAUpdater::GetValueAtEndOfBlock(BasicBlock *BB) {
+  Value *Res = GetValueAtEndOfBlockInternal(BB);
+  return Res;
+}
+
+Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
+  // If there is no definition of the renamed variable in this block, just use
+  // GetValueAtEndOfBlock to do our work.
+  if (!HasValueForBlock(BB))
+    return GetValueAtEndOfBlock(BB);
+
+  // Otherwise, we have the hard case.  Get the live-in values for each
+  // predecessor.
+  SmallVector<std::pair<BasicBlock*, Value*>, 8> PredValues;
+  Value *SingularValue = nullptr;
+
+  // We can get our predecessor info by walking the pred_iterator list, but it
+  // is relatively slow.  If we already have PHI nodes in this block, walk one
+  // of them to get the predecessor list instead.
+  if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) {
+    for (unsigned i = 0, e = SomePhi->getNumIncomingValues(); i != e; ++i) {
+      BasicBlock *PredBB = SomePhi->getIncomingBlock(i);
+      Value *PredVal = GetValueAtEndOfBlock(PredBB);
+      PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+      // Compute SingularValue.
+      if (i == 0)
+        SingularValue = PredVal;
+      else if (PredVal != SingularValue)
+        SingularValue = nullptr;
+    }
+  } else {
+    bool isFirstPred = true;
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+      BasicBlock *PredBB = *PI;
+      Value *PredVal = GetValueAtEndOfBlock(PredBB);
+      PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+      // Compute SingularValue.
+      if (isFirstPred) {
+        SingularValue = PredVal;
+        isFirstPred = false;
+      } else if (PredVal != SingularValue)
+        SingularValue = nullptr;
+    }
+  }
+
+  // If there are no predecessors, just return undef.
+  if (PredValues.empty())
+    return UndefValue::get(ProtoType);
+
+  // Otherwise, if all the merged values are the same, just use it.
+  if (SingularValue)
+    return SingularValue;
+
+  // Otherwise, we do need a PHI: check to see if we already have one available
+  // in this block that produces the right value.
+  if (isa<PHINode>(BB->begin())) {
+    SmallDenseMap<BasicBlock*, Value*, 8> ValueMapping(PredValues.begin(),
+                                                       PredValues.end());
+    PHINode *SomePHI;
+    for (BasicBlock::iterator It = BB->begin();
+         (SomePHI = dyn_cast<PHINode>(It)); ++It) {
+      if (IsEquivalentPHI(SomePHI, ValueMapping))
+        return SomePHI;
+    }
+  }
+
+  // Ok, we have no way out, insert a new one now.
+  PHINode *InsertedPHI = PHINode::Create(ProtoType, PredValues.size(),
+                                         ProtoName, &BB->front());
+
+  // Fill in all the predecessors of the PHI.
+  for (const auto &PredValue : PredValues)
+    InsertedPHI->addIncoming(PredValue.second, PredValue.first);
+
+  // See if the PHI node can be merged to a single value.  This can happen in
+  // loop cases when we get a PHI of itself and one other value.
+  if (Value *V =
+          SimplifyInstruction(InsertedPHI, BB->getModule()->getDataLayout())) {
+    InsertedPHI->eraseFromParent();
+    return V;
+  }
+
+  // Set the DebugLoc of the inserted PHI, if available.
+  DebugLoc DL;
+  if (const Instruction *I = BB->getFirstNonPHI())
+      DL = I->getDebugLoc();
+  InsertedPHI->setDebugLoc(DL);
+
+  // If the client wants to know about all new instructions, tell it.
+  if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
+
+  DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  return InsertedPHI;
+}
+
+void SSAUpdater::RewriteUse(Use &U) {
+  Instruction *User = cast<Instruction>(U.getUser());
+
+  Value *V;
+  if (PHINode *UserPN = dyn_cast<PHINode>(User))
+    V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U));
+  else
+    V = GetValueInMiddleOfBlock(User->getParent());
+
+  // Notify that users of the existing value that it is being replaced.
+  Value *OldVal = U.get();
+  if (OldVal != V && OldVal->hasValueHandle())
+    ValueHandleBase::ValueIsRAUWd(OldVal, V);
+
+  U.set(V);
+}
+
+void SSAUpdater::RewriteUseAfterInsertions(Use &U) {
+  Instruction *User = cast<Instruction>(U.getUser());
+  
+  Value *V;
+  if (PHINode *UserPN = dyn_cast<PHINode>(User))
+    V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U));
+  else
+    V = GetValueAtEndOfBlock(User->getParent());
+  
+  U.set(V);
+}
+
+namespace llvm {
+
+template<>
+class SSAUpdaterTraits<SSAUpdater> {
+public:
+  typedef BasicBlock BlkT;
+  typedef Value *ValT;
+  typedef PHINode PhiT;
+
+  typedef succ_iterator BlkSucc_iterator;
+  static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return succ_begin(BB); }
+  static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return succ_end(BB); }
+
+  class PHI_iterator {
+  private:
+    PHINode *PHI;
+    unsigned idx;
+
+  public:
+    explicit PHI_iterator(PHINode *P) // begin iterator
+      : PHI(P), idx(0) {}
+    PHI_iterator(PHINode *P, bool) // end iterator
+      : PHI(P), idx(PHI->getNumIncomingValues()) {}
+
+    PHI_iterator &operator++() { ++idx; return *this; } 
+    bool operator==(const PHI_iterator& x) const { return idx == x.idx; }
+    bool operator!=(const PHI_iterator& x) const { return !operator==(x); }
+
+    Value *getIncomingValue() { return PHI->getIncomingValue(idx); }
+    BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); }
+  };
+
+  static PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
+  static PHI_iterator PHI_end(PhiT *PHI) {
+    return PHI_iterator(PHI, true);
+  }
+
+  /// FindPredecessorBlocks - Put the predecessors of Info->BB into the Preds
+  /// vector, set Info->NumPreds, and allocate space in Info->Preds.
+  static void FindPredecessorBlocks(BasicBlock *BB,
+                                    SmallVectorImpl<BasicBlock*> *Preds) {
+    // We can get our predecessor info by walking the pred_iterator list,
+    // but it is relatively slow.  If we already have PHI nodes in this
+    // block, walk one of them to get the predecessor list instead.
+    if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) {
+      Preds->append(SomePhi->block_begin(), SomePhi->block_end());
+    } else {
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+        Preds->push_back(*PI);
+    }
+  }
+
+  /// GetUndefVal - Get an undefined value of the same type as the value
+  /// being handled.
+  static Value *GetUndefVal(BasicBlock *BB, SSAUpdater *Updater) {
+    return UndefValue::get(Updater->ProtoType);
+  }
+
+  /// CreateEmptyPHI - Create a new PHI instruction in the specified block.
+  /// Reserve space for the operands but do not fill them in yet.
+  static Value *CreateEmptyPHI(BasicBlock *BB, unsigned NumPreds,
+                               SSAUpdater *Updater) {
+    PHINode *PHI = PHINode::Create(Updater->ProtoType, NumPreds,
+                                   Updater->ProtoName, &BB->front());
+    return PHI;
+  }
+
+  /// AddPHIOperand - Add the specified value as an operand of the PHI for
+  /// the specified predecessor block.
+  static void AddPHIOperand(PHINode *PHI, Value *Val, BasicBlock *Pred) {
+    PHI->addIncoming(Val, Pred);
+  }
+
+  /// InstrIsPHI - Check if an instruction is a PHI.
+  ///
+  static PHINode *InstrIsPHI(Instruction *I) {
+    return dyn_cast<PHINode>(I);
+  }
+
+  /// ValueIsPHI - Check if a value is a PHI.
+  ///
+  static PHINode *ValueIsPHI(Value *Val, SSAUpdater *Updater) {
+    return dyn_cast<PHINode>(Val);
+  }
+
+  /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source
+  /// operands, i.e., it was just added.
+  static PHINode *ValueIsNewPHI(Value *Val, SSAUpdater *Updater) {
+    PHINode *PHI = ValueIsPHI(Val, Updater);
+    if (PHI && PHI->getNumIncomingValues() == 0)
+      return PHI;
+    return nullptr;
+  }
+
+  /// GetPHIValue - For the specified PHI instruction, return the value
+  /// that it defines.
+  static Value *GetPHIValue(PHINode *PHI) {
+    return PHI;
+  }
+};
+
+} // end namespace llvm
+
+/// Check to see if AvailableVals has an entry for the specified BB and if so,
+/// return it.  If not, construct SSA form by first calculating the required
+/// placement of PHIs and then inserting new PHIs where needed.
+Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  if (Value *V = AvailableVals[BB])
+    return V;
+
+  SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
+  return Impl.GetValue(BB);
+}
+
+//===----------------------------------------------------------------------===//
+// LoadAndStorePromoter Implementation
+//===----------------------------------------------------------------------===//
+
+LoadAndStorePromoter::
+LoadAndStorePromoter(ArrayRef<const Instruction*> Insts,
+                     SSAUpdater &S, StringRef BaseName) : SSA(S) {
+  if (Insts.empty()) return;
+  
+  const Value *SomeVal;
+  if (const LoadInst *LI = dyn_cast<LoadInst>(Insts[0]))
+    SomeVal = LI;
+  else
+    SomeVal = cast<StoreInst>(Insts[0])->getOperand(0);
+
+  if (BaseName.empty())
+    BaseName = SomeVal->getName();
+  SSA.Initialize(SomeVal->getType(), BaseName);
+}
+
+void LoadAndStorePromoter::
+run(const SmallVectorImpl<Instruction*> &Insts) const {
+  // First step: bucket up uses of the alloca by the block they occur in.
+  // This is important because we have to handle multiple defs/uses in a block
+  // ourselves: SSAUpdater is purely for cross-block references.
+  DenseMap<BasicBlock*, TinyPtrVector<Instruction*>> UsesByBlock;
+
+  for (Instruction *User : Insts)
+    UsesByBlock[User->getParent()].push_back(User);
+  
+  // Okay, now we can iterate over all the blocks in the function with uses,
+  // processing them.  Keep track of which loads are loading a live-in value.
+  // Walk the uses in the use-list order to be determinstic.
+  SmallVector<LoadInst*, 32> LiveInLoads;
+  DenseMap<Value*, Value*> ReplacedLoads;
+
+  for (Instruction *User : Insts) {
+    BasicBlock *BB = User->getParent();
+    TinyPtrVector<Instruction*> &BlockUses = UsesByBlock[BB];
+    
+    // If this block has already been processed, ignore this repeat use.
+    if (BlockUses.empty()) continue;
+    
+    // Okay, this is the first use in the block.  If this block just has a
+    // single user in it, we can rewrite it trivially.
+    if (BlockUses.size() == 1) {
+      // If it is a store, it is a trivial def of the value in the block.
+      if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+        updateDebugInfo(SI);
+        SSA.AddAvailableValue(BB, SI->getOperand(0));
+      } else 
+        // Otherwise it is a load, queue it to rewrite as a live-in load.
+        LiveInLoads.push_back(cast<LoadInst>(User));
+      BlockUses.clear();
+      continue;
+    }
+    
+    // Otherwise, check to see if this block is all loads.
+    bool HasStore = false;
+    for (Instruction *I : BlockUses) {
+      if (isa<StoreInst>(I)) {
+        HasStore = true;
+        break;
+      }
+    }
+    
+    // If so, we can queue them all as live in loads.  We don't have an
+    // efficient way to tell which on is first in the block and don't want to
+    // scan large blocks, so just add all loads as live ins.
+    if (!HasStore) {
+      for (Instruction *I : BlockUses)
+        LiveInLoads.push_back(cast<LoadInst>(I));
+      BlockUses.clear();
+      continue;
+    }
+    
+    // Otherwise, we have mixed loads and stores (or just a bunch of stores).
+    // Since SSAUpdater is purely for cross-block values, we need to determine
+    // the order of these instructions in the block.  If the first use in the
+    // block is a load, then it uses the live in value.  The last store defines
+    // the live out value.  We handle this by doing a linear scan of the block.
+    Value *StoredValue = nullptr;
+    for (Instruction &I : *BB) {
+      if (LoadInst *L = dyn_cast<LoadInst>(&I)) {
+        // If this is a load from an unrelated pointer, ignore it.
+        if (!isInstInList(L, Insts)) continue;
+        
+        // If we haven't seen a store yet, this is a live in use, otherwise
+        // use the stored value.
+        if (StoredValue) {
+          replaceLoadWithValue(L, StoredValue);
+          L->replaceAllUsesWith(StoredValue);
+          ReplacedLoads[L] = StoredValue;
+        } else {
+          LiveInLoads.push_back(L);
+        }
+        continue;
+      }
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+        // If this is a store to an unrelated pointer, ignore it.
+        if (!isInstInList(SI, Insts)) continue;
+        updateDebugInfo(SI);
+
+        // Remember that this is the active value in the block.
+        StoredValue = SI->getOperand(0);
+      }
+    }
+    
+    // The last stored value that happened is the live-out for the block.
+    assert(StoredValue && "Already checked that there is a store in block");
+    SSA.AddAvailableValue(BB, StoredValue);
+    BlockUses.clear();
+  }
+  
+  // Okay, now we rewrite all loads that use live-in values in the loop,
+  // inserting PHI nodes as necessary.
+  for (LoadInst *ALoad : LiveInLoads) {
+    Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent());
+    replaceLoadWithValue(ALoad, NewVal);
+
+    // Avoid assertions in unreachable code.
+    if (NewVal == ALoad) NewVal = UndefValue::get(NewVal->getType());
+    ALoad->replaceAllUsesWith(NewVal);
+    ReplacedLoads[ALoad] = NewVal;
+  }
+  
+  // Allow the client to do stuff before we start nuking things.
+  doExtraRewritesBeforeFinalDeletion();
+  
+  // Now that everything is rewritten, delete the old instructions from the
+  // function.  They should all be dead now.
+  for (Instruction *User : Insts) {
+    // If this is a load that still has uses, then the load must have been added
+    // as a live value in the SSAUpdate data structure for a block (e.g. because
+    // the loaded value was stored later).  In this case, we need to recursively
+    // propagate the updates until we get to the real value.
+    if (!User->use_empty()) {
+      Value *NewVal = ReplacedLoads[User];
+      assert(NewVal && "not a replaced load?");
+      
+      // Propagate down to the ultimate replacee.  The intermediately loads
+      // could theoretically already have been deleted, so we don't want to
+      // dereference the Value*'s.
+      DenseMap<Value*, Value*>::iterator RLI = ReplacedLoads.find(NewVal);
+      while (RLI != ReplacedLoads.end()) {
+        NewVal = RLI->second;
+        RLI = ReplacedLoads.find(NewVal);
+      }
+      
+      replaceLoadWithValue(cast<LoadInst>(User), NewVal);
+      User->replaceAllUsesWith(NewVal);
+    }
+    
+    instructionDeleted(User);
+    User->eraseFromParent();
+  }
+}
+
+bool
+LoadAndStorePromoter::isInstInList(Instruction *I,
+                                   const SmallVectorImpl<Instruction*> &Insts)
+                                   const {
+  return is_contained(Insts, I);
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/SanitizerStats.cpp b/contrib/llvm/lib/Transforms/Utils/SanitizerStats.cpp
new file mode 100644
index 000000000000..8c23957ac43e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SanitizerStats.cpp
@@ -0,0 +1,108 @@
+//===- SanitizerStats.cpp - Sanitizer statistics gathering ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements code generation for sanitizer statistics gathering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SanitizerStats.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+SanitizerStatReport::SanitizerStatReport(Module *M) : M(M) {
+  StatTy = ArrayType::get(Type::getInt8PtrTy(M->getContext()), 2);
+  EmptyModuleStatsTy = makeModuleStatsTy();
+
+  ModuleStatsGV = new GlobalVariable(*M, EmptyModuleStatsTy, false,
+                                     GlobalValue::InternalLinkage, nullptr);
+}
+
+ArrayType *SanitizerStatReport::makeModuleStatsArrayTy() {
+  return ArrayType::get(StatTy, Inits.size());
+}
+
+StructType *SanitizerStatReport::makeModuleStatsTy() {
+  return StructType::get(M->getContext(), {Type::getInt8PtrTy(M->getContext()),
+                                           Type::getInt32Ty(M->getContext()),
+                                           makeModuleStatsArrayTy()});
+}
+
+void SanitizerStatReport::create(IRBuilder<> &B, SanitizerStatKind SK) {
+  Function *F = B.GetInsertBlock()->getParent();
+  Module *M = F->getParent();
+  PointerType *Int8PtrTy = B.getInt8PtrTy();
+  IntegerType *IntPtrTy = B.getIntPtrTy(M->getDataLayout());
+  ArrayType *StatTy = ArrayType::get(Int8PtrTy, 2);
+
+  Inits.push_back(ConstantArray::get(
+      StatTy,
+      {Constant::getNullValue(Int8PtrTy),
+       ConstantExpr::getIntToPtr(
+           ConstantInt::get(IntPtrTy, uint64_t(SK) << (IntPtrTy->getBitWidth() -
+                                                       kSanitizerStatKindBits)),
+           Int8PtrTy)}));
+
+  FunctionType *StatReportTy =
+      FunctionType::get(B.getVoidTy(), Int8PtrTy, false);
+  Constant *StatReport = M->getOrInsertFunction(
+      "__sanitizer_stat_report", StatReportTy);
+
+  auto InitAddr = ConstantExpr::getGetElementPtr(
+      EmptyModuleStatsTy, ModuleStatsGV,
+      ArrayRef<Constant *>{
+          ConstantInt::get(IntPtrTy, 0), ConstantInt::get(B.getInt32Ty(), 2),
+          ConstantInt::get(IntPtrTy, Inits.size() - 1),
+      });
+  B.CreateCall(StatReport, ConstantExpr::getBitCast(InitAddr, Int8PtrTy));
+}
+
+void SanitizerStatReport::finish() {
+  if (Inits.empty()) {
+    ModuleStatsGV->eraseFromParent();
+    return;
+  }
+
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(M->getContext());
+  IntegerType *Int32Ty = Type::getInt32Ty(M->getContext());
+  Type *VoidTy = Type::getVoidTy(M->getContext());
+
+  // Create a new ModuleStatsGV to replace the old one. We can't just set the
+  // old one's initializer because its type is different.
+  auto NewModuleStatsGV = new GlobalVariable(
+      *M, makeModuleStatsTy(), false, GlobalValue::InternalLinkage,
+      ConstantStruct::getAnon(
+          {Constant::getNullValue(Int8PtrTy),
+           ConstantInt::get(Int32Ty, Inits.size()),
+           ConstantArray::get(makeModuleStatsArrayTy(), Inits)}));
+  ModuleStatsGV->replaceAllUsesWith(
+      ConstantExpr::getBitCast(NewModuleStatsGV, ModuleStatsGV->getType()));
+  ModuleStatsGV->eraseFromParent();
+
+  // Create a global constructor to register NewModuleStatsGV.
+  auto F = Function::Create(FunctionType::get(VoidTy, false),
+                            GlobalValue::InternalLinkage, "", M);
+  auto BB = BasicBlock::Create(M->getContext(), "", F);
+  IRBuilder<> B(BB);
+
+  FunctionType *StatInitTy = FunctionType::get(VoidTy, Int8PtrTy, false);
+  Constant *StatInit = M->getOrInsertFunction(
+      "__sanitizer_stat_init", StatInitTy);
+
+  B.CreateCall(StatInit, ConstantExpr::getBitCast(NewModuleStatsGV, Int8PtrTy));
+  B.CreateRetVoid();
+
+  appendToGlobalCtors(*M, F, 0);
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
new file mode 100644
index 000000000000..dee658f98393
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -0,0 +1,5996 @@
+//===- SimplifyCFG.cpp - Code to perform CFG simplification ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Peephole optimize the CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "simplifycfg"
+
+// Chosen as 2 so as to be cheap, but still to have enough power to fold
+// a select, so the "clamp" idiom (of a min followed by a max) will be caught.
+// To catch this, we need to fold a compare and a select, hence '2' being the
+// minimum reasonable default.
+static cl::opt<unsigned> PHINodeFoldingThreshold(
+    "phi-node-folding-threshold", cl::Hidden, cl::init(2),
+    cl::desc(
+        "Control the amount of phi node folding to perform (default = 2)"));
+
+static cl::opt<bool> DupRet(
+    "simplifycfg-dup-ret", cl::Hidden, cl::init(false),
+    cl::desc("Duplicate return instructions into unconditional branches"));
+
+static cl::opt<bool>
+    SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
+               cl::desc("Sink common instructions down to the end block"));
+
+static cl::opt<bool> HoistCondStores(
+    "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
+    cl::desc("Hoist conditional stores if an unconditional store precedes"));
+
+static cl::opt<bool> MergeCondStores(
+    "simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true),
+    cl::desc("Hoist conditional stores even if an unconditional store does not "
+             "precede - hoist multiple conditional stores into a single "
+             "predicated store"));
+
+static cl::opt<bool> MergeCondStoresAggressively(
+    "simplifycfg-merge-cond-stores-aggressively", cl::Hidden, cl::init(false),
+    cl::desc("When merging conditional stores, do so even if the resultant "
+             "basic blocks are unlikely to be if-converted as a result"));
+
+static cl::opt<bool> SpeculateOneExpensiveInst(
+    "speculate-one-expensive-inst", cl::Hidden, cl::init(true),
+    cl::desc("Allow exactly one expensive instruction to be speculatively "
+             "executed"));
+
+static cl::opt<unsigned> MaxSpeculationDepth(
+    "max-speculation-depth", cl::Hidden, cl::init(10),
+    cl::desc("Limit maximum recursion depth when calculating costs of "
+             "speculatively executed instructions"));
+
+STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
+STATISTIC(NumLinearMaps,
+          "Number of switch instructions turned into linear mapping");
+STATISTIC(NumLookupTables,
+          "Number of switch instructions turned into lookup tables");
+STATISTIC(
+    NumLookupTablesHoles,
+    "Number of switch instructions turned into lookup tables (holes checked)");
+STATISTIC(NumTableCmpReuses, "Number of reused switch table lookup compares");
+STATISTIC(NumSinkCommons,
+          "Number of common instructions sunk down to the end block");
+STATISTIC(NumSpeculations, "Number of speculative executed instructions");
+
+namespace {
+
+// The first field contains the value that the switch produces when a certain
+// case group is selected, and the second field is a vector containing the
+// cases composing the case group.
+typedef SmallVector<std::pair<Constant *, SmallVector<ConstantInt *, 4>>, 2>
+    SwitchCaseResultVectorTy;
+// The first field contains the phi node that generates a result of the switch
+// and the second field contains the value generated for a certain case in the
+// switch for that PHI.
+typedef SmallVector<std::pair<PHINode *, Constant *>, 4> SwitchCaseResultsTy;
+
+/// ValueEqualityComparisonCase - Represents a case of a switch.
+struct ValueEqualityComparisonCase {
+  ConstantInt *Value;
+  BasicBlock *Dest;
+
+  ValueEqualityComparisonCase(ConstantInt *Value, BasicBlock *Dest)
+      : Value(Value), Dest(Dest) {}
+
+  bool operator<(ValueEqualityComparisonCase RHS) const {
+    // Comparing pointers is ok as we only rely on the order for uniquing.
+    return Value < RHS.Value;
+  }
+
+  bool operator==(BasicBlock *RHSDest) const { return Dest == RHSDest; }
+};
+
+class SimplifyCFGOpt {
+  const TargetTransformInfo &TTI;
+  const DataLayout &DL;
+  unsigned BonusInstThreshold;
+  AssumptionCache *AC;
+  SmallPtrSetImpl<BasicBlock *> *LoopHeaders;
+  // See comments in SimplifyCFGOpt::SimplifySwitch.
+  bool LateSimplifyCFG;
+  Value *isValueEqualityComparison(TerminatorInst *TI);
+  BasicBlock *GetValueEqualityComparisonCases(
+      TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases);
+  bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
+                                                     BasicBlock *Pred,
+                                                     IRBuilder<> &Builder);
+  bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
+                                           IRBuilder<> &Builder);
+
+  bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
+  bool SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder);
+  bool SimplifySingleResume(ResumeInst *RI);
+  bool SimplifyCommonResume(ResumeInst *RI);
+  bool SimplifyCleanupReturn(CleanupReturnInst *RI);
+  bool SimplifyUnreachable(UnreachableInst *UI);
+  bool SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);
+  bool SimplifyIndirectBr(IndirectBrInst *IBI);
+  bool SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder);
+  bool SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder);
+
+public:
+  SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout &DL,
+                 unsigned BonusInstThreshold, AssumptionCache *AC,
+                 SmallPtrSetImpl<BasicBlock *> *LoopHeaders,
+                 bool LateSimplifyCFG)
+      : TTI(TTI), DL(DL), BonusInstThreshold(BonusInstThreshold), AC(AC),
+        LoopHeaders(LoopHeaders), LateSimplifyCFG(LateSimplifyCFG) {}
+
+  bool run(BasicBlock *BB);
+};
+
+} // end anonymous namespace
+
+/// Return true if it is safe to merge these two
+/// terminator instructions together.
+static bool
+SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2,
+                       SmallSetVector<BasicBlock *, 4> *FailBlocks = nullptr) {
+  if (SI1 == SI2)
+    return false; // Can't merge with self!
+
+  // It is not safe to merge these two switch instructions if they have a common
+  // successor, and if that successor has a PHI node, and if *that* PHI node has
+  // conflicting incoming values from the two switch blocks.
+  BasicBlock *SI1BB = SI1->getParent();
+  BasicBlock *SI2BB = SI2->getParent();
+
+  SmallPtrSet<BasicBlock *, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+  bool Fail = false;
+  for (BasicBlock *Succ : successors(SI2BB))
+    if (SI1Succs.count(Succ))
+      for (BasicBlock::iterator BBI = Succ->begin(); isa<PHINode>(BBI); ++BBI) {
+        PHINode *PN = cast<PHINode>(BBI);
+        if (PN->getIncomingValueForBlock(SI1BB) !=
+            PN->getIncomingValueForBlock(SI2BB)) {
+          if (FailBlocks)
+            FailBlocks->insert(Succ);
+          Fail = true;
+        }
+      }
+
+  return !Fail;
+}
+
+/// Return true if it is safe and profitable to merge these two terminator
+/// instructions together, where SI1 is an unconditional branch. PhiNodes will
+/// store all PHI nodes in common successors.
+static bool
+isProfitableToFoldUnconditional(BranchInst *SI1, BranchInst *SI2,
+                                Instruction *Cond,
+                                SmallVectorImpl<PHINode *> &PhiNodes) {
+  if (SI1 == SI2)
+    return false; // Can't merge with self!
+  assert(SI1->isUnconditional() && SI2->isConditional());
+
+  // We fold the unconditional branch if we can easily update all PHI nodes in
+  // common successors:
+  // 1> We have a constant incoming value for the conditional branch;
+  // 2> We have "Cond" as the incoming value for the unconditional branch;
+  // 3> SI2->getCondition() and Cond have same operands.
+  CmpInst *Ci2 = dyn_cast<CmpInst>(SI2->getCondition());
+  if (!Ci2)
+    return false;
+  if (!(Cond->getOperand(0) == Ci2->getOperand(0) &&
+        Cond->getOperand(1) == Ci2->getOperand(1)) &&
+      !(Cond->getOperand(0) == Ci2->getOperand(1) &&
+        Cond->getOperand(1) == Ci2->getOperand(0)))
+    return false;
+
+  BasicBlock *SI1BB = SI1->getParent();
+  BasicBlock *SI2BB = SI2->getParent();
+  SmallPtrSet<BasicBlock *, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+  for (BasicBlock *Succ : successors(SI2BB))
+    if (SI1Succs.count(Succ))
+      for (BasicBlock::iterator BBI = Succ->begin(); isa<PHINode>(BBI); ++BBI) {
+        PHINode *PN = cast<PHINode>(BBI);
+        if (PN->getIncomingValueForBlock(SI1BB) != Cond ||
+            !isa<ConstantInt>(PN->getIncomingValueForBlock(SI2BB)))
+          return false;
+        PhiNodes.push_back(PN);
+      }
+  return true;
+}
+
+/// Update PHI nodes in Succ to indicate that there will now be entries in it
+/// from the 'NewPred' block. The values that will be flowing into the PHI nodes
+/// will be the same as those coming in from ExistPred, an existing predecessor
+/// of Succ.
+static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
+                                  BasicBlock *ExistPred) {
+  if (!isa<PHINode>(Succ->begin()))
+    return; // Quick exit if nothing to do
+
+  PHINode *PN;
+  for (BasicBlock::iterator I = Succ->begin(); (PN = dyn_cast<PHINode>(I)); ++I)
+    PN->addIncoming(PN->getIncomingValueForBlock(ExistPred), NewPred);
+}
+
+/// Compute an abstract "cost" of speculating the given instruction,
+/// which is assumed to be safe to speculate. TCC_Free means cheap,
+/// TCC_Basic means less cheap, and TCC_Expensive means prohibitively
+/// expensive.
+static unsigned ComputeSpeculationCost(const User *I,
+                                       const TargetTransformInfo &TTI) {
+  assert(isSafeToSpeculativelyExecute(I) &&
+         "Instruction is not safe to speculatively execute!");
+  return TTI.getUserCost(I);
+}
+
+/// If we have a merge point of an "if condition" as accepted above,
+/// return true if the specified value dominates the block.  We
+/// don't handle the true generality of domination here, just a special case
+/// which works well enough for us.
+///
+/// If AggressiveInsts is non-null, and if V does not dominate BB, we check to
+/// see if V (which must be an instruction) and its recursive operands
+/// that do not dominate BB have a combined cost lower than CostRemaining and
+/// are non-trapping.  If both are true, the instruction is inserted into the
+/// set and true is returned.
+///
+/// The cost for most non-trapping instructions is defined as 1 except for
+/// Select whose cost is 2.
+///
+/// After this function returns, CostRemaining is decreased by the cost of
+/// V plus its non-dominating operands.  If that cost is greater than
+/// CostRemaining, false is returned and CostRemaining is undefined.
+static bool DominatesMergePoint(Value *V, BasicBlock *BB,
+                                SmallPtrSetImpl<Instruction *> *AggressiveInsts,
+                                unsigned &CostRemaining,
+                                const TargetTransformInfo &TTI,
+                                unsigned Depth = 0) {
+  // It is possible to hit a zero-cost cycle (phi/gep instructions for example),
+  // so limit the recursion depth.
+  // TODO: While this recursion limit does prevent pathological behavior, it
+  // would be better to track visited instructions to avoid cycles.
+  if (Depth == MaxSpeculationDepth)
+    return false;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    // Non-instructions all dominate instructions, but not all constantexprs
+    // can be executed unconditionally.
+    if (ConstantExpr *C = dyn_cast<ConstantExpr>(V))
+      if (C->canTrap())
+        return false;
+    return true;
+  }
+  BasicBlock *PBB = I->getParent();
+
+  // We don't want to allow weird loops that might have the "if condition" in
+  // the bottom of this block.
+  if (PBB == BB)
+    return false;
+
+  // If this instruction is defined in a block that contains an unconditional
+  // branch to BB, then it must be in the 'conditional' part of the "if
+  // statement".  If not, it definitely dominates the region.
+  BranchInst *BI = dyn_cast<BranchInst>(PBB->getTerminator());
+  if (!BI || BI->isConditional() || BI->getSuccessor(0) != BB)
+    return true;
+
+  // If we aren't allowing aggressive promotion anymore, then don't consider
+  // instructions in the 'if region'.
+  if (!AggressiveInsts)
+    return false;
+
+  // If we have seen this instruction before, don't count it again.
+  if (AggressiveInsts->count(I))
+    return true;
+
+  // Okay, it looks like the instruction IS in the "condition".  Check to
+  // see if it's a cheap instruction to unconditionally compute, and if it
+  // only uses stuff defined outside of the condition.  If so, hoist it out.
+  if (!isSafeToSpeculativelyExecute(I))
+    return false;
+
+  unsigned Cost = ComputeSpeculationCost(I, TTI);
+
+  // Allow exactly one instruction to be speculated regardless of its cost
+  // (as long as it is safe to do so).
+  // This is intended to flatten the CFG even if the instruction is a division
+  // or other expensive operation. The speculation of an expensive instruction
+  // is expected to be undone in CodeGenPrepare if the speculation has not
+  // enabled further IR optimizations.
+  if (Cost > CostRemaining &&
+      (!SpeculateOneExpensiveInst || !AggressiveInsts->empty() || Depth > 0))
+    return false;
+
+  // Avoid unsigned wrap.
+  CostRemaining = (Cost > CostRemaining) ? 0 : CostRemaining - Cost;
+
+  // Okay, we can only really hoist these out if their operands do
+  // not take us over the cost threshold.
+  for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
+    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI,
+                             Depth + 1))
+      return false;
+  // Okay, it's safe to do this!  Remember this instruction.
+  AggressiveInsts->insert(I);
+  return true;
+}
+
+/// Extract ConstantInt from value, looking through IntToPtr
+/// and PointerNullValue. Return NULL if value is not a constant int.
+static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) {
+  // Normal constant int.
+  ConstantInt *CI = dyn_cast<ConstantInt>(V);
+  if (CI || !isa<Constant>(V) || !V->getType()->isPointerTy())
+    return CI;
+
+  // This is some kind of pointer constant. Turn it into a pointer-sized
+  // ConstantInt if possible.
+  IntegerType *PtrTy = cast<IntegerType>(DL.getIntPtrType(V->getType()));
+
+  // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*).
+  if (isa<ConstantPointerNull>(V))
+    return ConstantInt::get(PtrTy, 0);
+
+  // IntToPtr const int.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::IntToPtr)
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(CE->getOperand(0))) {
+        // The constant is very likely to have the right type already.
+        if (CI->getType() == PtrTy)
+          return CI;
+        else
+          return cast<ConstantInt>(
+              ConstantExpr::getIntegerCast(CI, PtrTy, /*isSigned=*/false));
+      }
+  return nullptr;
+}
+
+namespace {
+
+/// Given a chain of or (||) or and (&&) comparison of a value against a
+/// constant, this will try to recover the information required for a switch
+/// structure.
+/// It will depth-first traverse the chain of comparison, seeking for patterns
+/// like %a == 12 or %a < 4 and combine them to produce a set of integer
+/// representing the different cases for the switch.
+/// Note that if the chain is composed of '||' it will build the set of elements
+/// that matches the comparisons (i.e. any of this value validate the chain)
+/// while for a chain of '&&' it will build the set elements that make the test
+/// fail.
+struct ConstantComparesGatherer {
+  const DataLayout &DL;
+  Value *CompValue; /// Value found for the switch comparison
+  Value *Extra;     /// Extra clause to be checked before the switch
+  SmallVector<ConstantInt *, 8> Vals; /// Set of integers to match in switch
+  unsigned UsedICmps; /// Number of comparisons matched in the and/or chain
+
+  /// Construct and compute the result for the comparison instruction Cond
+  ConstantComparesGatherer(Instruction *Cond, const DataLayout &DL)
+      : DL(DL), CompValue(nullptr), Extra(nullptr), UsedICmps(0) {
+    gather(Cond);
+  }
+
+  /// Prevent copy
+  ConstantComparesGatherer(const ConstantComparesGatherer &) = delete;
+  ConstantComparesGatherer &
+  operator=(const ConstantComparesGatherer &) = delete;
+
+private:
+  /// Try to set the current value used for the comparison, it succeeds only if
+  /// it wasn't set before or if the new value is the same as the old one
+  bool setValueOnce(Value *NewVal) {
+    if (CompValue && CompValue != NewVal)
+      return false;
+    CompValue = NewVal;
+    return (CompValue != nullptr);
+  }
+
+  /// Try to match Instruction "I" as a comparison against a constant and
+  /// populates the array Vals with the set of values that match (or do not
+  /// match depending on isEQ).
+  /// Return false on failure. On success, the Value the comparison matched
+  /// against is placed in CompValue.
+  /// If CompValue is already set, the function is expected to fail if a match
+  /// is found but the value compared to is different.
+  bool matchInstruction(Instruction *I, bool isEQ) {
+    // If this is an icmp against a constant, handle this as one of the cases.
+    ICmpInst *ICI;
+    ConstantInt *C;
+    if (!((ICI = dyn_cast<ICmpInst>(I)) &&
+          (C = GetConstantInt(I->getOperand(1), DL)))) {
+      return false;
+    }
+
+    Value *RHSVal;
+    const APInt *RHSC;
+
+    // Pattern match a special case
+    // (x & ~2^z) == y --> x == y || x == y|2^z
+    // This undoes a transformation done by instcombine to fuse 2 compares.
+    if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE)) {
+
+      // It's a little bit hard to see why the following transformations are
+      // correct. Here is a CVC3 program to verify them for 64-bit values:
+
+      /*
+         ONE  : BITVECTOR(64) = BVZEROEXTEND(0bin1, 63);
+         x    : BITVECTOR(64);
+         y    : BITVECTOR(64);
+         z    : BITVECTOR(64);
+         mask : BITVECTOR(64) = BVSHL(ONE, z);
+         QUERY( (y & ~mask = y) =>
+                ((x & ~mask = y) <=> (x = y OR x = (y |  mask)))
+         );
+         QUERY( (y |  mask = y) =>
+                ((x |  mask = y) <=> (x = y OR x = (y & ~mask)))
+         );
+      */
+
+      // Please note that each pattern must be a dual implication (<--> or
+      // iff). One directional implication can create spurious matches. If the
+      // implication is only one-way, an unsatisfiable condition on the left
+      // side can imply a satisfiable condition on the right side. Dual
+      // implication ensures that satisfiable conditions are transformed to
+      // other satisfiable conditions and unsatisfiable conditions are
+      // transformed to other unsatisfiable conditions.
+
+      // Here is a concrete example of a unsatisfiable condition on the left
+      // implying a satisfiable condition on the right:
+      //
+      // mask = (1 << z)
+      // (x & ~mask) == y  --> (x == y || x == (y | mask))
+      //
+      // Substituting y = 3, z = 0 yields:
+      // (x & -2) == 3 --> (x == 3 || x == 2)
+
+      // Pattern match a special case:
+      /*
+        QUERY( (y & ~mask = y) =>
+               ((x & ~mask = y) <=> (x = y OR x = (y |  mask)))
+        );
+      */
+      if (match(ICI->getOperand(0),
+                m_And(m_Value(RHSVal), m_APInt(RHSC)))) {
+        APInt Mask = ~*RHSC;
+        if (Mask.isPowerOf2() && (C->getValue() & ~Mask) == C->getValue()) {
+          // If we already have a value for the switch, it has to match!
+          if (!setValueOnce(RHSVal))
+            return false;
+
+          Vals.push_back(C);
+          Vals.push_back(
+              ConstantInt::get(C->getContext(),
+                               C->getValue() | Mask));
+          UsedICmps++;
+          return true;
+        }
+      }
+
+      // Pattern match a special case:
+      /*
+        QUERY( (y |  mask = y) =>
+               ((x |  mask = y) <=> (x = y OR x = (y & ~mask)))
+        );
+      */
+      if (match(ICI->getOperand(0),
+                m_Or(m_Value(RHSVal), m_APInt(RHSC)))) {
+        APInt Mask = *RHSC;
+        if (Mask.isPowerOf2() && (C->getValue() | Mask) == C->getValue()) {
+          // If we already have a value for the switch, it has to match!
+          if (!setValueOnce(RHSVal))
+            return false;
+
+          Vals.push_back(C);
+          Vals.push_back(ConstantInt::get(C->getContext(),
+                                          C->getValue() & ~Mask));
+          UsedICmps++;
+          return true;
+        }
+      }
+
+      // If we already have a value for the switch, it has to match!
+      if (!setValueOnce(ICI->getOperand(0)))
+        return false;
+
+      UsedICmps++;
+      Vals.push_back(C);
+      return ICI->getOperand(0);
+    }
+
+    // If we have "x ult 3", for example, then we can add 0,1,2 to the set.
+    ConstantRange Span = ConstantRange::makeAllowedICmpRegion(
+        ICI->getPredicate(), C->getValue());
+
+    // Shift the range if the compare is fed by an add. This is the range
+    // compare idiom as emitted by instcombine.
+    Value *CandidateVal = I->getOperand(0);
+    if (match(I->getOperand(0), m_Add(m_Value(RHSVal), m_APInt(RHSC)))) {
+      Span = Span.subtract(*RHSC);
+      CandidateVal = RHSVal;
+    }
+
+    // If this is an and/!= check, then we are looking to build the set of
+    // value that *don't* pass the and chain. I.e. to turn "x ugt 2" into
+    // x != 0 && x != 1.
+    if (!isEQ)
+      Span = Span.inverse();
+
+    // If there are a ton of values, we don't want to make a ginormous switch.
+    if (Span.isSizeLargerThan(8) || Span.isEmptySet()) {
+      return false;
+    }
+
+    // If we already have a value for the switch, it has to match!
+    if (!setValueOnce(CandidateVal))
+      return false;
+
+    // Add all values from the range to the set
+    for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
+      Vals.push_back(ConstantInt::get(I->getContext(), Tmp));
+
+    UsedICmps++;
+    return true;
+  }
+
+  /// Given a potentially 'or'd or 'and'd together collection of icmp
+  /// eq/ne/lt/gt instructions that compare a value against a constant, extract
+  /// the value being compared, and stick the list constants into the Vals
+  /// vector.
+  /// One "Extra" case is allowed to differ from the other.
+  void gather(Value *V) {
+    Instruction *I = dyn_cast<Instruction>(V);
+    bool isEQ = (I->getOpcode() == Instruction::Or);
+
+    // Keep a stack (SmallVector for efficiency) for depth-first traversal
+    SmallVector<Value *, 8> DFT;
+    SmallPtrSet<Value *, 8> Visited;
+
+    // Initialize
+    Visited.insert(V);
+    DFT.push_back(V);
+
+    while (!DFT.empty()) {
+      V = DFT.pop_back_val();
+
+      if (Instruction *I = dyn_cast<Instruction>(V)) {
+        // If it is a || (or && depending on isEQ), process the operands.
+        if (I->getOpcode() == (isEQ ? Instruction::Or : Instruction::And)) {
+          if (Visited.insert(I->getOperand(1)).second)
+            DFT.push_back(I->getOperand(1));
+          if (Visited.insert(I->getOperand(0)).second)
+            DFT.push_back(I->getOperand(0));
+          continue;
+        }
+
+        // Try to match the current instruction
+        if (matchInstruction(I, isEQ))
+          // Match succeed, continue the loop
+          continue;
+      }
+
+      // One element of the sequence of || (or &&) could not be match as a
+      // comparison against the same value as the others.
+      // We allow only one "Extra" case to be checked before the switch
+      if (!Extra) {
+        Extra = V;
+        continue;
+      }
+      // Failed to parse a proper sequence, abort now
+      CompValue = nullptr;
+      break;
+    }
+  }
+};
+
+} // end anonymous namespace
+
+static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
+  Instruction *Cond = nullptr;
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    Cond = dyn_cast<Instruction>(SI->getCondition());
+  } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isConditional())
+      Cond = dyn_cast<Instruction>(BI->getCondition());
+  } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(TI)) {
+    Cond = dyn_cast<Instruction>(IBI->getAddress());
+  }
+
+  TI->eraseFromParent();
+  if (Cond)
+    RecursivelyDeleteTriviallyDeadInstructions(Cond);
+}
+
+/// Return true if the specified terminator checks
+/// to see if a value is equal to constant integer value.
+Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
+  Value *CV = nullptr;
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    // Do not permit merging of large switch instructions into their
+    // predecessors unless there is only one predecessor.
+    if (SI->getNumSuccessors() * std::distance(pred_begin(SI->getParent()),
+                                               pred_end(SI->getParent())) <=
+        128)
+      CV = SI->getCondition();
+  } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
+    if (BI->isConditional() && BI->getCondition()->hasOneUse())
+      if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) {
+        if (ICI->isEquality() && GetConstantInt(ICI->getOperand(1), DL))
+          CV = ICI->getOperand(0);
+      }
+
+  // Unwrap any lossless ptrtoint cast.
+  if (CV) {
+    if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) {
+      Value *Ptr = PTII->getPointerOperand();
+      if (PTII->getType() == DL.getIntPtrType(Ptr->getType()))
+        CV = Ptr;
+    }
+  }
+  return CV;
+}
+
+/// Given a value comparison instruction,
+/// decode all of the 'cases' that it represents and return the 'default' block.
+BasicBlock *SimplifyCFGOpt::GetValueEqualityComparisonCases(
+    TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases) {
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    Cases.reserve(SI->getNumCases());
+    for (auto Case : SI->cases())
+      Cases.push_back(ValueEqualityComparisonCase(Case.getCaseValue(),
+                                                  Case.getCaseSuccessor()));
+    return SI->getDefaultDest();
+  }
+
+  BranchInst *BI = cast<BranchInst>(TI);
+  ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+  BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE);
+  Cases.push_back(ValueEqualityComparisonCase(
+      GetConstantInt(ICI->getOperand(1), DL), Succ));
+  return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ);
+}
+
+/// Given a vector of bb/value pairs, remove any entries
+/// in the list that match the specified block.
+static void
+EliminateBlockCases(BasicBlock *BB,
+                    std::vector<ValueEqualityComparisonCase> &Cases) {
+  Cases.erase(std::remove(Cases.begin(), Cases.end(), BB), Cases.end());
+}
+
+/// Return true if there are any keys in C1 that exist in C2 as well.
+static bool ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
+                          std::vector<ValueEqualityComparisonCase> &C2) {
+  std::vector<ValueEqualityComparisonCase> *V1 = &C1, *V2 = &C2;
+
+  // Make V1 be smaller than V2.
+  if (V1->size() > V2->size())
+    std::swap(V1, V2);
+
+  if (V1->empty())
+    return false;
+  if (V1->size() == 1) {
+    // Just scan V2.
+    ConstantInt *TheVal = (*V1)[0].Value;
+    for (unsigned i = 0, e = V2->size(); i != e; ++i)
+      if (TheVal == (*V2)[i].Value)
+        return true;
+  }
+
+  // Otherwise, just sort both lists and compare element by element.
+  array_pod_sort(V1->begin(), V1->end());
+  array_pod_sort(V2->begin(), V2->end());
+  unsigned i1 = 0, i2 = 0, e1 = V1->size(), e2 = V2->size();
+  while (i1 != e1 && i2 != e2) {
+    if ((*V1)[i1].Value == (*V2)[i2].Value)
+      return true;
+    if ((*V1)[i1].Value < (*V2)[i2].Value)
+      ++i1;
+    else
+      ++i2;
+  }
+  return false;
+}
+
+/// If TI is known to be a terminator instruction and its block is known to
+/// only have a single predecessor block, check to see if that predecessor is
+/// also a value comparison with the same value, and if that comparison
+/// determines the outcome of this comparison. If so, simplify TI. This does a
+/// very limited form of jump threading.
+bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
+    TerminatorInst *TI, BasicBlock *Pred, IRBuilder<> &Builder) {
+  Value *PredVal = isValueEqualityComparison(Pred->getTerminator());
+  if (!PredVal)
+    return false; // Not a value comparison in predecessor.
+
+  Value *ThisVal = isValueEqualityComparison(TI);
+  assert(ThisVal && "This isn't a value comparison!!");
+  if (ThisVal != PredVal)
+    return false; // Different predicates.
+
+  // TODO: Preserve branch weight metadata, similarly to how
+  // FoldValueComparisonIntoPredecessors preserves it.
+
+  // Find out information about when control will move from Pred to TI's block.
+  std::vector<ValueEqualityComparisonCase> PredCases;
+  BasicBlock *PredDef =
+      GetValueEqualityComparisonCases(Pred->getTerminator(), PredCases);
+  EliminateBlockCases(PredDef, PredCases); // Remove default from cases.
+
+  // Find information about how control leaves this block.
+  std::vector<ValueEqualityComparisonCase> ThisCases;
+  BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases);
+  EliminateBlockCases(ThisDef, ThisCases); // Remove default from cases.
+
+  // If TI's block is the default block from Pred's comparison, potentially
+  // simplify TI based on this knowledge.
+  if (PredDef == TI->getParent()) {
+    // If we are here, we know that the value is none of those cases listed in
+    // PredCases.  If there are any cases in ThisCases that are in PredCases, we
+    // can simplify TI.
+    if (!ValuesOverlap(PredCases, ThisCases))
+      return false;
+
+    if (isa<BranchInst>(TI)) {
+      // Okay, one of the successors of this condbr is dead.  Convert it to a
+      // uncond br.
+      assert(ThisCases.size() == 1 && "Branch can only have one case!");
+      // Insert the new branch.
+      Instruction *NI = Builder.CreateBr(ThisDef);
+      (void)NI;
+
+      // Remove PHI node entries for the dead edge.
+      ThisCases[0].Dest->removePredecessor(TI->getParent());
+
+      DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+                   << "Through successor TI: " << *TI << "Leaving: " << *NI
+                   << "\n");
+
+      EraseTerminatorInstAndDCECond(TI);
+      return true;
+    }
+
+    SwitchInst *SI = cast<SwitchInst>(TI);
+    // Okay, TI has cases that are statically dead, prune them away.
+    SmallPtrSet<Constant *, 16> DeadCases;
+    for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+      DeadCases.insert(PredCases[i].Value);
+
+    DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+                 << "Through successor TI: " << *TI);
+
+    // Collect branch weights into a vector.
+    SmallVector<uint32_t, 8> Weights;
+    MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
+    bool HasWeight = MD && (MD->getNumOperands() == 2 + SI->getNumCases());
+    if (HasWeight)
+      for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
+           ++MD_i) {
+        ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(MD_i));
+        Weights.push_back(CI->getValue().getZExtValue());
+      }
+    for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
+      --i;
+      if (DeadCases.count(i->getCaseValue())) {
+        if (HasWeight) {
+          std::swap(Weights[i->getCaseIndex() + 1], Weights.back());
+          Weights.pop_back();
+        }
+        i->getCaseSuccessor()->removePredecessor(TI->getParent());
+        SI->removeCase(i);
+      }
+    }
+    if (HasWeight && Weights.size() >= 2)
+      SI->setMetadata(LLVMContext::MD_prof,
+                      MDBuilder(SI->getParent()->getContext())
+                          .createBranchWeights(Weights));
+
+    DEBUG(dbgs() << "Leaving: " << *TI << "\n");
+    return true;
+  }
+
+  // Otherwise, TI's block must correspond to some matched value.  Find out
+  // which value (or set of values) this is.
+  ConstantInt *TIV = nullptr;
+  BasicBlock *TIBB = TI->getParent();
+  for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+    if (PredCases[i].Dest == TIBB) {
+      if (TIV)
+        return false; // Cannot handle multiple values coming to this block.
+      TIV = PredCases[i].Value;
+    }
+  assert(TIV && "No edge from pred to succ?");
+
+  // Okay, we found the one constant that our value can be if we get into TI's
+  // BB.  Find out which successor will unconditionally be branched to.
+  BasicBlock *TheRealDest = nullptr;
+  for (unsigned i = 0, e = ThisCases.size(); i != e; ++i)
+    if (ThisCases[i].Value == TIV) {
+      TheRealDest = ThisCases[i].Dest;
+      break;
+    }
+
+  // If not handled by any explicit cases, it is handled by the default case.
+  if (!TheRealDest)
+    TheRealDest = ThisDef;
+
+  // Remove PHI node entries for dead edges.
+  BasicBlock *CheckEdge = TheRealDest;
+  for (BasicBlock *Succ : successors(TIBB))
+    if (Succ != CheckEdge)
+      Succ->removePredecessor(TIBB);
+    else
+      CheckEdge = nullptr;
+
+  // Insert the new branch.
+  Instruction *NI = Builder.CreateBr(TheRealDest);
+  (void)NI;
+
+  DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+               << "Through successor TI: " << *TI << "Leaving: " << *NI
+               << "\n");
+
+  EraseTerminatorInstAndDCECond(TI);
+  return true;
+}
+
+namespace {
+
+/// This class implements a stable ordering of constant
+/// integers that does not depend on their address.  This is important for
+/// applications that sort ConstantInt's to ensure uniqueness.
+struct ConstantIntOrdering {
+  bool operator()(const ConstantInt *LHS, const ConstantInt *RHS) const {
+    return LHS->getValue().ult(RHS->getValue());
+  }
+};
+
+} // end anonymous namespace
+
+static int ConstantIntSortPredicate(ConstantInt *const *P1,
+                                    ConstantInt *const *P2) {
+  const ConstantInt *LHS = *P1;
+  const ConstantInt *RHS = *P2;
+  if (LHS == RHS)
+    return 0;
+  return LHS->getValue().ult(RHS->getValue()) ? 1 : -1;
+}
+
+static inline bool HasBranchWeights(const Instruction *I) {
+  MDNode *ProfMD = I->getMetadata(LLVMContext::MD_prof);
+  if (ProfMD && ProfMD->getOperand(0))
+    if (MDString *MDS = dyn_cast<MDString>(ProfMD->getOperand(0)))
+      return MDS->getString().equals("branch_weights");
+
+  return false;
+}
+
+/// Get Weights of a given TerminatorInst, the default weight is at the front
+/// of the vector. If TI is a conditional eq, we need to swap the branch-weight
+/// metadata.
+static void GetBranchWeights(TerminatorInst *TI,
+                             SmallVectorImpl<uint64_t> &Weights) {
+  MDNode *MD = TI->getMetadata(LLVMContext::MD_prof);
+  assert(MD);
+  for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) {
+    ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(i));
+    Weights.push_back(CI->getValue().getZExtValue());
+  }
+
+  // If TI is a conditional eq, the default case is the false case,
+  // and the corresponding branch-weight data is at index 2. We swap the
+  // default weight to be the first entry.
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    assert(Weights.size() == 2);
+    ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+    if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+      std::swap(Weights.front(), Weights.back());
+  }
+}
+
+/// Keep halving the weights until all can fit in uint32_t.
+static void FitWeights(MutableArrayRef<uint64_t> Weights) {
+  uint64_t Max = *std::max_element(Weights.begin(), Weights.end());
+  if (Max > UINT_MAX) {
+    unsigned Offset = 32 - countLeadingZeros(Max);
+    for (uint64_t &I : Weights)
+      I >>= Offset;
+  }
+}
+
+/// The specified terminator is a value equality comparison instruction
+/// (either a switch or a branch on "X == c").
+/// See if any of the predecessors of the terminator block are value comparisons
+/// on the same value.  If so, and if safe to do so, fold them together.
+bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
+                                                         IRBuilder<> &Builder) {
+  BasicBlock *BB = TI->getParent();
+  Value *CV = isValueEqualityComparison(TI); // CondVal
+  assert(CV && "Not a comparison?");
+  bool Changed = false;
+
+  SmallVector<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
+  while (!Preds.empty()) {
+    BasicBlock *Pred = Preds.pop_back_val();
+
+    // See if the predecessor is a comparison with the same value.
+    TerminatorInst *PTI = Pred->getTerminator();
+    Value *PCV = isValueEqualityComparison(PTI); // PredCondVal
+
+    if (PCV == CV && TI != PTI) {
+      SmallSetVector<BasicBlock*, 4> FailBlocks;
+      if (!SafeToMergeTerminators(TI, PTI, &FailBlocks)) {
+        for (auto *Succ : FailBlocks) {
+          if (!SplitBlockPredecessors(Succ, TI->getParent(), ".fold.split"))
+            return false;
+        }
+      }
+
+      // Figure out which 'cases' to copy from SI to PSI.
+      std::vector<ValueEqualityComparisonCase> BBCases;
+      BasicBlock *BBDefault = GetValueEqualityComparisonCases(TI, BBCases);
+
+      std::vector<ValueEqualityComparisonCase> PredCases;
+      BasicBlock *PredDefault = GetValueEqualityComparisonCases(PTI, PredCases);
+
+      // Based on whether the default edge from PTI goes to BB or not, fill in
+      // PredCases and PredDefault with the new switch cases we would like to
+      // build.
+      SmallVector<BasicBlock *, 8> NewSuccessors;
+
+      // Update the branch weight metadata along the way
+      SmallVector<uint64_t, 8> Weights;
+      bool PredHasWeights = HasBranchWeights(PTI);
+      bool SuccHasWeights = HasBranchWeights(TI);
+
+      if (PredHasWeights) {
+        GetBranchWeights(PTI, Weights);
+        // branch-weight metadata is inconsistent here.
+        if (Weights.size() != 1 + PredCases.size())
+          PredHasWeights = SuccHasWeights = false;
+      } else if (SuccHasWeights)
+        // If there are no predecessor weights but there are successor weights,
+        // populate Weights with 1, which will later be scaled to the sum of
+        // successor's weights
+        Weights.assign(1 + PredCases.size(), 1);
+
+      SmallVector<uint64_t, 8> SuccWeights;
+      if (SuccHasWeights) {
+        GetBranchWeights(TI, SuccWeights);
+        // branch-weight metadata is inconsistent here.
+        if (SuccWeights.size() != 1 + BBCases.size())
+          PredHasWeights = SuccHasWeights = false;
+      } else if (PredHasWeights)
+        SuccWeights.assign(1 + BBCases.size(), 1);
+
+      if (PredDefault == BB) {
+        // If this is the default destination from PTI, only the edges in TI
+        // that don't occur in PTI, or that branch to BB will be activated.
+        std::set<ConstantInt *, ConstantIntOrdering> PTIHandled;
+        for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+          if (PredCases[i].Dest != BB)
+            PTIHandled.insert(PredCases[i].Value);
+          else {
+            // The default destination is BB, we don't need explicit targets.
+            std::swap(PredCases[i], PredCases.back());
+
+            if (PredHasWeights || SuccHasWeights) {
+              // Increase weight for the default case.
+              Weights[0] += Weights[i + 1];
+              std::swap(Weights[i + 1], Weights.back());
+              Weights.pop_back();
+            }
+
+            PredCases.pop_back();
+            --i;
+            --e;
+          }
+
+        // Reconstruct the new switch statement we will be building.
+        if (PredDefault != BBDefault) {
+          PredDefault->removePredecessor(Pred);
+          PredDefault = BBDefault;
+          NewSuccessors.push_back(BBDefault);
+        }
+
+        unsigned CasesFromPred = Weights.size();
+        uint64_t ValidTotalSuccWeight = 0;
+        for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
+          if (!PTIHandled.count(BBCases[i].Value) &&
+              BBCases[i].Dest != BBDefault) {
+            PredCases.push_back(BBCases[i]);
+            NewSuccessors.push_back(BBCases[i].Dest);
+            if (SuccHasWeights || PredHasWeights) {
+              // The default weight is at index 0, so weight for the ith case
+              // should be at index i+1. Scale the cases from successor by
+              // PredDefaultWeight (Weights[0]).
+              Weights.push_back(Weights[0] * SuccWeights[i + 1]);
+              ValidTotalSuccWeight += SuccWeights[i + 1];
+            }
+          }
+
+        if (SuccHasWeights || PredHasWeights) {
+          ValidTotalSuccWeight += SuccWeights[0];
+          // Scale the cases from predecessor by ValidTotalSuccWeight.
+          for (unsigned i = 1; i < CasesFromPred; ++i)
+            Weights[i] *= ValidTotalSuccWeight;
+          // Scale the default weight by SuccDefaultWeight (SuccWeights[0]).
+          Weights[0] *= SuccWeights[0];
+        }
+      } else {
+        // If this is not the default destination from PSI, only the edges
+        // in SI that occur in PSI with a destination of BB will be
+        // activated.
+        std::set<ConstantInt *, ConstantIntOrdering> PTIHandled;
+        std::map<ConstantInt *, uint64_t> WeightsForHandled;
+        for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+          if (PredCases[i].Dest == BB) {
+            PTIHandled.insert(PredCases[i].Value);
+
+            if (PredHasWeights || SuccHasWeights) {
+              WeightsForHandled[PredCases[i].Value] = Weights[i + 1];
+              std::swap(Weights[i + 1], Weights.back());
+              Weights.pop_back();
+            }
+
+            std::swap(PredCases[i], PredCases.back());
+            PredCases.pop_back();
+            --i;
+            --e;
+          }
+
+        // Okay, now we know which constants were sent to BB from the
+        // predecessor.  Figure out where they will all go now.
+        for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
+          if (PTIHandled.count(BBCases[i].Value)) {
+            // If this is one we are capable of getting...
+            if (PredHasWeights || SuccHasWeights)
+              Weights.push_back(WeightsForHandled[BBCases[i].Value]);
+            PredCases.push_back(BBCases[i]);
+            NewSuccessors.push_back(BBCases[i].Dest);
+            PTIHandled.erase(
+                BBCases[i].Value); // This constant is taken care of
+          }
+
+        // If there are any constants vectored to BB that TI doesn't handle,
+        // they must go to the default destination of TI.
+        for (ConstantInt *I : PTIHandled) {
+          if (PredHasWeights || SuccHasWeights)
+            Weights.push_back(WeightsForHandled[I]);
+          PredCases.push_back(ValueEqualityComparisonCase(I, BBDefault));
+          NewSuccessors.push_back(BBDefault);
+        }
+      }
+
+      // Okay, at this point, we know which new successor Pred will get.  Make
+      // sure we update the number of entries in the PHI nodes for these
+      // successors.
+      for (BasicBlock *NewSuccessor : NewSuccessors)
+        AddPredecessorToBlock(NewSuccessor, Pred, BB);
+
+      Builder.SetInsertPoint(PTI);
+      // Convert pointer to int before we switch.
+      if (CV->getType()->isPointerTy()) {
+        CV = Builder.CreatePtrToInt(CV, DL.getIntPtrType(CV->getType()),
+                                    "magicptr");
+      }
+
+      // Now that the successors are updated, create the new Switch instruction.
+      SwitchInst *NewSI =
+          Builder.CreateSwitch(CV, PredDefault, PredCases.size());
+      NewSI->setDebugLoc(PTI->getDebugLoc());
+      for (ValueEqualityComparisonCase &V : PredCases)
+        NewSI->addCase(V.Value, V.Dest);
+
+      if (PredHasWeights || SuccHasWeights) {
+        // Halve the weights if any of them cannot fit in an uint32_t
+        FitWeights(Weights);
+
+        SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+
+        NewSI->setMetadata(
+            LLVMContext::MD_prof,
+            MDBuilder(BB->getContext()).createBranchWeights(MDWeights));
+      }
+
+      EraseTerminatorInstAndDCECond(PTI);
+
+      // Okay, last check.  If BB is still a successor of PSI, then we must
+      // have an infinite loop case.  If so, add an infinitely looping block
+      // to handle the case to preserve the behavior of the code.
+      BasicBlock *InfLoopBlock = nullptr;
+      for (unsigned i = 0, e = NewSI->getNumSuccessors(); i != e; ++i)
+        if (NewSI->getSuccessor(i) == BB) {
+          if (!InfLoopBlock) {
+            // Insert it at the end of the function, because it's either code,
+            // or it won't matter if it's hot. :)
+            InfLoopBlock = BasicBlock::Create(BB->getContext(), "infloop",
+                                              BB->getParent());
+            BranchInst::Create(InfLoopBlock, InfLoopBlock);
+          }
+          NewSI->setSuccessor(i, InfLoopBlock);
+        }
+
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+// If we would need to insert a select that uses the value of this invoke
+// (comments in HoistThenElseCodeToIf explain why we would need to do this), we
+// can't hoist the invoke, as there is nowhere to put the select in this case.
+static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
+                                Instruction *I1, Instruction *I2) {
+  for (BasicBlock *Succ : successors(BB1)) {
+    PHINode *PN;
+    for (BasicBlock::iterator BBI = Succ->begin();
+         (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+      Value *BB1V = PN->getIncomingValueForBlock(BB1);
+      Value *BB2V = PN->getIncomingValueForBlock(BB2);
+      if (BB1V != BB2V && (BB1V == I1 || BB2V == I2)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I);
+
+/// Given a conditional branch that goes to BB1 and BB2, hoist any common code
+/// in the two blocks up into the branch block. The caller of this function
+/// guarantees that BI's block dominates BB1 and BB2.
+static bool HoistThenElseCodeToIf(BranchInst *BI,
+                                  const TargetTransformInfo &TTI) {
+  // This does very trivial matching, with limited scanning, to find identical
+  // instructions in the two blocks.  In particular, we don't want to get into
+  // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
+  // such, we currently just scan for obviously identical instructions in an
+  // identical order.
+  BasicBlock *BB1 = BI->getSuccessor(0); // The true destination.
+  BasicBlock *BB2 = BI->getSuccessor(1); // The false destination
+
+  BasicBlock::iterator BB1_Itr = BB1->begin();
+  BasicBlock::iterator BB2_Itr = BB2->begin();
+
+  Instruction *I1 = &*BB1_Itr++, *I2 = &*BB2_Itr++;
+  // Skip debug info if it is not identical.
+  DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
+  DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
+  if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
+    while (isa<DbgInfoIntrinsic>(I1))
+      I1 = &*BB1_Itr++;
+    while (isa<DbgInfoIntrinsic>(I2))
+      I2 = &*BB2_Itr++;
+  }
+  if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2) ||
+      (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)))
+    return false;
+
+  BasicBlock *BIParent = BI->getParent();
+
+  bool Changed = false;
+  do {
+    // If we are hoisting the terminator instruction, don't move one (making a
+    // broken BB), instead clone it, and remove BI.
+    if (isa<TerminatorInst>(I1))
+      goto HoistTerminator;
+
+    if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2))
+      return Changed;
+
+    // For a normal instruction, we just move one to right before the branch,
+    // then replace all uses of the other with the first.  Finally, we remove
+    // the now redundant second instruction.
+    BIParent->getInstList().splice(BI->getIterator(), BB1->getInstList(), I1);
+    if (!I2->use_empty())
+      I2->replaceAllUsesWith(I1);
+    I1->andIRFlags(I2);
+    unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
+                           LLVMContext::MD_range,
+                           LLVMContext::MD_fpmath,
+                           LLVMContext::MD_invariant_load,
+                           LLVMContext::MD_nonnull,
+                           LLVMContext::MD_invariant_group,
+                           LLVMContext::MD_align,
+                           LLVMContext::MD_dereferenceable,
+                           LLVMContext::MD_dereferenceable_or_null,
+                           LLVMContext::MD_mem_parallel_loop_access};
+    combineMetadata(I1, I2, KnownIDs);
+
+    // I1 and I2 are being combined into a single instruction.  Its debug
+    // location is the merged locations of the original instructions.
+    if (!isa<CallInst>(I1))
+      I1->setDebugLoc(
+          DILocation::getMergedLocation(I1->getDebugLoc(), I2->getDebugLoc()));
+
+    I2->eraseFromParent();
+    Changed = true;
+
+    I1 = &*BB1_Itr++;
+    I2 = &*BB2_Itr++;
+    // Skip debug info if it is not identical.
+    DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
+    DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
+    if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
+      while (isa<DbgInfoIntrinsic>(I1))
+        I1 = &*BB1_Itr++;
+      while (isa<DbgInfoIntrinsic>(I2))
+        I2 = &*BB2_Itr++;
+    }
+  } while (I1->isIdenticalToWhenDefined(I2));
+
+  return true;
+
+HoistTerminator:
+  // It may not be possible to hoist an invoke.
+  if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))
+    return Changed;
+
+  for (BasicBlock *Succ : successors(BB1)) {
+    PHINode *PN;
+    for (BasicBlock::iterator BBI = Succ->begin();
+         (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+      Value *BB1V = PN->getIncomingValueForBlock(BB1);
+      Value *BB2V = PN->getIncomingValueForBlock(BB2);
+      if (BB1V == BB2V)
+        continue;
+
+      // Check for passingValueIsAlwaysUndefined here because we would rather
+      // eliminate undefined control flow then converting it to a select.
+      if (passingValueIsAlwaysUndefined(BB1V, PN) ||
+          passingValueIsAlwaysUndefined(BB2V, PN))
+        return Changed;
+
+      if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V))
+        return Changed;
+      if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V))
+        return Changed;
+    }
+  }
+
+  // Okay, it is safe to hoist the terminator.
+  Instruction *NT = I1->clone();
+  BIParent->getInstList().insert(BI->getIterator(), NT);
+  if (!NT->getType()->isVoidTy()) {
+    I1->replaceAllUsesWith(NT);
+    I2->replaceAllUsesWith(NT);
+    NT->takeName(I1);
+  }
+
+  IRBuilder<NoFolder> Builder(NT);
+  // Hoisting one of the terminators from our successor is a great thing.
+  // Unfortunately, the successors of the if/else blocks may have PHI nodes in
+  // them.  If they do, all PHI entries for BB1/BB2 must agree for all PHI
+  // nodes, so we insert select instruction to compute the final result.
+  std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects;
+  for (BasicBlock *Succ : successors(BB1)) {
+    PHINode *PN;
+    for (BasicBlock::iterator BBI = Succ->begin();
+         (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+      Value *BB1V = PN->getIncomingValueForBlock(BB1);
+      Value *BB2V = PN->getIncomingValueForBlock(BB2);
+      if (BB1V == BB2V)
+        continue;
+
+      // These values do not agree.  Insert a select instruction before NT
+      // that determines the right value.
+      SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
+      if (!SI)
+        SI = cast<SelectInst>(
+            Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
+                                 BB1V->getName() + "." + BB2V->getName(), BI));
+
+      // Make the PHI node use the select for all incoming values for BB1/BB2
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingBlock(i) == BB1 || PN->getIncomingBlock(i) == BB2)
+          PN->setIncomingValue(i, SI);
+    }
+  }
+
+  // Update any PHI nodes in our new successors.
+  for (BasicBlock *Succ : successors(BB1))
+    AddPredecessorToBlock(Succ, BIParent, BB1);
+
+  EraseTerminatorInstAndDCECond(BI);
+  return true;
+}
+
+// All instructions in Insts belong to different blocks that all unconditionally
+// branch to a common successor. Analyze each instruction and return true if it
+// would be possible to sink them into their successor, creating one common
+// instruction instead. For every value that would be required to be provided by
+// PHI node (because an operand varies in each input block), add to PHIOperands.
+static bool canSinkInstructions(
+    ArrayRef<Instruction *> Insts,
+    DenseMap<Instruction *, SmallVector<Value *, 4>> &PHIOperands) {
+  // Prune out obviously bad instructions to move. Any non-store instruction
+  // must have exactly one use, and we check later that use is by a single,
+  // common PHI instruction in the successor.
+  for (auto *I : Insts) {
+    // These instructions may change or break semantics if moved.
+    if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
+        I->getType()->isTokenTy())
+      return false;
+
+    // Conservatively return false if I is an inline-asm instruction. Sinking
+    // and merging inline-asm instructions can potentially create arguments
+    // that cannot satisfy the inline-asm constraints.
+    if (const auto *C = dyn_cast<CallInst>(I))
+      if (C->isInlineAsm())
+        return false;
+
+    // Everything must have only one use too, apart from stores which
+    // have no uses.
+    if (!isa<StoreInst>(I) && !I->hasOneUse())
+      return false;
+  }
+
+  const Instruction *I0 = Insts.front();
+  for (auto *I : Insts)
+    if (!I->isSameOperationAs(I0))
+      return false;
+
+  // All instructions in Insts are known to be the same opcode. If they aren't
+  // stores, check the only user of each is a PHI or in the same block as the
+  // instruction, because if a user is in the same block as an instruction
+  // we're contemplating sinking, it must already be determined to be sinkable.
+  if (!isa<StoreInst>(I0)) {
+    auto *PNUse = dyn_cast<PHINode>(*I0->user_begin());
+    auto *Succ = I0->getParent()->getTerminator()->getSuccessor(0);
+    if (!all_of(Insts, [&PNUse,&Succ](const Instruction *I) -> bool {
+          auto *U = cast<Instruction>(*I->user_begin());
+          return (PNUse &&
+                  PNUse->getParent() == Succ &&
+                  PNUse->getIncomingValueForBlock(I->getParent()) == I) ||
+                 U->getParent() == I->getParent();
+        }))
+      return false;
+  }
+
+  // Because SROA can't handle speculating stores of selects, try not
+  // to sink loads or stores of allocas when we'd have to create a PHI for
+  // the address operand. Also, because it is likely that loads or stores
+  // of allocas will disappear when Mem2Reg/SROA is run, don't sink them.
+  // This can cause code churn which can have unintended consequences down
+  // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244.
+  // FIXME: This is a workaround for a deficiency in SROA - see
+  // https://llvm.org/bugs/show_bug.cgi?id=30188
+  if (isa<StoreInst>(I0) && any_of(Insts, [](const Instruction *I) {
+        return isa<AllocaInst>(I->getOperand(1));
+      }))
+    return false;
+  if (isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) {
+        return isa<AllocaInst>(I->getOperand(0));
+      }))
+    return false;
+
+  for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) {
+    if (I0->getOperand(OI)->getType()->isTokenTy())
+      // Don't touch any operand of token type.
+      return false;
+
+    auto SameAsI0 = [&I0, OI](const Instruction *I) {
+      assert(I->getNumOperands() == I0->getNumOperands());
+      return I->getOperand(OI) == I0->getOperand(OI);
+    };
+    if (!all_of(Insts, SameAsI0)) {
+      if (!canReplaceOperandWithVariable(I0, OI))
+        // We can't create a PHI from this GEP.
+        return false;
+      // Don't create indirect calls! The called value is the final operand.
+      if ((isa<CallInst>(I0) || isa<InvokeInst>(I0)) && OI == OE - 1) {
+        // FIXME: if the call was *already* indirect, we should do this.
+        return false;
+      }
+      for (auto *I : Insts)
+        PHIOperands[I].push_back(I->getOperand(OI));
+    }
+  }
+  return true;
+}
+
+// Assuming canSinkLastInstruction(Blocks) has returned true, sink the last
+// instruction of every block in Blocks to their common successor, commoning
+// into one instruction.
+static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
+  auto *BBEnd = Blocks[0]->getTerminator()->getSuccessor(0);
+
+  // canSinkLastInstruction returning true guarantees that every block has at
+  // least one non-terminator instruction.
+  SmallVector<Instruction*,4> Insts;
+  for (auto *BB : Blocks) {
+    Instruction *I = BB->getTerminator();
+    do {
+      I = I->getPrevNode();
+    } while (isa<DbgInfoIntrinsic>(I) && I != &BB->front());
+    if (!isa<DbgInfoIntrinsic>(I))
+      Insts.push_back(I);
+  }
+
+  // The only checking we need to do now is that all users of all instructions
+  // are the same PHI node. canSinkLastInstruction should have checked this but
+  // it is slightly over-aggressive - it gets confused by commutative instructions
+  // so double-check it here.
+  Instruction *I0 = Insts.front();
+  if (!isa<StoreInst>(I0)) {
+    auto *PNUse = dyn_cast<PHINode>(*I0->user_begin());
+    if (!all_of(Insts, [&PNUse](const Instruction *I) -> bool {
+          auto *U = cast<Instruction>(*I->user_begin());
+          return U == PNUse;
+        }))
+      return false;
+  }
+
+  // We don't need to do any more checking here; canSinkLastInstruction should
+  // have done it all for us.
+  SmallVector<Value*, 4> NewOperands;
+  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
+    // This check is different to that in canSinkLastInstruction. There, we
+    // cared about the global view once simplifycfg (and instcombine) have
+    // completed - it takes into account PHIs that become trivially
+    // simplifiable.  However here we need a more local view; if an operand
+    // differs we create a PHI and rely on instcombine to clean up the very
+    // small mess we may make.
+    bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) {
+      return I->getOperand(O) != I0->getOperand(O);
+    });
+    if (!NeedPHI) {
+      NewOperands.push_back(I0->getOperand(O));
+      continue;
+    }
+
+    // Create a new PHI in the successor block and populate it.
+    auto *Op = I0->getOperand(O);
+    assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!");
+    auto *PN = PHINode::Create(Op->getType(), Insts.size(),
+                               Op->getName() + ".sink", &BBEnd->front());
+    for (auto *I : Insts)
+      PN->addIncoming(I->getOperand(O), I->getParent());
+    NewOperands.push_back(PN);
+  }
+
+  // Arbitrarily use I0 as the new "common" instruction; remap its operands
+  // and move it to the start of the successor block.
+  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
+    I0->getOperandUse(O).set(NewOperands[O]);
+  I0->moveBefore(&*BBEnd->getFirstInsertionPt());
+
+  // The debug location for the "common" instruction is the merged locations of
+  // all the commoned instructions.  We start with the original location of the
+  // "common" instruction and iteratively merge each location in the loop below.
+  const DILocation *Loc = I0->getDebugLoc();
+
+  // Update metadata and IR flags, and merge debug locations.
+  for (auto *I : Insts)
+    if (I != I0) {
+      Loc = DILocation::getMergedLocation(Loc, I->getDebugLoc());
+      combineMetadataForCSE(I0, I);
+      I0->andIRFlags(I);
+    }
+  if (!isa<CallInst>(I0))
+    I0->setDebugLoc(Loc);
+
+  if (!isa<StoreInst>(I0)) {
+    // canSinkLastInstruction checked that all instructions were used by
+    // one and only one PHI node. Find that now, RAUW it to our common
+    // instruction and nuke it.
+    assert(I0->hasOneUse());
+    auto *PN = cast<PHINode>(*I0->user_begin());
+    PN->replaceAllUsesWith(I0);
+    PN->eraseFromParent();
+  }
+
+  // Finally nuke all instructions apart from the common instruction.
+  for (auto *I : Insts)
+    if (I != I0)
+      I->eraseFromParent();
+
+  return true;
+}
+
+namespace {
+
+  // LockstepReverseIterator - Iterates through instructions
+  // in a set of blocks in reverse order from the first non-terminator.
+  // For example (assume all blocks have size n):
+  //   LockstepReverseIterator I([B1, B2, B3]);
+  //   *I-- = [B1[n], B2[n], B3[n]];
+  //   *I-- = [B1[n-1], B2[n-1], B3[n-1]];
+  //   *I-- = [B1[n-2], B2[n-2], B3[n-2]];
+  //   ...
+  class LockstepReverseIterator {
+    ArrayRef<BasicBlock*> Blocks;
+    SmallVector<Instruction*,4> Insts;
+    bool Fail;
+  public:
+    LockstepReverseIterator(ArrayRef<BasicBlock*> Blocks) :
+      Blocks(Blocks) {
+      reset();
+    }
+
+    void reset() {
+      Fail = false;
+      Insts.clear();
+      for (auto *BB : Blocks) {
+        Instruction *Inst = BB->getTerminator();
+        for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
+          Inst = Inst->getPrevNode();
+        if (!Inst) {
+          // Block wasn't big enough.
+          Fail = true;
+          return;
+        }
+        Insts.push_back(Inst);
+      }
+    }
+
+    bool isValid() const {
+      return !Fail;
+    }
+
+    void operator -- () {
+      if (Fail)
+        return;
+      for (auto *&Inst : Insts) {
+        for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
+          Inst = Inst->getPrevNode();
+        // Already at beginning of block.
+        if (!Inst) {
+          Fail = true;
+          return;
+        }
+      }
+    }
+
+    ArrayRef<Instruction*> operator * () const {
+      return Insts;
+    }
+  };
+
+} // end anonymous namespace
+
+/// Given an unconditional branch that goes to BBEnd,
+/// check whether BBEnd has only two predecessors and the other predecessor
+/// ends with an unconditional branch. If it is true, sink any common code
+/// in the two predecessors to BBEnd.
+static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
+  assert(BI1->isUnconditional());
+  BasicBlock *BBEnd = BI1->getSuccessor(0);
+
+  // We support two situations:
+  //   (1) all incoming arcs are unconditional
+  //   (2) one incoming arc is conditional
+  //
+  // (2) is very common in switch defaults and
+  // else-if patterns;
+  //
+  //   if (a) f(1);
+  //   else if (b) f(2);
+  //
+  // produces:
+  //
+  //       [if]
+  //      /    \
+  //    [f(1)] [if]
+  //      |     | \
+  //      |     |  |
+  //      |  [f(2)]|
+  //       \    | /
+  //        [ end ]
+  //
+  // [end] has two unconditional predecessor arcs and one conditional. The
+  // conditional refers to the implicit empty 'else' arc. This conditional
+  // arc can also be caused by an empty default block in a switch.
+  //
+  // In this case, we attempt to sink code from all *unconditional* arcs.
+  // If we can sink instructions from these arcs (determined during the scan
+  // phase below) we insert a common successor for all unconditional arcs and
+  // connect that to [end], to enable sinking:
+  //
+  //       [if]
+  //      /    \
+  //    [x(1)] [if]
+  //      |     | \
+  //      |     |  \
+  //      |  [x(2)] |
+  //       \   /    |
+  //   [sink.split] |
+  //         \     /
+  //         [ end ]
+  //
+  SmallVector<BasicBlock*,4> UnconditionalPreds;
+  Instruction *Cond = nullptr;
+  for (auto *B : predecessors(BBEnd)) {
+    auto *T = B->getTerminator();
+    if (isa<BranchInst>(T) && cast<BranchInst>(T)->isUnconditional())
+      UnconditionalPreds.push_back(B);
+    else if ((isa<BranchInst>(T) || isa<SwitchInst>(T)) && !Cond)
+      Cond = T;
+    else
+      return false;
+  }
+  if (UnconditionalPreds.size() < 2)
+    return false;
+
+  bool Changed = false;
+  // We take a two-step approach to tail sinking. First we scan from the end of
+  // each block upwards in lockstep. If the n'th instruction from the end of each
+  // block can be sunk, those instructions are added to ValuesToSink and we
+  // carry on. If we can sink an instruction but need to PHI-merge some operands
+  // (because they're not identical in each instruction) we add these to
+  // PHIOperands.
+  unsigned ScanIdx = 0;
+  SmallPtrSet<Value*,4> InstructionsToSink;
+  DenseMap<Instruction*, SmallVector<Value*,4>> PHIOperands;
+  LockstepReverseIterator LRI(UnconditionalPreds);
+  while (LRI.isValid() &&
+         canSinkInstructions(*LRI, PHIOperands)) {
+    DEBUG(dbgs() << "SINK: instruction can be sunk: " << *(*LRI)[0] << "\n");
+    InstructionsToSink.insert((*LRI).begin(), (*LRI).end());
+    ++ScanIdx;
+    --LRI;
+  }
+
+  auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) {
+    unsigned NumPHIdValues = 0;
+    for (auto *I : *LRI)
+      for (auto *V : PHIOperands[I])
+        if (InstructionsToSink.count(V) == 0)
+          ++NumPHIdValues;
+    DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
+    unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size();
+    if ((NumPHIdValues % UnconditionalPreds.size()) != 0)
+        NumPHIInsts++;
+
+    return NumPHIInsts <= 1;
+  };
+
+  if (ScanIdx > 0 && Cond) {
+    // Check if we would actually sink anything first! This mutates the CFG and
+    // adds an extra block. The goal in doing this is to allow instructions that
+    // couldn't be sunk before to be sunk - obviously, speculatable instructions
+    // (such as trunc, add) can be sunk and predicated already. So we check that
+    // we're going to sink at least one non-speculatable instruction.
+    LRI.reset();
+    unsigned Idx = 0;
+    bool Profitable = false;
+    while (ProfitableToSinkInstruction(LRI) && Idx < ScanIdx) {
+      if (!isSafeToSpeculativelyExecute((*LRI)[0])) {
+        Profitable = true;
+        break;
+      }
+      --LRI;
+      ++Idx;
+    }
+    if (!Profitable)
+      return false;
+
+    DEBUG(dbgs() << "SINK: Splitting edge\n");
+    // We have a conditional edge and we're going to sink some instructions.
+    // Insert a new block postdominating all blocks we're going to sink from.
+    if (!SplitBlockPredecessors(BI1->getSuccessor(0), UnconditionalPreds,
+                                ".sink.split"))
+      // Edges couldn't be split.
+      return false;
+    Changed = true;
+  }
+
+  // Now that we've analyzed all potential sinking candidates, perform the
+  // actual sink. We iteratively sink the last non-terminator of the source
+  // blocks into their common successor unless doing so would require too
+  // many PHI instructions to be generated (currently only one PHI is allowed
+  // per sunk instruction).
+  //
+  // We can use InstructionsToSink to discount values needing PHI-merging that will
+  // actually be sunk in a later iteration. This allows us to be more
+  // aggressive in what we sink. This does allow a false positive where we
+  // sink presuming a later value will also be sunk, but stop half way through
+  // and never actually sink it which means we produce more PHIs than intended.
+  // This is unlikely in practice though.
+  for (unsigned SinkIdx = 0; SinkIdx != ScanIdx; ++SinkIdx) {
+    DEBUG(dbgs() << "SINK: Sink: "
+                 << *UnconditionalPreds[0]->getTerminator()->getPrevNode()
+                 << "\n");
+
+    // Because we've sunk every instruction in turn, the current instruction to
+    // sink is always at index 0.
+    LRI.reset();
+    if (!ProfitableToSinkInstruction(LRI)) {
+      // Too many PHIs would be created.
+      DEBUG(dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
+      break;
+    }
+
+    if (!sinkLastInstruction(UnconditionalPreds))
+      return Changed;
+    NumSinkCommons++;
+    Changed = true;
+  }
+  return Changed;
+}
+
+/// \brief Determine if we can hoist sink a sole store instruction out of a
+/// conditional block.
+///
+/// We are looking for code like the following:
+///   BrBB:
+///     store i32 %add, i32* %arrayidx2
+///     ... // No other stores or function calls (we could be calling a memory
+///     ... // function).
+///     %cmp = icmp ult %x, %y
+///     br i1 %cmp, label %EndBB, label %ThenBB
+///   ThenBB:
+///     store i32 %add5, i32* %arrayidx2
+///     br label EndBB
+///   EndBB:
+///     ...
+///   We are going to transform this into:
+///   BrBB:
+///     store i32 %add, i32* %arrayidx2
+///     ... //
+///     %cmp = icmp ult %x, %y
+///     %add.add5 = select i1 %cmp, i32 %add, %add5
+///     store i32 %add.add5, i32* %arrayidx2
+///     ...
+///
+/// \return The pointer to the value of the previous store if the store can be
+///         hoisted into the predecessor block. 0 otherwise.
+static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
+                                     BasicBlock *StoreBB, BasicBlock *EndBB) {
+  StoreInst *StoreToHoist = dyn_cast<StoreInst>(I);
+  if (!StoreToHoist)
+    return nullptr;
+
+  // Volatile or atomic.
+  if (!StoreToHoist->isSimple())
+    return nullptr;
+
+  Value *StorePtr = StoreToHoist->getPointerOperand();
+
+  // Look for a store to the same pointer in BrBB.
+  unsigned MaxNumInstToLookAt = 9;
+  for (Instruction &CurI : reverse(*BrBB)) {
+    if (!MaxNumInstToLookAt)
+      break;
+    // Skip debug info.
+    if (isa<DbgInfoIntrinsic>(CurI))
+      continue;
+    --MaxNumInstToLookAt;
+
+    // Could be calling an instruction that affects memory like free().
+    if (CurI.mayHaveSideEffects() && !isa<StoreInst>(CurI))
+      return nullptr;
+
+    if (auto *SI = dyn_cast<StoreInst>(&CurI)) {
+      // Found the previous store make sure it stores to the same location.
+      if (SI->getPointerOperand() == StorePtr)
+        // Found the previous store, return its value operand.
+        return SI->getValueOperand();
+      return nullptr; // Unknown store.
+    }
+  }
+
+  return nullptr;
+}
+
+/// \brief Speculate a conditional basic block flattening the CFG.
+///
+/// Note that this is a very risky transform currently. Speculating
+/// instructions like this is most often not desirable. Instead, there is an MI
+/// pass which can do it with full awareness of the resource constraints.
+/// However, some cases are "obvious" and we should do directly. An example of
+/// this is speculating a single, reasonably cheap instruction.
+///
+/// There is only one distinct advantage to flattening the CFG at the IR level:
+/// it makes very common but simplistic optimizations such as are common in
+/// instcombine and the DAG combiner more powerful by removing CFG edges and
+/// modeling their effects with easier to reason about SSA value graphs.
+///
+///
+/// An illustration of this transform is turning this IR:
+/// \code
+///   BB:
+///     %cmp = icmp ult %x, %y
+///     br i1 %cmp, label %EndBB, label %ThenBB
+///   ThenBB:
+///     %sub = sub %x, %y
+///     br label BB2
+///   EndBB:
+///     %phi = phi [ %sub, %ThenBB ], [ 0, %EndBB ]
+///     ...
+/// \endcode
+///
+/// Into this IR:
+/// \code
+///   BB:
+///     %cmp = icmp ult %x, %y
+///     %sub = sub %x, %y
+///     %cond = select i1 %cmp, 0, %sub
+///     ...
+/// \endcode
+///
+/// \returns true if the conditional block is removed.
+static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
+                                   const TargetTransformInfo &TTI) {
+  // Be conservative for now. FP select instruction can often be expensive.
+  Value *BrCond = BI->getCondition();
+  if (isa<FCmpInst>(BrCond))
+    return false;
+
+  BasicBlock *BB = BI->getParent();
+  BasicBlock *EndBB = ThenBB->getTerminator()->getSuccessor(0);
+
+  // If ThenBB is actually on the false edge of the conditional branch, remember
+  // to swap the select operands later.
+  bool Invert = false;
+  if (ThenBB != BI->getSuccessor(0)) {
+    assert(ThenBB == BI->getSuccessor(1) && "No edge from 'if' block?");
+    Invert = true;
+  }
+  assert(EndBB == BI->getSuccessor(!Invert) && "No edge from to end block");
+
+  // Keep a count of how many times instructions are used within CondBB when
+  // they are candidates for sinking into CondBB. Specifically:
+  // - They are defined in BB, and
+  // - They have no side effects, and
+  // - All of their uses are in CondBB.
+  SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts;
+
+  unsigned SpeculationCost = 0;
+  Value *SpeculatedStoreValue = nullptr;
+  StoreInst *SpeculatedStore = nullptr;
+  for (BasicBlock::iterator BBI = ThenBB->begin(),
+                            BBE = std::prev(ThenBB->end());
+       BBI != BBE; ++BBI) {
+    Instruction *I = &*BBI;
+    // Skip debug info.
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    // Only speculatively execute a single instruction (not counting the
+    // terminator) for now.
+    ++SpeculationCost;
+    if (SpeculationCost > 1)
+      return false;
+
+    // Don't hoist the instruction if it's unsafe or expensive.
+    if (!isSafeToSpeculativelyExecute(I) &&
+        !(HoistCondStores && (SpeculatedStoreValue = isSafeToSpeculateStore(
+                                  I, BB, ThenBB, EndBB))))
+      return false;
+    if (!SpeculatedStoreValue &&
+        ComputeSpeculationCost(I, TTI) >
+            PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic)
+      return false;
+
+    // Store the store speculation candidate.
+    if (SpeculatedStoreValue)
+      SpeculatedStore = cast<StoreInst>(I);
+
+    // Do not hoist the instruction if any of its operands are defined but not
+    // used in BB. The transformation will prevent the operand from
+    // being sunk into the use block.
+    for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) {
+      Instruction *OpI = dyn_cast<Instruction>(*i);
+      if (!OpI || OpI->getParent() != BB || OpI->mayHaveSideEffects())
+        continue; // Not a candidate for sinking.
+
+      ++SinkCandidateUseCounts[OpI];
+    }
+  }
+
+  // Consider any sink candidates which are only used in CondBB as costs for
+  // speculation. Note, while we iterate over a DenseMap here, we are summing
+  // and so iteration order isn't significant.
+  for (SmallDenseMap<Instruction *, unsigned, 4>::iterator
+           I = SinkCandidateUseCounts.begin(),
+           E = SinkCandidateUseCounts.end();
+       I != E; ++I)
+    if (I->first->getNumUses() == I->second) {
+      ++SpeculationCost;
+      if (SpeculationCost > 1)
+        return false;
+    }
+
+  // Check that the PHI nodes can be converted to selects.
+  bool HaveRewritablePHIs = false;
+  for (BasicBlock::iterator I = EndBB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    Value *OrigV = PN->getIncomingValueForBlock(BB);
+    Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
+
+    // FIXME: Try to remove some of the duplication with HoistThenElseCodeToIf.
+    // Skip PHIs which are trivial.
+    if (ThenV == OrigV)
+      continue;
+
+    // Don't convert to selects if we could remove undefined behavior instead.
+    if (passingValueIsAlwaysUndefined(OrigV, PN) ||
+        passingValueIsAlwaysUndefined(ThenV, PN))
+      return false;
+
+    HaveRewritablePHIs = true;
+    ConstantExpr *OrigCE = dyn_cast<ConstantExpr>(OrigV);
+    ConstantExpr *ThenCE = dyn_cast<ConstantExpr>(ThenV);
+    if (!OrigCE && !ThenCE)
+      continue; // Known safe and cheap.
+
+    if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE)) ||
+        (OrigCE && !isSafeToSpeculativelyExecute(OrigCE)))
+      return false;
+    unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, TTI) : 0;
+    unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, TTI) : 0;
+    unsigned MaxCost =
+        2 * PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
+    if (OrigCost + ThenCost > MaxCost)
+      return false;
+
+    // Account for the cost of an unfolded ConstantExpr which could end up
+    // getting expanded into Instructions.
+    // FIXME: This doesn't account for how many operations are combined in the
+    // constant expression.
+    ++SpeculationCost;
+    if (SpeculationCost > 1)
+      return false;
+  }
+
+  // If there are no PHIs to process, bail early. This helps ensure idempotence
+  // as well.
+  if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue))
+    return false;
+
+  // If we get here, we can hoist the instruction and if-convert.
+  DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
+
+  // Insert a select of the value of the speculated store.
+  if (SpeculatedStoreValue) {
+    IRBuilder<NoFolder> Builder(BI);
+    Value *TrueV = SpeculatedStore->getValueOperand();
+    Value *FalseV = SpeculatedStoreValue;
+    if (Invert)
+      std::swap(TrueV, FalseV);
+    Value *S = Builder.CreateSelect(
+        BrCond, TrueV, FalseV, TrueV->getName() + "." + FalseV->getName(), BI);
+    SpeculatedStore->setOperand(0, S);
+    SpeculatedStore->setDebugLoc(
+        DILocation::getMergedLocation(
+          BI->getDebugLoc(), SpeculatedStore->getDebugLoc()));
+  }
+
+  // Metadata can be dependent on the condition we are hoisting above.
+  // Conservatively strip all metadata on the instruction.
+  for (auto &I : *ThenBB)
+    I.dropUnknownNonDebugMetadata();
+
+  // Hoist the instructions.
+  BB->getInstList().splice(BI->getIterator(), ThenBB->getInstList(),
+                           ThenBB->begin(), std::prev(ThenBB->end()));
+
+  // Insert selects and rewrite the PHI operands.
+  IRBuilder<NoFolder> Builder(BI);
+  for (BasicBlock::iterator I = EndBB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    unsigned OrigI = PN->getBasicBlockIndex(BB);
+    unsigned ThenI = PN->getBasicBlockIndex(ThenBB);
+    Value *OrigV = PN->getIncomingValue(OrigI);
+    Value *ThenV = PN->getIncomingValue(ThenI);
+
+    // Skip PHIs which are trivial.
+    if (OrigV == ThenV)
+      continue;
+
+    // Create a select whose true value is the speculatively executed value and
+    // false value is the preexisting value. Swap them if the branch
+    // destinations were inverted.
+    Value *TrueV = ThenV, *FalseV = OrigV;
+    if (Invert)
+      std::swap(TrueV, FalseV);
+    Value *V = Builder.CreateSelect(
+        BrCond, TrueV, FalseV, TrueV->getName() + "." + FalseV->getName(), BI);
+    PN->setIncomingValue(OrigI, V);
+    PN->setIncomingValue(ThenI, V);
+  }
+
+  ++NumSpeculations;
+  return true;
+}
+
+/// Return true if we can thread a branch across this block.
+static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
+  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
+  unsigned Size = 0;
+
+  for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
+    if (isa<DbgInfoIntrinsic>(BBI))
+      continue;
+    if (Size > 10)
+      return false; // Don't clone large BB's.
+    ++Size;
+
+    // We can only support instructions that do not define values that are
+    // live outside of the current basic block.
+    for (User *U : BBI->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      if (UI->getParent() != BB || isa<PHINode>(UI))
+        return false;
+    }
+
+    // Looks ok, continue checking.
+  }
+
+  return true;
+}
+
+/// If we have a conditional branch on a PHI node value that is defined in the
+/// same block as the branch and if any PHI entries are constants, thread edges
+/// corresponding to that entry to be branches to their ultimate destination.
+static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL,
+                                AssumptionCache *AC) {
+  BasicBlock *BB = BI->getParent();
+  PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
+  // NOTE: we currently cannot transform this case if the PHI node is used
+  // outside of the block.
+  if (!PN || PN->getParent() != BB || !PN->hasOneUse())
+    return false;
+
+  // Degenerate case of a single entry PHI.
+  if (PN->getNumIncomingValues() == 1) {
+    FoldSingleEntryPHINodes(PN->getParent());
+    return true;
+  }
+
+  // Now we know that this block has multiple preds and two succs.
+  if (!BlockIsSimpleEnoughToThreadThrough(BB))
+    return false;
+
+  // Can't fold blocks that contain noduplicate or convergent calls.
+  if (any_of(*BB, [](const Instruction &I) {
+        const CallInst *CI = dyn_cast<CallInst>(&I);
+        return CI && (CI->cannotDuplicate() || CI->isConvergent());
+      }))
+    return false;
+
+  // Okay, this is a simple enough basic block.  See if any phi values are
+  // constants.
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i));
+    if (!CB || !CB->getType()->isIntegerTy(1))
+      continue;
+
+    // Okay, we now know that all edges from PredBB should be revectored to
+    // branch to RealDest.
+    BasicBlock *PredBB = PN->getIncomingBlock(i);
+    BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
+
+    if (RealDest == BB)
+      continue; // Skip self loops.
+    // Skip if the predecessor's terminator is an indirect branch.
+    if (isa<IndirectBrInst>(PredBB->getTerminator()))
+      continue;
+
+    // The dest block might have PHI nodes, other predecessors and other
+    // difficult cases.  Instead of being smart about this, just insert a new
+    // block that jumps to the destination block, effectively splitting
+    // the edge we are about to create.
+    BasicBlock *EdgeBB =
+        BasicBlock::Create(BB->getContext(), RealDest->getName() + ".critedge",
+                           RealDest->getParent(), RealDest);
+    BranchInst::Create(RealDest, EdgeBB);
+
+    // Update PHI nodes.
+    AddPredecessorToBlock(RealDest, EdgeBB, BB);
+
+    // BB may have instructions that are being threaded over.  Clone these
+    // instructions into EdgeBB.  We know that there will be no uses of the
+    // cloned instructions outside of EdgeBB.
+    BasicBlock::iterator InsertPt = EdgeBB->begin();
+    DenseMap<Value *, Value *> TranslateMap; // Track translated values.
+    for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
+      if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
+        TranslateMap[PN] = PN->getIncomingValueForBlock(PredBB);
+        continue;
+      }
+      // Clone the instruction.
+      Instruction *N = BBI->clone();
+      if (BBI->hasName())
+        N->setName(BBI->getName() + ".c");
+
+      // Update operands due to translation.
+      for (User::op_iterator i = N->op_begin(), e = N->op_end(); i != e; ++i) {
+        DenseMap<Value *, Value *>::iterator PI = TranslateMap.find(*i);
+        if (PI != TranslateMap.end())
+          *i = PI->second;
+      }
+
+      // Check for trivial simplification.
+      if (Value *V = SimplifyInstruction(N, {DL, nullptr, nullptr, AC})) {
+        if (!BBI->use_empty())
+          TranslateMap[&*BBI] = V;
+        if (!N->mayHaveSideEffects()) {
+          N->deleteValue(); // Instruction folded away, don't need actual inst
+          N = nullptr;
+        }
+      } else {
+        if (!BBI->use_empty())
+          TranslateMap[&*BBI] = N;
+      }
+      // Insert the new instruction into its new home.
+      if (N)
+        EdgeBB->getInstList().insert(InsertPt, N);
+
+      // Register the new instruction with the assumption cache if necessary.
+      if (auto *II = dyn_cast_or_null<IntrinsicInst>(N))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
+    }
+
+    // Loop over all of the edges from PredBB to BB, changing them to branch
+    // to EdgeBB instead.
+    TerminatorInst *PredBBTI = PredBB->getTerminator();
+    for (unsigned i = 0, e = PredBBTI->getNumSuccessors(); i != e; ++i)
+      if (PredBBTI->getSuccessor(i) == BB) {
+        BB->removePredecessor(PredBB);
+        PredBBTI->setSuccessor(i, EdgeBB);
+      }
+
+    // Recurse, simplifying any other constants.
+    return FoldCondBranchOnPHI(BI, DL, AC) | true;
+  }
+
+  return false;
+}
+
+/// Given a BB that starts with the specified two-entry PHI node,
+/// see if we can eliminate it.
+static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
+                                const DataLayout &DL) {
+  // Ok, this is a two entry PHI node.  Check to see if this is a simple "if
+  // statement", which has a very simple dominance structure.  Basically, we
+  // are trying to find the condition that is being branched on, which
+  // subsequently causes this merge to happen.  We really want control
+  // dependence information for this check, but simplifycfg can't keep it up
+  // to date, and this catches most of the cases we care about anyway.
+  BasicBlock *BB = PN->getParent();
+  BasicBlock *IfTrue, *IfFalse;
+  Value *IfCond = GetIfCondition(BB, IfTrue, IfFalse);
+  if (!IfCond ||
+      // Don't bother if the branch will be constant folded trivially.
+      isa<ConstantInt>(IfCond))
+    return false;
+
+  // Okay, we found that we can merge this two-entry phi node into a select.
+  // Doing so would require us to fold *all* two entry phi nodes in this block.
+  // At some point this becomes non-profitable (particularly if the target
+  // doesn't support cmov's).  Only do this transformation if there are two or
+  // fewer PHI nodes in this block.
+  unsigned NumPhis = 0;
+  for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++NumPhis, ++I)
+    if (NumPhis > 2)
+      return false;
+
+  // Loop over the PHI's seeing if we can promote them all to select
+  // instructions.  While we are at it, keep track of the instructions
+  // that need to be moved to the dominating block.
+  SmallPtrSet<Instruction *, 4> AggressiveInsts;
+  unsigned MaxCostVal0 = PHINodeFoldingThreshold,
+           MaxCostVal1 = PHINodeFoldingThreshold;
+  MaxCostVal0 *= TargetTransformInfo::TCC_Basic;
+  MaxCostVal1 *= TargetTransformInfo::TCC_Basic;
+
+  for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
+    PHINode *PN = cast<PHINode>(II++);
+    if (Value *V = SimplifyInstruction(PN, {DL, PN})) {
+      PN->replaceAllUsesWith(V);
+      PN->eraseFromParent();
+      continue;
+    }
+
+    if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts,
+                             MaxCostVal0, TTI) ||
+        !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts,
+                             MaxCostVal1, TTI))
+      return false;
+  }
+
+  // If we folded the first phi, PN dangles at this point.  Refresh it.  If
+  // we ran out of PHIs then we simplified them all.
+  PN = dyn_cast<PHINode>(BB->begin());
+  if (!PN)
+    return true;
+
+  // Don't fold i1 branches on PHIs which contain binary operators.  These can
+  // often be turned into switches and other things.
+  if (PN->getType()->isIntegerTy(1) &&
+      (isa<BinaryOperator>(PN->getIncomingValue(0)) ||
+       isa<BinaryOperator>(PN->getIncomingValue(1)) ||
+       isa<BinaryOperator>(IfCond)))
+    return false;
+
+  // If all PHI nodes are promotable, check to make sure that all instructions
+  // in the predecessor blocks can be promoted as well. If not, we won't be able
+  // to get rid of the control flow, so it's not worth promoting to select
+  // instructions.
+  BasicBlock *DomBlock = nullptr;
+  BasicBlock *IfBlock1 = PN->getIncomingBlock(0);
+  BasicBlock *IfBlock2 = PN->getIncomingBlock(1);
+  if (cast<BranchInst>(IfBlock1->getTerminator())->isConditional()) {
+    IfBlock1 = nullptr;
+  } else {
+    DomBlock = *pred_begin(IfBlock1);
+    for (BasicBlock::iterator I = IfBlock1->begin(); !isa<TerminatorInst>(I);
+         ++I)
+      if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) {
+        // This is not an aggressive instruction that we can promote.
+        // Because of this, we won't be able to get rid of the control flow, so
+        // the xform is not worth it.
+        return false;
+      }
+  }
+
+  if (cast<BranchInst>(IfBlock2->getTerminator())->isConditional()) {
+    IfBlock2 = nullptr;
+  } else {
+    DomBlock = *pred_begin(IfBlock2);
+    for (BasicBlock::iterator I = IfBlock2->begin(); !isa<TerminatorInst>(I);
+         ++I)
+      if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) {
+        // This is not an aggressive instruction that we can promote.
+        // Because of this, we won't be able to get rid of the control flow, so
+        // the xform is not worth it.
+        return false;
+      }
+  }
+
+  DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond << "  T: "
+               << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n");
+
+  // If we can still promote the PHI nodes after this gauntlet of tests,
+  // do all of the PHI's now.
+  Instruction *InsertPt = DomBlock->getTerminator();
+  IRBuilder<NoFolder> Builder(InsertPt);
+
+  // Move all 'aggressive' instructions, which are defined in the
+  // conditional parts of the if's up to the dominating block.
+  if (IfBlock1) {
+    for (auto &I : *IfBlock1)
+      I.dropUnknownNonDebugMetadata();
+    DomBlock->getInstList().splice(InsertPt->getIterator(),
+                                   IfBlock1->getInstList(), IfBlock1->begin(),
+                                   IfBlock1->getTerminator()->getIterator());
+  }
+  if (IfBlock2) {
+    for (auto &I : *IfBlock2)
+      I.dropUnknownNonDebugMetadata();
+    DomBlock->getInstList().splice(InsertPt->getIterator(),
+                                   IfBlock2->getInstList(), IfBlock2->begin(),
+                                   IfBlock2->getTerminator()->getIterator());
+  }
+
+  while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+    // Change the PHI node into a select instruction.
+    Value *TrueVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
+    Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
+
+    Value *Sel = Builder.CreateSelect(IfCond, TrueVal, FalseVal, "", InsertPt);
+    PN->replaceAllUsesWith(Sel);
+    Sel->takeName(PN);
+    PN->eraseFromParent();
+  }
+
+  // At this point, IfBlock1 and IfBlock2 are both empty, so our if statement
+  // has been flattened.  Change DomBlock to jump directly to our new block to
+  // avoid other simplifycfg's kicking in on the diamond.
+  TerminatorInst *OldTI = DomBlock->getTerminator();
+  Builder.SetInsertPoint(OldTI);
+  Builder.CreateBr(BB);
+  OldTI->eraseFromParent();
+  return true;
+}
+
+/// If we found a conditional branch that goes to two returning blocks,
+/// try to merge them together into one return,
+/// introducing a select if the return values disagree.
+static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
+                                           IRBuilder<> &Builder) {
+  assert(BI->isConditional() && "Must be a conditional branch");
+  BasicBlock *TrueSucc = BI->getSuccessor(0);
+  BasicBlock *FalseSucc = BI->getSuccessor(1);
+  ReturnInst *TrueRet = cast<ReturnInst>(TrueSucc->getTerminator());
+  ReturnInst *FalseRet = cast<ReturnInst>(FalseSucc->getTerminator());
+
+  // Check to ensure both blocks are empty (just a return) or optionally empty
+  // with PHI nodes.  If there are other instructions, merging would cause extra
+  // computation on one path or the other.
+  if (!TrueSucc->getFirstNonPHIOrDbg()->isTerminator())
+    return false;
+  if (!FalseSucc->getFirstNonPHIOrDbg()->isTerminator())
+    return false;
+
+  Builder.SetInsertPoint(BI);
+  // Okay, we found a branch that is going to two return nodes.  If
+  // there is no return value for this function, just change the
+  // branch into a return.
+  if (FalseRet->getNumOperands() == 0) {
+    TrueSucc->removePredecessor(BI->getParent());
+    FalseSucc->removePredecessor(BI->getParent());
+    Builder.CreateRetVoid();
+    EraseTerminatorInstAndDCECond(BI);
+    return true;
+  }
+
+  // Otherwise, figure out what the true and false return values are
+  // so we can insert a new select instruction.
+  Value *TrueValue = TrueRet->getReturnValue();
+  Value *FalseValue = FalseRet->getReturnValue();
+
+  // Unwrap any PHI nodes in the return blocks.
+  if (PHINode *TVPN = dyn_cast_or_null<PHINode>(TrueValue))
+    if (TVPN->getParent() == TrueSucc)
+      TrueValue = TVPN->getIncomingValueForBlock(BI->getParent());
+  if (PHINode *FVPN = dyn_cast_or_null<PHINode>(FalseValue))
+    if (FVPN->getParent() == FalseSucc)
+      FalseValue = FVPN->getIncomingValueForBlock(BI->getParent());
+
+  // In order for this transformation to be safe, we must be able to
+  // unconditionally execute both operands to the return.  This is
+  // normally the case, but we could have a potentially-trapping
+  // constant expression that prevents this transformation from being
+  // safe.
+  if (ConstantExpr *TCV = dyn_cast_or_null<ConstantExpr>(TrueValue))
+    if (TCV->canTrap())
+      return false;
+  if (ConstantExpr *FCV = dyn_cast_or_null<ConstantExpr>(FalseValue))
+    if (FCV->canTrap())
+      return false;
+
+  // Okay, we collected all the mapped values and checked them for sanity, and
+  // defined to really do this transformation.  First, update the CFG.
+  TrueSucc->removePredecessor(BI->getParent());
+  FalseSucc->removePredecessor(BI->getParent());
+
+  // Insert select instructions where needed.
+  Value *BrCond = BI->getCondition();
+  if (TrueValue) {
+    // Insert a select if the results differ.
+    if (TrueValue == FalseValue || isa<UndefValue>(FalseValue)) {
+    } else if (isa<UndefValue>(TrueValue)) {
+      TrueValue = FalseValue;
+    } else {
+      TrueValue =
+          Builder.CreateSelect(BrCond, TrueValue, FalseValue, "retval", BI);
+    }
+  }
+
+  Value *RI =
+      !TrueValue ? Builder.CreateRetVoid() : Builder.CreateRet(TrueValue);
+
+  (void)RI;
+
+  DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
+               << "\n  " << *BI << "NewRet = " << *RI
+               << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: " << *FalseSucc);
+
+  EraseTerminatorInstAndDCECond(BI);
+
+  return true;
+}
+
+/// Return true if the given instruction is available
+/// in its predecessor block. If yes, the instruction will be removed.
+static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) {
+  if (!isa<BinaryOperator>(Inst) && !isa<CmpInst>(Inst))
+    return false;
+  for (Instruction &I : *PB) {
+    Instruction *PBI = &I;
+    // Check whether Inst and PBI generate the same value.
+    if (Inst->isIdenticalTo(PBI)) {
+      Inst->replaceAllUsesWith(PBI);
+      Inst->eraseFromParent();
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Return true if either PBI or BI has branch weight available, and store
+/// the weights in {Pred|Succ}{True|False}Weight. If one of PBI and BI does
+/// not have branch weight, use 1:1 as its weight.
+static bool extractPredSuccWeights(BranchInst *PBI, BranchInst *BI,
+                                   uint64_t &PredTrueWeight,
+                                   uint64_t &PredFalseWeight,
+                                   uint64_t &SuccTrueWeight,
+                                   uint64_t &SuccFalseWeight) {
+  bool PredHasWeights =
+      PBI->extractProfMetadata(PredTrueWeight, PredFalseWeight);
+  bool SuccHasWeights =
+      BI->extractProfMetadata(SuccTrueWeight, SuccFalseWeight);
+  if (PredHasWeights || SuccHasWeights) {
+    if (!PredHasWeights)
+      PredTrueWeight = PredFalseWeight = 1;
+    if (!SuccHasWeights)
+      SuccTrueWeight = SuccFalseWeight = 1;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+/// If this basic block is simple enough, and if a predecessor branches to us
+/// and one of our successors, fold the block into the predecessor and use
+/// logical operations to pick the right destination.
+bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
+  BasicBlock *BB = BI->getParent();
+
+  Instruction *Cond = nullptr;
+  if (BI->isConditional())
+    Cond = dyn_cast<Instruction>(BI->getCondition());
+  else {
+    // For unconditional branch, check for a simple CFG pattern, where
+    // BB has a single predecessor and BB's successor is also its predecessor's
+    // successor. If such pattern exisits, check for CSE between BB and its
+    // predecessor.
+    if (BasicBlock *PB = BB->getSinglePredecessor())
+      if (BranchInst *PBI = dyn_cast<BranchInst>(PB->getTerminator()))
+        if (PBI->isConditional() &&
+            (BI->getSuccessor(0) == PBI->getSuccessor(0) ||
+             BI->getSuccessor(0) == PBI->getSuccessor(1))) {
+          for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+            Instruction *Curr = &*I++;
+            if (isa<CmpInst>(Curr)) {
+              Cond = Curr;
+              break;
+            }
+            // Quit if we can't remove this instruction.
+            if (!checkCSEInPredecessor(Curr, PB))
+              return false;
+          }
+        }
+
+    if (!Cond)
+      return false;
+  }
+
+  if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
+      Cond->getParent() != BB || !Cond->hasOneUse())
+    return false;
+
+  // Make sure the instruction after the condition is the cond branch.
+  BasicBlock::iterator CondIt = ++Cond->getIterator();
+
+  // Ignore dbg intrinsics.
+  while (isa<DbgInfoIntrinsic>(CondIt))
+    ++CondIt;
+
+  if (&*CondIt != BI)
+    return false;
+
+  // Only allow this transformation if computing the condition doesn't involve
+  // too many instructions and these involved instructions can be executed
+  // unconditionally. We denote all involved instructions except the condition
+  // as "bonus instructions", and only allow this transformation when the
+  // number of the bonus instructions does not exceed a certain threshold.
+  unsigned NumBonusInsts = 0;
+  for (auto I = BB->begin(); Cond != &*I; ++I) {
+    // Ignore dbg intrinsics.
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+    if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(&*I))
+      return false;
+    // I has only one use and can be executed unconditionally.
+    Instruction *User = dyn_cast<Instruction>(I->user_back());
+    if (User == nullptr || User->getParent() != BB)
+      return false;
+    // I is used in the same BB. Since BI uses Cond and doesn't have more slots
+    // to use any other instruction, User must be an instruction between next(I)
+    // and Cond.
+    ++NumBonusInsts;
+    // Early exits once we reach the limit.
+    if (NumBonusInsts > BonusInstThreshold)
+      return false;
+  }
+
+  // Cond is known to be a compare or binary operator.  Check to make sure that
+  // neither operand is a potentially-trapping constant expression.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
+    if (CE->canTrap())
+      return false;
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1)))
+    if (CE->canTrap())
+      return false;
+
+  // Finally, don't infinitely unroll conditional loops.
+  BasicBlock *TrueDest = BI->getSuccessor(0);
+  BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : nullptr;
+  if (TrueDest == BB || FalseDest == BB)
+    return false;
+
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *PredBlock = *PI;
+    BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
+
+    // Check that we have two conditional branches.  If there is a PHI node in
+    // the common successor, verify that the same value flows in from both
+    // blocks.
+    SmallVector<PHINode *, 4> PHIs;
+    if (!PBI || PBI->isUnconditional() ||
+        (BI->isConditional() && !SafeToMergeTerminators(BI, PBI)) ||
+        (!BI->isConditional() &&
+         !isProfitableToFoldUnconditional(BI, PBI, Cond, PHIs)))
+      continue;
+
+    // Determine if the two branches share a common destination.
+    Instruction::BinaryOps Opc = Instruction::BinaryOpsEnd;
+    bool InvertPredCond = false;
+
+    if (BI->isConditional()) {
+      if (PBI->getSuccessor(0) == TrueDest) {
+        Opc = Instruction::Or;
+      } else if (PBI->getSuccessor(1) == FalseDest) {
+        Opc = Instruction::And;
+      } else if (PBI->getSuccessor(0) == FalseDest) {
+        Opc = Instruction::And;
+        InvertPredCond = true;
+      } else if (PBI->getSuccessor(1) == TrueDest) {
+        Opc = Instruction::Or;
+        InvertPredCond = true;
+      } else {
+        continue;
+      }
+    } else {
+      if (PBI->getSuccessor(0) != TrueDest && PBI->getSuccessor(1) != TrueDest)
+        continue;
+    }
+
+    DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
+    IRBuilder<> Builder(PBI);
+
+    // If we need to invert the condition in the pred block to match, do so now.
+    if (InvertPredCond) {
+      Value *NewCond = PBI->getCondition();
+
+      if (NewCond->hasOneUse() && isa<CmpInst>(NewCond)) {
+        CmpInst *CI = cast<CmpInst>(NewCond);
+        CI->setPredicate(CI->getInversePredicate());
+      } else {
+        NewCond =
+            Builder.CreateNot(NewCond, PBI->getCondition()->getName() + ".not");
+      }
+
+      PBI->setCondition(NewCond);
+      PBI->swapSuccessors();
+    }
+
+    // If we have bonus instructions, clone them into the predecessor block.
+    // Note that there may be multiple predecessor blocks, so we cannot move
+    // bonus instructions to a predecessor block.
+    ValueToValueMapTy VMap; // maps original values to cloned values
+    // We already make sure Cond is the last instruction before BI. Therefore,
+    // all instructions before Cond other than DbgInfoIntrinsic are bonus
+    // instructions.
+    for (auto BonusInst = BB->begin(); Cond != &*BonusInst; ++BonusInst) {
+      if (isa<DbgInfoIntrinsic>(BonusInst))
+        continue;
+      Instruction *NewBonusInst = BonusInst->clone();
+      RemapInstruction(NewBonusInst, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      VMap[&*BonusInst] = NewBonusInst;
+
+      // If we moved a load, we cannot any longer claim any knowledge about
+      // its potential value. The previous information might have been valid
+      // only given the branch precondition.
+      // For an analogous reason, we must also drop all the metadata whose
+      // semantics we don't understand.
+      NewBonusInst->dropUnknownNonDebugMetadata();
+
+      PredBlock->getInstList().insert(PBI->getIterator(), NewBonusInst);
+      NewBonusInst->takeName(&*BonusInst);
+      BonusInst->setName(BonusInst->getName() + ".old");
+    }
+
+    // Clone Cond into the predecessor basic block, and or/and the
+    // two conditions together.
+    Instruction *New = Cond->clone();
+    RemapInstruction(New, VMap,
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+    PredBlock->getInstList().insert(PBI->getIterator(), New);
+    New->takeName(Cond);
+    Cond->setName(New->getName() + ".old");
+
+    if (BI->isConditional()) {
+      Instruction *NewCond = cast<Instruction>(
+          Builder.CreateBinOp(Opc, PBI->getCondition(), New, "or.cond"));
+      PBI->setCondition(NewCond);
+
+      uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+      bool HasWeights =
+          extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
+                                 SuccTrueWeight, SuccFalseWeight);
+      SmallVector<uint64_t, 8> NewWeights;
+
+      if (PBI->getSuccessor(0) == BB) {
+        if (HasWeights) {
+          // PBI: br i1 %x, BB, FalseDest
+          // BI:  br i1 %y, TrueDest, FalseDest
+          // TrueWeight is TrueWeight for PBI * TrueWeight for BI.
+          NewWeights.push_back(PredTrueWeight * SuccTrueWeight);
+          // FalseWeight is FalseWeight for PBI * TotalWeight for BI +
+          //               TrueWeight for PBI * FalseWeight for BI.
+          // We assume that total weights of a BranchInst can fit into 32 bits.
+          // Therefore, we will not have overflow using 64-bit arithmetic.
+          NewWeights.push_back(PredFalseWeight *
+                                   (SuccFalseWeight + SuccTrueWeight) +
+                               PredTrueWeight * SuccFalseWeight);
+        }
+        AddPredecessorToBlock(TrueDest, PredBlock, BB);
+        PBI->setSuccessor(0, TrueDest);
+      }
+      if (PBI->getSuccessor(1) == BB) {
+        if (HasWeights) {
+          // PBI: br i1 %x, TrueDest, BB
+          // BI:  br i1 %y, TrueDest, FalseDest
+          // TrueWeight is TrueWeight for PBI * TotalWeight for BI +
+          //              FalseWeight for PBI * TrueWeight for BI.
+          NewWeights.push_back(PredTrueWeight *
+                                   (SuccFalseWeight + SuccTrueWeight) +
+                               PredFalseWeight * SuccTrueWeight);
+          // FalseWeight is FalseWeight for PBI * FalseWeight for BI.
+          NewWeights.push_back(PredFalseWeight * SuccFalseWeight);
+        }
+        AddPredecessorToBlock(FalseDest, PredBlock, BB);
+        PBI->setSuccessor(1, FalseDest);
+      }
+      if (NewWeights.size() == 2) {
+        // Halve the weights if any of them cannot fit in an uint32_t
+        FitWeights(NewWeights);
+
+        SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(),
+                                           NewWeights.end());
+        PBI->setMetadata(
+            LLVMContext::MD_prof,
+            MDBuilder(BI->getContext()).createBranchWeights(MDWeights));
+      } else
+        PBI->setMetadata(LLVMContext::MD_prof, nullptr);
+    } else {
+      // Update PHI nodes in the common successors.
+      for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+        ConstantInt *PBI_C = cast<ConstantInt>(
+            PHIs[i]->getIncomingValueForBlock(PBI->getParent()));
+        assert(PBI_C->getType()->isIntegerTy(1));
+        Instruction *MergedCond = nullptr;
+        if (PBI->getSuccessor(0) == TrueDest) {
+          // Create (PBI_Cond and PBI_C) or (!PBI_Cond and BI_Value)
+          // PBI_C is true: PBI_Cond or (!PBI_Cond and BI_Value)
+          //       is false: !PBI_Cond and BI_Value
+          Instruction *NotCond = cast<Instruction>(
+              Builder.CreateNot(PBI->getCondition(), "not.cond"));
+          MergedCond = cast<Instruction>(
+              Builder.CreateBinOp(Instruction::And, NotCond, New, "and.cond"));
+          if (PBI_C->isOne())
+            MergedCond = cast<Instruction>(Builder.CreateBinOp(
+                Instruction::Or, PBI->getCondition(), MergedCond, "or.cond"));
+        } else {
+          // Create (PBI_Cond and BI_Value) or (!PBI_Cond and PBI_C)
+          // PBI_C is true: (PBI_Cond and BI_Value) or (!PBI_Cond)
+          //       is false: PBI_Cond and BI_Value
+          MergedCond = cast<Instruction>(Builder.CreateBinOp(
+              Instruction::And, PBI->getCondition(), New, "and.cond"));
+          if (PBI_C->isOne()) {
+            Instruction *NotCond = cast<Instruction>(
+                Builder.CreateNot(PBI->getCondition(), "not.cond"));
+            MergedCond = cast<Instruction>(Builder.CreateBinOp(
+                Instruction::Or, NotCond, MergedCond, "or.cond"));
+          }
+        }
+        // Update PHI Node.
+        PHIs[i]->setIncomingValue(PHIs[i]->getBasicBlockIndex(PBI->getParent()),
+                                  MergedCond);
+      }
+      // Change PBI from Conditional to Unconditional.
+      BranchInst *New_PBI = BranchInst::Create(TrueDest, PBI);
+      EraseTerminatorInstAndDCECond(PBI);
+      PBI = New_PBI;
+    }
+
+    // If BI was a loop latch, it may have had associated loop metadata.
+    // We need to copy it to the new latch, that is, PBI.
+    if (MDNode *LoopMD = BI->getMetadata(LLVMContext::MD_loop))
+      PBI->setMetadata(LLVMContext::MD_loop, LoopMD);
+
+    // TODO: If BB is reachable from all paths through PredBlock, then we
+    // could replace PBI's branch probabilities with BI's.
+
+    // Copy any debug value intrinsics into the end of PredBlock.
+    for (Instruction &I : *BB)
+      if (isa<DbgInfoIntrinsic>(I))
+        I.clone()->insertBefore(PBI);
+
+    return true;
+  }
+  return false;
+}
+
+// If there is only one store in BB1 and BB2, return it, otherwise return
+// nullptr.
+static StoreInst *findUniqueStoreInBlocks(BasicBlock *BB1, BasicBlock *BB2) {
+  StoreInst *S = nullptr;
+  for (auto *BB : {BB1, BB2}) {
+    if (!BB)
+      continue;
+    for (auto &I : *BB)
+      if (auto *SI = dyn_cast<StoreInst>(&I)) {
+        if (S)
+          // Multiple stores seen.
+          return nullptr;
+        else
+          S = SI;
+      }
+  }
+  return S;
+}
+
+static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
+                                              Value *AlternativeV = nullptr) {
+  // PHI is going to be a PHI node that allows the value V that is defined in
+  // BB to be referenced in BB's only successor.
+  //
+  // If AlternativeV is nullptr, the only value we care about in PHI is V. It
+  // doesn't matter to us what the other operand is (it'll never get used). We
+  // could just create a new PHI with an undef incoming value, but that could
+  // increase register pressure if EarlyCSE/InstCombine can't fold it with some
+  // other PHI. So here we directly look for some PHI in BB's successor with V
+  // as an incoming operand. If we find one, we use it, else we create a new
+  // one.
+  //
+  // If AlternativeV is not nullptr, we care about both incoming values in PHI.
+  // PHI must be exactly: phi <ty> [ %BB, %V ], [ %OtherBB, %AlternativeV]
+  // where OtherBB is the single other predecessor of BB's only successor.
+  PHINode *PHI = nullptr;
+  BasicBlock *Succ = BB->getSingleSuccessor();
+
+  for (auto I = Succ->begin(); isa<PHINode>(I); ++I)
+    if (cast<PHINode>(I)->getIncomingValueForBlock(BB) == V) {
+      PHI = cast<PHINode>(I);
+      if (!AlternativeV)
+        break;
+
+      assert(std::distance(pred_begin(Succ), pred_end(Succ)) == 2);
+      auto PredI = pred_begin(Succ);
+      BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI;
+      if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV)
+        break;
+      PHI = nullptr;
+    }
+  if (PHI)
+    return PHI;
+
+  // If V is not an instruction defined in BB, just return it.
+  if (!AlternativeV &&
+      (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != BB))
+    return V;
+
+  PHI = PHINode::Create(V->getType(), 2, "simplifycfg.merge", &Succ->front());
+  PHI->addIncoming(V, BB);
+  for (BasicBlock *PredBB : predecessors(Succ))
+    if (PredBB != BB)
+      PHI->addIncoming(
+          AlternativeV ? AlternativeV : UndefValue::get(V->getType()), PredBB);
+  return PHI;
+}
+
+static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
+                                           BasicBlock *QTB, BasicBlock *QFB,
+                                           BasicBlock *PostBB, Value *Address,
+                                           bool InvertPCond, bool InvertQCond) {
+  auto IsaBitcastOfPointerType = [](const Instruction &I) {
+    return Operator::getOpcode(&I) == Instruction::BitCast &&
+           I.getType()->isPointerTy();
+  };
+
+  // If we're not in aggressive mode, we only optimize if we have some
+  // confidence that by optimizing we'll allow P and/or Q to be if-converted.
+  auto IsWorthwhile = [&](BasicBlock *BB) {
+    if (!BB)
+      return true;
+    // Heuristic: if the block can be if-converted/phi-folded and the
+    // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
+    // thread this store.
+    unsigned N = 0;
+    for (auto &I : *BB) {
+      // Cheap instructions viable for folding.
+      if (isa<BinaryOperator>(I) || isa<GetElementPtrInst>(I) ||
+          isa<StoreInst>(I))
+        ++N;
+      // Free instructions.
+      else if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) ||
+               IsaBitcastOfPointerType(I))
+        continue;
+      else
+        return false;
+    }
+    return N <= PHINodeFoldingThreshold;
+  };
+
+  if (!MergeCondStoresAggressively &&
+      (!IsWorthwhile(PTB) || !IsWorthwhile(PFB) || !IsWorthwhile(QTB) ||
+       !IsWorthwhile(QFB)))
+    return false;
+
+  // For every pointer, there must be exactly two stores, one coming from
+  // PTB or PFB, and the other from QTB or QFB. We don't support more than one
+  // store (to any address) in PTB,PFB or QTB,QFB.
+  // FIXME: We could relax this restriction with a bit more work and performance
+  // testing.
+  StoreInst *PStore = findUniqueStoreInBlocks(PTB, PFB);
+  StoreInst *QStore = findUniqueStoreInBlocks(QTB, QFB);
+  if (!PStore || !QStore)
+    return false;
+
+  // Now check the stores are compatible.
+  if (!QStore->isUnordered() || !PStore->isUnordered())
+    return false;
+
+  // Check that sinking the store won't cause program behavior changes. Sinking
+  // the store out of the Q blocks won't change any behavior as we're sinking
+  // from a block to its unconditional successor. But we're moving a store from
+  // the P blocks down through the middle block (QBI) and past both QFB and QTB.
+  // So we need to check that there are no aliasing loads or stores in
+  // QBI, QTB and QFB. We also need to check there are no conflicting memory
+  // operations between PStore and the end of its parent block.
+  //
+  // The ideal way to do this is to query AliasAnalysis, but we don't
+  // preserve AA currently so that is dangerous. Be super safe and just
+  // check there are no other memory operations at all.
+  for (auto &I : *QFB->getSinglePredecessor())
+    if (I.mayReadOrWriteMemory())
+      return false;
+  for (auto &I : *QFB)
+    if (&I != QStore && I.mayReadOrWriteMemory())
+      return false;
+  if (QTB)
+    for (auto &I : *QTB)
+      if (&I != QStore && I.mayReadOrWriteMemory())
+        return false;
+  for (auto I = BasicBlock::iterator(PStore), E = PStore->getParent()->end();
+       I != E; ++I)
+    if (&*I != PStore && I->mayReadOrWriteMemory())
+      return false;
+
+  // OK, we're going to sink the stores to PostBB. The store has to be
+  // conditional though, so first create the predicate.
+  Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator())
+                     ->getCondition();
+  Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator())
+                     ->getCondition();
+
+  Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(),
+                                                PStore->getParent());
+  Value *QPHI = ensureValueAvailableInSuccessor(QStore->getValueOperand(),
+                                                QStore->getParent(), PPHI);
+
+  IRBuilder<> QB(&*PostBB->getFirstInsertionPt());
+
+  Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond);
+  Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond);
+
+  if (InvertPCond)
+    PPred = QB.CreateNot(PPred);
+  if (InvertQCond)
+    QPred = QB.CreateNot(QPred);
+  Value *CombinedPred = QB.CreateOr(PPred, QPred);
+
+  auto *T =
+      SplitBlockAndInsertIfThen(CombinedPred, &*QB.GetInsertPoint(), false);
+  QB.SetInsertPoint(T);
+  StoreInst *SI = cast<StoreInst>(QB.CreateStore(QPHI, Address));
+  AAMDNodes AAMD;
+  PStore->getAAMetadata(AAMD, /*Merge=*/false);
+  PStore->getAAMetadata(AAMD, /*Merge=*/true);
+  SI->setAAMetadata(AAMD);
+
+  QStore->eraseFromParent();
+  PStore->eraseFromParent();
+
+  return true;
+}
+
+static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) {
+  // The intention here is to find diamonds or triangles (see below) where each
+  // conditional block contains a store to the same address. Both of these
+  // stores are conditional, so they can't be unconditionally sunk. But it may
+  // be profitable to speculatively sink the stores into one merged store at the
+  // end, and predicate the merged store on the union of the two conditions of
+  // PBI and QBI.
+  //
+  // This can reduce the number of stores executed if both of the conditions are
+  // true, and can allow the blocks to become small enough to be if-converted.
+  // This optimization will also chain, so that ladders of test-and-set
+  // sequences can be if-converted away.
+  //
+  // We only deal with simple diamonds or triangles:
+  //
+  //     PBI       or      PBI        or a combination of the two
+  //    /   \               | \
+  //   PTB  PFB             |  PFB
+  //    \   /               | /
+  //     QBI                QBI
+  //    /  \                | \
+  //   QTB  QFB             |  QFB
+  //    \  /                | /
+  //    PostBB            PostBB
+  //
+  // We model triangles as a type of diamond with a nullptr "true" block.
+  // Triangles are canonicalized so that the fallthrough edge is represented by
+  // a true condition, as in the diagram above.
+  //
+  BasicBlock *PTB = PBI->getSuccessor(0);
+  BasicBlock *PFB = PBI->getSuccessor(1);
+  BasicBlock *QTB = QBI->getSuccessor(0);
+  BasicBlock *QFB = QBI->getSuccessor(1);
+  BasicBlock *PostBB = QFB->getSingleSuccessor();
+
+  // Make sure we have a good guess for PostBB. If QTB's only successor is
+  // QFB, then QFB is a better PostBB.
+  if (QTB->getSingleSuccessor() == QFB)
+    PostBB = QFB;
+
+  // If we couldn't find a good PostBB, stop.
+  if (!PostBB)
+    return false;
+
+  bool InvertPCond = false, InvertQCond = false;
+  // Canonicalize fallthroughs to the true branches.
+  if (PFB == QBI->getParent()) {
+    std::swap(PFB, PTB);
+    InvertPCond = true;
+  }
+  if (QFB == PostBB) {
+    std::swap(QFB, QTB);
+    InvertQCond = true;
+  }
+
+  // From this point on we can assume PTB or QTB may be fallthroughs but PFB
+  // and QFB may not. Model fallthroughs as a nullptr block.
+  if (PTB == QBI->getParent())
+    PTB = nullptr;
+  if (QTB == PostBB)
+    QTB = nullptr;
+
+  // Legality bailouts. We must have at least the non-fallthrough blocks and
+  // the post-dominating block, and the non-fallthroughs must only have one
+  // predecessor.
+  auto HasOnePredAndOneSucc = [](BasicBlock *BB, BasicBlock *P, BasicBlock *S) {
+    return BB->getSinglePredecessor() == P && BB->getSingleSuccessor() == S;
+  };
+  if (!HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) ||
+      !HasOnePredAndOneSucc(QFB, QBI->getParent(), PostBB))
+    return false;
+  if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) ||
+      (QTB && !HasOnePredAndOneSucc(QTB, QBI->getParent(), PostBB)))
+    return false;
+  if (!PostBB->hasNUses(2) || !QBI->getParent()->hasNUses(2))
+    return false;
+
+  // OK, this is a sequence of two diamonds or triangles.
+  // Check if there are stores in PTB or PFB that are repeated in QTB or QFB.
+  SmallPtrSet<Value *, 4> PStoreAddresses, QStoreAddresses;
+  for (auto *BB : {PTB, PFB}) {
+    if (!BB)
+      continue;
+    for (auto &I : *BB)
+      if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+        PStoreAddresses.insert(SI->getPointerOperand());
+  }
+  for (auto *BB : {QTB, QFB}) {
+    if (!BB)
+      continue;
+    for (auto &I : *BB)
+      if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+        QStoreAddresses.insert(SI->getPointerOperand());
+  }
+
+  set_intersect(PStoreAddresses, QStoreAddresses);
+  // set_intersect mutates PStoreAddresses in place. Rename it here to make it
+  // clear what it contains.
+  auto &CommonAddresses = PStoreAddresses;
+
+  bool Changed = false;
+  for (auto *Address : CommonAddresses)
+    Changed |= mergeConditionalStoreToAddress(
+        PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond);
+  return Changed;
+}
+
+/// If we have a conditional branch as a predecessor of another block,
+/// this function tries to simplify it.  We know
+/// that PBI and BI are both conditional branches, and BI is in one of the
+/// successor blocks of PBI - PBI branches to BI.
+static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
+                                           const DataLayout &DL) {
+  assert(PBI->isConditional() && BI->isConditional());
+  BasicBlock *BB = BI->getParent();
+
+  // If this block ends with a branch instruction, and if there is a
+  // predecessor that ends on a branch of the same condition, make
+  // this conditional branch redundant.
+  if (PBI->getCondition() == BI->getCondition() &&
+      PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+    // Okay, the outcome of this conditional branch is statically
+    // knowable.  If this block had a single pred, handle specially.
+    if (BB->getSinglePredecessor()) {
+      // Turn this into a branch on constant.
+      bool CondIsTrue = PBI->getSuccessor(0) == BB;
+      BI->setCondition(
+          ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue));
+      return true; // Nuke the branch on constant.
+    }
+
+    // Otherwise, if there are multiple predecessors, insert a PHI that merges
+    // in the constant and simplify the block result.  Subsequent passes of
+    // simplifycfg will thread the block.
+    if (BlockIsSimpleEnoughToThreadThrough(BB)) {
+      pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
+      PHINode *NewPN = PHINode::Create(
+          Type::getInt1Ty(BB->getContext()), std::distance(PB, PE),
+          BI->getCondition()->getName() + ".pr", &BB->front());
+      // Okay, we're going to insert the PHI node.  Since PBI is not the only
+      // predecessor, compute the PHI'd conditional value for all of the preds.
+      // Any predecessor where the condition is not computable we keep symbolic.
+      for (pred_iterator PI = PB; PI != PE; ++PI) {
+        BasicBlock *P = *PI;
+        if ((PBI = dyn_cast<BranchInst>(P->getTerminator())) && PBI != BI &&
+            PBI->isConditional() && PBI->getCondition() == BI->getCondition() &&
+            PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+          bool CondIsTrue = PBI->getSuccessor(0) == BB;
+          NewPN->addIncoming(
+              ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue),
+              P);
+        } else {
+          NewPN->addIncoming(BI->getCondition(), P);
+        }
+      }
+
+      BI->setCondition(NewPN);
+      return true;
+    }
+  }
+
+  if (auto *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
+    if (CE->canTrap())
+      return false;
+
+  // If both branches are conditional and both contain stores to the same
+  // address, remove the stores from the conditionals and create a conditional
+  // merged store at the end.
+  if (MergeCondStores && mergeConditionalStores(PBI, BI))
+    return true;
+
+  // If this is a conditional branch in an empty block, and if any
+  // predecessors are a conditional branch to one of our destinations,
+  // fold the conditions into logical ops and one cond br.
+  BasicBlock::iterator BBI = BB->begin();
+  // Ignore dbg intrinsics.
+  while (isa<DbgInfoIntrinsic>(BBI))
+    ++BBI;
+  if (&*BBI != BI)
+    return false;
+
+  int PBIOp, BIOp;
+  if (PBI->getSuccessor(0) == BI->getSuccessor(0)) {
+    PBIOp = 0;
+    BIOp = 0;
+  } else if (PBI->getSuccessor(0) == BI->getSuccessor(1)) {
+    PBIOp = 0;
+    BIOp = 1;
+  } else if (PBI->getSuccessor(1) == BI->getSuccessor(0)) {
+    PBIOp = 1;
+    BIOp = 0;
+  } else if (PBI->getSuccessor(1) == BI->getSuccessor(1)) {
+    PBIOp = 1;
+    BIOp = 1;
+  } else {
+    return false;
+  }
+
+  // Check to make sure that the other destination of this branch
+  // isn't BB itself.  If so, this is an infinite loop that will
+  // keep getting unwound.
+  if (PBI->getSuccessor(PBIOp) == BB)
+    return false;
+
+  // Do not perform this transformation if it would require
+  // insertion of a large number of select instructions. For targets
+  // without predication/cmovs, this is a big pessimization.
+
+  // Also do not perform this transformation if any phi node in the common
+  // destination block can trap when reached by BB or PBB (PR17073). In that
+  // case, it would be unsafe to hoist the operation into a select instruction.
+
+  BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
+  unsigned NumPhis = 0;
+  for (BasicBlock::iterator II = CommonDest->begin(); isa<PHINode>(II);
+       ++II, ++NumPhis) {
+    if (NumPhis > 2) // Disable this xform.
+      return false;
+
+    PHINode *PN = cast<PHINode>(II);
+    Value *BIV = PN->getIncomingValueForBlock(BB);
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BIV))
+      if (CE->canTrap())
+        return false;
+
+    unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent());
+    Value *PBIV = PN->getIncomingValue(PBBIdx);
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(PBIV))
+      if (CE->canTrap())
+        return false;
+  }
+
+  // Finally, if everything is ok, fold the branches to logical ops.
+  BasicBlock *OtherDest = BI->getSuccessor(BIOp ^ 1);
+
+  DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
+               << "AND: " << *BI->getParent());
+
+  // If OtherDest *is* BB, then BB is a basic block with a single conditional
+  // branch in it, where one edge (OtherDest) goes back to itself but the other
+  // exits.  We don't *know* that the program avoids the infinite loop
+  // (even though that seems likely).  If we do this xform naively, we'll end up
+  // recursively unpeeling the loop.  Since we know that (after the xform is
+  // done) that the block *is* infinite if reached, we just make it an obviously
+  // infinite loop with no cond branch.
+  if (OtherDest == BB) {
+    // Insert it at the end of the function, because it's either code,
+    // or it won't matter if it's hot. :)
+    BasicBlock *InfLoopBlock =
+        BasicBlock::Create(BB->getContext(), "infloop", BB->getParent());
+    BranchInst::Create(InfLoopBlock, InfLoopBlock);
+    OtherDest = InfLoopBlock;
+  }
+
+  DEBUG(dbgs() << *PBI->getParent()->getParent());
+
+  // BI may have other predecessors.  Because of this, we leave
+  // it alone, but modify PBI.
+
+  // Make sure we get to CommonDest on True&True directions.
+  Value *PBICond = PBI->getCondition();
+  IRBuilder<NoFolder> Builder(PBI);
+  if (PBIOp)
+    PBICond = Builder.CreateNot(PBICond, PBICond->getName() + ".not");
+
+  Value *BICond = BI->getCondition();
+  if (BIOp)
+    BICond = Builder.CreateNot(BICond, BICond->getName() + ".not");
+
+  // Merge the conditions.
+  Value *Cond = Builder.CreateOr(PBICond, BICond, "brmerge");
+
+  // Modify PBI to branch on the new condition to the new dests.
+  PBI->setCondition(Cond);
+  PBI->setSuccessor(0, CommonDest);
+  PBI->setSuccessor(1, OtherDest);
+
+  // Update branch weight for PBI.
+  uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+  uint64_t PredCommon, PredOther, SuccCommon, SuccOther;
+  bool HasWeights =
+      extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
+                             SuccTrueWeight, SuccFalseWeight);
+  if (HasWeights) {
+    PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
+    PredOther = PBIOp ? PredTrueWeight : PredFalseWeight;
+    SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
+    SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
+    // The weight to CommonDest should be PredCommon * SuccTotal +
+    //                                    PredOther * SuccCommon.
+    // The weight to OtherDest should be PredOther * SuccOther.
+    uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther) +
+                                  PredOther * SuccCommon,
+                              PredOther * SuccOther};
+    // Halve the weights if any of them cannot fit in an uint32_t
+    FitWeights(NewWeights);
+
+    PBI->setMetadata(LLVMContext::MD_prof,
+                     MDBuilder(BI->getContext())
+                         .createBranchWeights(NewWeights[0], NewWeights[1]));
+  }
+
+  // OtherDest may have phi nodes.  If so, add an entry from PBI's
+  // block that are identical to the entries for BI's block.
+  AddPredecessorToBlock(OtherDest, PBI->getParent(), BB);
+
+  // We know that the CommonDest already had an edge from PBI to
+  // it.  If it has PHIs though, the PHIs may have different
+  // entries for BB and PBI's BB.  If so, insert a select to make
+  // them agree.
+  PHINode *PN;
+  for (BasicBlock::iterator II = CommonDest->begin();
+       (PN = dyn_cast<PHINode>(II)); ++II) {
+    Value *BIV = PN->getIncomingValueForBlock(BB);
+    unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent());
+    Value *PBIV = PN->getIncomingValue(PBBIdx);
+    if (BIV != PBIV) {
+      // Insert a select in PBI to pick the right value.
+      SelectInst *NV = cast<SelectInst>(
+          Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName() + ".mux"));
+      PN->setIncomingValue(PBBIdx, NV);
+      // Although the select has the same condition as PBI, the original branch
+      // weights for PBI do not apply to the new select because the select's
+      // 'logical' edges are incoming edges of the phi that is eliminated, not
+      // the outgoing edges of PBI.
+      if (HasWeights) {
+        uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
+        uint64_t PredOther = PBIOp ? PredTrueWeight : PredFalseWeight;
+        uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
+        uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
+        // The weight to PredCommonDest should be PredCommon * SuccTotal.
+        // The weight to PredOtherDest should be PredOther * SuccCommon.
+        uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther),
+                                  PredOther * SuccCommon};
+
+        FitWeights(NewWeights);
+
+        NV->setMetadata(LLVMContext::MD_prof,
+                        MDBuilder(BI->getContext())
+                            .createBranchWeights(NewWeights[0], NewWeights[1]));
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "INTO: " << *PBI->getParent());
+  DEBUG(dbgs() << *PBI->getParent()->getParent());
+
+  // This basic block is probably dead.  We know it has at least
+  // one fewer predecessor.
+  return true;
+}
+
+// Simplifies a terminator by replacing it with a branch to TrueBB if Cond is
+// true or to FalseBB if Cond is false.
+// Takes care of updating the successors and removing the old terminator.
+// Also makes sure not to introduce new successors by assuming that edges to
+// non-successor TrueBBs and FalseBBs aren't reachable.
+static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
+                                       BasicBlock *TrueBB, BasicBlock *FalseBB,
+                                       uint32_t TrueWeight,
+                                       uint32_t FalseWeight) {
+  // Remove any superfluous successor edges from the CFG.
+  // First, figure out which successors to preserve.
+  // If TrueBB and FalseBB are equal, only try to preserve one copy of that
+  // successor.
+  BasicBlock *KeepEdge1 = TrueBB;
+  BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr;
+
+  // Then remove the rest.
+  for (BasicBlock *Succ : OldTerm->successors()) {
+    // Make sure only to keep exactly one copy of each edge.
+    if (Succ == KeepEdge1)
+      KeepEdge1 = nullptr;
+    else if (Succ == KeepEdge2)
+      KeepEdge2 = nullptr;
+    else
+      Succ->removePredecessor(OldTerm->getParent(),
+                              /*DontDeleteUselessPHIs=*/true);
+  }
+
+  IRBuilder<> Builder(OldTerm);
+  Builder.SetCurrentDebugLocation(OldTerm->getDebugLoc());
+
+  // Insert an appropriate new terminator.
+  if (!KeepEdge1 && !KeepEdge2) {
+    if (TrueBB == FalseBB)
+      // We were only looking for one successor, and it was present.
+      // Create an unconditional branch to it.
+      Builder.CreateBr(TrueBB);
+    else {
+      // We found both of the successors we were looking for.
+      // Create a conditional branch sharing the condition of the select.
+      BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
+      if (TrueWeight != FalseWeight)
+        NewBI->setMetadata(LLVMContext::MD_prof,
+                           MDBuilder(OldTerm->getContext())
+                               .createBranchWeights(TrueWeight, FalseWeight));
+    }
+  } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
+    // Neither of the selected blocks were successors, so this
+    // terminator must be unreachable.
+    new UnreachableInst(OldTerm->getContext(), OldTerm);
+  } else {
+    // One of the selected values was a successor, but the other wasn't.
+    // Insert an unconditional branch to the one that was found;
+    // the edge to the one that wasn't must be unreachable.
+    if (!KeepEdge1)
+      // Only TrueBB was found.
+      Builder.CreateBr(TrueBB);
+    else
+      // Only FalseBB was found.
+      Builder.CreateBr(FalseBB);
+  }
+
+  EraseTerminatorInstAndDCECond(OldTerm);
+  return true;
+}
+
+// Replaces
+//   (switch (select cond, X, Y)) on constant X, Y
+// with a branch - conditional if X and Y lead to distinct BBs,
+// unconditional otherwise.
+static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
+  // Check for constant integer values in the select.
+  ConstantInt *TrueVal = dyn_cast<ConstantInt>(Select->getTrueValue());
+  ConstantInt *FalseVal = dyn_cast<ConstantInt>(Select->getFalseValue());
+  if (!TrueVal || !FalseVal)
+    return false;
+
+  // Find the relevant condition and destinations.
+  Value *Condition = Select->getCondition();
+  BasicBlock *TrueBB = SI->findCaseValue(TrueVal)->getCaseSuccessor();
+  BasicBlock *FalseBB = SI->findCaseValue(FalseVal)->getCaseSuccessor();
+
+  // Get weight for TrueBB and FalseBB.
+  uint32_t TrueWeight = 0, FalseWeight = 0;
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      TrueWeight =
+          (uint32_t)Weights[SI->findCaseValue(TrueVal)->getSuccessorIndex()];
+      FalseWeight =
+          (uint32_t)Weights[SI->findCaseValue(FalseVal)->getSuccessorIndex()];
+    }
+  }
+
+  // Perform the actual simplification.
+  return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB, TrueWeight,
+                                    FalseWeight);
+}
+
+// Replaces
+//   (indirectbr (select cond, blockaddress(@fn, BlockA),
+//                             blockaddress(@fn, BlockB)))
+// with
+//   (br cond, BlockA, BlockB).
+static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
+  // Check that both operands of the select are block addresses.
+  BlockAddress *TBA = dyn_cast<BlockAddress>(SI->getTrueValue());
+  BlockAddress *FBA = dyn_cast<BlockAddress>(SI->getFalseValue());
+  if (!TBA || !FBA)
+    return false;
+
+  // Extract the actual blocks.
+  BasicBlock *TrueBB = TBA->getBasicBlock();
+  BasicBlock *FalseBB = FBA->getBasicBlock();
+
+  // Perform the actual simplification.
+  return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB, 0,
+                                    0);
+}
+
+/// This is called when we find an icmp instruction
+/// (a seteq/setne with a constant) as the only instruction in a
+/// block that ends with an uncond branch.  We are looking for a very specific
+/// pattern that occurs when "A == 1 || A == 2 || A == 3" gets simplified.  In
+/// this case, we merge the first two "or's of icmp" into a switch, but then the
+/// default value goes to an uncond block with a seteq in it, we get something
+/// like:
+///
+///   switch i8 %A, label %DEFAULT [ i8 1, label %end    i8 2, label %end ]
+/// DEFAULT:
+///   %tmp = icmp eq i8 %A, 92
+///   br label %end
+/// end:
+///   ... = phi i1 [ true, %entry ], [ %tmp, %DEFAULT ], [ true, %entry ]
+///
+/// We prefer to split the edge to 'end' so that there is a true/false entry to
+/// the PHI, merging the third icmp into the switch.
+static bool TryToSimplifyUncondBranchWithICmpInIt(
+    ICmpInst *ICI, IRBuilder<> &Builder, const DataLayout &DL,
+    const TargetTransformInfo &TTI, unsigned BonusInstThreshold,
+    AssumptionCache *AC) {
+  BasicBlock *BB = ICI->getParent();
+
+  // If the block has any PHIs in it or the icmp has multiple uses, it is too
+  // complex.
+  if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse())
+    return false;
+
+  Value *V = ICI->getOperand(0);
+  ConstantInt *Cst = cast<ConstantInt>(ICI->getOperand(1));
+
+  // The pattern we're looking for is where our only predecessor is a switch on
+  // 'V' and this block is the default case for the switch.  In this case we can
+  // fold the compared value into the switch to simplify things.
+  BasicBlock *Pred = BB->getSinglePredecessor();
+  if (!Pred || !isa<SwitchInst>(Pred->getTerminator()))
+    return false;
+
+  SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator());
+  if (SI->getCondition() != V)
+    return false;
+
+  // If BB is reachable on a non-default case, then we simply know the value of
+  // V in this block.  Substitute it and constant fold the icmp instruction
+  // away.
+  if (SI->getDefaultDest() != BB) {
+    ConstantInt *VVal = SI->findCaseDest(BB);
+    assert(VVal && "Should have a unique destination value");
+    ICI->setOperand(0, VVal);
+
+    if (Value *V = SimplifyInstruction(ICI, {DL, ICI})) {
+      ICI->replaceAllUsesWith(V);
+      ICI->eraseFromParent();
+    }
+    // BB is now empty, so it is likely to simplify away.
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+  }
+
+  // Ok, the block is reachable from the default dest.  If the constant we're
+  // comparing exists in one of the other edges, then we can constant fold ICI
+  // and zap it.
+  if (SI->findCaseValue(Cst) != SI->case_default()) {
+    Value *V;
+    if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+      V = ConstantInt::getFalse(BB->getContext());
+    else
+      V = ConstantInt::getTrue(BB->getContext());
+
+    ICI->replaceAllUsesWith(V);
+    ICI->eraseFromParent();
+    // BB is now empty, so it is likely to simplify away.
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+  }
+
+  // The use of the icmp has to be in the 'end' block, by the only PHI node in
+  // the block.
+  BasicBlock *SuccBlock = BB->getTerminator()->getSuccessor(0);
+  PHINode *PHIUse = dyn_cast<PHINode>(ICI->user_back());
+  if (PHIUse == nullptr || PHIUse != &SuccBlock->front() ||
+      isa<PHINode>(++BasicBlock::iterator(PHIUse)))
+    return false;
+
+  // If the icmp is a SETEQ, then the default dest gets false, the new edge gets
+  // true in the PHI.
+  Constant *DefaultCst = ConstantInt::getTrue(BB->getContext());
+  Constant *NewCst = ConstantInt::getFalse(BB->getContext());
+
+  if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+    std::swap(DefaultCst, NewCst);
+
+  // Replace ICI (which is used by the PHI for the default value) with true or
+  // false depending on if it is EQ or NE.
+  ICI->replaceAllUsesWith(DefaultCst);
+  ICI->eraseFromParent();
+
+  // Okay, the switch goes to this block on a default value.  Add an edge from
+  // the switch to the merge point on the compared value.
+  BasicBlock *NewBB =
+      BasicBlock::Create(BB->getContext(), "switch.edge", BB->getParent(), BB);
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      // Split weight for default case to case for "Cst".
+      Weights[0] = (Weights[0] + 1) >> 1;
+      Weights.push_back(Weights[0]);
+
+      SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+      SI->setMetadata(
+          LLVMContext::MD_prof,
+          MDBuilder(SI->getContext()).createBranchWeights(MDWeights));
+    }
+  }
+  SI->addCase(Cst, NewBB);
+
+  // NewBB branches to the phi block, add the uncond branch and the phi entry.
+  Builder.SetInsertPoint(NewBB);
+  Builder.SetCurrentDebugLocation(SI->getDebugLoc());
+  Builder.CreateBr(SuccBlock);
+  PHIUse->addIncoming(NewCst, NewBB);
+  return true;
+}
+
+/// The specified branch is a conditional branch.
+/// Check to see if it is branching on an or/and chain of icmp instructions, and
+/// fold it into a switch instruction if so.
+static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
+                                      const DataLayout &DL) {
+  Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
+  if (!Cond)
+    return false;
+
+  // Change br (X == 0 | X == 1), T, F into a switch instruction.
+  // If this is a bunch of seteq's or'd together, or if it's a bunch of
+  // 'setne's and'ed together, collect them.
+
+  // Try to gather values from a chain of and/or to be turned into a switch
+  ConstantComparesGatherer ConstantCompare(Cond, DL);
+  // Unpack the result
+  SmallVectorImpl<ConstantInt *> &Values = ConstantCompare.Vals;
+  Value *CompVal = ConstantCompare.CompValue;
+  unsigned UsedICmps = ConstantCompare.UsedICmps;
+  Value *ExtraCase = ConstantCompare.Extra;
+
+  // If we didn't have a multiply compared value, fail.
+  if (!CompVal)
+    return false;
+
+  // Avoid turning single icmps into a switch.
+  if (UsedICmps <= 1)
+    return false;
+
+  bool TrueWhenEqual = (Cond->getOpcode() == Instruction::Or);
+
+  // There might be duplicate constants in the list, which the switch
+  // instruction can't handle, remove them now.
+  array_pod_sort(Values.begin(), Values.end(), ConstantIntSortPredicate);
+  Values.erase(std::unique(Values.begin(), Values.end()), Values.end());
+
+  // If Extra was used, we require at least two switch values to do the
+  // transformation.  A switch with one value is just a conditional branch.
+  if (ExtraCase && Values.size() < 2)
+    return false;
+
+  // TODO: Preserve branch weight metadata, similarly to how
+  // FoldValueComparisonIntoPredecessors preserves it.
+
+  // Figure out which block is which destination.
+  BasicBlock *DefaultBB = BI->getSuccessor(1);
+  BasicBlock *EdgeBB = BI->getSuccessor(0);
+  if (!TrueWhenEqual)
+    std::swap(DefaultBB, EdgeBB);
+
+  BasicBlock *BB = BI->getParent();
+
+  DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
+               << " cases into SWITCH.  BB is:\n"
+               << *BB);
+
+  // If there are any extra values that couldn't be folded into the switch
+  // then we evaluate them with an explicit branch first.  Split the block
+  // right before the condbr to handle it.
+  if (ExtraCase) {
+    BasicBlock *NewBB =
+        BB->splitBasicBlock(BI->getIterator(), "switch.early.test");
+    // Remove the uncond branch added to the old block.
+    TerminatorInst *OldTI = BB->getTerminator();
+    Builder.SetInsertPoint(OldTI);
+
+    if (TrueWhenEqual)
+      Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB);
+    else
+      Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
+
+    OldTI->eraseFromParent();
+
+    // If there are PHI nodes in EdgeBB, then we need to add a new entry to them
+    // for the edge we just added.
+    AddPredecessorToBlock(EdgeBB, BB, NewBB);
+
+    DEBUG(dbgs() << "  ** 'icmp' chain unhandled condition: " << *ExtraCase
+                 << "\nEXTRABB = " << *BB);
+    BB = NewBB;
+  }
+
+  Builder.SetInsertPoint(BI);
+  // Convert pointer to int before we switch.
+  if (CompVal->getType()->isPointerTy()) {
+    CompVal = Builder.CreatePtrToInt(
+        CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr");
+  }
+
+  // Create the new switch instruction now.
+  SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
+
+  // Add all of the 'cases' to the switch instruction.
+  for (unsigned i = 0, e = Values.size(); i != e; ++i)
+    New->addCase(Values[i], EdgeBB);
+
+  // We added edges from PI to the EdgeBB.  As such, if there were any
+  // PHI nodes in EdgeBB, they need entries to be added corresponding to
+  // the number of edges added.
+  for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) {
+    PHINode *PN = cast<PHINode>(BBI);
+    Value *InVal = PN->getIncomingValueForBlock(BB);
+    for (unsigned i = 0, e = Values.size() - 1; i != e; ++i)
+      PN->addIncoming(InVal, BB);
+  }
+
+  // Erase the old branch instruction.
+  EraseTerminatorInstAndDCECond(BI);
+
+  DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n');
+  return true;
+}
+
+bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
+  if (isa<PHINode>(RI->getValue()))
+    return SimplifyCommonResume(RI);
+  else if (isa<LandingPadInst>(RI->getParent()->getFirstNonPHI()) &&
+           RI->getValue() == RI->getParent()->getFirstNonPHI())
+    // The resume must unwind the exception that caused control to branch here.
+    return SimplifySingleResume(RI);
+
+  return false;
+}
+
+// Simplify resume that is shared by several landing pads (phi of landing pad).
+bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
+  BasicBlock *BB = RI->getParent();
+
+  // Check that there are no other instructions except for debug intrinsics
+  // between the phi of landing pads (RI->getValue()) and resume instruction.
+  BasicBlock::iterator I = cast<Instruction>(RI->getValue())->getIterator(),
+                       E = RI->getIterator();
+  while (++I != E)
+    if (!isa<DbgInfoIntrinsic>(I))
+      return false;
+
+  SmallSetVector<BasicBlock *, 4> TrivialUnwindBlocks;
+  auto *PhiLPInst = cast<PHINode>(RI->getValue());
+
+  // Check incoming blocks to see if any of them are trivial.
+  for (unsigned Idx = 0, End = PhiLPInst->getNumIncomingValues(); Idx != End;
+       Idx++) {
+    auto *IncomingBB = PhiLPInst->getIncomingBlock(Idx);
+    auto *IncomingValue = PhiLPInst->getIncomingValue(Idx);
+
+    // If the block has other successors, we can not delete it because
+    // it has other dependents.
+    if (IncomingBB->getUniqueSuccessor() != BB)
+      continue;
+
+    auto *LandingPad = dyn_cast<LandingPadInst>(IncomingBB->getFirstNonPHI());
+    // Not the landing pad that caused the control to branch here.
+    if (IncomingValue != LandingPad)
+      continue;
+
+    bool isTrivial = true;
+
+    I = IncomingBB->getFirstNonPHI()->getIterator();
+    E = IncomingBB->getTerminator()->getIterator();
+    while (++I != E)
+      if (!isa<DbgInfoIntrinsic>(I)) {
+        isTrivial = false;
+        break;
+      }
+
+    if (isTrivial)
+      TrivialUnwindBlocks.insert(IncomingBB);
+  }
+
+  // If no trivial unwind blocks, don't do any simplifications.
+  if (TrivialUnwindBlocks.empty())
+    return false;
+
+  // Turn all invokes that unwind here into calls.
+  for (auto *TrivialBB : TrivialUnwindBlocks) {
+    // Blocks that will be simplified should be removed from the phi node.
+    // Note there could be multiple edges to the resume block, and we need
+    // to remove them all.
+    while (PhiLPInst->getBasicBlockIndex(TrivialBB) != -1)
+      BB->removePredecessor(TrivialBB, true);
+
+    for (pred_iterator PI = pred_begin(TrivialBB), PE = pred_end(TrivialBB);
+         PI != PE;) {
+      BasicBlock *Pred = *PI++;
+      removeUnwindEdge(Pred);
+    }
+
+    // In each SimplifyCFG run, only the current processed block can be erased.
+    // Otherwise, it will break the iteration of SimplifyCFG pass. So instead
+    // of erasing TrivialBB, we only remove the branch to the common resume
+    // block so that we can later erase the resume block since it has no
+    // predecessors.
+    TrivialBB->getTerminator()->eraseFromParent();
+    new UnreachableInst(RI->getContext(), TrivialBB);
+  }
+
+  // Delete the resume block if all its predecessors have been removed.
+  if (pred_empty(BB))
+    BB->eraseFromParent();
+
+  return !TrivialUnwindBlocks.empty();
+}
+
+// Simplify resume that is only used by a single (non-phi) landing pad.
+bool SimplifyCFGOpt::SimplifySingleResume(ResumeInst *RI) {
+  BasicBlock *BB = RI->getParent();
+  LandingPadInst *LPInst = dyn_cast<LandingPadInst>(BB->getFirstNonPHI());
+  assert(RI->getValue() == LPInst &&
+         "Resume must unwind the exception that caused control to here");
+
+  // Check that there are no other instructions except for debug intrinsics.
+  BasicBlock::iterator I = LPInst->getIterator(), E = RI->getIterator();
+  while (++I != E)
+    if (!isa<DbgInfoIntrinsic>(I))
+      return false;
+
+  // Turn all invokes that unwind here into calls and delete the basic block.
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
+    BasicBlock *Pred = *PI++;
+    removeUnwindEdge(Pred);
+  }
+
+  // The landingpad is now unreachable.  Zap it.
+  BB->eraseFromParent();
+  if (LoopHeaders)
+    LoopHeaders->erase(BB);
+  return true;
+}
+
+static bool removeEmptyCleanup(CleanupReturnInst *RI) {
+  // If this is a trivial cleanup pad that executes no instructions, it can be
+  // eliminated.  If the cleanup pad continues to the caller, any predecessor
+  // that is an EH pad will be updated to continue to the caller and any
+  // predecessor that terminates with an invoke instruction will have its invoke
+  // instruction converted to a call instruction.  If the cleanup pad being
+  // simplified does not continue to the caller, each predecessor will be
+  // updated to continue to the unwind destination of the cleanup pad being
+  // simplified.
+  BasicBlock *BB = RI->getParent();
+  CleanupPadInst *CPInst = RI->getCleanupPad();
+  if (CPInst->getParent() != BB)
+    // This isn't an empty cleanup.
+    return false;
+
+  // We cannot kill the pad if it has multiple uses.  This typically arises
+  // from unreachable basic blocks.
+  if (!CPInst->hasOneUse())
+    return false;
+
+  // Check that there are no other instructions except for benign intrinsics.
+  BasicBlock::iterator I = CPInst->getIterator(), E = RI->getIterator();
+  while (++I != E) {
+    auto *II = dyn_cast<IntrinsicInst>(I);
+    if (!II)
+      return false;
+
+    Intrinsic::ID IntrinsicID = II->getIntrinsicID();
+    switch (IntrinsicID) {
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_value:
+    case Intrinsic::lifetime_end:
+      break;
+    default:
+      return false;
+    }
+  }
+
+  // If the cleanup return we are simplifying unwinds to the caller, this will
+  // set UnwindDest to nullptr.
+  BasicBlock *UnwindDest = RI->getUnwindDest();
+  Instruction *DestEHPad = UnwindDest ? UnwindDest->getFirstNonPHI() : nullptr;
+
+  // We're about to remove BB from the control flow.  Before we do, sink any
+  // PHINodes into the unwind destination.  Doing this before changing the
+  // control flow avoids some potentially slow checks, since we can currently
+  // be certain that UnwindDest and BB have no common predecessors (since they
+  // are both EH pads).
+  if (UnwindDest) {
+    // First, go through the PHI nodes in UnwindDest and update any nodes that
+    // reference the block we are removing
+    for (BasicBlock::iterator I = UnwindDest->begin(),
+                              IE = DestEHPad->getIterator();
+         I != IE; ++I) {
+      PHINode *DestPN = cast<PHINode>(I);
+
+      int Idx = DestPN->getBasicBlockIndex(BB);
+      // Since BB unwinds to UnwindDest, it has to be in the PHI node.
+      assert(Idx != -1);
+      // This PHI node has an incoming value that corresponds to a control
+      // path through the cleanup pad we are removing.  If the incoming
+      // value is in the cleanup pad, it must be a PHINode (because we
+      // verified above that the block is otherwise empty).  Otherwise, the
+      // value is either a constant or a value that dominates the cleanup
+      // pad being removed.
+      //
+      // Because BB and UnwindDest are both EH pads, all of their
+      // predecessors must unwind to these blocks, and since no instruction
+      // can have multiple unwind destinations, there will be no overlap in
+      // incoming blocks between SrcPN and DestPN.
+      Value *SrcVal = DestPN->getIncomingValue(Idx);
+      PHINode *SrcPN = dyn_cast<PHINode>(SrcVal);
+
+      // Remove the entry for the block we are deleting.
+      DestPN->removeIncomingValue(Idx, false);
+
+      if (SrcPN && SrcPN->getParent() == BB) {
+        // If the incoming value was a PHI node in the cleanup pad we are
+        // removing, we need to merge that PHI node's incoming values into
+        // DestPN.
+        for (unsigned SrcIdx = 0, SrcE = SrcPN->getNumIncomingValues();
+             SrcIdx != SrcE; ++SrcIdx) {
+          DestPN->addIncoming(SrcPN->getIncomingValue(SrcIdx),
+                              SrcPN->getIncomingBlock(SrcIdx));
+        }
+      } else {
+        // Otherwise, the incoming value came from above BB and
+        // so we can just reuse it.  We must associate all of BB's
+        // predecessors with this value.
+        for (auto *pred : predecessors(BB)) {
+          DestPN->addIncoming(SrcVal, pred);
+        }
+      }
+    }
+
+    // Sink any remaining PHI nodes directly into UnwindDest.
+    Instruction *InsertPt = DestEHPad;
+    for (BasicBlock::iterator I = BB->begin(),
+                              IE = BB->getFirstNonPHI()->getIterator();
+         I != IE;) {
+      // The iterator must be incremented here because the instructions are
+      // being moved to another block.
+      PHINode *PN = cast<PHINode>(I++);
+      if (PN->use_empty())
+        // If the PHI node has no uses, just leave it.  It will be erased
+        // when we erase BB below.
+        continue;
+
+      // Otherwise, sink this PHI node into UnwindDest.
+      // Any predecessors to UnwindDest which are not already represented
+      // must be back edges which inherit the value from the path through
+      // BB.  In this case, the PHI value must reference itself.
+      for (auto *pred : predecessors(UnwindDest))
+        if (pred != BB)
+          PN->addIncoming(PN, pred);
+      PN->moveBefore(InsertPt);
+    }
+  }
+
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
+    // The iterator must be updated here because we are removing this pred.
+    BasicBlock *PredBB = *PI++;
+    if (UnwindDest == nullptr) {
+      removeUnwindEdge(PredBB);
+    } else {
+      TerminatorInst *TI = PredBB->getTerminator();
+      TI->replaceUsesOfWith(BB, UnwindDest);
+    }
+  }
+
+  // The cleanup pad is now unreachable.  Zap it.
+  BB->eraseFromParent();
+  return true;
+}
+
+// Try to merge two cleanuppads together.
+static bool mergeCleanupPad(CleanupReturnInst *RI) {
+  // Skip any cleanuprets which unwind to caller, there is nothing to merge
+  // with.
+  BasicBlock *UnwindDest = RI->getUnwindDest();
+  if (!UnwindDest)
+    return false;
+
+  // This cleanupret isn't the only predecessor of this cleanuppad, it wouldn't
+  // be safe to merge without code duplication.
+  if (UnwindDest->getSinglePredecessor() != RI->getParent())
+    return false;
+
+  // Verify that our cleanuppad's unwind destination is another cleanuppad.
+  auto *SuccessorCleanupPad = dyn_cast<CleanupPadInst>(&UnwindDest->front());
+  if (!SuccessorCleanupPad)
+    return false;
+
+  CleanupPadInst *PredecessorCleanupPad = RI->getCleanupPad();
+  // Replace any uses of the successor cleanupad with the predecessor pad
+  // The only cleanuppad uses should be this cleanupret, it's cleanupret and
+  // funclet bundle operands.
+  SuccessorCleanupPad->replaceAllUsesWith(PredecessorCleanupPad);
+  // Remove the old cleanuppad.
+  SuccessorCleanupPad->eraseFromParent();
+  // Now, we simply replace the cleanupret with a branch to the unwind
+  // destination.
+  BranchInst::Create(UnwindDest, RI->getParent());
+  RI->eraseFromParent();
+
+  return true;
+}
+
+bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) {
+  // It is possible to transiantly have an undef cleanuppad operand because we
+  // have deleted some, but not all, dead blocks.
+  // Eventually, this block will be deleted.
+  if (isa<UndefValue>(RI->getOperand(0)))
+    return false;
+
+  if (mergeCleanupPad(RI))
+    return true;
+
+  if (removeEmptyCleanup(RI))
+    return true;
+
+  return false;
+}
+
+bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
+  BasicBlock *BB = RI->getParent();
+  if (!BB->getFirstNonPHIOrDbg()->isTerminator())
+    return false;
+
+  // Find predecessors that end with branches.
+  SmallVector<BasicBlock *, 8> UncondBranchPreds;
+  SmallVector<BranchInst *, 8> CondBranchPreds;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *P = *PI;
+    TerminatorInst *PTI = P->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) {
+      if (BI->isUnconditional())
+        UncondBranchPreds.push_back(P);
+      else
+        CondBranchPreds.push_back(BI);
+    }
+  }
+
+  // If we found some, do the transformation!
+  if (!UncondBranchPreds.empty() && DupRet) {
+    while (!UncondBranchPreds.empty()) {
+      BasicBlock *Pred = UncondBranchPreds.pop_back_val();
+      DEBUG(dbgs() << "FOLDING: " << *BB
+                   << "INTO UNCOND BRANCH PRED: " << *Pred);
+      (void)FoldReturnIntoUncondBranch(RI, BB, Pred);
+    }
+
+    // If we eliminated all predecessors of the block, delete the block now.
+    if (pred_empty(BB)) {
+      // We know there are no successors, so just nuke the block.
+      BB->eraseFromParent();
+      if (LoopHeaders)
+        LoopHeaders->erase(BB);
+    }
+
+    return true;
+  }
+
+  // Check out all of the conditional branches going to this return
+  // instruction.  If any of them just select between returns, change the
+  // branch itself into a select/return pair.
+  while (!CondBranchPreds.empty()) {
+    BranchInst *BI = CondBranchPreds.pop_back_val();
+
+    // Check to see if the non-BB successor is also a return block.
+    if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) &&
+        isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) &&
+        SimplifyCondBranchToTwoReturns(BI, Builder))
+      return true;
+  }
+  return false;
+}
+
+bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
+  BasicBlock *BB = UI->getParent();
+
+  bool Changed = false;
+
+  // If there are any instructions immediately before the unreachable that can
+  // be removed, do so.
+  while (UI->getIterator() != BB->begin()) {
+    BasicBlock::iterator BBI = UI->getIterator();
+    --BBI;
+    // Do not delete instructions that can have side effects which might cause
+    // the unreachable to not be reachable; specifically, calls and volatile
+    // operations may have this effect.
+    if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI))
+      break;
+
+    if (BBI->mayHaveSideEffects()) {
+      if (auto *SI = dyn_cast<StoreInst>(BBI)) {
+        if (SI->isVolatile())
+          break;
+      } else if (auto *LI = dyn_cast<LoadInst>(BBI)) {
+        if (LI->isVolatile())
+          break;
+      } else if (auto *RMWI = dyn_cast<AtomicRMWInst>(BBI)) {
+        if (RMWI->isVolatile())
+          break;
+      } else if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(BBI)) {
+        if (CXI->isVolatile())
+          break;
+      } else if (isa<CatchPadInst>(BBI)) {
+        // A catchpad may invoke exception object constructors and such, which
+        // in some languages can be arbitrary code, so be conservative by
+        // default.
+        // For CoreCLR, it just involves a type test, so can be removed.
+        if (classifyEHPersonality(BB->getParent()->getPersonalityFn()) !=
+            EHPersonality::CoreCLR)
+          break;
+      } else if (!isa<FenceInst>(BBI) && !isa<VAArgInst>(BBI) &&
+                 !isa<LandingPadInst>(BBI)) {
+        break;
+      }
+      // Note that deleting LandingPad's here is in fact okay, although it
+      // involves a bit of subtle reasoning. If this inst is a LandingPad,
+      // all the predecessors of this block will be the unwind edges of Invokes,
+      // and we can therefore guarantee this block will be erased.
+    }
+
+    // Delete this instruction (any uses are guaranteed to be dead)
+    if (!BBI->use_empty())
+      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+    BBI->eraseFromParent();
+    Changed = true;
+  }
+
+  // If the unreachable instruction is the first in the block, take a gander
+  // at all of the predecessors of this instruction, and simplify them.
+  if (&BB->front() != UI)
+    return Changed;
+
+  SmallVector<BasicBlock *, 8> Preds(pred_begin(BB), pred_end(BB));
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+    TerminatorInst *TI = Preds[i]->getTerminator();
+    IRBuilder<> Builder(TI);
+    if (auto *BI = dyn_cast<BranchInst>(TI)) {
+      if (BI->isUnconditional()) {
+        if (BI->getSuccessor(0) == BB) {
+          new UnreachableInst(TI->getContext(), TI);
+          TI->eraseFromParent();
+          Changed = true;
+        }
+      } else {
+        if (BI->getSuccessor(0) == BB) {
+          Builder.CreateBr(BI->getSuccessor(1));
+          EraseTerminatorInstAndDCECond(BI);
+        } else if (BI->getSuccessor(1) == BB) {
+          Builder.CreateBr(BI->getSuccessor(0));
+          EraseTerminatorInstAndDCECond(BI);
+          Changed = true;
+        }
+      }
+    } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+      for (auto i = SI->case_begin(), e = SI->case_end(); i != e;) {
+        if (i->getCaseSuccessor() != BB) {
+          ++i;
+          continue;
+        }
+        BB->removePredecessor(SI->getParent());
+        i = SI->removeCase(i);
+        e = SI->case_end();
+        Changed = true;
+      }
+    } else if (auto *II = dyn_cast<InvokeInst>(TI)) {
+      if (II->getUnwindDest() == BB) {
+        removeUnwindEdge(TI->getParent());
+        Changed = true;
+      }
+    } else if (auto *CSI = dyn_cast<CatchSwitchInst>(TI)) {
+      if (CSI->getUnwindDest() == BB) {
+        removeUnwindEdge(TI->getParent());
+        Changed = true;
+        continue;
+      }
+
+      for (CatchSwitchInst::handler_iterator I = CSI->handler_begin(),
+                                             E = CSI->handler_end();
+           I != E; ++I) {
+        if (*I == BB) {
+          CSI->removeHandler(I);
+          --I;
+          --E;
+          Changed = true;
+        }
+      }
+      if (CSI->getNumHandlers() == 0) {
+        BasicBlock *CatchSwitchBB = CSI->getParent();
+        if (CSI->hasUnwindDest()) {
+          // Redirect preds to the unwind dest
+          CatchSwitchBB->replaceAllUsesWith(CSI->getUnwindDest());
+        } else {
+          // Rewrite all preds to unwind to caller (or from invoke to call).
+          SmallVector<BasicBlock *, 8> EHPreds(predecessors(CatchSwitchBB));
+          for (BasicBlock *EHPred : EHPreds)
+            removeUnwindEdge(EHPred);
+        }
+        // The catchswitch is no longer reachable.
+        new UnreachableInst(CSI->getContext(), CSI);
+        CSI->eraseFromParent();
+        Changed = true;
+      }
+    } else if (isa<CleanupReturnInst>(TI)) {
+      new UnreachableInst(TI->getContext(), TI);
+      TI->eraseFromParent();
+      Changed = true;
+    }
+  }
+
+  // If this block is now dead, remove it.
+  if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) {
+    // We know there are no successors, so just nuke the block.
+    BB->eraseFromParent();
+    if (LoopHeaders)
+      LoopHeaders->erase(BB);
+    return true;
+  }
+
+  return Changed;
+}
+
+static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
+  assert(Cases.size() >= 1);
+
+  array_pod_sort(Cases.begin(), Cases.end(), ConstantIntSortPredicate);
+  for (size_t I = 1, E = Cases.size(); I != E; ++I) {
+    if (Cases[I - 1]->getValue() != Cases[I]->getValue() + 1)
+      return false;
+  }
+  return true;
+}
+
+/// Turn a switch with two reachable destinations into an integer range
+/// comparison and branch.
+static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
+  assert(SI->getNumCases() > 1 && "Degenerate switch?");
+
+  bool HasDefault =
+      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+
+  // Partition the cases into two sets with different destinations.
+  BasicBlock *DestA = HasDefault ? SI->getDefaultDest() : nullptr;
+  BasicBlock *DestB = nullptr;
+  SmallVector<ConstantInt *, 16> CasesA;
+  SmallVector<ConstantInt *, 16> CasesB;
+
+  for (auto Case : SI->cases()) {
+    BasicBlock *Dest = Case.getCaseSuccessor();
+    if (!DestA)
+      DestA = Dest;
+    if (Dest == DestA) {
+      CasesA.push_back(Case.getCaseValue());
+      continue;
+    }
+    if (!DestB)
+      DestB = Dest;
+    if (Dest == DestB) {
+      CasesB.push_back(Case.getCaseValue());
+      continue;
+    }
+    return false; // More than two destinations.
+  }
+
+  assert(DestA && DestB &&
+         "Single-destination switch should have been folded.");
+  assert(DestA != DestB);
+  assert(DestB != SI->getDefaultDest());
+  assert(!CasesB.empty() && "There must be non-default cases.");
+  assert(!CasesA.empty() || HasDefault);
+
+  // Figure out if one of the sets of cases form a contiguous range.
+  SmallVectorImpl<ConstantInt *> *ContiguousCases = nullptr;
+  BasicBlock *ContiguousDest = nullptr;
+  BasicBlock *OtherDest = nullptr;
+  if (!CasesA.empty() && CasesAreContiguous(CasesA)) {
+    ContiguousCases = &CasesA;
+    ContiguousDest = DestA;
+    OtherDest = DestB;
+  } else if (CasesAreContiguous(CasesB)) {
+    ContiguousCases = &CasesB;
+    ContiguousDest = DestB;
+    OtherDest = DestA;
+  } else
+    return false;
+
+  // Start building the compare and branch.
+
+  Constant *Offset = ConstantExpr::getNeg(ContiguousCases->back());
+  Constant *NumCases =
+      ConstantInt::get(Offset->getType(), ContiguousCases->size());
+
+  Value *Sub = SI->getCondition();
+  if (!Offset->isNullValue())
+    Sub = Builder.CreateAdd(Sub, Offset, Sub->getName() + ".off");
+
+  Value *Cmp;
+  // If NumCases overflowed, then all possible values jump to the successor.
+  if (NumCases->isNullValue() && !ContiguousCases->empty())
+    Cmp = ConstantInt::getTrue(SI->getContext());
+  else
+    Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
+  BranchInst *NewBI = Builder.CreateCondBr(Cmp, ContiguousDest, OtherDest);
+
+  // Update weight for the newly-created conditional branch.
+  if (HasBranchWeights(SI)) {
+    SmallVector<uint64_t, 8> Weights;
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      uint64_t TrueWeight = 0;
+      uint64_t FalseWeight = 0;
+      for (size_t I = 0, E = Weights.size(); I != E; ++I) {
+        if (SI->getSuccessor(I) == ContiguousDest)
+          TrueWeight += Weights[I];
+        else
+          FalseWeight += Weights[I];
+      }
+      while (TrueWeight > UINT32_MAX || FalseWeight > UINT32_MAX) {
+        TrueWeight /= 2;
+        FalseWeight /= 2;
+      }
+      NewBI->setMetadata(LLVMContext::MD_prof,
+                         MDBuilder(SI->getContext())
+                             .createBranchWeights((uint32_t)TrueWeight,
+                                                  (uint32_t)FalseWeight));
+    }
+  }
+
+  // Prune obsolete incoming values off the successors' PHI nodes.
+  for (auto BBI = ContiguousDest->begin(); isa<PHINode>(BBI); ++BBI) {
+    unsigned PreviousEdges = ContiguousCases->size();
+    if (ContiguousDest == SI->getDefaultDest())
+      ++PreviousEdges;
+    for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
+      cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
+  }
+  for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) {
+    unsigned PreviousEdges = SI->getNumCases() - ContiguousCases->size();
+    if (OtherDest == SI->getDefaultDest())
+      ++PreviousEdges;
+    for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
+      cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
+  }
+
+  // Drop the switch.
+  SI->eraseFromParent();
+
+  return true;
+}
+
+/// Compute masked bits for the condition of a switch
+/// and use it to remove dead cases.
+static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
+                                     const DataLayout &DL) {
+  Value *Cond = SI->getCondition();
+  unsigned Bits = Cond->getType()->getIntegerBitWidth();
+  KnownBits Known = computeKnownBits(Cond, DL, 0, AC, SI);
+
+  // We can also eliminate cases by determining that their values are outside of
+  // the limited range of the condition based on how many significant (non-sign)
+  // bits are in the condition value.
+  unsigned ExtraSignBits = ComputeNumSignBits(Cond, DL, 0, AC, SI) - 1;
+  unsigned MaxSignificantBitsInCond = Bits - ExtraSignBits;
+
+  // Gather dead cases.
+  SmallVector<ConstantInt *, 8> DeadCases;
+  for (auto &Case : SI->cases()) {
+    const APInt &CaseVal = Case.getCaseValue()->getValue();
+    if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
+        (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
+      DeadCases.push_back(Case.getCaseValue());
+      DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n");
+    }
+  }
+
+  // If we can prove that the cases must cover all possible values, the
+  // default destination becomes dead and we can remove it.  If we know some
+  // of the bits in the value, we can use that to more precisely compute the
+  // number of possible unique case values.
+  bool HasDefault =
+      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+  const unsigned NumUnknownBits =
+      Bits - (Known.Zero | Known.One).countPopulation();
+  assert(NumUnknownBits <= Bits);
+  if (HasDefault && DeadCases.empty() &&
+      NumUnknownBits < 64 /* avoid overflow */ &&
+      SI->getNumCases() == (1ULL << NumUnknownBits)) {
+    DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
+    BasicBlock *NewDefault =
+        SplitBlockPredecessors(SI->getDefaultDest(), SI->getParent(), "");
+    SI->setDefaultDest(&*NewDefault);
+    SplitBlock(&*NewDefault, &NewDefault->front());
+    auto *OldTI = NewDefault->getTerminator();
+    new UnreachableInst(SI->getContext(), OldTI);
+    EraseTerminatorInstAndDCECond(OldTI);
+    return true;
+  }
+
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeight = HasBranchWeights(SI);
+  if (HasWeight) {
+    GetBranchWeights(SI, Weights);
+    HasWeight = (Weights.size() == 1 + SI->getNumCases());
+  }
+
+  // Remove dead cases from the switch.
+  for (ConstantInt *DeadCase : DeadCases) {
+    SwitchInst::CaseIt CaseI = SI->findCaseValue(DeadCase);
+    assert(CaseI != SI->case_default() &&
+           "Case was not found. Probably mistake in DeadCases forming.");
+    if (HasWeight) {
+      std::swap(Weights[CaseI->getCaseIndex() + 1], Weights.back());
+      Weights.pop_back();
+    }
+
+    // Prune unused values from PHI nodes.
+    CaseI->getCaseSuccessor()->removePredecessor(SI->getParent());
+    SI->removeCase(CaseI);
+  }
+  if (HasWeight && Weights.size() >= 2) {
+    SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+    SI->setMetadata(LLVMContext::MD_prof,
+                    MDBuilder(SI->getParent()->getContext())
+                        .createBranchWeights(MDWeights));
+  }
+
+  return !DeadCases.empty();
+}
+
+/// If BB would be eligible for simplification by
+/// TryToSimplifyUncondBranchFromEmptyBlock (i.e. it is empty and terminated
+/// by an unconditional branch), look at the phi node for BB in the successor
+/// block and see if the incoming value is equal to CaseValue. If so, return
+/// the phi node, and set PhiIndex to BB's index in the phi node.
+static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
+                                              BasicBlock *BB, int *PhiIndex) {
+  if (BB->getFirstNonPHIOrDbg() != BB->getTerminator())
+    return nullptr; // BB must be empty to be a candidate for simplification.
+  if (!BB->getSinglePredecessor())
+    return nullptr; // BB must be dominated by the switch.
+
+  BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!Branch || !Branch->isUnconditional())
+    return nullptr; // Terminator must be unconditional branch.
+
+  BasicBlock *Succ = Branch->getSuccessor(0);
+
+  BasicBlock::iterator I = Succ->begin();
+  while (PHINode *PHI = dyn_cast<PHINode>(I++)) {
+    int Idx = PHI->getBasicBlockIndex(BB);
+    assert(Idx >= 0 && "PHI has no entry for predecessor?");
+
+    Value *InValue = PHI->getIncomingValue(Idx);
+    if (InValue != CaseValue)
+      continue;
+
+    *PhiIndex = Idx;
+    return PHI;
+  }
+
+  return nullptr;
+}
+
+/// Try to forward the condition of a switch instruction to a phi node
+/// dominated by the switch, if that would mean that some of the destination
+/// blocks of the switch can be folded away.
+/// Returns true if a change is made.
+static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
+  typedef DenseMap<PHINode *, SmallVector<int, 4>> ForwardingNodesMap;
+  ForwardingNodesMap ForwardingNodes;
+
+  for (auto Case : SI->cases()) {
+    ConstantInt *CaseValue = Case.getCaseValue();
+    BasicBlock *CaseDest = Case.getCaseSuccessor();
+
+    int PhiIndex;
+    PHINode *PHI =
+        FindPHIForConditionForwarding(CaseValue, CaseDest, &PhiIndex);
+    if (!PHI)
+      continue;
+
+    ForwardingNodes[PHI].push_back(PhiIndex);
+  }
+
+  bool Changed = false;
+
+  for (ForwardingNodesMap::iterator I = ForwardingNodes.begin(),
+                                    E = ForwardingNodes.end();
+       I != E; ++I) {
+    PHINode *Phi = I->first;
+    SmallVectorImpl<int> &Indexes = I->second;
+
+    if (Indexes.size() < 2)
+      continue;
+
+    for (size_t I = 0, E = Indexes.size(); I != E; ++I)
+      Phi->setIncomingValue(Indexes[I], SI->getCondition());
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+/// Return true if the backend will be able to handle
+/// initializing an array of constants like C.
+static bool ValidLookupTableConstant(Constant *C, const TargetTransformInfo &TTI) {
+  if (C->isThreadDependent())
+    return false;
+  if (C->isDLLImportDependent())
+    return false;
+
+  if (!isa<ConstantFP>(C) && !isa<ConstantInt>(C) &&
+      !isa<ConstantPointerNull>(C) && !isa<GlobalValue>(C) &&
+      !isa<UndefValue>(C) && !isa<ConstantExpr>(C))
+    return false;
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    if (!CE->isGEPWithNoNotionalOverIndexing())
+      return false;
+    if (!ValidLookupTableConstant(CE->getOperand(0), TTI))
+      return false;
+  }
+
+  if (!TTI.shouldBuildLookupTablesForConstant(C))
+    return false;
+
+  return true;
+}
+
+/// If V is a Constant, return it. Otherwise, try to look up
+/// its constant value in ConstantPool, returning 0 if it's not there.
+static Constant *
+LookupConstant(Value *V,
+               const SmallDenseMap<Value *, Constant *> &ConstantPool) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return C;
+  return ConstantPool.lookup(V);
+}
+
+/// Try to fold instruction I into a constant. This works for
+/// simple instructions such as binary operations where both operands are
+/// constant or can be replaced by constants from the ConstantPool. Returns the
+/// resulting constant on success, 0 otherwise.
+static Constant *
+ConstantFold(Instruction *I, const DataLayout &DL,
+             const SmallDenseMap<Value *, Constant *> &ConstantPool) {
+  if (SelectInst *Select = dyn_cast<SelectInst>(I)) {
+    Constant *A = LookupConstant(Select->getCondition(), ConstantPool);
+    if (!A)
+      return nullptr;
+    if (A->isAllOnesValue())
+      return LookupConstant(Select->getTrueValue(), ConstantPool);
+    if (A->isNullValue())
+      return LookupConstant(Select->getFalseValue(), ConstantPool);
+    return nullptr;
+  }
+
+  SmallVector<Constant *, 4> COps;
+  for (unsigned N = 0, E = I->getNumOperands(); N != E; ++N) {
+    if (Constant *A = LookupConstant(I->getOperand(N), ConstantPool))
+      COps.push_back(A);
+    else
+      return nullptr;
+  }
+
+  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
+    return ConstantFoldCompareInstOperands(Cmp->getPredicate(), COps[0],
+                                           COps[1], DL);
+  }
+
+  return ConstantFoldInstOperands(I, COps, DL);
+}
+
+/// Try to determine the resulting constant values in phi nodes
+/// at the common destination basic block, *CommonDest, for one of the case
+/// destionations CaseDest corresponding to value CaseVal (0 for the default
+/// case), of a switch instruction SI.
+static bool
+GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
+               BasicBlock **CommonDest,
+               SmallVectorImpl<std::pair<PHINode *, Constant *>> &Res,
+               const DataLayout &DL, const TargetTransformInfo &TTI) {
+  // The block from which we enter the common destination.
+  BasicBlock *Pred = SI->getParent();
+
+  // If CaseDest is empty except for some side-effect free instructions through
+  // which we can constant-propagate the CaseVal, continue to its successor.
+  SmallDenseMap<Value *, Constant *> ConstantPool;
+  ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
+  for (BasicBlock::iterator I = CaseDest->begin(), E = CaseDest->end(); I != E;
+       ++I) {
+    if (TerminatorInst *T = dyn_cast<TerminatorInst>(I)) {
+      // If the terminator is a simple branch, continue to the next block.
+      if (T->getNumSuccessors() != 1 || T->isExceptional())
+        return false;
+      Pred = CaseDest;
+      CaseDest = T->getSuccessor(0);
+    } else if (isa<DbgInfoIntrinsic>(I)) {
+      // Skip debug intrinsic.
+      continue;
+    } else if (Constant *C = ConstantFold(&*I, DL, ConstantPool)) {
+      // Instruction is side-effect free and constant.
+
+      // If the instruction has uses outside this block or a phi node slot for
+      // the block, it is not safe to bypass the instruction since it would then
+      // no longer dominate all its uses.
+      for (auto &Use : I->uses()) {
+        User *User = Use.getUser();
+        if (Instruction *I = dyn_cast<Instruction>(User))
+          if (I->getParent() == CaseDest)
+            continue;
+        if (PHINode *Phi = dyn_cast<PHINode>(User))
+          if (Phi->getIncomingBlock(Use) == CaseDest)
+            continue;
+        return false;
+      }
+
+      ConstantPool.insert(std::make_pair(&*I, C));
+    } else {
+      break;
+    }
+  }
+
+  // If we did not have a CommonDest before, use the current one.
+  if (!*CommonDest)
+    *CommonDest = CaseDest;
+  // If the destination isn't the common one, abort.
+  if (CaseDest != *CommonDest)
+    return false;
+
+  // Get the values for this case from phi nodes in the destination block.
+  BasicBlock::iterator I = (*CommonDest)->begin();
+  while (PHINode *PHI = dyn_cast<PHINode>(I++)) {
+    int Idx = PHI->getBasicBlockIndex(Pred);
+    if (Idx == -1)
+      continue;
+
+    Constant *ConstVal =
+        LookupConstant(PHI->getIncomingValue(Idx), ConstantPool);
+    if (!ConstVal)
+      return false;
+
+    // Be conservative about which kinds of constants we support.
+    if (!ValidLookupTableConstant(ConstVal, TTI))
+      return false;
+
+    Res.push_back(std::make_pair(PHI, ConstVal));
+  }
+
+  return Res.size() > 0;
+}
+
+// Helper function used to add CaseVal to the list of cases that generate
+// Result.
+static void MapCaseToResult(ConstantInt *CaseVal,
+                            SwitchCaseResultVectorTy &UniqueResults,
+                            Constant *Result) {
+  for (auto &I : UniqueResults) {
+    if (I.first == Result) {
+      I.second.push_back(CaseVal);
+      return;
+    }
+  }
+  UniqueResults.push_back(
+      std::make_pair(Result, SmallVector<ConstantInt *, 4>(1, CaseVal)));
+}
+
+// Helper function that initializes a map containing
+// results for the PHI node of the common destination block for a switch
+// instruction. Returns false if multiple PHI nodes have been found or if
+// there is not a common destination block for the switch.
+static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
+                                  BasicBlock *&CommonDest,
+                                  SwitchCaseResultVectorTy &UniqueResults,
+                                  Constant *&DefaultResult,
+                                  const DataLayout &DL,
+                                  const TargetTransformInfo &TTI) {
+  for (auto &I : SI->cases()) {
+    ConstantInt *CaseVal = I.getCaseValue();
+
+    // Resulting value at phi nodes for this case value.
+    SwitchCaseResultsTy Results;
+    if (!GetCaseResults(SI, CaseVal, I.getCaseSuccessor(), &CommonDest, Results,
+                        DL, TTI))
+      return false;
+
+    // Only one value per case is permitted
+    if (Results.size() > 1)
+      return false;
+    MapCaseToResult(CaseVal, UniqueResults, Results.begin()->second);
+
+    // Check the PHI consistency.
+    if (!PHI)
+      PHI = Results[0].first;
+    else if (PHI != Results[0].first)
+      return false;
+  }
+  // Find the default result value.
+  SmallVector<std::pair<PHINode *, Constant *>, 1> DefaultResults;
+  BasicBlock *DefaultDest = SI->getDefaultDest();
+  GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResults,
+                 DL, TTI);
+  // If the default value is not found abort unless the default destination
+  // is unreachable.
+  DefaultResult =
+      DefaultResults.size() == 1 ? DefaultResults.begin()->second : nullptr;
+  if ((!DefaultResult &&
+       !isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg())))
+    return false;
+
+  return true;
+}
+
+// Helper function that checks if it is possible to transform a switch with only
+// two cases (or two cases + default) that produces a result into a select.
+// Example:
+// switch (a) {
+//   case 10:                %0 = icmp eq i32 %a, 10
+//     return 10;            %1 = select i1 %0, i32 10, i32 4
+//   case 20:        ---->   %2 = icmp eq i32 %a, 20
+//     return 2;             %3 = select i1 %2, i32 2, i32 %1
+//   default:
+//     return 4;
+// }
+static Value *ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector,
+                                   Constant *DefaultResult, Value *Condition,
+                                   IRBuilder<> &Builder) {
+  assert(ResultVector.size() == 2 &&
+         "We should have exactly two unique results at this point");
+  // If we are selecting between only two cases transform into a simple
+  // select or a two-way select if default is possible.
+  if (ResultVector[0].second.size() == 1 &&
+      ResultVector[1].second.size() == 1) {
+    ConstantInt *const FirstCase = ResultVector[0].second[0];
+    ConstantInt *const SecondCase = ResultVector[1].second[0];
+
+    bool DefaultCanTrigger = DefaultResult;
+    Value *SelectValue = ResultVector[1].first;
+    if (DefaultCanTrigger) {
+      Value *const ValueCompare =
+          Builder.CreateICmpEQ(Condition, SecondCase, "switch.selectcmp");
+      SelectValue = Builder.CreateSelect(ValueCompare, ResultVector[1].first,
+                                         DefaultResult, "switch.select");
+    }
+    Value *const ValueCompare =
+        Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp");
+    return Builder.CreateSelect(ValueCompare, ResultVector[0].first,
+                                SelectValue, "switch.select");
+  }
+
+  return nullptr;
+}
+
+// Helper function to cleanup a switch instruction that has been converted into
+// a select, fixing up PHI nodes and basic blocks.
+static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI,
+                                              Value *SelectValue,
+                                              IRBuilder<> &Builder) {
+  BasicBlock *SelectBB = SI->getParent();
+  while (PHI->getBasicBlockIndex(SelectBB) >= 0)
+    PHI->removeIncomingValue(SelectBB);
+  PHI->addIncoming(SelectValue, SelectBB);
+
+  Builder.CreateBr(PHI->getParent());
+
+  // Remove the switch.
+  for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
+    BasicBlock *Succ = SI->getSuccessor(i);
+
+    if (Succ == PHI->getParent())
+      continue;
+    Succ->removePredecessor(SelectBB);
+  }
+  SI->eraseFromParent();
+}
+
+/// If the switch is only used to initialize one or more
+/// phi nodes in a common successor block with only two different
+/// constant values, replace the switch with select.
+static bool SwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
+                           AssumptionCache *AC, const DataLayout &DL,
+                           const TargetTransformInfo &TTI) {
+  Value *const Cond = SI->getCondition();
+  PHINode *PHI = nullptr;
+  BasicBlock *CommonDest = nullptr;
+  Constant *DefaultResult;
+  SwitchCaseResultVectorTy UniqueResults;
+  // Collect all the cases that will deliver the same value from the switch.
+  if (!InitializeUniqueCases(SI, PHI, CommonDest, UniqueResults, DefaultResult,
+                             DL, TTI))
+    return false;
+  // Selects choose between maximum two values.
+  if (UniqueResults.size() != 2)
+    return false;
+  assert(PHI != nullptr && "PHI for value select not found");
+
+  Builder.SetInsertPoint(SI);
+  Value *SelectValue =
+      ConvertTwoCaseSwitch(UniqueResults, DefaultResult, Cond, Builder);
+  if (SelectValue) {
+    RemoveSwitchAfterSelectConversion(SI, PHI, SelectValue, Builder);
+    return true;
+  }
+  // The switch couldn't be converted into a select.
+  return false;
+}
+
+namespace {
+
+/// This class represents a lookup table that can be used to replace a switch.
+class SwitchLookupTable {
+public:
+  /// Create a lookup table to use as a switch replacement with the contents
+  /// of Values, using DefaultValue to fill any holes in the table.
+  SwitchLookupTable(
+      Module &M, uint64_t TableSize, ConstantInt *Offset,
+      const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
+      Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName);
+
+  /// Build instructions with Builder to retrieve the value at
+  /// the position given by Index in the lookup table.
+  Value *BuildLookup(Value *Index, IRBuilder<> &Builder);
+
+  /// Return true if a table with TableSize elements of
+  /// type ElementType would fit in a target-legal register.
+  static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize,
+                                 Type *ElementType);
+
+private:
+  // Depending on the contents of the table, it can be represented in
+  // different ways.
+  enum {
+    // For tables where each element contains the same value, we just have to
+    // store that single value and return it for each lookup.
+    SingleValueKind,
+
+    // For tables where there is a linear relationship between table index
+    // and values. We calculate the result with a simple multiplication
+    // and addition instead of a table lookup.
+    LinearMapKind,
+
+    // For small tables with integer elements, we can pack them into a bitmap
+    // that fits into a target-legal register. Values are retrieved by
+    // shift and mask operations.
+    BitMapKind,
+
+    // The table is stored as an array of values. Values are retrieved by load
+    // instructions from the table.
+    ArrayKind
+  } Kind;
+
+  // For SingleValueKind, this is the single value.
+  Constant *SingleValue;
+
+  // For BitMapKind, this is the bitmap.
+  ConstantInt *BitMap;
+  IntegerType *BitMapElementTy;
+
+  // For LinearMapKind, these are the constants used to derive the value.
+  ConstantInt *LinearOffset;
+  ConstantInt *LinearMultiplier;
+
+  // For ArrayKind, this is the array.
+  GlobalVariable *Array;
+};
+
+} // end anonymous namespace
+
+SwitchLookupTable::SwitchLookupTable(
+    Module &M, uint64_t TableSize, ConstantInt *Offset,
+    const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
+    Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName)
+    : SingleValue(nullptr), BitMap(nullptr), BitMapElementTy(nullptr),
+      LinearOffset(nullptr), LinearMultiplier(nullptr), Array(nullptr) {
+  assert(Values.size() && "Can't build lookup table without values!");
+  assert(TableSize >= Values.size() && "Can't fit values in table!");
+
+  // If all values in the table are equal, this is that value.
+  SingleValue = Values.begin()->second;
+
+  Type *ValueType = Values.begin()->second->getType();
+
+  // Build up the table contents.
+  SmallVector<Constant *, 64> TableContents(TableSize);
+  for (size_t I = 0, E = Values.size(); I != E; ++I) {
+    ConstantInt *CaseVal = Values[I].first;
+    Constant *CaseRes = Values[I].second;
+    assert(CaseRes->getType() == ValueType);
+
+    uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue();
+    TableContents[Idx] = CaseRes;
+
+    if (CaseRes != SingleValue)
+      SingleValue = nullptr;
+  }
+
+  // Fill in any holes in the table with the default result.
+  if (Values.size() < TableSize) {
+    assert(DefaultValue &&
+           "Need a default value to fill the lookup table holes.");
+    assert(DefaultValue->getType() == ValueType);
+    for (uint64_t I = 0; I < TableSize; ++I) {
+      if (!TableContents[I])
+        TableContents[I] = DefaultValue;
+    }
+
+    if (DefaultValue != SingleValue)
+      SingleValue = nullptr;
+  }
+
+  // If each element in the table contains the same value, we only need to store
+  // that single value.
+  if (SingleValue) {
+    Kind = SingleValueKind;
+    return;
+  }
+
+  // Check if we can derive the value with a linear transformation from the
+  // table index.
+  if (isa<IntegerType>(ValueType)) {
+    bool LinearMappingPossible = true;
+    APInt PrevVal;
+    APInt DistToPrev;
+    assert(TableSize >= 2 && "Should be a SingleValue table.");
+    // Check if there is the same distance between two consecutive values.
+    for (uint64_t I = 0; I < TableSize; ++I) {
+      ConstantInt *ConstVal = dyn_cast<ConstantInt>(TableContents[I]);
+      if (!ConstVal) {
+        // This is an undef. We could deal with it, but undefs in lookup tables
+        // are very seldom. It's probably not worth the additional complexity.
+        LinearMappingPossible = false;
+        break;
+      }
+      const APInt &Val = ConstVal->getValue();
+      if (I != 0) {
+        APInt Dist = Val - PrevVal;
+        if (I == 1) {
+          DistToPrev = Dist;
+        } else if (Dist != DistToPrev) {
+          LinearMappingPossible = false;
+          break;
+        }
+      }
+      PrevVal = Val;
+    }
+    if (LinearMappingPossible) {
+      LinearOffset = cast<ConstantInt>(TableContents[0]);
+      LinearMultiplier = ConstantInt::get(M.getContext(), DistToPrev);
+      Kind = LinearMapKind;
+      ++NumLinearMaps;
+      return;
+    }
+  }
+
+  // If the type is integer and the table fits in a register, build a bitmap.
+  if (WouldFitInRegister(DL, TableSize, ValueType)) {
+    IntegerType *IT = cast<IntegerType>(ValueType);
+    APInt TableInt(TableSize * IT->getBitWidth(), 0);
+    for (uint64_t I = TableSize; I > 0; --I) {
+      TableInt <<= IT->getBitWidth();
+      // Insert values into the bitmap. Undef values are set to zero.
+      if (!isa<UndefValue>(TableContents[I - 1])) {
+        ConstantInt *Val = cast<ConstantInt>(TableContents[I - 1]);
+        TableInt |= Val->getValue().zext(TableInt.getBitWidth());
+      }
+    }
+    BitMap = ConstantInt::get(M.getContext(), TableInt);
+    BitMapElementTy = IT;
+    Kind = BitMapKind;
+    ++NumBitMaps;
+    return;
+  }
+
+  // Store the table in an array.
+  ArrayType *ArrayTy = ArrayType::get(ValueType, TableSize);
+  Constant *Initializer = ConstantArray::get(ArrayTy, TableContents);
+
+  Array = new GlobalVariable(M, ArrayTy, /*constant=*/true,
+                             GlobalVariable::PrivateLinkage, Initializer,
+                             "switch.table." + FuncName);
+  Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  Kind = ArrayKind;
+}
+
+Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) {
+  switch (Kind) {
+  case SingleValueKind:
+    return SingleValue;
+  case LinearMapKind: {
+    // Derive the result value from the input value.
+    Value *Result = Builder.CreateIntCast(Index, LinearMultiplier->getType(),
+                                          false, "switch.idx.cast");
+    if (!LinearMultiplier->isOne())
+      Result = Builder.CreateMul(Result, LinearMultiplier, "switch.idx.mult");
+    if (!LinearOffset->isZero())
+      Result = Builder.CreateAdd(Result, LinearOffset, "switch.offset");
+    return Result;
+  }
+  case BitMapKind: {
+    // Type of the bitmap (e.g. i59).
+    IntegerType *MapTy = BitMap->getType();
+
+    // Cast Index to the same type as the bitmap.
+    // Note: The Index is <= the number of elements in the table, so
+    // truncating it to the width of the bitmask is safe.
+    Value *ShiftAmt = Builder.CreateZExtOrTrunc(Index, MapTy, "switch.cast");
+
+    // Multiply the shift amount by the element width.
+    ShiftAmt = Builder.CreateMul(
+        ShiftAmt, ConstantInt::get(MapTy, BitMapElementTy->getBitWidth()),
+        "switch.shiftamt");
+
+    // Shift down.
+    Value *DownShifted =
+        Builder.CreateLShr(BitMap, ShiftAmt, "switch.downshift");
+    // Mask off.
+    return Builder.CreateTrunc(DownShifted, BitMapElementTy, "switch.masked");
+  }
+  case ArrayKind: {
+    // Make sure the table index will not overflow when treated as signed.
+    IntegerType *IT = cast<IntegerType>(Index->getType());
+    uint64_t TableSize =
+        Array->getInitializer()->getType()->getArrayNumElements();
+    if (TableSize > (1ULL << (IT->getBitWidth() - 1)))
+      Index = Builder.CreateZExt(
+          Index, IntegerType::get(IT->getContext(), IT->getBitWidth() + 1),
+          "switch.tableidx.zext");
+
+    Value *GEPIndices[] = {Builder.getInt32(0), Index};
+    Value *GEP = Builder.CreateInBoundsGEP(Array->getValueType(), Array,
+                                           GEPIndices, "switch.gep");
+    return Builder.CreateLoad(GEP, "switch.load");
+  }
+  }
+  llvm_unreachable("Unknown lookup table kind!");
+}
+
+bool SwitchLookupTable::WouldFitInRegister(const DataLayout &DL,
+                                           uint64_t TableSize,
+                                           Type *ElementType) {
+  auto *IT = dyn_cast<IntegerType>(ElementType);
+  if (!IT)
+    return false;
+  // FIXME: If the type is wider than it needs to be, e.g. i8 but all values
+  // are <= 15, we could try to narrow the type.
+
+  // Avoid overflow, fitsInLegalInteger uses unsigned int for the width.
+  if (TableSize >= UINT_MAX / IT->getBitWidth())
+    return false;
+  return DL.fitsInLegalInteger(TableSize * IT->getBitWidth());
+}
+
+/// Determine whether a lookup table should be built for this switch, based on
+/// the number of cases, size of the table, and the types of the results.
+static bool
+ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
+                       const TargetTransformInfo &TTI, const DataLayout &DL,
+                       const SmallDenseMap<PHINode *, Type *> &ResultTypes) {
+  if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10)
+    return false; // TableSize overflowed, or mul below might overflow.
+
+  bool AllTablesFitInRegister = true;
+  bool HasIllegalType = false;
+  for (const auto &I : ResultTypes) {
+    Type *Ty = I.second;
+
+    // Saturate this flag to true.
+    HasIllegalType = HasIllegalType || !TTI.isTypeLegal(Ty);
+
+    // Saturate this flag to false.
+    AllTablesFitInRegister =
+        AllTablesFitInRegister &&
+        SwitchLookupTable::WouldFitInRegister(DL, TableSize, Ty);
+
+    // If both flags saturate, we're done. NOTE: This *only* works with
+    // saturating flags, and all flags have to saturate first due to the
+    // non-deterministic behavior of iterating over a dense map.
+    if (HasIllegalType && !AllTablesFitInRegister)
+      break;
+  }
+
+  // If each table would fit in a register, we should build it anyway.
+  if (AllTablesFitInRegister)
+    return true;
+
+  // Don't build a table that doesn't fit in-register if it has illegal types.
+  if (HasIllegalType)
+    return false;
+
+  // The table density should be at least 40%. This is the same criterion as for
+  // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
+  // FIXME: Find the best cut-off.
+  return SI->getNumCases() * 10 >= TableSize * 4;
+}
+
+/// Try to reuse the switch table index compare. Following pattern:
+/// \code
+///     if (idx < tablesize)
+///        r = table[idx]; // table does not contain default_value
+///     else
+///        r = default_value;
+///     if (r != default_value)
+///        ...
+/// \endcode
+/// Is optimized to:
+/// \code
+///     cond = idx < tablesize;
+///     if (cond)
+///        r = table[idx];
+///     else
+///        r = default_value;
+///     if (cond)
+///        ...
+/// \endcode
+/// Jump threading will then eliminate the second if(cond).
+static void reuseTableCompare(
+    User *PhiUser, BasicBlock *PhiBlock, BranchInst *RangeCheckBranch,
+    Constant *DefaultValue,
+    const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values) {
+
+  ICmpInst *CmpInst = dyn_cast<ICmpInst>(PhiUser);
+  if (!CmpInst)
+    return;
+
+  // We require that the compare is in the same block as the phi so that jump
+  // threading can do its work afterwards.
+  if (CmpInst->getParent() != PhiBlock)
+    return;
+
+  Constant *CmpOp1 = dyn_cast<Constant>(CmpInst->getOperand(1));
+  if (!CmpOp1)
+    return;
+
+  Value *RangeCmp = RangeCheckBranch->getCondition();
+  Constant *TrueConst = ConstantInt::getTrue(RangeCmp->getType());
+  Constant *FalseConst = ConstantInt::getFalse(RangeCmp->getType());
+
+  // Check if the compare with the default value is constant true or false.
+  Constant *DefaultConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
+                                                 DefaultValue, CmpOp1, true);
+  if (DefaultConst != TrueConst && DefaultConst != FalseConst)
+    return;
+
+  // Check if the compare with the case values is distinct from the default
+  // compare result.
+  for (auto ValuePair : Values) {
+    Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
+                                                ValuePair.second, CmpOp1, true);
+    if (!CaseConst || CaseConst == DefaultConst)
+      return;
+    assert((CaseConst == TrueConst || CaseConst == FalseConst) &&
+           "Expect true or false as compare result.");
+  }
+
+  // Check if the branch instruction dominates the phi node. It's a simple
+  // dominance check, but sufficient for our needs.
+  // Although this check is invariant in the calling loops, it's better to do it
+  // at this late stage. Practically we do it at most once for a switch.
+  BasicBlock *BranchBlock = RangeCheckBranch->getParent();
+  for (auto PI = pred_begin(PhiBlock), E = pred_end(PhiBlock); PI != E; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (Pred != BranchBlock && Pred->getUniquePredecessor() != BranchBlock)
+      return;
+  }
+
+  if (DefaultConst == FalseConst) {
+    // The compare yields the same result. We can replace it.
+    CmpInst->replaceAllUsesWith(RangeCmp);
+    ++NumTableCmpReuses;
+  } else {
+    // The compare yields the same result, just inverted. We can replace it.
+    Value *InvertedTableCmp = BinaryOperator::CreateXor(
+        RangeCmp, ConstantInt::get(RangeCmp->getType(), 1), "inverted.cmp",
+        RangeCheckBranch);
+    CmpInst->replaceAllUsesWith(InvertedTableCmp);
+    ++NumTableCmpReuses;
+  }
+}
+
+/// If the switch is only used to initialize one or more phi nodes in a common
+/// successor block with different constant values, replace the switch with
+/// lookup tables.
+static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
+                                const DataLayout &DL,
+                                const TargetTransformInfo &TTI) {
+  assert(SI->getNumCases() > 1 && "Degenerate switch?");
+
+  // Only build lookup table when we have a target that supports it.
+  if (!TTI.shouldBuildLookupTables())
+    return false;
+
+  // FIXME: If the switch is too sparse for a lookup table, perhaps we could
+  // split off a dense part and build a lookup table for that.
+
+  // FIXME: This creates arrays of GEPs to constant strings, which means each
+  // GEP needs a runtime relocation in PIC code. We should just build one big
+  // string and lookup indices into that.
+
+  // Ignore switches with less than three cases. Lookup tables will not make
+  // them
+  // faster, so we don't analyze them.
+  if (SI->getNumCases() < 3)
+    return false;
+
+  // Figure out the corresponding result for each case value and phi node in the
+  // common destination, as well as the min and max case values.
+  assert(SI->case_begin() != SI->case_end());
+  SwitchInst::CaseIt CI = SI->case_begin();
+  ConstantInt *MinCaseVal = CI->getCaseValue();
+  ConstantInt *MaxCaseVal = CI->getCaseValue();
+
+  BasicBlock *CommonDest = nullptr;
+  typedef SmallVector<std::pair<ConstantInt *, Constant *>, 4> ResultListTy;
+  SmallDenseMap<PHINode *, ResultListTy> ResultLists;
+  SmallDenseMap<PHINode *, Constant *> DefaultResults;
+  SmallDenseMap<PHINode *, Type *> ResultTypes;
+  SmallVector<PHINode *, 4> PHIs;
+
+  for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
+    ConstantInt *CaseVal = CI->getCaseValue();
+    if (CaseVal->getValue().slt(MinCaseVal->getValue()))
+      MinCaseVal = CaseVal;
+    if (CaseVal->getValue().sgt(MaxCaseVal->getValue()))
+      MaxCaseVal = CaseVal;
+
+    // Resulting value at phi nodes for this case value.
+    typedef SmallVector<std::pair<PHINode *, Constant *>, 4> ResultsTy;
+    ResultsTy Results;
+    if (!GetCaseResults(SI, CaseVal, CI->getCaseSuccessor(), &CommonDest,
+                        Results, DL, TTI))
+      return false;
+
+    // Append the result from this case to the list for each phi.
+    for (const auto &I : Results) {
+      PHINode *PHI = I.first;
+      Constant *Value = I.second;
+      if (!ResultLists.count(PHI))
+        PHIs.push_back(PHI);
+      ResultLists[PHI].push_back(std::make_pair(CaseVal, Value));
+    }
+  }
+
+  // Keep track of the result types.
+  for (PHINode *PHI : PHIs) {
+    ResultTypes[PHI] = ResultLists[PHI][0].second->getType();
+  }
+
+  uint64_t NumResults = ResultLists[PHIs[0]].size();
+  APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue();
+  uint64_t TableSize = RangeSpread.getLimitedValue() + 1;
+  bool TableHasHoles = (NumResults < TableSize);
+
+  // If the table has holes, we need a constant result for the default case
+  // or a bitmask that fits in a register.
+  SmallVector<std::pair<PHINode *, Constant *>, 4> DefaultResultsList;
+  bool HasDefaultResults =
+      GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest,
+                     DefaultResultsList, DL, TTI);
+
+  bool NeedMask = (TableHasHoles && !HasDefaultResults);
+  if (NeedMask) {
+    // As an extra penalty for the validity test we require more cases.
+    if (SI->getNumCases() < 4) // FIXME: Find best threshold value (benchmark).
+      return false;
+    if (!DL.fitsInLegalInteger(TableSize))
+      return false;
+  }
+
+  for (const auto &I : DefaultResultsList) {
+    PHINode *PHI = I.first;
+    Constant *Result = I.second;
+    DefaultResults[PHI] = Result;
+  }
+
+  if (!ShouldBuildLookupTable(SI, TableSize, TTI, DL, ResultTypes))
+    return false;
+
+  // Create the BB that does the lookups.
+  Module &Mod = *CommonDest->getParent()->getParent();
+  BasicBlock *LookupBB = BasicBlock::Create(
+      Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest);
+
+  // Compute the table index value.
+  Builder.SetInsertPoint(SI);
+  Value *TableIndex =
+      Builder.CreateSub(SI->getCondition(), MinCaseVal, "switch.tableidx");
+
+  // Compute the maximum table size representable by the integer type we are
+  // switching upon.
+  unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits();
+  uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize;
+  assert(MaxTableSize >= TableSize &&
+         "It is impossible for a switch to have more entries than the max "
+         "representable value of its input integer type's size.");
+
+  // If the default destination is unreachable, or if the lookup table covers
+  // all values of the conditional variable, branch directly to the lookup table
+  // BB. Otherwise, check that the condition is within the case range.
+  const bool DefaultIsReachable =
+      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+  const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize);
+  BranchInst *RangeCheckBranch = nullptr;
+
+  if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
+    Builder.CreateBr(LookupBB);
+    // Note: We call removeProdecessor later since we need to be able to get the
+    // PHI value for the default case in case we're using a bit mask.
+  } else {
+    Value *Cmp = Builder.CreateICmpULT(
+        TableIndex, ConstantInt::get(MinCaseVal->getType(), TableSize));
+    RangeCheckBranch =
+        Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+  }
+
+  // Populate the BB that does the lookups.
+  Builder.SetInsertPoint(LookupBB);
+
+  if (NeedMask) {
+    // Before doing the lookup we do the hole check.
+    // The LookupBB is therefore re-purposed to do the hole check
+    // and we create a new LookupBB.
+    BasicBlock *MaskBB = LookupBB;
+    MaskBB->setName("switch.hole_check");
+    LookupBB = BasicBlock::Create(Mod.getContext(), "switch.lookup",
+                                  CommonDest->getParent(), CommonDest);
+
+    // Make the mask's bitwidth at least 8bit and a power-of-2 to avoid
+    // unnecessary illegal types.
+    uint64_t TableSizePowOf2 = NextPowerOf2(std::max(7ULL, TableSize - 1ULL));
+    APInt MaskInt(TableSizePowOf2, 0);
+    APInt One(TableSizePowOf2, 1);
+    // Build bitmask; fill in a 1 bit for every case.
+    const ResultListTy &ResultList = ResultLists[PHIs[0]];
+    for (size_t I = 0, E = ResultList.size(); I != E; ++I) {
+      uint64_t Idx = (ResultList[I].first->getValue() - MinCaseVal->getValue())
+                         .getLimitedValue();
+      MaskInt |= One << Idx;
+    }
+    ConstantInt *TableMask = ConstantInt::get(Mod.getContext(), MaskInt);
+
+    // Get the TableIndex'th bit of the bitmask.
+    // If this bit is 0 (meaning hole) jump to the default destination,
+    // else continue with table lookup.
+    IntegerType *MapTy = TableMask->getType();
+    Value *MaskIndex =
+        Builder.CreateZExtOrTrunc(TableIndex, MapTy, "switch.maskindex");
+    Value *Shifted = Builder.CreateLShr(TableMask, MaskIndex, "switch.shifted");
+    Value *LoBit = Builder.CreateTrunc(
+        Shifted, Type::getInt1Ty(Mod.getContext()), "switch.lobit");
+    Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest());
+
+    Builder.SetInsertPoint(LookupBB);
+    AddPredecessorToBlock(SI->getDefaultDest(), MaskBB, SI->getParent());
+  }
+
+  if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
+    // We cached PHINodes in PHIs, to avoid accessing deleted PHINodes later,
+    // do not delete PHINodes here.
+    SI->getDefaultDest()->removePredecessor(SI->getParent(),
+                                            /*DontDeleteUselessPHIs=*/true);
+  }
+
+  bool ReturnedEarly = false;
+  for (size_t I = 0, E = PHIs.size(); I != E; ++I) {
+    PHINode *PHI = PHIs[I];
+    const ResultListTy &ResultList = ResultLists[PHI];
+
+    // If using a bitmask, use any value to fill the lookup table holes.
+    Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI];
+    StringRef FuncName = SI->getParent()->getParent()->getName();
+    SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL,
+                            FuncName);
+
+    Value *Result = Table.BuildLookup(TableIndex, Builder);
+
+    // If the result is used to return immediately from the function, we want to
+    // do that right here.
+    if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->user_begin()) &&
+        PHI->user_back() == CommonDest->getFirstNonPHIOrDbg()) {
+      Builder.CreateRet(Result);
+      ReturnedEarly = true;
+      break;
+    }
+
+    // Do a small peephole optimization: re-use the switch table compare if
+    // possible.
+    if (!TableHasHoles && HasDefaultResults && RangeCheckBranch) {
+      BasicBlock *PhiBlock = PHI->getParent();
+      // Search for compare instructions which use the phi.
+      for (auto *User : PHI->users()) {
+        reuseTableCompare(User, PhiBlock, RangeCheckBranch, DV, ResultList);
+      }
+    }
+
+    PHI->addIncoming(Result, LookupBB);
+  }
+
+  if (!ReturnedEarly)
+    Builder.CreateBr(CommonDest);
+
+  // Remove the switch.
+  for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
+    BasicBlock *Succ = SI->getSuccessor(i);
+
+    if (Succ == SI->getDefaultDest())
+      continue;
+    Succ->removePredecessor(SI->getParent());
+  }
+  SI->eraseFromParent();
+
+  ++NumLookupTables;
+  if (NeedMask)
+    ++NumLookupTablesHoles;
+  return true;
+}
+
+static bool isSwitchDense(ArrayRef<int64_t> Values) {
+  // See also SelectionDAGBuilder::isDense(), which this function was based on.
+  uint64_t Diff = (uint64_t)Values.back() - (uint64_t)Values.front();
+  uint64_t Range = Diff + 1;
+  uint64_t NumCases = Values.size();
+  // 40% is the default density for building a jump table in optsize/minsize mode.
+  uint64_t MinDensity = 40;
+
+  return NumCases * 100 >= Range * MinDensity;
+}
+
+// Try and transform a switch that has "holes" in it to a contiguous sequence
+// of cases.
+//
+// A switch such as: switch(i) {case 5: case 9: case 13: case 17:} can be
+// range-reduced to: switch ((i-5) / 4) {case 0: case 1: case 2: case 3:}.
+//
+// This converts a sparse switch into a dense switch which allows better
+// lowering and could also allow transforming into a lookup table.
+static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
+                              const DataLayout &DL,
+                              const TargetTransformInfo &TTI) {
+  auto *CondTy = cast<IntegerType>(SI->getCondition()->getType());
+  if (CondTy->getIntegerBitWidth() > 64 ||
+      !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
+    return false;
+  // Only bother with this optimization if there are more than 3 switch cases;
+  // SDAG will only bother creating jump tables for 4 or more cases.
+  if (SI->getNumCases() < 4)
+    return false;
+
+  // This transform is agnostic to the signedness of the input or case values. We
+  // can treat the case values as signed or unsigned. We can optimize more common
+  // cases such as a sequence crossing zero {-4,0,4,8} if we interpret case values
+  // as signed.
+  SmallVector<int64_t,4> Values;
+  for (auto &C : SI->cases())
+    Values.push_back(C.getCaseValue()->getValue().getSExtValue());
+  std::sort(Values.begin(), Values.end());
+
+  // If the switch is already dense, there's nothing useful to do here.
+  if (isSwitchDense(Values))
+    return false;
+
+  // First, transform the values such that they start at zero and ascend.
+  int64_t Base = Values[0];
+  for (auto &V : Values)
+    V -= Base;
+
+  // Now we have signed numbers that have been shifted so that, given enough
+  // precision, there are no negative values. Since the rest of the transform
+  // is bitwise only, we switch now to an unsigned representation.
+  uint64_t GCD = 0;
+  for (auto &V : Values)
+    GCD = GreatestCommonDivisor64(GCD, (uint64_t)V);
+
+  // This transform can be done speculatively because it is so cheap - it results
+  // in a single rotate operation being inserted. This can only happen if the
+  // factor extracted is a power of 2.
+  // FIXME: If the GCD is an odd number we can multiply by the multiplicative
+  // inverse of GCD and then perform this transform.
+  // FIXME: It's possible that optimizing a switch on powers of two might also
+  // be beneficial - flag values are often powers of two and we could use a CLZ
+  // as the key function.
+  if (GCD <= 1 || !isPowerOf2_64(GCD))
+    // No common divisor found or too expensive to compute key function.
+    return false;
+
+  unsigned Shift = Log2_64(GCD);
+  for (auto &V : Values)
+    V = (int64_t)((uint64_t)V >> Shift);
+
+  if (!isSwitchDense(Values))
+    // Transform didn't create a dense switch.
+    return false;
+
+  // The obvious transform is to shift the switch condition right and emit a
+  // check that the condition actually cleanly divided by GCD, i.e.
+  //   C & (1 << Shift - 1) == 0
+  // inserting a new CFG edge to handle the case where it didn't divide cleanly.
+  //
+  // A cheaper way of doing this is a simple ROTR(C, Shift). This performs the
+  // shift and puts the shifted-off bits in the uppermost bits. If any of these
+  // are nonzero then the switch condition will be very large and will hit the
+  // default case.
+
+  auto *Ty = cast<IntegerType>(SI->getCondition()->getType());
+  Builder.SetInsertPoint(SI);
+  auto *ShiftC = ConstantInt::get(Ty, Shift);
+  auto *Sub = Builder.CreateSub(SI->getCondition(), ConstantInt::get(Ty, Base));
+  auto *LShr = Builder.CreateLShr(Sub, ShiftC);
+  auto *Shl = Builder.CreateShl(Sub, Ty->getBitWidth() - Shift);
+  auto *Rot = Builder.CreateOr(LShr, Shl);
+  SI->replaceUsesOfWith(SI->getCondition(), Rot);
+
+  for (auto Case : SI->cases()) {
+    auto *Orig = Case.getCaseValue();
+    auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base);
+    Case.setValue(
+        cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(ShiftC->getValue()))));
+  }
+  return true;
+}
+
+bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
+  BasicBlock *BB = SI->getParent();
+
+  if (isValueEqualityComparison(SI)) {
+    // If we only have one predecessor, and if it is a branch on this value,
+    // see if that predecessor totally determines the outcome of this switch.
+    if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+      if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+
+    Value *Cond = SI->getCondition();
+    if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
+      if (SimplifySwitchOnSelect(SI, Select))
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+
+    // If the block only contains the switch, see if we can fold the block
+    // away into any preds.
+    BasicBlock::iterator BBI = BB->begin();
+    // Ignore dbg intrinsics.
+    while (isa<DbgInfoIntrinsic>(BBI))
+      ++BBI;
+    if (SI == &*BBI)
+      if (FoldValueComparisonIntoPredecessors(SI, Builder))
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+  }
+
+  // Try to transform the switch into an icmp and a branch.
+  if (TurnSwitchRangeIntoICmp(SI, Builder))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+
+  // Remove unreachable cases.
+  if (EliminateDeadSwitchCases(SI, AC, DL))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+
+  if (SwitchToSelect(SI, Builder, AC, DL, TTI))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+
+  if (ForwardSwitchConditionToPHI(SI))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+
+  // The conversion from switch to lookup tables results in difficult
+  // to analyze code and makes pruning branches much harder.
+  // This is a problem of the switch expression itself can still be
+  // restricted as a result of inlining or CVP. There only apply this
+  // transformation during late steps of the optimisation chain.
+  if (LateSimplifyCFG && SwitchToLookupTable(SI, Builder, DL, TTI))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+
+  if (ReduceSwitchRange(SI, Builder, DL, TTI))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+
+  return false;
+}
+
+bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
+  BasicBlock *BB = IBI->getParent();
+  bool Changed = false;
+
+  // Eliminate redundant destinations.
+  SmallPtrSet<Value *, 8> Succs;
+  for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
+    BasicBlock *Dest = IBI->getDestination(i);
+    if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
+      Dest->removePredecessor(BB);
+      IBI->removeDestination(i);
+      --i;
+      --e;
+      Changed = true;
+    }
+  }
+
+  if (IBI->getNumDestinations() == 0) {
+    // If the indirectbr has no successors, change it to unreachable.
+    new UnreachableInst(IBI->getContext(), IBI);
+    EraseTerminatorInstAndDCECond(IBI);
+    return true;
+  }
+
+  if (IBI->getNumDestinations() == 1) {
+    // If the indirectbr has one successor, change it to a direct branch.
+    BranchInst::Create(IBI->getDestination(0), IBI);
+    EraseTerminatorInstAndDCECond(IBI);
+    return true;
+  }
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
+    if (SimplifyIndirectBrOnSelect(IBI, SI))
+      return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+  }
+  return Changed;
+}
+
+/// Given an block with only a single landing pad and a unconditional branch
+/// try to find another basic block which this one can be merged with.  This
+/// handles cases where we have multiple invokes with unique landing pads, but
+/// a shared handler.
+///
+/// We specifically choose to not worry about merging non-empty blocks
+/// here.  That is a PRE/scheduling problem and is best solved elsewhere.  In
+/// practice, the optimizer produces empty landing pad blocks quite frequently
+/// when dealing with exception dense code.  (see: instcombine, gvn, if-else
+/// sinking in this file)
+///
+/// This is primarily a code size optimization.  We need to avoid performing
+/// any transform which might inhibit optimization (such as our ability to
+/// specialize a particular handler via tail commoning).  We do this by not
+/// merging any blocks which require us to introduce a phi.  Since the same
+/// values are flowing through both blocks, we don't loose any ability to
+/// specialize.  If anything, we make such specialization more likely.
+///
+/// TODO - This transformation could remove entries from a phi in the target
+/// block when the inputs in the phi are the same for the two blocks being
+/// merged.  In some cases, this could result in removal of the PHI entirely.
+static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
+                                 BasicBlock *BB) {
+  auto Succ = BB->getUniqueSuccessor();
+  assert(Succ);
+  // If there's a phi in the successor block, we'd likely have to introduce
+  // a phi into the merged landing pad block.
+  if (isa<PHINode>(*Succ->begin()))
+    return false;
+
+  for (BasicBlock *OtherPred : predecessors(Succ)) {
+    if (BB == OtherPred)
+      continue;
+    BasicBlock::iterator I = OtherPred->begin();
+    LandingPadInst *LPad2 = dyn_cast<LandingPadInst>(I);
+    if (!LPad2 || !LPad2->isIdenticalTo(LPad))
+      continue;
+    for (++I; isa<DbgInfoIntrinsic>(I); ++I) {
+    }
+    BranchInst *BI2 = dyn_cast<BranchInst>(I);
+    if (!BI2 || !BI2->isIdenticalTo(BI))
+      continue;
+
+    // We've found an identical block.  Update our predecessors to take that
+    // path instead and make ourselves dead.
+    SmallSet<BasicBlock *, 16> Preds;
+    Preds.insert(pred_begin(BB), pred_end(BB));
+    for (BasicBlock *Pred : Preds) {
+      InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
+      assert(II->getNormalDest() != BB && II->getUnwindDest() == BB &&
+             "unexpected successor");
+      II->setUnwindDest(OtherPred);
+    }
+
+    // The debug info in OtherPred doesn't cover the merged control flow that
+    // used to go through BB.  We need to delete it or update it.
+    for (auto I = OtherPred->begin(), E = OtherPred->end(); I != E;) {
+      Instruction &Inst = *I;
+      I++;
+      if (isa<DbgInfoIntrinsic>(Inst))
+        Inst.eraseFromParent();
+    }
+
+    SmallSet<BasicBlock *, 16> Succs;
+    Succs.insert(succ_begin(BB), succ_end(BB));
+    for (BasicBlock *Succ : Succs) {
+      Succ->removePredecessor(BB);
+    }
+
+    IRBuilder<> Builder(BI);
+    Builder.CreateUnreachable();
+    BI->eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
+                                          IRBuilder<> &Builder) {
+  BasicBlock *BB = BI->getParent();
+
+  if (SinkCommon && SinkThenElseCodeToEnd(BI))
+    return true;
+
+  // If the Terminator is the only non-phi instruction, simplify the block.
+  // if LoopHeader is provided, check if the block is a loop header
+  // (This is for early invocations before loop simplify and vectorization
+  // to keep canonical loop forms for nested loops.
+  // These blocks can be eliminated when the pass is invoked later
+  // in the back-end.)
+  BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
+  if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
+      (!LoopHeaders || !LoopHeaders->count(BB)) &&
+      TryToSimplifyUncondBranchFromEmptyBlock(BB))
+    return true;
+
+  // If the only instruction in the block is a seteq/setne comparison
+  // against a constant, try to simplify the block.
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
+    if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) {
+      for (++I; isa<DbgInfoIntrinsic>(I); ++I)
+        ;
+      if (I->isTerminator() &&
+          TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, DL, TTI,
+                                                BonusInstThreshold, AC))
+        return true;
+    }
+
+  // See if we can merge an empty landing pad block with another which is
+  // equivalent.
+  if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) {
+    for (++I; isa<DbgInfoIntrinsic>(I); ++I) {
+    }
+    if (I->isTerminator() && TryToMergeLandingPad(LPad, BI, BB))
+      return true;
+  }
+
+  // If this basic block is ONLY a compare and a branch, and if a predecessor
+  // branches to us and our successor, fold the comparison into the
+  // predecessor and use logical operations to update the incoming value
+  // for PHI nodes in common successor.
+  if (FoldBranchToCommonDest(BI, BonusInstThreshold))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+  return false;
+}
+
+static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
+  BasicBlock *PredPred = nullptr;
+  for (auto *P : predecessors(BB)) {
+    BasicBlock *PPred = P->getSinglePredecessor();
+    if (!PPred || (PredPred && PredPred != PPred))
+      return nullptr;
+    PredPred = PPred;
+  }
+  return PredPred;
+}
+
+bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
+  BasicBlock *BB = BI->getParent();
+
+  // Conditional branch
+  if (isValueEqualityComparison(BI)) {
+    // If we only have one predecessor, and if it is a branch on this value,
+    // see if that predecessor totally determines the outcome of this
+    // switch.
+    if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+      if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+
+    // This block must be empty, except for the setcond inst, if it exists.
+    // Ignore dbg intrinsics.
+    BasicBlock::iterator I = BB->begin();
+    // Ignore dbg intrinsics.
+    while (isa<DbgInfoIntrinsic>(I))
+      ++I;
+    if (&*I == BI) {
+      if (FoldValueComparisonIntoPredecessors(BI, Builder))
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+    } else if (&*I == cast<Instruction>(BI->getCondition())) {
+      ++I;
+      // Ignore dbg intrinsics.
+      while (isa<DbgInfoIntrinsic>(I))
+        ++I;
+      if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+    }
+  }
+
+  // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction.
+  if (SimplifyBranchOnICmpChain(BI, Builder, DL))
+    return true;
+
+  // If this basic block has a single dominating predecessor block and the
+  // dominating block's condition implies BI's condition, we know the direction
+  // of the BI branch.
+  if (BasicBlock *Dom = BB->getSinglePredecessor()) {
+    auto *PBI = dyn_cast_or_null<BranchInst>(Dom->getTerminator());
+    if (PBI && PBI->isConditional() &&
+        PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+      assert(PBI->getSuccessor(0) == BB || PBI->getSuccessor(1) == BB);
+      bool CondIsFalse = PBI->getSuccessor(1) == BB;
+      Optional<bool> Implication = isImpliedCondition(
+          PBI->getCondition(), BI->getCondition(), DL, CondIsFalse);
+      if (Implication) {
+        // Turn this into a branch on constant.
+        auto *OldCond = BI->getCondition();
+        ConstantInt *CI = *Implication
+                              ? ConstantInt::getTrue(BB->getContext())
+                              : ConstantInt::getFalse(BB->getContext());
+        BI->setCondition(CI);
+        RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+      }
+    }
+  }
+
+  // If this basic block is ONLY a compare and a branch, and if a predecessor
+  // branches to us and one of our successors, fold the comparison into the
+  // predecessor and use logical operations to pick the right destination.
+  if (FoldBranchToCommonDest(BI, BonusInstThreshold))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+
+  // We have a conditional branch to two blocks that are only reachable
+  // from BI.  We know that the condbr dominates the two blocks, so see if
+  // there is any identical code in the "then" and "else" blocks.  If so, we
+  // can hoist it up to the branching block.
+  if (BI->getSuccessor(0)->getSinglePredecessor()) {
+    if (BI->getSuccessor(1)->getSinglePredecessor()) {
+      if (HoistThenElseCodeToIf(BI, TTI))
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+    } else {
+      // If Successor #1 has multiple preds, we may be able to conditionally
+      // execute Successor #0 if it branches to Successor #1.
+      TerminatorInst *Succ0TI = BI->getSuccessor(0)->getTerminator();
+      if (Succ0TI->getNumSuccessors() == 1 &&
+          Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
+        if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), TTI))
+          return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+    }
+  } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
+    // If Successor #0 has multiple preds, we may be able to conditionally
+    // execute Successor #1 if it branches to Successor #0.
+    TerminatorInst *Succ1TI = BI->getSuccessor(1)->getTerminator();
+    if (Succ1TI->getNumSuccessors() == 1 &&
+        Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
+      if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), TTI))
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+  }
+
+  // If this is a branch on a phi node in the current block, thread control
+  // through this block if any PHI node entries are constants.
+  if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
+    if (PN->getParent() == BI->getParent())
+      if (FoldCondBranchOnPHI(BI, DL, AC))
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+
+  // Scan predecessor blocks for conditional branches.
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
+      if (PBI != BI && PBI->isConditional())
+        if (SimplifyCondBranchToCondBranch(PBI, BI, DL))
+          return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+
+  // Look for diamond patterns.
+  if (MergeCondStores)
+    if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB))
+      if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator()))
+        if (PBI != BI && PBI->isConditional())
+          if (mergeConditionalStores(PBI, BI))
+            return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+
+  return false;
+}
+
+/// Check if passing a value to an instruction will cause undefined behavior.
+static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return false;
+
+  if (I->use_empty())
+    return false;
+
+  if (C->isNullValue() || isa<UndefValue>(C)) {
+    // Only look at the first use, avoid hurting compile time with long uselists
+    User *Use = *I->user_begin();
+
+    // Now make sure that there are no instructions in between that can alter
+    // control flow (eg. calls)
+    for (BasicBlock::iterator
+             i = ++BasicBlock::iterator(I),
+             UI = BasicBlock::iterator(dyn_cast<Instruction>(Use));
+         i != UI; ++i)
+      if (i == I->getParent()->end() || i->mayHaveSideEffects())
+        return false;
+
+    // Look through GEPs. A load from a GEP derived from NULL is still undefined
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Use))
+      if (GEP->getPointerOperand() == I)
+        return passingValueIsAlwaysUndefined(V, GEP);
+
+    // Look through bitcasts.
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(Use))
+      return passingValueIsAlwaysUndefined(V, BC);
+
+    // Load from null is undefined.
+    if (LoadInst *LI = dyn_cast<LoadInst>(Use))
+      if (!LI->isVolatile())
+        return LI->getPointerAddressSpace() == 0;
+
+    // Store to null is undefined.
+    if (StoreInst *SI = dyn_cast<StoreInst>(Use))
+      if (!SI->isVolatile())
+        return SI->getPointerAddressSpace() == 0 &&
+               SI->getPointerOperand() == I;
+
+    // A call to null is undefined.
+    if (auto CS = CallSite(Use))
+      return CS.getCalledValue() == I;
+  }
+  return false;
+}
+
+/// If BB has an incoming value that will always trigger undefined behavior
+/// (eg. null pointer dereference), remove the branch leading here.
+static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
+  for (BasicBlock::iterator i = BB->begin();
+       PHINode *PHI = dyn_cast<PHINode>(i); ++i)
+    for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
+      if (passingValueIsAlwaysUndefined(PHI->getIncomingValue(i), PHI)) {
+        TerminatorInst *T = PHI->getIncomingBlock(i)->getTerminator();
+        IRBuilder<> Builder(T);
+        if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+          BB->removePredecessor(PHI->getIncomingBlock(i));
+          // Turn uncoditional branches into unreachables and remove the dead
+          // destination from conditional branches.
+          if (BI->isUnconditional())
+            Builder.CreateUnreachable();
+          else
+            Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1)
+                                                       : BI->getSuccessor(0));
+          BI->eraseFromParent();
+          return true;
+        }
+        // TODO: SwitchInst.
+      }
+
+  return false;
+}
+
+bool SimplifyCFGOpt::run(BasicBlock *BB) {
+  bool Changed = false;
+
+  assert(BB && BB->getParent() && "Block not embedded in function!");
+  assert(BB->getTerminator() && "Degenerate basic block encountered!");
+
+  // Remove basic blocks that have no predecessors (except the entry block)...
+  // or that just have themself as a predecessor.  These are unreachable.
+  if ((pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) ||
+      BB->getSinglePredecessor() == BB) {
+    DEBUG(dbgs() << "Removing BB: \n" << *BB);
+    DeleteDeadBlock(BB);
+    return true;
+  }
+
+  // Check to see if we can constant propagate this terminator instruction
+  // away...
+  Changed |= ConstantFoldTerminator(BB, true);
+
+  // Check for and eliminate duplicate PHI nodes in this block.
+  Changed |= EliminateDuplicatePHINodes(BB);
+
+  // Check for and remove branches that will always cause undefined behavior.
+  Changed |= removeUndefIntroducingPredecessor(BB);
+
+  // Merge basic blocks into their predecessor if there is only one distinct
+  // pred, and if there is only one distinct successor of the predecessor, and
+  // if there are no PHI nodes.
+  //
+  if (MergeBlockIntoPredecessor(BB))
+    return true;
+
+  IRBuilder<> Builder(BB);
+
+  // If there is a trivial two-entry PHI node in this basic block, and we can
+  // eliminate it, do so now.
+  if (PHINode *PN = dyn_cast<PHINode>(BB->begin()))
+    if (PN->getNumIncomingValues() == 2)
+      Changed |= FoldTwoEntryPHINode(PN, TTI, DL);
+
+  Builder.SetInsertPoint(BB->getTerminator());
+  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+    if (BI->isUnconditional()) {
+      if (SimplifyUncondBranch(BI, Builder))
+        return true;
+    } else {
+      if (SimplifyCondBranch(BI, Builder))
+        return true;
+    }
+  } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+    if (SimplifyReturn(RI, Builder))
+      return true;
+  } else if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) {
+    if (SimplifyResume(RI, Builder))
+      return true;
+  } else if (CleanupReturnInst *RI =
+                 dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
+    if (SimplifyCleanupReturn(RI))
+      return true;
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+    if (SimplifySwitch(SI, Builder))
+      return true;
+  } else if (UnreachableInst *UI =
+                 dyn_cast<UnreachableInst>(BB->getTerminator())) {
+    if (SimplifyUnreachable(UI))
+      return true;
+  } else if (IndirectBrInst *IBI =
+                 dyn_cast<IndirectBrInst>(BB->getTerminator())) {
+    if (SimplifyIndirectBr(IBI))
+      return true;
+  }
+
+  return Changed;
+}
+
+/// This function is used to do simplification of a CFG.
+/// For example, it adjusts branches to branches to eliminate the extra hop,
+/// eliminates unreachable basic blocks, and does other "peephole" optimization
+/// of the CFG.  It returns true if a modification was made.
+///
+bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
+                       unsigned BonusInstThreshold, AssumptionCache *AC,
+                       SmallPtrSetImpl<BasicBlock *> *LoopHeaders,
+                       bool LateSimplifyCFG) {
+  return SimplifyCFGOpt(TTI, BB->getModule()->getDataLayout(),
+                        BonusInstThreshold, AC, LoopHeaders, LateSimplifyCFG)
+      .run(BB);
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
new file mode 100644
index 000000000000..6d90e6b48358
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -0,0 +1,765 @@
+//===-- SimplifyIndVar.cpp - Induction variable simplification ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements induction variable simplification. It does
+// not define any actual pass or policy, but provides a single function to
+// simplify a loop's induction variables based on ScalarEvolution.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "indvars"
+
+STATISTIC(NumElimIdentity, "Number of IV identities eliminated");
+STATISTIC(NumElimOperand,  "Number of IV operands folded into a use");
+STATISTIC(NumElimRem     , "Number of IV remainder operations eliminated");
+STATISTIC(
+    NumSimplifiedSDiv,
+    "Number of IV signed division operations converted to unsigned division");
+STATISTIC(NumElimCmp     , "Number of IV comparisons eliminated");
+
+namespace {
+  /// This is a utility for simplifying induction variables
+  /// based on ScalarEvolution. It is the primary instrument of the
+  /// IndvarSimplify pass, but it may also be directly invoked to cleanup after
+  /// other loop passes that preserve SCEV.
+  class SimplifyIndvar {
+    Loop             *L;
+    LoopInfo         *LI;
+    ScalarEvolution  *SE;
+    DominatorTree    *DT;
+
+    SmallVectorImpl<WeakTrackingVH> &DeadInsts;
+
+    bool Changed;
+
+  public:
+    SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, DominatorTree *DT,
+                   LoopInfo *LI, SmallVectorImpl<WeakTrackingVH> &Dead)
+        : L(Loop), LI(LI), SE(SE), DT(DT), DeadInsts(Dead), Changed(false) {
+      assert(LI && "IV simplification requires LoopInfo");
+    }
+
+    bool hasChanged() const { return Changed; }
+
+    /// Iteratively perform simplification on a worklist of users of the
+    /// specified induction variable. This is the top-level driver that applies
+    /// all simplifications to users of an IV.
+    void simplifyUsers(PHINode *CurrIV, IVVisitor *V = nullptr);
+
+    Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand);
+
+    bool eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand);
+
+    bool eliminateOverflowIntrinsic(CallInst *CI);
+    bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
+    void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
+    void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand,
+                              bool IsSigned);
+    bool eliminateSDiv(BinaryOperator *SDiv);
+    bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand);
+    bool strengthenRightShift(BinaryOperator *BO, Value *IVOperand);
+  };
+}
+
+/// Fold an IV operand into its use.  This removes increments of an
+/// aligned IV when used by a instruction that ignores the low bits.
+///
+/// IVOperand is guaranteed SCEVable, but UseInst may not be.
+///
+/// Return the operand of IVOperand for this induction variable if IVOperand can
+/// be folded (in case more folding opportunities have been exposed).
+/// Otherwise return null.
+Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) {
+  Value *IVSrc = nullptr;
+  unsigned OperIdx = 0;
+  const SCEV *FoldedExpr = nullptr;
+  switch (UseInst->getOpcode()) {
+  default:
+    return nullptr;
+  case Instruction::UDiv:
+  case Instruction::LShr:
+    // We're only interested in the case where we know something about
+    // the numerator and have a constant denominator.
+    if (IVOperand != UseInst->getOperand(OperIdx) ||
+        !isa<ConstantInt>(UseInst->getOperand(1)))
+      return nullptr;
+
+    // Attempt to fold a binary operator with constant operand.
+    // e.g. ((I + 1) >> 2) => I >> 2
+    if (!isa<BinaryOperator>(IVOperand)
+        || !isa<ConstantInt>(IVOperand->getOperand(1)))
+      return nullptr;
+
+    IVSrc = IVOperand->getOperand(0);
+    // IVSrc must be the (SCEVable) IV, since the other operand is const.
+    assert(SE->isSCEVable(IVSrc->getType()) && "Expect SCEVable IV operand");
+
+    ConstantInt *D = cast<ConstantInt>(UseInst->getOperand(1));
+    if (UseInst->getOpcode() == Instruction::LShr) {
+      // Get a constant for the divisor. See createSCEV.
+      uint32_t BitWidth = cast<IntegerType>(UseInst->getType())->getBitWidth();
+      if (D->getValue().uge(BitWidth))
+        return nullptr;
+
+      D = ConstantInt::get(UseInst->getContext(),
+                           APInt::getOneBitSet(BitWidth, D->getZExtValue()));
+    }
+    FoldedExpr = SE->getUDivExpr(SE->getSCEV(IVSrc), SE->getSCEV(D));
+  }
+  // We have something that might fold it's operand. Compare SCEVs.
+  if (!SE->isSCEVable(UseInst->getType()))
+    return nullptr;
+
+  // Bypass the operand if SCEV can prove it has no effect.
+  if (SE->getSCEV(UseInst) != FoldedExpr)
+    return nullptr;
+
+  DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand
+        << " -> " << *UseInst << '\n');
+
+  UseInst->setOperand(OperIdx, IVSrc);
+  assert(SE->getSCEV(UseInst) == FoldedExpr && "bad SCEV with folded oper");
+
+  ++NumElimOperand;
+  Changed = true;
+  if (IVOperand->use_empty())
+    DeadInsts.emplace_back(IVOperand);
+  return IVSrc;
+}
+
+/// SimplifyIVUsers helper for eliminating useless
+/// comparisons against an induction variable.
+void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
+  unsigned IVOperIdx = 0;
+  ICmpInst::Predicate Pred = ICmp->getPredicate();
+  ICmpInst::Predicate OriginalPred = Pred;
+  if (IVOperand != ICmp->getOperand(0)) {
+    // Swapped
+    assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
+    IVOperIdx = 1;
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  // Get the SCEVs for the ICmp operands.
+  const SCEV *S = SE->getSCEV(ICmp->getOperand(IVOperIdx));
+  const SCEV *X = SE->getSCEV(ICmp->getOperand(1 - IVOperIdx));
+
+  // Simplify unnecessary loops away.
+  const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
+  S = SE->getSCEVAtScope(S, ICmpLoop);
+  X = SE->getSCEVAtScope(X, ICmpLoop);
+
+  ICmpInst::Predicate InvariantPredicate;
+  const SCEV *InvariantLHS, *InvariantRHS;
+
+  // If the condition is always true or always false, replace it with
+  // a constant value.
+  if (SE->isKnownPredicate(Pred, S, X)) {
+    ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
+    DeadInsts.emplace_back(ICmp);
+    DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+  } else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) {
+    ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
+    DeadInsts.emplace_back(ICmp);
+    DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+  } else if (isa<PHINode>(IVOperand) &&
+             SE->isLoopInvariantPredicate(Pred, S, X, L, InvariantPredicate,
+                                          InvariantLHS, InvariantRHS)) {
+
+    // Rewrite the comparison to a loop invariant comparison if it can be done
+    // cheaply, where cheaply means "we don't need to emit any new
+    // instructions".
+
+    Value *NewLHS = nullptr, *NewRHS = nullptr;
+
+    if (S == InvariantLHS || X == InvariantLHS)
+      NewLHS =
+          ICmp->getOperand(S == InvariantLHS ? IVOperIdx : (1 - IVOperIdx));
+
+    if (S == InvariantRHS || X == InvariantRHS)
+      NewRHS =
+          ICmp->getOperand(S == InvariantRHS ? IVOperIdx : (1 - IVOperIdx));
+
+    auto *PN = cast<PHINode>(IVOperand);
+    for (unsigned i = 0, e = PN->getNumIncomingValues();
+         i != e && (!NewLHS || !NewRHS);
+         ++i) {
+
+      // If this is a value incoming from the backedge, then it cannot be a loop
+      // invariant value (since we know that IVOperand is an induction variable).
+      if (L->contains(PN->getIncomingBlock(i)))
+        continue;
+
+      // NB! This following assert does not fundamentally have to be true, but
+      // it is true today given how SCEV analyzes induction variables.
+      // Specifically, today SCEV will *not* recognize %iv as an induction
+      // variable in the following case:
+      //
+      // define void @f(i32 %k) {
+      // entry:
+      //   br i1 undef, label %r, label %l
+      //
+      // l:
+      //   %k.inc.l = add i32 %k, 1
+      //   br label %loop
+      //
+      // r:
+      //   %k.inc.r = add i32 %k, 1
+      //   br label %loop
+      //
+      // loop:
+      //   %iv = phi i32 [ %k.inc.l, %l ], [ %k.inc.r, %r ], [ %iv.inc, %loop ]
+      //   %iv.inc = add i32 %iv, 1
+      //   br label %loop
+      // }
+      //
+      // but if it starts to, at some point, then the assertion below will have
+      // to be changed to a runtime check.
+
+      Value *Incoming = PN->getIncomingValue(i);
+
+#ifndef NDEBUG
+      if (auto *I = dyn_cast<Instruction>(Incoming))
+        assert(DT->dominates(I, ICmp) && "Should be a unique loop dominating value!");
+#endif
+
+      const SCEV *IncomingS = SE->getSCEV(Incoming);
+
+      if (!NewLHS && IncomingS == InvariantLHS)
+        NewLHS = Incoming;
+      if (!NewRHS && IncomingS == InvariantRHS)
+        NewRHS = Incoming;
+    }
+
+    if (!NewLHS || !NewRHS)
+      // We could not find an existing value to replace either LHS or RHS.
+      // Generating new instructions has subtler tradeoffs, so avoid doing that
+      // for now.
+      return;
+
+    DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n');
+    ICmp->setPredicate(InvariantPredicate);
+    ICmp->setOperand(0, NewLHS);
+    ICmp->setOperand(1, NewRHS);
+  } else if (ICmpInst::isSigned(OriginalPred) &&
+             SE->isKnownNonNegative(S) && SE->isKnownNonNegative(X)) {
+    // If we were unable to make anything above, all we can is to canonicalize
+    // the comparison hoping that it will open the doors for other
+    // optimizations. If we find out that we compare two non-negative values,
+    // we turn the instruction's predicate to its unsigned version. Note that
+    // we cannot rely on Pred here unless we check if we have swapped it.
+    assert(ICmp->getPredicate() == OriginalPred && "Predicate changed?");
+    DEBUG(dbgs() << "INDVARS: Turn to unsigned comparison: " << *ICmp << '\n');
+    ICmp->setPredicate(ICmpInst::getUnsignedPredicate(OriginalPred));
+  } else
+    return;
+
+  ++NumElimCmp;
+  Changed = true;
+}
+
+bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) {
+  // Get the SCEVs for the ICmp operands.
+  auto *N = SE->getSCEV(SDiv->getOperand(0));
+  auto *D = SE->getSCEV(SDiv->getOperand(1));
+
+  // Simplify unnecessary loops away.
+  const Loop *L = LI->getLoopFor(SDiv->getParent());
+  N = SE->getSCEVAtScope(N, L);
+  D = SE->getSCEVAtScope(D, L);
+
+  // Replace sdiv by udiv if both of the operands are non-negative
+  if (SE->isKnownNonNegative(N) && SE->isKnownNonNegative(D)) {
+    auto *UDiv = BinaryOperator::Create(
+        BinaryOperator::UDiv, SDiv->getOperand(0), SDiv->getOperand(1),
+        SDiv->getName() + ".udiv", SDiv);
+    UDiv->setIsExact(SDiv->isExact());
+    SDiv->replaceAllUsesWith(UDiv);
+    DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
+    ++NumSimplifiedSDiv;
+    Changed = true;
+    DeadInsts.push_back(SDiv);
+    return true;
+  }
+
+  return false;
+}
+
+/// SimplifyIVUsers helper for eliminating useless
+/// remainder operations operating on an induction variable.
+void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
+                                      Value *IVOperand,
+                                      bool IsSigned) {
+  // We're only interested in the case where we know something about
+  // the numerator.
+  if (IVOperand != Rem->getOperand(0))
+    return;
+
+  // Get the SCEVs for the ICmp operands.
+  const SCEV *S = SE->getSCEV(Rem->getOperand(0));
+  const SCEV *X = SE->getSCEV(Rem->getOperand(1));
+
+  // Simplify unnecessary loops away.
+  const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent());
+  S = SE->getSCEVAtScope(S, ICmpLoop);
+  X = SE->getSCEVAtScope(X, ICmpLoop);
+
+  // i % n  -->  i  if i is in [0,n).
+  if ((!IsSigned || SE->isKnownNonNegative(S)) &&
+      SE->isKnownPredicate(IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                           S, X))
+    Rem->replaceAllUsesWith(Rem->getOperand(0));
+  else {
+    // (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n).
+    const SCEV *LessOne = SE->getMinusSCEV(S, SE->getOne(S->getType()));
+    if (IsSigned && !SE->isKnownNonNegative(LessOne))
+      return;
+
+    if (!SE->isKnownPredicate(IsSigned ?
+                              ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                              LessOne, X))
+      return;
+
+    ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ,
+                                  Rem->getOperand(0), Rem->getOperand(1));
+    SelectInst *Sel =
+      SelectInst::Create(ICmp,
+                         ConstantInt::get(Rem->getType(), 0),
+                         Rem->getOperand(0), "tmp", Rem);
+    Rem->replaceAllUsesWith(Sel);
+  }
+
+  DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+  ++NumElimRem;
+  Changed = true;
+  DeadInsts.emplace_back(Rem);
+}
+
+bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
+  auto *F = CI->getCalledFunction();
+  if (!F)
+    return false;
+
+  typedef const SCEV *(ScalarEvolution::*OperationFunctionTy)(
+      const SCEV *, const SCEV *, SCEV::NoWrapFlags, unsigned);
+  typedef const SCEV *(ScalarEvolution::*ExtensionFunctionTy)(
+      const SCEV *, Type *, unsigned);
+
+  OperationFunctionTy Operation;
+  ExtensionFunctionTy Extension;
+
+  Instruction::BinaryOps RawOp;
+
+  // We always have exactly one of nsw or nuw.  If NoSignedOverflow is false, we
+  // have nuw.
+  bool NoSignedOverflow;
+
+  switch (F->getIntrinsicID()) {
+  default:
+    return false;
+
+  case Intrinsic::sadd_with_overflow:
+    Operation = &ScalarEvolution::getAddExpr;
+    Extension = &ScalarEvolution::getSignExtendExpr;
+    RawOp = Instruction::Add;
+    NoSignedOverflow = true;
+    break;
+
+  case Intrinsic::uadd_with_overflow:
+    Operation = &ScalarEvolution::getAddExpr;
+    Extension = &ScalarEvolution::getZeroExtendExpr;
+    RawOp = Instruction::Add;
+    NoSignedOverflow = false;
+    break;
+
+  case Intrinsic::ssub_with_overflow:
+    Operation = &ScalarEvolution::getMinusSCEV;
+    Extension = &ScalarEvolution::getSignExtendExpr;
+    RawOp = Instruction::Sub;
+    NoSignedOverflow = true;
+    break;
+
+  case Intrinsic::usub_with_overflow:
+    Operation = &ScalarEvolution::getMinusSCEV;
+    Extension = &ScalarEvolution::getZeroExtendExpr;
+    RawOp = Instruction::Sub;
+    NoSignedOverflow = false;
+    break;
+  }
+
+  const SCEV *LHS = SE->getSCEV(CI->getArgOperand(0));
+  const SCEV *RHS = SE->getSCEV(CI->getArgOperand(1));
+
+  auto *NarrowTy = cast<IntegerType>(LHS->getType());
+  auto *WideTy =
+    IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2);
+
+  const SCEV *A =
+      (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0),
+                       WideTy, 0);
+  const SCEV *B =
+      (SE->*Operation)((SE->*Extension)(LHS, WideTy, 0),
+                       (SE->*Extension)(RHS, WideTy, 0), SCEV::FlagAnyWrap, 0);
+
+  if (A != B)
+    return false;
+
+  // Proved no overflow, nuke the overflow check and, if possible, the overflow
+  // intrinsic as well.
+
+  BinaryOperator *NewResult = BinaryOperator::Create(
+      RawOp, CI->getArgOperand(0), CI->getArgOperand(1), "", CI);
+
+  if (NoSignedOverflow)
+    NewResult->setHasNoSignedWrap(true);
+  else
+    NewResult->setHasNoUnsignedWrap(true);
+
+  SmallVector<ExtractValueInst *, 4> ToDelete;
+
+  for (auto *U : CI->users()) {
+    if (auto *EVI = dyn_cast<ExtractValueInst>(U)) {
+      if (EVI->getIndices()[0] == 1)
+        EVI->replaceAllUsesWith(ConstantInt::getFalse(CI->getContext()));
+      else {
+        assert(EVI->getIndices()[0] == 0 && "Only two possibilities!");
+        EVI->replaceAllUsesWith(NewResult);
+      }
+      ToDelete.push_back(EVI);
+    }
+  }
+
+  for (auto *EVI : ToDelete)
+    EVI->eraseFromParent();
+
+  if (CI->use_empty())
+    CI->eraseFromParent();
+
+  return true;
+}
+
+/// Eliminate an operation that consumes a simple IV and has no observable
+/// side-effect given the range of IV values.  IVOperand is guaranteed SCEVable,
+/// but UseInst may not be.
+bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
+                                     Instruction *IVOperand) {
+  if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+    eliminateIVComparison(ICmp, IVOperand);
+    return true;
+  }
+  if (BinaryOperator *Bin = dyn_cast<BinaryOperator>(UseInst)) {
+    bool IsSRem = Bin->getOpcode() == Instruction::SRem;
+    if (IsSRem || Bin->getOpcode() == Instruction::URem) {
+      eliminateIVRemainder(Bin, IVOperand, IsSRem);
+      return true;
+    }
+
+    if (Bin->getOpcode() == Instruction::SDiv)
+      return eliminateSDiv(Bin);
+  }
+
+  if (auto *CI = dyn_cast<CallInst>(UseInst))
+    if (eliminateOverflowIntrinsic(CI))
+      return true;
+
+  if (eliminateIdentitySCEV(UseInst, IVOperand))
+    return true;
+
+  return false;
+}
+
+/// Eliminate any operation that SCEV can prove is an identity function.
+bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst,
+                                           Instruction *IVOperand) {
+  if (!SE->isSCEVable(UseInst->getType()) ||
+      (UseInst->getType() != IVOperand->getType()) ||
+      (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand)))
+    return false;
+
+  // getSCEV(X) == getSCEV(Y) does not guarantee that X and Y are related in the
+  // dominator tree, even if X is an operand to Y.  For instance, in
+  //
+  //     %iv = phi i32 {0,+,1}
+  //     br %cond, label %left, label %merge
+  //
+  //   left:
+  //     %X = add i32 %iv, 0
+  //     br label %merge
+  //
+  //   merge:
+  //     %M = phi (%X, %iv)
+  //
+  // getSCEV(%M) == getSCEV(%X) == {0,+,1}, but %X does not dominate %M, and
+  // %M.replaceAllUsesWith(%X) would be incorrect.
+
+  if (isa<PHINode>(UseInst))
+    // If UseInst is not a PHI node then we know that IVOperand dominates
+    // UseInst directly from the legality of SSA.
+    if (!DT || !DT->dominates(IVOperand, UseInst))
+      return false;
+
+  if (!LI->replacementPreservesLCSSAForm(UseInst, IVOperand))
+    return false;
+
+  DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
+
+  UseInst->replaceAllUsesWith(IVOperand);
+  ++NumElimIdentity;
+  Changed = true;
+  DeadInsts.emplace_back(UseInst);
+  return true;
+}
+
+/// Annotate BO with nsw / nuw if it provably does not signed-overflow /
+/// unsigned-overflow.  Returns true if anything changed, false otherwise.
+bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
+                                                    Value *IVOperand) {
+
+  // Fastpath: we don't have any work to do if `BO` is `nuw` and `nsw`.
+  if (BO->hasNoUnsignedWrap() && BO->hasNoSignedWrap())
+    return false;
+
+  const SCEV *(ScalarEvolution::*GetExprForBO)(const SCEV *, const SCEV *,
+                                               SCEV::NoWrapFlags, unsigned);
+  switch (BO->getOpcode()) {
+  default:
+    return false;
+
+  case Instruction::Add:
+    GetExprForBO = &ScalarEvolution::getAddExpr;
+    break;
+
+  case Instruction::Sub:
+    GetExprForBO = &ScalarEvolution::getMinusSCEV;
+    break;
+
+  case Instruction::Mul:
+    GetExprForBO = &ScalarEvolution::getMulExpr;
+    break;
+  }
+
+  unsigned BitWidth = cast<IntegerType>(BO->getType())->getBitWidth();
+  Type *WideTy = IntegerType::get(BO->getContext(), BitWidth * 2);
+  const SCEV *LHS = SE->getSCEV(BO->getOperand(0));
+  const SCEV *RHS = SE->getSCEV(BO->getOperand(1));
+
+  bool Changed = false;
+
+  if (!BO->hasNoUnsignedWrap()) {
+    const SCEV *ExtendAfterOp = SE->getZeroExtendExpr(SE->getSCEV(BO), WideTy);
+    const SCEV *OpAfterExtend = (SE->*GetExprForBO)(
+      SE->getZeroExtendExpr(LHS, WideTy), SE->getZeroExtendExpr(RHS, WideTy),
+      SCEV::FlagAnyWrap, 0u);
+    if (ExtendAfterOp == OpAfterExtend) {
+      BO->setHasNoUnsignedWrap();
+      SE->forgetValue(BO);
+      Changed = true;
+    }
+  }
+
+  if (!BO->hasNoSignedWrap()) {
+    const SCEV *ExtendAfterOp = SE->getSignExtendExpr(SE->getSCEV(BO), WideTy);
+    const SCEV *OpAfterExtend = (SE->*GetExprForBO)(
+      SE->getSignExtendExpr(LHS, WideTy), SE->getSignExtendExpr(RHS, WideTy),
+      SCEV::FlagAnyWrap, 0u);
+    if (ExtendAfterOp == OpAfterExtend) {
+      BO->setHasNoSignedWrap();
+      SE->forgetValue(BO);
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// Annotate the Shr in (X << IVOperand) >> C as exact using the
+/// information from the IV's range. Returns true if anything changed, false
+/// otherwise.
+bool SimplifyIndvar::strengthenRightShift(BinaryOperator *BO,
+                                          Value *IVOperand) {
+  using namespace llvm::PatternMatch;
+
+  if (BO->getOpcode() == Instruction::Shl) {
+    bool Changed = false;
+    ConstantRange IVRange = SE->getUnsignedRange(SE->getSCEV(IVOperand));
+    for (auto *U : BO->users()) {
+      const APInt *C;
+      if (match(U,
+                m_AShr(m_Shl(m_Value(), m_Specific(IVOperand)), m_APInt(C))) ||
+          match(U,
+                m_LShr(m_Shl(m_Value(), m_Specific(IVOperand)), m_APInt(C)))) {
+        BinaryOperator *Shr = cast<BinaryOperator>(U);
+        if (!Shr->isExact() && IVRange.getUnsignedMin().uge(*C)) {
+          Shr->setIsExact(true);
+          Changed = true;
+        }
+      }
+    }
+    return Changed;
+  }
+
+  return false;
+}
+
+/// Add all uses of Def to the current IV's worklist.
+static void pushIVUsers(
+  Instruction *Def,
+  SmallPtrSet<Instruction*,16> &Simplified,
+  SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) {
+
+  for (User *U : Def->users()) {
+    Instruction *UI = cast<Instruction>(U);
+
+    // Avoid infinite or exponential worklist processing.
+    // Also ensure unique worklist users.
+    // If Def is a LoopPhi, it may not be in the Simplified set, so check for
+    // self edges first.
+    if (UI != Def && Simplified.insert(UI).second)
+      SimpleIVUsers.push_back(std::make_pair(UI, Def));
+  }
+}
+
+/// Return true if this instruction generates a simple SCEV
+/// expression in terms of that IV.
+///
+/// This is similar to IVUsers' isInteresting() but processes each instruction
+/// non-recursively when the operand is already known to be a simpleIVUser.
+///
+static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) {
+  if (!SE->isSCEVable(I->getType()))
+    return false;
+
+  // Get the symbolic expression for this instruction.
+  const SCEV *S = SE->getSCEV(I);
+
+  // Only consider affine recurrences.
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
+  if (AR && AR->getLoop() == L)
+    return true;
+
+  return false;
+}
+
+/// Iteratively perform simplification on a worklist of users
+/// of the specified induction variable. Each successive simplification may push
+/// more users which may themselves be candidates for simplification.
+///
+/// This algorithm does not require IVUsers analysis. Instead, it simplifies
+/// instructions in-place during analysis. Rather than rewriting induction
+/// variables bottom-up from their users, it transforms a chain of IVUsers
+/// top-down, updating the IR only when it encounters a clear optimization
+/// opportunity.
+///
+/// Once DisableIVRewrite is default, LSR will be the only client of IVUsers.
+///
+void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
+  if (!SE->isSCEVable(CurrIV->getType()))
+    return;
+
+  // Instructions processed by SimplifyIndvar for CurrIV.
+  SmallPtrSet<Instruction*,16> Simplified;
+
+  // Use-def pairs if IV users waiting to be processed for CurrIV.
+  SmallVector<std::pair<Instruction*, Instruction*>, 8> SimpleIVUsers;
+
+  // Push users of the current LoopPhi. In rare cases, pushIVUsers may be
+  // called multiple times for the same LoopPhi. This is the proper thing to
+  // do for loop header phis that use each other.
+  pushIVUsers(CurrIV, Simplified, SimpleIVUsers);
+
+  while (!SimpleIVUsers.empty()) {
+    std::pair<Instruction*, Instruction*> UseOper =
+      SimpleIVUsers.pop_back_val();
+    Instruction *UseInst = UseOper.first;
+
+    // Bypass back edges to avoid extra work.
+    if (UseInst == CurrIV) continue;
+
+    Instruction *IVOperand = UseOper.second;
+    for (unsigned N = 0; IVOperand; ++N) {
+      assert(N <= Simplified.size() && "runaway iteration");
+
+      Value *NewOper = foldIVUser(UseOper.first, IVOperand);
+      if (!NewOper)
+        break; // done folding
+      IVOperand = dyn_cast<Instruction>(NewOper);
+    }
+    if (!IVOperand)
+      continue;
+
+    if (eliminateIVUser(UseOper.first, IVOperand)) {
+      pushIVUsers(IVOperand, Simplified, SimpleIVUsers);
+      continue;
+    }
+
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseOper.first)) {
+      if ((isa<OverflowingBinaryOperator>(BO) &&
+           strengthenOverflowingOperation(BO, IVOperand)) ||
+          (isa<ShlOperator>(BO) && strengthenRightShift(BO, IVOperand))) {
+        // re-queue uses of the now modified binary operator and fall
+        // through to the checks that remain.
+        pushIVUsers(IVOperand, Simplified, SimpleIVUsers);
+      }
+    }
+
+    CastInst *Cast = dyn_cast<CastInst>(UseOper.first);
+    if (V && Cast) {
+      V->visitCast(Cast);
+      continue;
+    }
+    if (isSimpleIVUser(UseOper.first, L, SE)) {
+      pushIVUsers(UseOper.first, Simplified, SimpleIVUsers);
+    }
+  }
+}
+
+namespace llvm {
+
+void IVVisitor::anchor() { }
+
+/// Simplify instructions that use this induction variable
+/// by using ScalarEvolution to analyze the IV's recurrence.
+bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT,
+                       LoopInfo *LI, SmallVectorImpl<WeakTrackingVH> &Dead,
+                       IVVisitor *V) {
+  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, Dead);
+  SIV.simplifyUsers(CurrIV, V);
+  return SIV.hasChanged();
+}
+
+/// Simplify users of induction variables within this
+/// loop. This does not actually change or add IVs.
+bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
+                     LoopInfo *LI, SmallVectorImpl<WeakTrackingVH> &Dead) {
+  bool Changed = false;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, Dead);
+  }
+  return Changed;
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
new file mode 100644
index 000000000000..2ea15f65cef9
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -0,0 +1,152 @@
+//===------ SimplifyInstructions.cpp - Remove redundant instructions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a utility pass used for testing the InstructionSimplify analysis.
+// The analysis is applied to every instruction, and if it simplifies then the
+// instruction is replaced by the simplification.  If you are looking for a pass
+// that performs serious instruction folding, use the instcombine pass instead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SimplifyInstructions.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "instsimplify"
+
+STATISTIC(NumSimplified, "Number of redundant instructions removed");
+
+static bool runImpl(Function &F, const SimplifyQuery &SQ,
+                    OptimizationRemarkEmitter *ORE) {
+  SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+  bool Changed = false;
+
+  do {
+    for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+      // Here be subtlety: the iterator must be incremented before the loop
+      // body (not sure why), so a range-for loop won't work here.
+      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+        Instruction *I = &*BI++;
+        // The first time through the loop ToSimplify is empty and we try to
+        // simplify all instructions.  On later iterations ToSimplify is not
+        // empty and we only bother simplifying instructions that are in it.
+        if (!ToSimplify->empty() && !ToSimplify->count(I))
+          continue;
+
+        // Don't waste time simplifying unused instructions.
+        if (!I->use_empty()) {
+          if (Value *V = SimplifyInstruction(I, SQ, ORE)) {
+            // Mark all uses for resimplification next time round the loop.
+            for (User *U : I->users())
+              Next->insert(cast<Instruction>(U));
+            I->replaceAllUsesWith(V);
+            ++NumSimplified;
+            Changed = true;
+          }
+        }
+        if (RecursivelyDeleteTriviallyDeadInstructions(I, SQ.TLI)) {
+          // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
+          // instruction, so simply incrementing the iterator does not work.
+          // When instructions get deleted re-iterate instead.
+          BI = BB->begin();
+          BE = BB->end();
+          Changed = true;
+        }
+      }
+    }
+
+    // Place the list of instructions to simplify on the next loop iteration
+    // into ToSimplify.
+    std::swap(ToSimplify, Next);
+    Next->clear();
+  } while (!ToSimplify->empty());
+
+  return Changed;
+}
+
+namespace {
+  struct InstSimplifier : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    InstSimplifier() : FunctionPass(ID) {
+      initializeInstSimplifierPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    }
+
+    /// runOnFunction - Remove instructions that simplify.
+    bool runOnFunction(Function &F) override {
+      if (skipFunction(F))
+        return false;
+
+      const DominatorTree *DT =
+          &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      const TargetLibraryInfo *TLI =
+          &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+      AssumptionCache *AC =
+          &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+      OptimizationRemarkEmitter *ORE =
+          &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+      const DataLayout &DL = F.getParent()->getDataLayout();
+      const SimplifyQuery SQ(DL, TLI, DT, AC);
+      return runImpl(F, SQ, ORE);
+    }
+  };
+}
+
+char InstSimplifier::ID = 0;
+INITIALIZE_PASS_BEGIN(InstSimplifier, "instsimplify",
+                      "Remove redundant instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(InstSimplifier, "instsimplify",
+                    "Remove redundant instructions", false, false)
+char &llvm::InstructionSimplifierID = InstSimplifier::ID;
+
+// Public interface to the simplify instructions pass.
+FunctionPass *llvm::createInstructionSimplifierPass() {
+  return new InstSimplifier();
+}
+
+PreservedAnalyses InstSimplifierPass::run(Function &F,
+                                      FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  const SimplifyQuery SQ(DL, &TLI, &DT, &AC);
+  bool Changed = runImpl(F, SQ, &ORE);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
new file mode 100644
index 000000000000..77c0a41929ac
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -0,0 +1,2440 @@
+//===------ SimplifyLibCalls.cpp - Library calls simplifier ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a utility pass used for testing the InstructionSimplify analysis.
+// The analysis is applied to every instruction, and if it simplifies then the
+// instruction is replaced by the simplification.  If you are looking for a pass
+// that performs serious instruction folding, use the instcombine pass instead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+static cl::opt<bool>
+    EnableUnsafeFPShrink("enable-double-float-shrink", cl::Hidden,
+                         cl::init(false),
+                         cl::desc("Enable unsafe double to float "
+                                  "shrinking for math lib calls"));
+
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+static bool ignoreCallingConv(LibFunc Func) {
+  return Func == LibFunc_abs || Func == LibFunc_labs ||
+         Func == LibFunc_llabs || Func == LibFunc_strlen;
+}
+
+static bool isCallingConvCCompatible(CallInst *CI) {
+  switch(CI->getCallingConv()) {
+  default:
+    return false;
+  case llvm::CallingConv::C:
+    return true;
+  case llvm::CallingConv::ARM_APCS:
+  case llvm::CallingConv::ARM_AAPCS:
+  case llvm::CallingConv::ARM_AAPCS_VFP: {
+
+    // The iOS ABI diverges from the standard in some cases, so for now don't
+    // try to simplify those calls.
+    if (Triple(CI->getModule()->getTargetTriple()).isiOS())
+      return false;
+
+    auto *FuncTy = CI->getFunctionType();
+
+    if (!FuncTy->getReturnType()->isPointerTy() &&
+        !FuncTy->getReturnType()->isIntegerTy() &&
+        !FuncTy->getReturnType()->isVoidTy())
+      return false;
+
+    for (auto Param : FuncTy->params()) {
+      if (!Param->isPointerTy() && !Param->isIntegerTy())
+        return false;
+    }
+    return true;
+  }
+  }
+  return false;
+}
+
+/// Return true if it is only used in equality comparisons with With.
+static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
+  for (User *U : V->users()) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
+      if (IC->isEquality() && IC->getOperand(1) == With)
+        continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+static bool callHasFloatingPointArgument(const CallInst *CI) {
+  return any_of(CI->operands(), [](const Use &OI) {
+    return OI->getType()->isFloatingPointTy();
+  });
+}
+
+/// \brief Check whether the overloaded unary floating point function
+/// corresponding to \a Ty is available.
+static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                            LibFunc DoubleFn, LibFunc FloatFn,
+                            LibFunc LongDoubleFn) {
+  switch (Ty->getTypeID()) {
+  case Type::FloatTyID:
+    return TLI->has(FloatFn);
+  case Type::DoubleTyID:
+    return TLI->has(DoubleFn);
+  default:
+    return TLI->has(LongDoubleFn);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// String and Memory Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilder<> &B) {
+  // Extract some information from the instruction
+  Value *Dst = CI->getArgOperand(0);
+  Value *Src = CI->getArgOperand(1);
+
+  // See if we can get the length of the input string.
+  uint64_t Len = GetStringLength(Src);
+  if (Len == 0)
+    return nullptr;
+  --Len; // Unbias length.
+
+  // Handle the simple, do-nothing case: strcat(x, "") -> x
+  if (Len == 0)
+    return Dst;
+
+  return emitStrLenMemCpy(Src, Dst, Len, B);
+}
+
+Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
+                                           IRBuilder<> &B) {
+  // We need to find the end of the destination string.  That's where the
+  // memory is to be moved to. We just generate a call to strlen.
+  Value *DstLen = emitStrLen(Dst, B, DL, TLI);
+  if (!DstLen)
+    return nullptr;
+
+  // Now that we have the destination's length, we must index into the
+  // destination's pointer to get the actual memcpy destination (end of
+  // the string .. we're concatenating).
+  Value *CpyDst = B.CreateGEP(B.getInt8Ty(), Dst, DstLen, "endptr");
+
+  // We have enough information to now generate the memcpy call to do the
+  // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
+  B.CreateMemCpy(CpyDst, Src,
+                 ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1),
+                 1);
+  return Dst;
+}
+
+Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
+  // Extract some information from the instruction.
+  Value *Dst = CI->getArgOperand(0);
+  Value *Src = CI->getArgOperand(1);
+  uint64_t Len;
+
+  // We don't do anything if length is not constant.
+  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+    Len = LengthArg->getZExtValue();
+  else
+    return nullptr;
+
+  // See if we can get the length of the input string.
+  uint64_t SrcLen = GetStringLength(Src);
+  if (SrcLen == 0)
+    return nullptr;
+  --SrcLen; // Unbias length.
+
+  // Handle the simple, do-nothing cases:
+  // strncat(x, "", c) -> x
+  // strncat(x,  c, 0) -> x
+  if (SrcLen == 0 || Len == 0)
+    return Dst;
+
+  // We don't optimize this case.
+  if (Len < SrcLen)
+    return nullptr;
+
+  // strncat(x, s, c) -> strcat(x, s)
+  // s is constant so the strcat can be optimized further.
+  return emitStrLenMemCpy(Src, Dst, SrcLen, B);
+}
+
+Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  Value *SrcStr = CI->getArgOperand(0);
+
+  // If the second operand is non-constant, see if we can compute the length
+  // of the input string and turn this into memchr.
+  ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!CharC) {
+    uint64_t Len = GetStringLength(SrcStr);
+    if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32.
+      return nullptr;
+
+    return emitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
+                      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len),
+                      B, DL, TLI);
+  }
+
+  // Otherwise, the character is a constant, see if the first argument is
+  // a string literal.  If so, we can constant fold.
+  StringRef Str;
+  if (!getConstantStringInfo(SrcStr, Str)) {
+    if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
+      return B.CreateGEP(B.getInt8Ty(), SrcStr, emitStrLen(SrcStr, B, DL, TLI),
+                         "strchr");
+    return nullptr;
+  }
+
+  // Compute the offset, make sure to handle the case when we're searching for
+  // zero (a weird way to spell strlen).
+  size_t I = (0xFF & CharC->getSExtValue()) == 0
+                 ? Str.size()
+                 : Str.find(CharC->getSExtValue());
+  if (I == StringRef::npos) // Didn't find the char.  strchr returns null.
+    return Constant::getNullValue(CI->getType());
+
+  // strchr(s+n,c)  -> gep(s+n+i,c)
+  return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strchr");
+}
+
+Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) {
+  Value *SrcStr = CI->getArgOperand(0);
+  ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+
+  // Cannot fold anything if we're not looking for a constant.
+  if (!CharC)
+    return nullptr;
+
+  StringRef Str;
+  if (!getConstantStringInfo(SrcStr, Str)) {
+    // strrchr(s, 0) -> strchr(s, 0)
+    if (CharC->isZero())
+      return emitStrChr(SrcStr, '\0', B, TLI);
+    return nullptr;
+  }
+
+  // Compute the offset.
+  size_t I = (0xFF & CharC->getSExtValue()) == 0
+                 ? Str.size()
+                 : Str.rfind(CharC->getSExtValue());
+  if (I == StringRef::npos) // Didn't find the char. Return null.
+    return Constant::getNullValue(CI->getType());
+
+  // strrchr(s+n,c) -> gep(s+n+i,c)
+  return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strrchr");
+}
+
+Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
+  Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
+  if (Str1P == Str2P) // strcmp(x,x)  -> 0
+    return ConstantInt::get(CI->getType(), 0);
+
+  StringRef Str1, Str2;
+  bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+  bool HasStr2 = getConstantStringInfo(Str2P, Str2);
+
+  // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
+  if (HasStr1 && HasStr2)
+    return ConstantInt::get(CI->getType(), Str1.compare(Str2));
+
+  if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x
+    return B.CreateNeg(
+        B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType()));
+
+  if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
+    return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+
+  // strcmp(P, "x") -> memcmp(P, "x", 2)
+  uint64_t Len1 = GetStringLength(Str1P);
+  uint64_t Len2 = GetStringLength(Str2P);
+  if (Len1 && Len2) {
+    return emitMemCmp(Str1P, Str2P,
+                      ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+                                       std::min(Len1, Len2)),
+                      B, DL, TLI);
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
+  Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
+  if (Str1P == Str2P) // strncmp(x,x,n)  -> 0
+    return ConstantInt::get(CI->getType(), 0);
+
+  // Get the length argument if it is constant.
+  uint64_t Length;
+  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+    Length = LengthArg->getZExtValue();
+  else
+    return nullptr;
+
+  if (Length == 0) // strncmp(x,y,0)   -> 0
+    return ConstantInt::get(CI->getType(), 0);
+
+  if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
+    return emitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, DL, TLI);
+
+  StringRef Str1, Str2;
+  bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+  bool HasStr2 = getConstantStringInfo(Str2P, Str2);
+
+  // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
+  if (HasStr1 && HasStr2) {
+    StringRef SubStr1 = Str1.substr(0, Length);
+    StringRef SubStr2 = Str2.substr(0, Length);
+    return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
+  }
+
+  if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> -*x
+    return B.CreateNeg(
+        B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType()));
+
+  if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x
+    return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
+  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+  if (Dst == Src) // strcpy(x,x)  -> x
+    return Src;
+
+  // See if we can get the length of the input string.
+  uint64_t Len = GetStringLength(Src);
+  if (Len == 0)
+    return nullptr;
+
+  // We have enough information to now generate the memcpy call to do the
+  // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+  B.CreateMemCpy(Dst, Src,
+                 ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), 1);
+  return Dst;
+}
+
+Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+  if (Dst == Src) { // stpcpy(x,x)  -> x+strlen(x)
+    Value *StrLen = emitStrLen(Src, B, DL, TLI);
+    return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
+  }
+
+  // See if we can get the length of the input string.
+  uint64_t Len = GetStringLength(Src);
+  if (Len == 0)
+    return nullptr;
+
+  Type *PT = Callee->getFunctionType()->getParamType(0);
+  Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len);
+  Value *DstEnd = B.CreateGEP(B.getInt8Ty(), Dst,
+                              ConstantInt::get(DL.getIntPtrType(PT), Len - 1));
+
+  // We have enough information to now generate the memcpy call to do the
+  // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+  B.CreateMemCpy(Dst, Src, LenV, 1);
+  return DstEnd;
+}
+
+Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Dst = CI->getArgOperand(0);
+  Value *Src = CI->getArgOperand(1);
+  Value *LenOp = CI->getArgOperand(2);
+
+  // See if we can get the length of the input string.
+  uint64_t SrcLen = GetStringLength(Src);
+  if (SrcLen == 0)
+    return nullptr;
+  --SrcLen;
+
+  if (SrcLen == 0) {
+    // strncpy(x, "", y) -> memset(x, '\0', y, 1)
+    B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1);
+    return Dst;
+  }
+
+  uint64_t Len;
+  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp))
+    Len = LengthArg->getZExtValue();
+  else
+    return nullptr;
+
+  if (Len == 0)
+    return Dst; // strncpy(x, y, 0) -> x
+
+  // Let strncpy handle the zero padding
+  if (Len > SrcLen + 1)
+    return nullptr;
+
+  Type *PT = Callee->getFunctionType()->getParamType(0);
+  // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
+  B.CreateMemCpy(Dst, Src, ConstantInt::get(DL.getIntPtrType(PT), Len), 1);
+
+  return Dst;
+}
+
+Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilder<> &B,
+                                               unsigned CharSize) {
+  Value *Src = CI->getArgOperand(0);
+
+  // Constant folding: strlen("xyz") -> 3
+  if (uint64_t Len = GetStringLength(Src, CharSize))
+    return ConstantInt::get(CI->getType(), Len - 1);
+
+  // If s is a constant pointer pointing to a string literal, we can fold
+  // strlen(s + x) to strlen(s) - x, when x is known to be in the range
+  // [0, strlen(s)] or the string has a single null terminator '\0' at the end.
+  // We only try to simplify strlen when the pointer s points to an array
+  // of i8. Otherwise, we would need to scale the offset x before doing the
+  // subtraction. This will make the optimization more complex, and it's not
+  // very useful because calling strlen for a pointer of other types is
+  // very uncommon.
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Src)) {
+    if (!isGEPBasedOnPointerToString(GEP, CharSize))
+      return nullptr;
+
+    ConstantDataArraySlice Slice;
+    if (getConstantDataArrayInfo(GEP->getOperand(0), Slice, CharSize)) {
+      uint64_t NullTermIdx;
+      if (Slice.Array == nullptr) {
+        NullTermIdx = 0;
+      } else {
+        NullTermIdx = ~((uint64_t)0);
+        for (uint64_t I = 0, E = Slice.Length; I < E; ++I) {
+          if (Slice.Array->getElementAsInteger(I + Slice.Offset) == 0) {
+            NullTermIdx = I;
+            break;
+          }
+        }
+        // If the string does not have '\0', leave it to strlen to compute
+        // its length.
+        if (NullTermIdx == ~((uint64_t)0))
+          return nullptr;
+      }
+
+      Value *Offset = GEP->getOperand(2);
+      KnownBits Known = computeKnownBits(Offset, DL, 0, nullptr, CI, nullptr);
+      Known.Zero.flipAllBits();
+      uint64_t ArrSize =
+             cast<ArrayType>(GEP->getSourceElementType())->getNumElements();
+
+      // KnownZero's bits are flipped, so zeros in KnownZero now represent
+      // bits known to be zeros in Offset, and ones in KnowZero represent
+      // bits unknown in Offset. Therefore, Offset is known to be in range
+      // [0, NullTermIdx] when the flipped KnownZero is non-negative and
+      // unsigned-less-than NullTermIdx.
+      //
+      // If Offset is not provably in the range [0, NullTermIdx], we can still
+      // optimize if we can prove that the program has undefined behavior when
+      // Offset is outside that range. That is the case when GEP->getOperand(0)
+      // is a pointer to an object whose memory extent is NullTermIdx+1.
+      if ((Known.Zero.isNonNegative() && Known.Zero.ule(NullTermIdx)) ||
+          (GEP->isInBounds() && isa<GlobalVariable>(GEP->getOperand(0)) &&
+           NullTermIdx == ArrSize - 1)) {
+        Offset = B.CreateSExtOrTrunc(Offset, CI->getType());
+        return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx),
+                           Offset);
+      }
+    }
+
+    return nullptr;
+  }
+
+  // strlen(x?"foo":"bars") --> x ? 3 : 4
+  if (SelectInst *SI = dyn_cast<SelectInst>(Src)) {
+    uint64_t LenTrue = GetStringLength(SI->getTrueValue(), CharSize);
+    uint64_t LenFalse = GetStringLength(SI->getFalseValue(), CharSize);
+    if (LenTrue && LenFalse) {
+      Function *Caller = CI->getParent()->getParent();
+      emitOptimizationRemark(CI->getContext(), "simplify-libcalls", *Caller,
+                             SI->getDebugLoc(),
+                             "folded strlen(select) to select of constants");
+      return B.CreateSelect(SI->getCondition(),
+                            ConstantInt::get(CI->getType(), LenTrue - 1),
+                            ConstantInt::get(CI->getType(), LenFalse - 1));
+    }
+  }
+
+  // strlen(x) != 0 --> *x != 0
+  // strlen(x) == 0 --> *x == 0
+  if (isOnlyUsedInZeroEqualityComparison(CI))
+    return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType());
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
+  return optimizeStringLength(CI, B, 8);
+}
+
+Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilder<> &B) {
+  Module &M = *CI->getParent()->getParent()->getParent();
+  unsigned WCharSize = TLI->getWCharSize(M) * 8;
+
+  return optimizeStringLength(CI, B, WCharSize);
+}
+
+Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) {
+  StringRef S1, S2;
+  bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+  bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+  // strpbrk(s, "") -> nullptr
+  // strpbrk("", s) -> nullptr
+  if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+    return Constant::getNullValue(CI->getType());
+
+  // Constant folding.
+  if (HasS1 && HasS2) {
+    size_t I = S1.find_first_of(S2);
+    if (I == StringRef::npos) // No match.
+      return Constant::getNullValue(CI->getType());
+
+    return B.CreateGEP(B.getInt8Ty(), CI->getArgOperand(0), B.getInt64(I),
+                       "strpbrk");
+  }
+
+  // strpbrk(s, "a") -> strchr(s, 'a')
+  if (HasS2 && S2.size() == 1)
+    return emitStrChr(CI->getArgOperand(0), S2[0], B, TLI);
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilder<> &B) {
+  Value *EndPtr = CI->getArgOperand(1);
+  if (isa<ConstantPointerNull>(EndPtr)) {
+    // With a null EndPtr, this function won't capture the main argument.
+    // It would be readonly too, except that it still may write to errno.
+    CI->addParamAttr(0, Attribute::NoCapture);
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilder<> &B) {
+  StringRef S1, S2;
+  bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+  bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+  // strspn(s, "") -> 0
+  // strspn("", s) -> 0
+  if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+    return Constant::getNullValue(CI->getType());
+
+  // Constant folding.
+  if (HasS1 && HasS2) {
+    size_t Pos = S1.find_first_not_of(S2);
+    if (Pos == StringRef::npos)
+      Pos = S1.size();
+    return ConstantInt::get(CI->getType(), Pos);
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilder<> &B) {
+  StringRef S1, S2;
+  bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+  bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+  // strcspn("", s) -> 0
+  if (HasS1 && S1.empty())
+    return Constant::getNullValue(CI->getType());
+
+  // Constant folding.
+  if (HasS1 && HasS2) {
+    size_t Pos = S1.find_first_of(S2);
+    if (Pos == StringRef::npos)
+      Pos = S1.size();
+    return ConstantInt::get(CI->getType(), Pos);
+  }
+
+  // strcspn(s, "") -> strlen(s)
+  if (HasS2 && S2.empty())
+    return emitStrLen(CI->getArgOperand(0), B, DL, TLI);
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) {
+  // fold strstr(x, x) -> x.
+  if (CI->getArgOperand(0) == CI->getArgOperand(1))
+    return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+  // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
+  if (isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
+    Value *StrLen = emitStrLen(CI->getArgOperand(1), B, DL, TLI);
+    if (!StrLen)
+      return nullptr;
+    Value *StrNCmp = emitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
+                                 StrLen, B, DL, TLI);
+    if (!StrNCmp)
+      return nullptr;
+    for (auto UI = CI->user_begin(), UE = CI->user_end(); UI != UE;) {
+      ICmpInst *Old = cast<ICmpInst>(*UI++);
+      Value *Cmp =
+          B.CreateICmp(Old->getPredicate(), StrNCmp,
+                       ConstantInt::getNullValue(StrNCmp->getType()), "cmp");
+      replaceAllUsesWith(Old, Cmp);
+    }
+    return CI;
+  }
+
+  // See if either input string is a constant string.
+  StringRef SearchStr, ToFindStr;
+  bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
+  bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
+
+  // fold strstr(x, "") -> x.
+  if (HasStr2 && ToFindStr.empty())
+    return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+  // If both strings are known, constant fold it.
+  if (HasStr1 && HasStr2) {
+    size_t Offset = SearchStr.find(ToFindStr);
+
+    if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
+      return Constant::getNullValue(CI->getType());
+
+    // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
+    Value *Result = castToCStr(CI->getArgOperand(0), B);
+    Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
+    return B.CreateBitCast(Result, CI->getType());
+  }
+
+  // fold strstr(x, "y") -> strchr(x, 'y').
+  if (HasStr2 && ToFindStr.size() == 1) {
+    Value *StrChr = emitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI);
+    return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr;
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) {
+  Value *SrcStr = CI->getArgOperand(0);
+  ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+
+  // memchr(x, y, 0) -> null
+  if (LenC && LenC->isZero())
+    return Constant::getNullValue(CI->getType());
+
+  // From now on we need at least constant length and string.
+  StringRef Str;
+  if (!LenC || !getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false))
+    return nullptr;
+
+  // Truncate the string to LenC. If Str is smaller than LenC we will still only
+  // scan the string, as reading past the end of it is undefined and we can just
+  // return null if we don't find the char.
+  Str = Str.substr(0, LenC->getZExtValue());
+
+  // If the char is variable but the input str and length are not we can turn
+  // this memchr call into a simple bit field test. Of course this only works
+  // when the return value is only checked against null.
+  //
+  // It would be really nice to reuse switch lowering here but we can't change
+  // the CFG at this point.
+  //
+  // memchr("\r\n", C, 2) != nullptr -> (C & ((1 << '\r') | (1 << '\n'))) != 0
+  //   after bounds check.
+  if (!CharC && !Str.empty() && isOnlyUsedInZeroEqualityComparison(CI)) {
+    unsigned char Max =
+        *std::max_element(reinterpret_cast<const unsigned char *>(Str.begin()),
+                          reinterpret_cast<const unsigned char *>(Str.end()));
+
+    // Make sure the bit field we're about to create fits in a register on the
+    // target.
+    // FIXME: On a 64 bit architecture this prevents us from using the
+    // interesting range of alpha ascii chars. We could do better by emitting
+    // two bitfields or shifting the range by 64 if no lower chars are used.
+    if (!DL.fitsInLegalInteger(Max + 1))
+      return nullptr;
+
+    // For the bit field use a power-of-2 type with at least 8 bits to avoid
+    // creating unnecessary illegal types.
+    unsigned char Width = NextPowerOf2(std::max((unsigned char)7, Max));
+
+    // Now build the bit field.
+    APInt Bitfield(Width, 0);
+    for (char C : Str)
+      Bitfield.setBit((unsigned char)C);
+    Value *BitfieldC = B.getInt(Bitfield);
+
+    // First check that the bit field access is within bounds.
+    Value *C = B.CreateZExtOrTrunc(CI->getArgOperand(1), BitfieldC->getType());
+    Value *Bounds = B.CreateICmp(ICmpInst::ICMP_ULT, C, B.getIntN(Width, Width),
+                                 "memchr.bounds");
+
+    // Create code that checks if the given bit is set in the field.
+    Value *Shl = B.CreateShl(B.getIntN(Width, 1ULL), C);
+    Value *Bits = B.CreateIsNotNull(B.CreateAnd(Shl, BitfieldC), "memchr.bits");
+
+    // Finally merge both checks and cast to pointer type. The inttoptr
+    // implicitly zexts the i1 to intptr type.
+    return B.CreateIntToPtr(B.CreateAnd(Bounds, Bits, "memchr"), CI->getType());
+  }
+
+  // Check if all arguments are constants.  If so, we can constant fold.
+  if (!CharC)
+    return nullptr;
+
+  // Compute the offset.
+  size_t I = Str.find(CharC->getSExtValue() & 0xFF);
+  if (I == StringRef::npos) // Didn't find the char.  memchr returns null.
+    return Constant::getNullValue(CI->getType());
+
+  // memchr(s+n,c,l) -> gep(s+n+i,c)
+  return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "memchr");
+}
+
+Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
+  Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
+
+  if (LHS == RHS) // memcmp(s,s,x) -> 0
+    return Constant::getNullValue(CI->getType());
+
+  // Make sure we have a constant length.
+  ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  if (!LenC)
+    return nullptr;
+
+  uint64_t Len = LenC->getZExtValue();
+  if (Len == 0) // memcmp(s1,s2,0) -> 0
+    return Constant::getNullValue(CI->getType());
+
+  // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
+  if (Len == 1) {
+    Value *LHSV = B.CreateZExt(B.CreateLoad(castToCStr(LHS, B), "lhsc"),
+                               CI->getType(), "lhsv");
+    Value *RHSV = B.CreateZExt(B.CreateLoad(castToCStr(RHS, B), "rhsc"),
+                               CI->getType(), "rhsv");
+    return B.CreateSub(LHSV, RHSV, "chardiff");
+  }
+
+  // memcmp(S1,S2,N/8)==0 -> (*(intN_t*)S1 != *(intN_t*)S2)==0
+  if (DL.isLegalInteger(Len * 8) && isOnlyUsedInZeroEqualityComparison(CI)) {
+
+    IntegerType *IntType = IntegerType::get(CI->getContext(), Len * 8);
+    unsigned PrefAlignment = DL.getPrefTypeAlignment(IntType);
+
+    if (getKnownAlignment(LHS, DL, CI) >= PrefAlignment &&
+        getKnownAlignment(RHS, DL, CI) >= PrefAlignment) {
+
+      Type *LHSPtrTy =
+          IntType->getPointerTo(LHS->getType()->getPointerAddressSpace());
+      Type *RHSPtrTy =
+          IntType->getPointerTo(RHS->getType()->getPointerAddressSpace());
+
+      Value *LHSV =
+          B.CreateLoad(B.CreateBitCast(LHS, LHSPtrTy, "lhsc"), "lhsv");
+      Value *RHSV =
+          B.CreateLoad(B.CreateBitCast(RHS, RHSPtrTy, "rhsc"), "rhsv");
+
+      return B.CreateZExt(B.CreateICmpNE(LHSV, RHSV), CI->getType(), "memcmp");
+    }
+  }
+
+  // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
+  StringRef LHSStr, RHSStr;
+  if (getConstantStringInfo(LHS, LHSStr) &&
+      getConstantStringInfo(RHS, RHSStr)) {
+    // Make sure we're not reading out-of-bounds memory.
+    if (Len > LHSStr.size() || Len > RHSStr.size())
+      return nullptr;
+    // Fold the memcmp and normalize the result.  This way we get consistent
+    // results across multiple platforms.
+    uint64_t Ret = 0;
+    int Cmp = memcmp(LHSStr.data(), RHSStr.data(), Len);
+    if (Cmp < 0)
+      Ret = -1;
+    else if (Cmp > 0)
+      Ret = 1;
+    return ConstantInt::get(CI->getType(), Ret);
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
+  // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
+  B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                 CI->getArgOperand(2), 1);
+  return CI->getArgOperand(0);
+}
+
+Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
+  // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
+  B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                  CI->getArgOperand(2), 1);
+  return CI->getArgOperand(0);
+}
+
+// TODO: Does this belong in BuildLibCalls or should all of those similar
+// functions be moved here?
+static Value *emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
+                         IRBuilder<> &B, const TargetLibraryInfo &TLI) {
+  LibFunc Func;
+  if (!TLI.getLibFunc("calloc", Func) || !TLI.has(Func))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  const DataLayout &DL = M->getDataLayout();
+  IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
+  Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
+                                         PtrType, PtrType);
+  CallInst *CI = B.CreateCall(Calloc, { Num, Size }, "calloc");
+
+  if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+/// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n).
+static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B,
+                               const TargetLibraryInfo &TLI) {
+  // This has to be a memset of zeros (bzero).
+  auto *FillValue = dyn_cast<ConstantInt>(Memset->getArgOperand(1));
+  if (!FillValue || FillValue->getZExtValue() != 0)
+    return nullptr;
+
+  // TODO: We should handle the case where the malloc has more than one use.
+  // This is necessary to optimize common patterns such as when the result of
+  // the malloc is checked against null or when a memset intrinsic is used in
+  // place of a memset library call.
+  auto *Malloc = dyn_cast<CallInst>(Memset->getArgOperand(0));
+  if (!Malloc || !Malloc->hasOneUse())
+    return nullptr;
+
+  // Is the inner call really malloc()?
+  Function *InnerCallee = Malloc->getCalledFunction();
+  if (!InnerCallee)
+    return nullptr;
+
+  LibFunc Func;
+  if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
+      Func != LibFunc_malloc)
+    return nullptr;
+
+  // The memset must cover the same number of bytes that are malloc'd.
+  if (Memset->getArgOperand(2) != Malloc->getArgOperand(0))
+    return nullptr;
+
+  // Replace the malloc with a calloc. We need the data layout to know what the
+  // actual size of a 'size_t' parameter is. 
+  B.SetInsertPoint(Malloc->getParent(), ++Malloc->getIterator());
+  const DataLayout &DL = Malloc->getModule()->getDataLayout();
+  IntegerType *SizeType = DL.getIntPtrType(B.GetInsertBlock()->getContext());
+  Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1),
+                             Malloc->getArgOperand(0), Malloc->getAttributes(),
+                             B, TLI);
+  if (!Calloc)
+    return nullptr;
+
+  Malloc->replaceAllUsesWith(Calloc);
+  Malloc->eraseFromParent();
+
+  return Calloc;
+}
+
+Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
+  if (auto *Calloc = foldMallocMemset(CI, B, *TLI))
+    return Calloc;
+
+  // memset(p, v, n) -> llvm.memset(p, v, n, 1)
+  Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
+  B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+  return CI->getArgOperand(0);
+}
+
+//===----------------------------------------------------------------------===//
+// Math Library Optimizations
+//===----------------------------------------------------------------------===//
+
+/// Return a variant of Val with float type.
+/// Currently this works in two cases: If Val is an FPExtension of a float
+/// value to something bigger, simply return the operand.
+/// If Val is a ConstantFP but can be converted to a float ConstantFP without
+/// loss of precision do so.
+static Value *valueHasFloatPrecision(Value *Val) {
+  if (FPExtInst *Cast = dyn_cast<FPExtInst>(Val)) {
+    Value *Op = Cast->getOperand(0);
+    if (Op->getType()->isFloatTy())
+      return Op;
+  }
+  if (ConstantFP *Const = dyn_cast<ConstantFP>(Val)) {
+    APFloat F = Const->getValueAPF();
+    bool losesInfo;
+    (void)F.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
+                    &losesInfo);
+    if (!losesInfo)
+      return ConstantFP::get(Const->getContext(), F);
+  }
+  return nullptr;
+}
+
+/// Shrink double -> float for unary functions like 'floor'.
+static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
+                                    bool CheckRetType) {
+  Function *Callee = CI->getCalledFunction();
+  // We know this libcall has a valid prototype, but we don't know which.
+  if (!CI->getType()->isDoubleTy())
+    return nullptr;
+
+  if (CheckRetType) {
+    // Check if all the uses for function like 'sin' are converted to float.
+    for (User *U : CI->users()) {
+      FPTruncInst *Cast = dyn_cast<FPTruncInst>(U);
+      if (!Cast || !Cast->getType()->isFloatTy())
+        return nullptr;
+    }
+  }
+
+  // If this is something like 'floor((double)floatval)', convert to floorf.
+  Value *V = valueHasFloatPrecision(CI->getArgOperand(0));
+  if (V == nullptr)
+    return nullptr;
+  
+  // If call isn't an intrinsic, check that it isn't within a function with the
+  // same name as the float version of this call.
+  //
+  // e.g. inline float expf(float val) { return (float) exp((double) val); }
+  //
+  // A similar such definition exists in the MinGW-w64 math.h header file which
+  // when compiled with -O2 -ffast-math causes the generation of infinite loops
+  // where expf is called.
+  if (!Callee->isIntrinsic()) {
+    const Function *F = CI->getFunction();
+    StringRef FName = F->getName();
+    StringRef CalleeName = Callee->getName();
+    if ((FName.size() == (CalleeName.size() + 1)) &&
+        (FName.back() == 'f') &&
+        FName.startswith(CalleeName))
+      return nullptr;
+  }
+
+  // Propagate fast-math flags from the existing call to the new call.
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(CI->getFastMathFlags());
+
+  // floor((double)floatval) -> (double)floorf(floatval)
+  if (Callee->isIntrinsic()) {
+    Module *M = CI->getModule();
+    Intrinsic::ID IID = Callee->getIntrinsicID();
+    Function *F = Intrinsic::getDeclaration(M, IID, B.getFloatTy());
+    V = B.CreateCall(F, V);
+  } else {
+    // The call is a library call rather than an intrinsic.
+    V = emitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
+  }
+
+  return B.CreateFPExt(V, B.getDoubleTy());
+}
+
+// Replace a libcall \p CI with a call to intrinsic \p IID
+static Value *replaceUnaryCall(CallInst *CI, IRBuilder<> &B, Intrinsic::ID IID) {
+  // Propagate fast-math flags from the existing call to the new call.
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(CI->getFastMathFlags());
+
+  Module *M = CI->getModule();
+  Value *V = CI->getArgOperand(0);
+  Function *F = Intrinsic::getDeclaration(M, IID, CI->getType());
+  CallInst *NewCall = B.CreateCall(F, V);
+  NewCall->takeName(CI);
+  return NewCall;
+}
+
+/// Shrink double -> float for binary functions like 'fmin/fmax'.
+static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // We know this libcall has a valid prototype, but we don't know which.
+  if (!CI->getType()->isDoubleTy())
+    return nullptr;
+
+  // If this is something like 'fmin((double)floatval1, (double)floatval2)',
+  // or fmin(1.0, (double)floatval), then we convert it to fminf.
+  Value *V1 = valueHasFloatPrecision(CI->getArgOperand(0));
+  if (V1 == nullptr)
+    return nullptr;
+  Value *V2 = valueHasFloatPrecision(CI->getArgOperand(1));
+  if (V2 == nullptr)
+    return nullptr;
+
+  // Propagate fast-math flags from the existing call to the new call.
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(CI->getFastMathFlags());
+
+  // fmin((double)floatval1, (double)floatval2)
+  //                      -> (double)fminf(floatval1, floatval2)
+  // TODO: Handle intrinsics in the same way as in optimizeUnaryDoubleFP().
+  Value *V = emitBinaryFloatFnCall(V1, V2, Callee->getName(), B,
+                                   Callee->getAttributes());
+  return B.CreateFPExt(V, B.getDoubleTy());
+}
+
+Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Ret = nullptr;
+  StringRef Name = Callee->getName();
+  if (UnsafeFPShrink && Name == "cos" && hasFloatVersion(Name))
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
+
+  // cos(-x) -> cos(x)
+  Value *Op1 = CI->getArgOperand(0);
+  if (BinaryOperator::isFNeg(Op1)) {
+    BinaryOperator *BinExpr = cast<BinaryOperator>(Op1);
+    return B.CreateCall(Callee, BinExpr->getOperand(1), "cos");
+  }
+  return Ret;
+}
+
+static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) {
+  // Multiplications calculated using Addition Chains.
+  // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
+
+  assert(Exp != 0 && "Incorrect exponent 0 not handled");
+
+  if (InnerChain[Exp])
+    return InnerChain[Exp];
+
+  static const unsigned AddChain[33][2] = {
+      {0, 0}, // Unused.
+      {0, 0}, // Unused (base case = pow1).
+      {1, 1}, // Unused (pre-computed).
+      {1, 2},  {2, 2},   {2, 3},  {3, 3},   {2, 5},  {4, 4},
+      {1, 8},  {5, 5},   {1, 10}, {6, 6},   {4, 9},  {7, 7},
+      {3, 12}, {8, 8},   {8, 9},  {2, 16},  {1, 18}, {10, 10},
+      {6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13},
+      {3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16},
+  };
+
+  InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B),
+                                 getPow(InnerChain, AddChain[Exp][1], B));
+  return InnerChain[Exp];
+}
+
+Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Ret = nullptr;
+  StringRef Name = Callee->getName();
+  if (UnsafeFPShrink && Name == "pow" && hasFloatVersion(Name))
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
+
+  Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
+
+  // pow(1.0, x) -> 1.0
+  if (match(Op1, m_SpecificFP(1.0)))
+    return Op1;
+  // pow(2.0, x) -> llvm.exp2(x)
+  if (match(Op1, m_SpecificFP(2.0))) {
+    Value *Exp2 = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::exp2,
+                                            CI->getType());
+    return B.CreateCall(Exp2, Op2, "exp2");
+  }
+
+  // There's no llvm.exp10 intrinsic yet, but, maybe, some day there will
+  // be one.
+  if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
+    // pow(10.0, x) -> exp10(x)
+    if (Op1C->isExactlyValue(10.0) &&
+        hasUnaryFloatFn(TLI, Op1->getType(), LibFunc_exp10, LibFunc_exp10f,
+                        LibFunc_exp10l))
+      return emitUnaryFloatFnCall(Op2, TLI->getName(LibFunc_exp10), B,
+                                  Callee->getAttributes());
+  }
+
+  // pow(exp(x), y) -> exp(x * y)
+  // pow(exp2(x), y) -> exp2(x * y)
+  // We enable these only with fast-math. Besides rounding differences, the
+  // transformation changes overflow and underflow behavior quite dramatically.
+  // Example: x = 1000, y = 0.001.
+  // pow(exp(x), y) = pow(inf, 0.001) = inf, whereas exp(x*y) = exp(1).
+  auto *OpC = dyn_cast<CallInst>(Op1);
+  if (OpC && OpC->hasUnsafeAlgebra() && CI->hasUnsafeAlgebra()) {
+    LibFunc Func;
+    Function *OpCCallee = OpC->getCalledFunction();
+    if (OpCCallee && TLI->getLibFunc(OpCCallee->getName(), Func) &&
+        TLI->has(Func) && (Func == LibFunc_exp || Func == LibFunc_exp2)) {
+      IRBuilder<>::FastMathFlagGuard Guard(B);
+      B.setFastMathFlags(CI->getFastMathFlags());
+      Value *FMul = B.CreateFMul(OpC->getArgOperand(0), Op2, "mul");
+      return emitUnaryFloatFnCall(FMul, OpCCallee->getName(), B,
+                                  OpCCallee->getAttributes());
+    }
+  }
+
+  ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
+  if (!Op2C)
+    return Ret;
+
+  if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0
+    return ConstantFP::get(CI->getType(), 1.0);
+
+  if (Op2C->isExactlyValue(-0.5) &&
+      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc_sqrt, LibFunc_sqrtf,
+                      LibFunc_sqrtl)) {
+    // If -ffast-math:
+    // pow(x, -0.5) -> 1.0 / sqrt(x)
+    if (CI->hasUnsafeAlgebra()) {
+      IRBuilder<>::FastMathFlagGuard Guard(B);
+      B.setFastMathFlags(CI->getFastMathFlags());
+
+      // TODO: If the pow call is an intrinsic, we should lower to the sqrt
+      // intrinsic, so we match errno semantics.  We also should check that the
+      // target can in fact lower the sqrt intrinsic -- we currently have no way
+      // to ask this question other than asking whether the target has a sqrt
+      // libcall, which is a sufficient but not necessary condition.
+      Value *Sqrt = emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc_sqrt), B,
+                                         Callee->getAttributes());
+
+      return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Sqrt, "sqrtrecip");
+    }
+  }
+
+  if (Op2C->isExactlyValue(0.5) &&
+      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc_sqrt, LibFunc_sqrtf,
+                      LibFunc_sqrtl)) {
+
+    // In -ffast-math, pow(x, 0.5) -> sqrt(x).
+    if (CI->hasUnsafeAlgebra()) {
+      IRBuilder<>::FastMathFlagGuard Guard(B);
+      B.setFastMathFlags(CI->getFastMathFlags());
+
+      // TODO: As above, we should lower to the sqrt intrinsic if the pow is an
+      // intrinsic, to match errno semantics.
+      return emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc_sqrt), B,
+                                  Callee->getAttributes());
+    }
+
+    // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))).
+    // This is faster than calling pow, and still handles negative zero
+    // and negative infinity correctly.
+    // TODO: In finite-only mode, this could be just fabs(sqrt(x)).
+    Value *Inf = ConstantFP::getInfinity(CI->getType());
+    Value *NegInf = ConstantFP::getInfinity(CI->getType(), true);
+
+    // TODO: As above, we should lower to the sqrt intrinsic if the pow is an
+    // intrinsic, to match errno semantics.
+    Value *Sqrt = emitUnaryFloatFnCall(Op1, "sqrt", B, Callee->getAttributes());
+
+    Module *M = Callee->getParent();
+    Function *FabsF = Intrinsic::getDeclaration(M, Intrinsic::fabs,
+                                                CI->getType());
+    Value *FAbs = B.CreateCall(FabsF, Sqrt);
+
+    Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf);
+    Value *Sel = B.CreateSelect(FCmp, Inf, FAbs);
+    return Sel;
+  }
+
+  if (Op2C->isExactlyValue(1.0)) // pow(x, 1.0) -> x
+    return Op1;
+  if (Op2C->isExactlyValue(2.0)) // pow(x, 2.0) -> x*x
+    return B.CreateFMul(Op1, Op1, "pow2");
+  if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
+    return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
+
+  // In -ffast-math, generate repeated fmul instead of generating pow(x, n).
+  if (CI->hasUnsafeAlgebra()) {
+    APFloat V = abs(Op2C->getValueAPF());
+    // We limit to a max of 7 fmul(s). Thus max exponent is 32.
+    // This transformation applies to integer exponents only.
+    if (V.compare(APFloat(V.getSemantics(), 32.0)) == APFloat::cmpGreaterThan ||
+        !V.isInteger())
+      return nullptr;
+
+    // Propagate fast math flags.
+    IRBuilder<>::FastMathFlagGuard Guard(B);
+    B.setFastMathFlags(CI->getFastMathFlags());
+
+    // We will memoize intermediate products of the Addition Chain.
+    Value *InnerChain[33] = {nullptr};
+    InnerChain[1] = Op1;
+    InnerChain[2] = B.CreateFMul(Op1, Op1);
+
+    // We cannot readily convert a non-double type (like float) to a double.
+    // So we first convert V to something which could be converted to double.
+    bool ignored;
+    V.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &ignored);
+    
+    Value *FMul = getPow(InnerChain, V.convertToDouble(), B);
+    // For negative exponents simply compute the reciprocal.
+    if (Op2C->isNegative())
+      FMul = B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), FMul);
+    return FMul;
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Ret = nullptr;
+  StringRef Name = Callee->getName();
+  if (UnsafeFPShrink && Name == "exp2" && hasFloatVersion(Name))
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
+
+  Value *Op = CI->getArgOperand(0);
+  // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
+  // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
+  LibFunc LdExp = LibFunc_ldexpl;
+  if (Op->getType()->isFloatTy())
+    LdExp = LibFunc_ldexpf;
+  else if (Op->getType()->isDoubleTy())
+    LdExp = LibFunc_ldexp;
+
+  if (TLI->has(LdExp)) {
+    Value *LdExpArg = nullptr;
+    if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
+        LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty());
+    } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
+        LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty());
+    }
+
+    if (LdExpArg) {
+      Constant *One = ConstantFP::get(CI->getContext(), APFloat(1.0f));
+      if (!Op->getType()->isFloatTy())
+        One = ConstantExpr::getFPExtend(One, Op->getType());
+
+      Module *M = CI->getModule();
+      Value *NewCallee =
+          M->getOrInsertFunction(TLI->getName(LdExp), Op->getType(),
+                                 Op->getType(), B.getInt32Ty());
+      CallInst *CI = B.CreateCall(NewCallee, {One, LdExpArg});
+      if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
+        CI->setCallingConv(F->getCallingConv());
+
+      return CI;
+    }
+  }
+  return Ret;
+}
+
+Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // If we can shrink the call to a float function rather than a double
+  // function, do that first.
+  StringRef Name = Callee->getName();
+  if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name))
+    if (Value *Ret = optimizeBinaryDoubleFP(CI, B))
+      return Ret;
+
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  FastMathFlags FMF;
+  if (CI->hasUnsafeAlgebra()) {
+    // Unsafe algebra sets all fast-math-flags to true.
+    FMF.setUnsafeAlgebra();
+  } else {
+    // At a minimum, no-nans-fp-math must be true.
+    if (!CI->hasNoNaNs())
+      return nullptr;
+    // No-signed-zeros is implied by the definitions of fmax/fmin themselves:
+    // "Ideally, fmax would be sensitive to the sign of zero, for example
+    // fmax(-0. 0, +0. 0) would return +0; however, implementation in software
+    // might be impractical."
+    FMF.setNoSignedZeros();
+    FMF.setNoNaNs();
+  }
+  B.setFastMathFlags(FMF);
+
+  // We have a relaxed floating-point environment. We can ignore NaN-handling
+  // and transform to a compare and select. We do not have to consider errno or
+  // exceptions, because fmin/fmax do not have those.
+  Value *Op0 = CI->getArgOperand(0);
+  Value *Op1 = CI->getArgOperand(1);
+  Value *Cmp = Callee->getName().startswith("fmin") ?
+    B.CreateFCmpOLT(Op0, Op1) : B.CreateFCmpOGT(Op0, Op1);
+  return B.CreateSelect(Cmp, Op0, Op1);
+}
+
+Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Ret = nullptr;
+  StringRef Name = Callee->getName();
+  if (UnsafeFPShrink && hasFloatVersion(Name))
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
+
+  if (!CI->hasUnsafeAlgebra())
+    return Ret;
+  Value *Op1 = CI->getArgOperand(0);
+  auto *OpC = dyn_cast<CallInst>(Op1);
+
+  // The earlier call must also be unsafe in order to do these transforms.
+  if (!OpC || !OpC->hasUnsafeAlgebra())
+    return Ret;
+
+  // log(pow(x,y)) -> y*log(x)
+  // This is only applicable to log, log2, log10.
+  if (Name != "log" && Name != "log2" && Name != "log10")
+    return Ret;
+
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  FastMathFlags FMF;
+  FMF.setUnsafeAlgebra();
+  B.setFastMathFlags(FMF);
+
+  LibFunc Func;
+  Function *F = OpC->getCalledFunction();
+  if (F && ((TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
+      Func == LibFunc_pow) || F->getIntrinsicID() == Intrinsic::pow))
+    return B.CreateFMul(OpC->getArgOperand(1),
+      emitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B,
+                           Callee->getAttributes()), "mul");
+
+  // log(exp2(y)) -> y*log(2)
+  if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) &&
+      TLI->has(Func) && Func == LibFunc_exp2)
+    return B.CreateFMul(
+        OpC->getArgOperand(0),
+        emitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0),
+                             Callee->getName(), B, Callee->getAttributes()),
+        "logmul");
+  return Ret;
+}
+
+Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Ret = nullptr;
+  // TODO: Once we have a way (other than checking for the existince of the
+  // libcall) to tell whether our target can lower @llvm.sqrt, relax the
+  // condition below.
+  if (TLI->has(LibFunc_sqrtf) && (Callee->getName() == "sqrt" ||
+                                  Callee->getIntrinsicID() == Intrinsic::sqrt))
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
+
+  if (!CI->hasUnsafeAlgebra())
+    return Ret;
+
+  Instruction *I = dyn_cast<Instruction>(CI->getArgOperand(0));
+  if (!I || I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra())
+    return Ret;
+
+  // We're looking for a repeated factor in a multiplication tree,
+  // so we can do this fold: sqrt(x * x) -> fabs(x);
+  // or this fold: sqrt((x * x) * y) -> fabs(x) * sqrt(y).
+  Value *Op0 = I->getOperand(0);
+  Value *Op1 = I->getOperand(1);
+  Value *RepeatOp = nullptr;
+  Value *OtherOp = nullptr;
+  if (Op0 == Op1) {
+    // Simple match: the operands of the multiply are identical.
+    RepeatOp = Op0;
+  } else {
+    // Look for a more complicated pattern: one of the operands is itself
+    // a multiply, so search for a common factor in that multiply.
+    // Note: We don't bother looking any deeper than this first level or for
+    // variations of this pattern because instcombine's visitFMUL and/or the
+    // reassociation pass should give us this form.
+    Value *OtherMul0, *OtherMul1;
+    if (match(Op0, m_FMul(m_Value(OtherMul0), m_Value(OtherMul1)))) {
+      // Pattern: sqrt((x * y) * z)
+      if (OtherMul0 == OtherMul1 &&
+          cast<Instruction>(Op0)->hasUnsafeAlgebra()) {
+        // Matched: sqrt((x * x) * z)
+        RepeatOp = OtherMul0;
+        OtherOp = Op1;
+      }
+    }
+  }
+  if (!RepeatOp)
+    return Ret;
+
+  // Fast math flags for any created instructions should match the sqrt
+  // and multiply.
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(I->getFastMathFlags());
+
+  // If we found a repeated factor, hoist it out of the square root and
+  // replace it with the fabs of that factor.
+  Module *M = Callee->getParent();
+  Type *ArgType = I->getType();
+  Value *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType);
+  Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs");
+  if (OtherOp) {
+    // If we found a non-repeated factor, we still need to get its square
+    // root. We then multiply that by the value that was simplified out
+    // of the square root calculation.
+    Value *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType);
+    Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt");
+    return B.CreateFMul(FabsCall, SqrtCall);
+  }
+  return FabsCall;
+}
+
+// TODO: Generalize to handle any trig function and its inverse.
+Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Ret = nullptr;
+  StringRef Name = Callee->getName();
+  if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name))
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
+
+  Value *Op1 = CI->getArgOperand(0);
+  auto *OpC = dyn_cast<CallInst>(Op1);
+  if (!OpC)
+    return Ret;
+
+  // Both calls must allow unsafe optimizations in order to remove them.
+  if (!CI->hasUnsafeAlgebra() || !OpC->hasUnsafeAlgebra())
+    return Ret;
+
+  // tan(atan(x)) -> x
+  // tanf(atanf(x)) -> x
+  // tanl(atanl(x)) -> x
+  LibFunc Func;
+  Function *F = OpC->getCalledFunction();
+  if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
+      ((Func == LibFunc_atan && Callee->getName() == "tan") ||
+       (Func == LibFunc_atanf && Callee->getName() == "tanf") ||
+       (Func == LibFunc_atanl && Callee->getName() == "tanl")))
+    Ret = OpC->getArgOperand(0);
+  return Ret;
+}
+
+static bool isTrigLibCall(CallInst *CI) {
+  // We can only hope to do anything useful if we can ignore things like errno
+  // and floating-point exceptions.
+  // We already checked the prototype.
+  return CI->hasFnAttr(Attribute::NoUnwind) &&
+         CI->hasFnAttr(Attribute::ReadNone);
+}
+
+static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
+                             bool UseFloat, Value *&Sin, Value *&Cos,
+                             Value *&SinCos) {
+  Type *ArgTy = Arg->getType();
+  Type *ResTy;
+  StringRef Name;
+
+  Triple T(OrigCallee->getParent()->getTargetTriple());
+  if (UseFloat) {
+    Name = "__sincospif_stret";
+
+    assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now");
+    // x86_64 can't use {float, float} since that would be returned in both
+    // xmm0 and xmm1, which isn't what a real struct would do.
+    ResTy = T.getArch() == Triple::x86_64
+                ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+                : static_cast<Type *>(StructType::get(ArgTy, ArgTy));
+  } else {
+    Name = "__sincospi_stret";
+    ResTy = StructType::get(ArgTy, ArgTy);
+  }
+
+  Module *M = OrigCallee->getParent();
+  Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(),
+                                         ResTy, ArgTy);
+
+  if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
+    // If the argument is an instruction, it must dominate all uses so put our
+    // sincos call there.
+    B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
+  } else {
+    // Otherwise (e.g. for a constant) the beginning of the function is as
+    // good a place as any.
+    BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock();
+    B.SetInsertPoint(&EntryBB, EntryBB.begin());
+  }
+
+  SinCos = B.CreateCall(Callee, Arg, "sincospi");
+
+  if (SinCos->getType()->isStructTy()) {
+    Sin = B.CreateExtractValue(SinCos, 0, "sinpi");
+    Cos = B.CreateExtractValue(SinCos, 1, "cospi");
+  } else {
+    Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0),
+                                 "sinpi");
+    Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
+                                 "cospi");
+  }
+}
+
+Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilder<> &B) {
+  // Make sure the prototype is as expected, otherwise the rest of the
+  // function is probably invalid and likely to abort.
+  if (!isTrigLibCall(CI))
+    return nullptr;
+
+  Value *Arg = CI->getArgOperand(0);
+  SmallVector<CallInst *, 1> SinCalls;
+  SmallVector<CallInst *, 1> CosCalls;
+  SmallVector<CallInst *, 1> SinCosCalls;
+
+  bool IsFloat = Arg->getType()->isFloatTy();
+
+  // Look for all compatible sinpi, cospi and sincospi calls with the same
+  // argument. If there are enough (in some sense) we can make the
+  // substitution.
+  Function *F = CI->getFunction();
+  for (User *U : Arg->users())
+    classifyArgUse(U, F, IsFloat, SinCalls, CosCalls, SinCosCalls);
+
+  // It's only worthwhile if both sinpi and cospi are actually used.
+  if (SinCosCalls.empty() && (SinCalls.empty() || CosCalls.empty()))
+    return nullptr;
+
+  Value *Sin, *Cos, *SinCos;
+  insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, SinCos);
+
+  auto replaceTrigInsts = [this](SmallVectorImpl<CallInst *> &Calls,
+                                 Value *Res) {
+    for (CallInst *C : Calls)
+      replaceAllUsesWith(C, Res);
+  };
+
+  replaceTrigInsts(SinCalls, Sin);
+  replaceTrigInsts(CosCalls, Cos);
+  replaceTrigInsts(SinCosCalls, SinCos);
+
+  return nullptr;
+}
+
+void LibCallSimplifier::classifyArgUse(
+    Value *Val, Function *F, bool IsFloat,
+    SmallVectorImpl<CallInst *> &SinCalls,
+    SmallVectorImpl<CallInst *> &CosCalls,
+    SmallVectorImpl<CallInst *> &SinCosCalls) {
+  CallInst *CI = dyn_cast<CallInst>(Val);
+
+  if (!CI)
+    return;
+
+  // Don't consider calls in other functions.
+  if (CI->getFunction() != F)
+    return;
+
+  Function *Callee = CI->getCalledFunction();
+  LibFunc Func;
+  if (!Callee || !TLI->getLibFunc(*Callee, Func) || !TLI->has(Func) ||
+      !isTrigLibCall(CI))
+    return;
+
+  if (IsFloat) {
+    if (Func == LibFunc_sinpif)
+      SinCalls.push_back(CI);
+    else if (Func == LibFunc_cospif)
+      CosCalls.push_back(CI);
+    else if (Func == LibFunc_sincospif_stret)
+      SinCosCalls.push_back(CI);
+  } else {
+    if (Func == LibFunc_sinpi)
+      SinCalls.push_back(CI);
+    else if (Func == LibFunc_cospi)
+      CosCalls.push_back(CI);
+    else if (Func == LibFunc_sincospi_stret)
+      SinCosCalls.push_back(CI);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Integer Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) {
+  // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
+  Value *Op = CI->getArgOperand(0);
+  Type *ArgType = Op->getType();
+  Value *F = Intrinsic::getDeclaration(CI->getCalledFunction()->getParent(),
+                                       Intrinsic::cttz, ArgType);
+  Value *V = B.CreateCall(F, {Op, B.getTrue()}, "cttz");
+  V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
+  V = B.CreateIntCast(V, B.getInt32Ty(), false);
+
+  Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType));
+  return B.CreateSelect(Cond, V, B.getInt32(0));
+}
+
+Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilder<> &B) {
+  // fls(x) -> (i32)(sizeInBits(x) - llvm.ctlz(x, false))
+  Value *Op = CI->getArgOperand(0);
+  Type *ArgType = Op->getType();
+  Value *F = Intrinsic::getDeclaration(CI->getCalledFunction()->getParent(),
+                                       Intrinsic::ctlz, ArgType);
+  Value *V = B.CreateCall(F, {Op, B.getFalse()}, "ctlz");
+  V = B.CreateSub(ConstantInt::get(V->getType(), ArgType->getIntegerBitWidth()),
+                  V);
+  return B.CreateIntCast(V, CI->getType(), false);
+}
+
+Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) {
+  // abs(x) -> x >s -1 ? x : -x
+  Value *Op = CI->getArgOperand(0);
+  Value *Pos =
+      B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()), "ispos");
+  Value *Neg = B.CreateNeg(Op, "neg");
+  return B.CreateSelect(Pos, Op, Neg);
+}
+
+Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) {
+  // isdigit(c) -> (c-'0') <u 10
+  Value *Op = CI->getArgOperand(0);
+  Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp");
+  Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit");
+  return B.CreateZExt(Op, CI->getType());
+}
+
+Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) {
+  // isascii(c) -> c <u 128
+  Value *Op = CI->getArgOperand(0);
+  Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii");
+  return B.CreateZExt(Op, CI->getType());
+}
+
+Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilder<> &B) {
+  // toascii(c) -> c & 0x7f
+  return B.CreateAnd(CI->getArgOperand(0),
+                     ConstantInt::get(CI->getType(), 0x7F));
+}
+
+//===----------------------------------------------------------------------===//
+// Formatting and IO Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg);
+
+Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B,
+                                                 int StreamArg) {
+  Function *Callee = CI->getCalledFunction();
+  // Error reporting calls should be cold, mark them as such.
+  // This applies even to non-builtin calls: it is only a hint and applies to
+  // functions that the frontend might not understand as builtins.
+
+  // This heuristic was suggested in:
+  // Improving Static Branch Prediction in a Compiler
+  // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu
+  // Proceedings of PACT'98, Oct. 1998, IEEE
+  if (!CI->hasFnAttr(Attribute::Cold) &&
+      isReportingError(Callee, CI, StreamArg)) {
+    CI->addAttribute(AttributeList::FunctionIndex, Attribute::Cold);
+  }
+
+  return nullptr;
+}
+
+static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) {
+  if (!Callee || !Callee->isDeclaration())
+    return false;
+
+  if (StreamArg < 0)
+    return true;
+
+  // These functions might be considered cold, but only if their stream
+  // argument is stderr.
+
+  if (StreamArg >= (int)CI->getNumArgOperands())
+    return false;
+  LoadInst *LI = dyn_cast<LoadInst>(CI->getArgOperand(StreamArg));
+  if (!LI)
+    return false;
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
+  if (!GV || !GV->isDeclaration())
+    return false;
+  return GV->getName() == "stderr";
+}
+
+Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) {
+  // Check for a fixed format string.
+  StringRef FormatStr;
+  if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr))
+    return nullptr;
+
+  // Empty format string -> noop.
+  if (FormatStr.empty()) // Tolerate printf's declared void.
+    return CI->use_empty() ? (Value *)CI : ConstantInt::get(CI->getType(), 0);
+
+  // Do not do any of the following transformations if the printf return value
+  // is used, in general the printf return value is not compatible with either
+  // putchar() or puts().
+  if (!CI->use_empty())
+    return nullptr;
+
+  // printf("x") -> putchar('x'), even for "%" and "%%".
+  if (FormatStr.size() == 1 || FormatStr == "%%")
+    return emitPutChar(B.getInt32(FormatStr[0]), B, TLI);
+
+  // printf("%s", "a") --> putchar('a')
+  if (FormatStr == "%s" && CI->getNumArgOperands() > 1) {
+    StringRef ChrStr;
+    if (!getConstantStringInfo(CI->getOperand(1), ChrStr))
+      return nullptr;
+    if (ChrStr.size() != 1)
+      return nullptr;
+    return emitPutChar(B.getInt32(ChrStr[0]), B, TLI);
+  }
+
+  // printf("foo\n") --> puts("foo")
+  if (FormatStr[FormatStr.size() - 1] == '\n' &&
+      FormatStr.find('%') == StringRef::npos) { // No format characters.
+    // Create a string literal with no \n on it.  We expect the constant merge
+    // pass to be run after this pass, to merge duplicate strings.
+    FormatStr = FormatStr.drop_back();
+    Value *GV = B.CreateGlobalString(FormatStr, "str");
+    return emitPutS(GV, B, TLI);
+  }
+
+  // Optimize specific format strings.
+  // printf("%c", chr) --> putchar(chr)
+  if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
+      CI->getArgOperand(1)->getType()->isIntegerTy())
+    return emitPutChar(CI->getArgOperand(1), B, TLI);
+
+  // printf("%s\n", str) --> puts(str)
+  if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
+      CI->getArgOperand(1)->getType()->isPointerTy())
+    return emitPutS(CI->getArgOperand(1), B, TLI);
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilder<> &B) {
+
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (Value *V = optimizePrintFString(CI, B)) {
+    return V;
+  }
+
+  // printf(format, ...) -> iprintf(format, ...) if no floating point
+  // arguments.
+  if (TLI->has(LibFunc_iprintf) && !callHasFloatingPointArgument(CI)) {
+    Module *M = B.GetInsertBlock()->getParent()->getParent();
+    Constant *IPrintFFn =
+        M->getOrInsertFunction("iprintf", FT, Callee->getAttributes());
+    CallInst *New = cast<CallInst>(CI->clone());
+    New->setCalledFunction(IPrintFFn);
+    B.Insert(New);
+    return New;
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
+  // Check for a fixed format string.
+  StringRef FormatStr;
+  if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+    return nullptr;
+
+  // If we just have a format string (nothing else crazy) transform it.
+  if (CI->getNumArgOperands() == 2) {
+    // Make sure there's no % in the constant array.  We could try to handle
+    // %% -> % in the future if we cared.
+    for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+      if (FormatStr[i] == '%')
+        return nullptr; // we found a format specifier, bail out.
+
+    // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
+    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                   ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+                                    FormatStr.size() + 1),
+                   1); // Copy the null byte.
+    return ConstantInt::get(CI->getType(), FormatStr.size());
+  }
+
+  // The remaining optimizations require the format string to be "%s" or "%c"
+  // and have an extra operand.
+  if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+      CI->getNumArgOperands() < 3)
+    return nullptr;
+
+  // Decode the second character of the format string.
+  if (FormatStr[1] == 'c') {
+    // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+    if (!CI->getArgOperand(2)->getType()->isIntegerTy())
+      return nullptr;
+    Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
+    Value *Ptr = castToCStr(CI->getArgOperand(0), B);
+    B.CreateStore(V, Ptr);
+    Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
+    B.CreateStore(B.getInt8(0), Ptr);
+
+    return ConstantInt::get(CI->getType(), 1);
+  }
+
+  if (FormatStr[1] == 's') {
+    // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
+    if (!CI->getArgOperand(2)->getType()->isPointerTy())
+      return nullptr;
+
+    Value *Len = emitStrLen(CI->getArgOperand(2), B, DL, TLI);
+    if (!Len)
+      return nullptr;
+    Value *IncLen =
+        B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc");
+    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1);
+
+    // The sprintf result is the unincremented number of bytes in the string.
+    return B.CreateIntCast(Len, CI->getType(), false);
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (Value *V = optimizeSPrintFString(CI, B)) {
+    return V;
+  }
+
+  // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
+  // point arguments.
+  if (TLI->has(LibFunc_siprintf) && !callHasFloatingPointArgument(CI)) {
+    Module *M = B.GetInsertBlock()->getParent()->getParent();
+    Constant *SIPrintFFn =
+        M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
+    CallInst *New = cast<CallInst>(CI->clone());
+    New->setCalledFunction(SIPrintFFn);
+    B.Insert(New);
+    return New;
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
+  optimizeErrorReporting(CI, B, 0);
+
+  // All the optimizations depend on the format string.
+  StringRef FormatStr;
+  if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+    return nullptr;
+
+  // Do not do any of the following transformations if the fprintf return
+  // value is used, in general the fprintf return value is not compatible
+  // with fwrite(), fputc() or fputs().
+  if (!CI->use_empty())
+    return nullptr;
+
+  // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
+  if (CI->getNumArgOperands() == 2) {
+    for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+      if (FormatStr[i] == '%') // Could handle %% -> % if we cared.
+        return nullptr;        // We found a format specifier.
+
+    return emitFWrite(
+        CI->getArgOperand(1),
+        ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size()),
+        CI->getArgOperand(0), B, DL, TLI);
+  }
+
+  // The remaining optimizations require the format string to be "%s" or "%c"
+  // and have an extra operand.
+  if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+      CI->getNumArgOperands() < 3)
+    return nullptr;
+
+  // Decode the second character of the format string.
+  if (FormatStr[1] == 'c') {
+    // fprintf(F, "%c", chr) --> fputc(chr, F)
+    if (!CI->getArgOperand(2)->getType()->isIntegerTy())
+      return nullptr;
+    return emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
+  }
+
+  if (FormatStr[1] == 's') {
+    // fprintf(F, "%s", str) --> fputs(str, F)
+    if (!CI->getArgOperand(2)->getType()->isPointerTy())
+      return nullptr;
+    return emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (Value *V = optimizeFPrintFString(CI, B)) {
+    return V;
+  }
+
+  // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
+  // floating point arguments.
+  if (TLI->has(LibFunc_fiprintf) && !callHasFloatingPointArgument(CI)) {
+    Module *M = B.GetInsertBlock()->getParent()->getParent();
+    Constant *FIPrintFFn =
+        M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
+    CallInst *New = cast<CallInst>(CI->clone());
+    New->setCalledFunction(FIPrintFFn);
+    B.Insert(New);
+    return New;
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) {
+  optimizeErrorReporting(CI, B, 3);
+
+  // Get the element size and count.
+  ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  if (!SizeC || !CountC)
+    return nullptr;
+  uint64_t Bytes = SizeC->getZExtValue() * CountC->getZExtValue();
+
+  // If this is writing zero records, remove the call (it's a noop).
+  if (Bytes == 0)
+    return ConstantInt::get(CI->getType(), 0);
+
+  // If this is writing one byte, turn it into fputc.
+  // This optimisation is only valid, if the return value is unused.
+  if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F)
+    Value *Char = B.CreateLoad(castToCStr(CI->getArgOperand(0), B), "char");
+    Value *NewCI = emitFPutC(Char, CI->getArgOperand(3), B, TLI);
+    return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
+  optimizeErrorReporting(CI, B, 1);
+
+  // Don't rewrite fputs to fwrite when optimising for size because fwrite
+  // requires more arguments and thus extra MOVs are required.
+  if (CI->getParent()->getParent()->optForSize())
+    return nullptr;
+
+  // We can't optimize if return value is used.
+  if (!CI->use_empty())
+    return nullptr;
+
+  // fputs(s,F) --> fwrite(s,1,strlen(s),F)
+  uint64_t Len = GetStringLength(CI->getArgOperand(0));
+  if (!Len)
+    return nullptr;
+
+  // Known to have no uses (see above).
+  return emitFWrite(
+      CI->getArgOperand(0),
+      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1),
+      CI->getArgOperand(1), B, DL, TLI);
+}
+
+Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
+  // Check for a constant string.
+  StringRef Str;
+  if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+    return nullptr;
+
+  if (Str.empty() && CI->use_empty()) {
+    // puts("") -> putchar('\n')
+    Value *Res = emitPutChar(B.getInt32('\n'), B, TLI);
+    if (CI->use_empty() || !Res)
+      return Res;
+    return B.CreateIntCast(Res, CI->getType(), true);
+  }
+
+  return nullptr;
+}
+
+bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
+  LibFunc Func;
+  SmallString<20> FloatFuncName = FuncName;
+  FloatFuncName += 'f';
+  if (TLI->getLibFunc(FloatFuncName, Func))
+    return TLI->has(Func);
+  return false;
+}
+
+Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
+                                                      IRBuilder<> &Builder) {
+  LibFunc Func;
+  Function *Callee = CI->getCalledFunction();
+  // Check for string/memory library functions.
+  if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
+    // Make sure we never change the calling convention.
+    assert((ignoreCallingConv(Func) ||
+            isCallingConvCCompatible(CI)) &&
+      "Optimizing string/memory libcall would change the calling convention");
+    switch (Func) {
+    case LibFunc_strcat:
+      return optimizeStrCat(CI, Builder);
+    case LibFunc_strncat:
+      return optimizeStrNCat(CI, Builder);
+    case LibFunc_strchr:
+      return optimizeStrChr(CI, Builder);
+    case LibFunc_strrchr:
+      return optimizeStrRChr(CI, Builder);
+    case LibFunc_strcmp:
+      return optimizeStrCmp(CI, Builder);
+    case LibFunc_strncmp:
+      return optimizeStrNCmp(CI, Builder);
+    case LibFunc_strcpy:
+      return optimizeStrCpy(CI, Builder);
+    case LibFunc_stpcpy:
+      return optimizeStpCpy(CI, Builder);
+    case LibFunc_strncpy:
+      return optimizeStrNCpy(CI, Builder);
+    case LibFunc_strlen:
+      return optimizeStrLen(CI, Builder);
+    case LibFunc_strpbrk:
+      return optimizeStrPBrk(CI, Builder);
+    case LibFunc_strtol:
+    case LibFunc_strtod:
+    case LibFunc_strtof:
+    case LibFunc_strtoul:
+    case LibFunc_strtoll:
+    case LibFunc_strtold:
+    case LibFunc_strtoull:
+      return optimizeStrTo(CI, Builder);
+    case LibFunc_strspn:
+      return optimizeStrSpn(CI, Builder);
+    case LibFunc_strcspn:
+      return optimizeStrCSpn(CI, Builder);
+    case LibFunc_strstr:
+      return optimizeStrStr(CI, Builder);
+    case LibFunc_memchr:
+      return optimizeMemChr(CI, Builder);
+    case LibFunc_memcmp:
+      return optimizeMemCmp(CI, Builder);
+    case LibFunc_memcpy:
+      return optimizeMemCpy(CI, Builder);
+    case LibFunc_memmove:
+      return optimizeMemMove(CI, Builder);
+    case LibFunc_memset:
+      return optimizeMemSet(CI, Builder);
+    case LibFunc_wcslen:
+      return optimizeWcslen(CI, Builder);
+    default:
+      break;
+    }
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
+  if (CI->isNoBuiltin())
+    return nullptr;
+
+  LibFunc Func;
+  Function *Callee = CI->getCalledFunction();
+  StringRef FuncName = Callee->getName();
+
+  SmallVector<OperandBundleDef, 2> OpBundles;
+  CI->getOperandBundlesAsDefs(OpBundles);
+  IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles);
+  bool isCallingConvC = isCallingConvCCompatible(CI);
+
+  // Command-line parameter overrides instruction attribute.
+  if (EnableUnsafeFPShrink.getNumOccurrences() > 0)
+    UnsafeFPShrink = EnableUnsafeFPShrink;
+  else if (isa<FPMathOperator>(CI) && CI->hasUnsafeAlgebra())
+    UnsafeFPShrink = true;
+
+  // First, check for intrinsics.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+    if (!isCallingConvC)
+      return nullptr;
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::pow:
+      return optimizePow(CI, Builder);
+    case Intrinsic::exp2:
+      return optimizeExp2(CI, Builder);
+    case Intrinsic::log:
+      return optimizeLog(CI, Builder);
+    case Intrinsic::sqrt:
+      return optimizeSqrt(CI, Builder);
+    // TODO: Use foldMallocMemset() with memset intrinsic.
+    default:
+      return nullptr;
+    }
+  }
+
+  // Also try to simplify calls to fortified library functions.
+  if (Value *SimplifiedFortifiedCI = FortifiedSimplifier.optimizeCall(CI)) {
+    // Try to further simplify the result.
+    CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI);
+    if (SimplifiedCI && SimplifiedCI->getCalledFunction()) {
+      // Use an IR Builder from SimplifiedCI if available instead of CI
+      // to guarantee we reach all uses we might replace later on.
+      IRBuilder<> TmpBuilder(SimplifiedCI);
+      if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, TmpBuilder)) {
+        // If we were able to further simplify, remove the now redundant call.
+        SimplifiedCI->replaceAllUsesWith(V);
+        SimplifiedCI->eraseFromParent();
+        return V;
+      }
+    }
+    return SimplifiedFortifiedCI;
+  }
+
+  // Then check for known library functions.
+  if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
+    // We never change the calling convention.
+    if (!ignoreCallingConv(Func) && !isCallingConvC)
+      return nullptr;
+    if (Value *V = optimizeStringMemoryLibCall(CI, Builder))
+      return V;
+    switch (Func) {
+    case LibFunc_cosf:
+    case LibFunc_cos:
+    case LibFunc_cosl:
+      return optimizeCos(CI, Builder);
+    case LibFunc_sinpif:
+    case LibFunc_sinpi:
+    case LibFunc_cospif:
+    case LibFunc_cospi:
+      return optimizeSinCosPi(CI, Builder);
+    case LibFunc_powf:
+    case LibFunc_pow:
+    case LibFunc_powl:
+      return optimizePow(CI, Builder);
+    case LibFunc_exp2l:
+    case LibFunc_exp2:
+    case LibFunc_exp2f:
+      return optimizeExp2(CI, Builder);
+    case LibFunc_fabsf:
+    case LibFunc_fabs:
+    case LibFunc_fabsl:
+      return replaceUnaryCall(CI, Builder, Intrinsic::fabs);
+    case LibFunc_sqrtf:
+    case LibFunc_sqrt:
+    case LibFunc_sqrtl:
+      return optimizeSqrt(CI, Builder);
+    case LibFunc_ffs:
+    case LibFunc_ffsl:
+    case LibFunc_ffsll:
+      return optimizeFFS(CI, Builder);
+    case LibFunc_fls:
+    case LibFunc_flsl:
+    case LibFunc_flsll:
+      return optimizeFls(CI, Builder);
+    case LibFunc_abs:
+    case LibFunc_labs:
+    case LibFunc_llabs:
+      return optimizeAbs(CI, Builder);
+    case LibFunc_isdigit:
+      return optimizeIsDigit(CI, Builder);
+    case LibFunc_isascii:
+      return optimizeIsAscii(CI, Builder);
+    case LibFunc_toascii:
+      return optimizeToAscii(CI, Builder);
+    case LibFunc_printf:
+      return optimizePrintF(CI, Builder);
+    case LibFunc_sprintf:
+      return optimizeSPrintF(CI, Builder);
+    case LibFunc_fprintf:
+      return optimizeFPrintF(CI, Builder);
+    case LibFunc_fwrite:
+      return optimizeFWrite(CI, Builder);
+    case LibFunc_fputs:
+      return optimizeFPuts(CI, Builder);
+    case LibFunc_log:
+    case LibFunc_log10:
+    case LibFunc_log1p:
+    case LibFunc_log2:
+    case LibFunc_logb:
+      return optimizeLog(CI, Builder);
+    case LibFunc_puts:
+      return optimizePuts(CI, Builder);
+    case LibFunc_tan:
+    case LibFunc_tanf:
+    case LibFunc_tanl:
+      return optimizeTan(CI, Builder);
+    case LibFunc_perror:
+      return optimizeErrorReporting(CI, Builder);
+    case LibFunc_vfprintf:
+    case LibFunc_fiprintf:
+      return optimizeErrorReporting(CI, Builder, 0);
+    case LibFunc_fputc:
+      return optimizeErrorReporting(CI, Builder, 1);
+    case LibFunc_ceil:
+      return replaceUnaryCall(CI, Builder, Intrinsic::ceil);
+    case LibFunc_floor:
+      return replaceUnaryCall(CI, Builder, Intrinsic::floor);
+    case LibFunc_round:
+      return replaceUnaryCall(CI, Builder, Intrinsic::round);
+    case LibFunc_nearbyint:
+      return replaceUnaryCall(CI, Builder, Intrinsic::nearbyint);
+    case LibFunc_rint:
+      return replaceUnaryCall(CI, Builder, Intrinsic::rint);
+    case LibFunc_trunc:
+      return replaceUnaryCall(CI, Builder, Intrinsic::trunc);
+    case LibFunc_acos:
+    case LibFunc_acosh:
+    case LibFunc_asin:
+    case LibFunc_asinh:
+    case LibFunc_atan:
+    case LibFunc_atanh:
+    case LibFunc_cbrt:
+    case LibFunc_cosh:
+    case LibFunc_exp:
+    case LibFunc_exp10:
+    case LibFunc_expm1:
+    case LibFunc_sin:
+    case LibFunc_sinh:
+    case LibFunc_tanh:
+      if (UnsafeFPShrink && hasFloatVersion(FuncName))
+        return optimizeUnaryDoubleFP(CI, Builder, true);
+      return nullptr;
+    case LibFunc_copysign:
+      if (hasFloatVersion(FuncName))
+        return optimizeBinaryDoubleFP(CI, Builder);
+      return nullptr;
+    case LibFunc_fminf:
+    case LibFunc_fmin:
+    case LibFunc_fminl:
+    case LibFunc_fmaxf:
+    case LibFunc_fmax:
+    case LibFunc_fmaxl:
+      return optimizeFMinFMax(CI, Builder);
+    default:
+      return nullptr;
+    }
+  }
+  return nullptr;
+}
+
+LibCallSimplifier::LibCallSimplifier(
+    const DataLayout &DL, const TargetLibraryInfo *TLI,
+    function_ref<void(Instruction *, Value *)> Replacer)
+    : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), UnsafeFPShrink(false),
+      Replacer(Replacer) {}
+
+void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
+  // Indirect through the replacer used in this instance.
+  Replacer(I, With);
+}
+
+// TODO:
+//   Additional cases that we need to add to this file:
+//
+// cbrt:
+//   * cbrt(expN(X))  -> expN(x/3)
+//   * cbrt(sqrt(x))  -> pow(x,1/6)
+//   * cbrt(cbrt(x))  -> pow(x,1/9)
+//
+// exp, expf, expl:
+//   * exp(log(x))  -> x
+//
+// log, logf, logl:
+//   * log(exp(x))   -> x
+//   * log(exp(y))   -> y*log(e)
+//   * log(exp10(y)) -> y*log(10)
+//   * log(sqrt(x))  -> 0.5*log(x)
+//
+// pow, powf, powl:
+//   * pow(sqrt(x),y) -> pow(x,y*0.5)
+//   * pow(pow(x,y),z)-> pow(x,y*z)
+//
+// signbit:
+//   * signbit(cnst) -> cnst'
+//   * signbit(nncst) -> 0 (if pstv is a non-negative constant)
+//
+// sqrt, sqrtf, sqrtl:
+//   * sqrt(expN(x))  -> expN(x*0.5)
+//   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
+//   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
+//
+
+//===----------------------------------------------------------------------===//
+// Fortified Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
+                                                         unsigned ObjSizeOp,
+                                                         unsigned SizeOp,
+                                                         bool isString) {
+  if (CI->getArgOperand(ObjSizeOp) == CI->getArgOperand(SizeOp))
+    return true;
+  if (ConstantInt *ObjSizeCI =
+          dyn_cast<ConstantInt>(CI->getArgOperand(ObjSizeOp))) {
+    if (ObjSizeCI->isMinusOne())
+      return true;
+    // If the object size wasn't -1 (unknown), bail out if we were asked to.
+    if (OnlyLowerUnknownSize)
+      return false;
+    if (isString) {
+      uint64_t Len = GetStringLength(CI->getArgOperand(SizeOp));
+      // If the length is 0 we don't know how long it is and so we can't
+      // remove the check.
+      if (Len == 0)
+        return false;
+      return ObjSizeCI->getZExtValue() >= Len;
+    }
+    if (ConstantInt *SizeCI = dyn_cast<ConstantInt>(CI->getArgOperand(SizeOp)))
+      return ObjSizeCI->getZExtValue() >= SizeCI->getZExtValue();
+  }
+  return false;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
+                                                     IRBuilder<> &B) {
+  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
+    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                   CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
+                                                      IRBuilder<> &B) {
+  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
+    B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                    CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
+                                                     IRBuilder<> &B) {
+  // TODO: Try foldMallocMemset() here.
+
+  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
+    Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
+    B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
+                                                      IRBuilder<> &B,
+                                                      LibFunc Func) {
+  Function *Callee = CI->getCalledFunction();
+  StringRef Name = Callee->getName();
+  const DataLayout &DL = CI->getModule()->getDataLayout();
+  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1),
+        *ObjSize = CI->getArgOperand(2);
+
+  // __stpcpy_chk(x,x,...)  -> x+strlen(x)
+  if (Func == LibFunc_stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) {
+    Value *StrLen = emitStrLen(Src, B, DL, TLI);
+    return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
+  }
+
+  // If a) we don't have any length information, or b) we know this will
+  // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our
+  // st[rp]cpy_chk call which may fail at runtime if the size is too long.
+  // TODO: It might be nice to get a maximum length out of the possible
+  // string lengths for varying.
+  if (isFortifiedCallFoldable(CI, 2, 1, true))
+    return emitStrCpy(Dst, Src, B, TLI, Name.substr(2, 6));
+
+  if (OnlyLowerUnknownSize)
+    return nullptr;
+
+  // Maybe we can stil fold __st[rp]cpy_chk to __memcpy_chk.
+  uint64_t Len = GetStringLength(Src);
+  if (Len == 0)
+    return nullptr;
+
+  Type *SizeTTy = DL.getIntPtrType(CI->getContext());
+  Value *LenV = ConstantInt::get(SizeTTy, Len);
+  Value *Ret = emitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI);
+  // If the function was an __stpcpy_chk, and we were able to fold it into
+  // a __memcpy_chk, we still need to return the correct end pointer.
+  if (Ret && Func == LibFunc_stpcpy_chk)
+    return B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(SizeTTy, Len - 1));
+  return Ret;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
+                                                       IRBuilder<> &B,
+                                                       LibFunc Func) {
+  Function *Callee = CI->getCalledFunction();
+  StringRef Name = Callee->getName();
+  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
+    Value *Ret = emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                             CI->getArgOperand(2), B, TLI, Name.substr(2, 7));
+    return Ret;
+  }
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
+  // FIXME: We shouldn't be changing "nobuiltin" or TLI unavailable calls here.
+  // Some clang users checked for _chk libcall availability using:
+  //   __has_builtin(__builtin___memcpy_chk)
+  // When compiling with -fno-builtin, this is always true.
+  // When passing -ffreestanding/-mkernel, which both imply -fno-builtin, we
+  // end up with fortified libcalls, which isn't acceptable in a freestanding
+  // environment which only provides their non-fortified counterparts.
+  //
+  // Until we change clang and/or teach external users to check for availability
+  // differently, disregard the "nobuiltin" attribute and TLI::has.
+  //
+  // PR23093.
+
+  LibFunc Func;
+  Function *Callee = CI->getCalledFunction();
+
+  SmallVector<OperandBundleDef, 2> OpBundles;
+  CI->getOperandBundlesAsDefs(OpBundles);
+  IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles);
+  bool isCallingConvC = isCallingConvCCompatible(CI);
+
+  // First, check that this is a known library functions and that the prototype
+  // is correct.
+  if (!TLI->getLibFunc(*Callee, Func))
+    return nullptr;
+
+  // We never change the calling convention.
+  if (!ignoreCallingConv(Func) && !isCallingConvC)
+    return nullptr;
+
+  switch (Func) {
+  case LibFunc_memcpy_chk:
+    return optimizeMemCpyChk(CI, Builder);
+  case LibFunc_memmove_chk:
+    return optimizeMemMoveChk(CI, Builder);
+  case LibFunc_memset_chk:
+    return optimizeMemSetChk(CI, Builder);
+  case LibFunc_stpcpy_chk:
+  case LibFunc_strcpy_chk:
+    return optimizeStrpCpyChk(CI, Builder, Func);
+  case LibFunc_stpncpy_chk:
+  case LibFunc_strncpy_chk:
+    return optimizeStrpNCpyChk(CI, Builder, Func);
+  default:
+    break;
+  }
+  return nullptr;
+}
+
+FortifiedLibCallSimplifier::FortifiedLibCallSimplifier(
+    const TargetLibraryInfo *TLI, bool OnlyLowerUnknownSize)
+    : TLI(TLI), OnlyLowerUnknownSize(OnlyLowerUnknownSize) {}
diff --git a/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp b/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp
new file mode 100644
index 000000000000..e9a368f4faa4
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp
@@ -0,0 +1,263 @@
+//===- SplitModule.cpp - Split a module into partitions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the function llvm::SplitModule, which splits a module
+// into multiple linkable partitions. It can be used to implement parallel code
+// generation for link-time optimization.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "split-module"
+
+#include "llvm/Transforms/Utils/SplitModule.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <queue>
+
+using namespace llvm;
+
+namespace {
+typedef EquivalenceClasses<const GlobalValue *> ClusterMapType;
+typedef DenseMap<const Comdat *, const GlobalValue *> ComdatMembersType;
+typedef DenseMap<const GlobalValue *, unsigned> ClusterIDMapType;
+}
+
+static void addNonConstUser(ClusterMapType &GVtoClusterMap,
+                            const GlobalValue *GV, const User *U) {
+  assert((!isa<Constant>(U) || isa<GlobalValue>(U)) && "Bad user");
+
+  if (const Instruction *I = dyn_cast<Instruction>(U)) {
+    const GlobalValue *F = I->getParent()->getParent();
+    GVtoClusterMap.unionSets(GV, F);
+  } else if (isa<GlobalIndirectSymbol>(U) || isa<Function>(U) ||
+             isa<GlobalVariable>(U)) {
+    GVtoClusterMap.unionSets(GV, cast<GlobalValue>(U));
+  } else {
+    llvm_unreachable("Underimplemented use case");
+  }
+}
+
+// Adds all GlobalValue users of V to the same cluster as GV.
+static void addAllGlobalValueUsers(ClusterMapType &GVtoClusterMap,
+                                   const GlobalValue *GV, const Value *V) {
+  for (auto *U : V->users()) {
+    SmallVector<const User *, 4> Worklist;
+    Worklist.push_back(U);
+    while (!Worklist.empty()) {
+      const User *UU = Worklist.pop_back_val();
+      // For each constant that is not a GV (a pure const) recurse.
+      if (isa<Constant>(UU) && !isa<GlobalValue>(UU)) {
+        Worklist.append(UU->user_begin(), UU->user_end());
+        continue;
+      }
+      addNonConstUser(GVtoClusterMap, GV, UU);
+    }
+  }
+}
+
+// Find partitions for module in the way that no locals need to be
+// globalized.
+// Try to balance pack those partitions into N files since this roughly equals
+// thread balancing for the backend codegen step.
+static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap,
+                           unsigned N) {
+  // At this point module should have the proper mix of globals and locals.
+  // As we attempt to partition this module, we must not change any
+  // locals to globals.
+  DEBUG(dbgs() << "Partition module with (" << M->size() << ")functions\n");
+  ClusterMapType GVtoClusterMap;
+  ComdatMembersType ComdatMembers;
+
+  auto recordGVSet = [&GVtoClusterMap, &ComdatMembers](GlobalValue &GV) {
+    if (GV.isDeclaration())
+      return;
+
+    if (!GV.hasName())
+      GV.setName("__llvmsplit_unnamed");
+
+    // Comdat groups must not be partitioned. For comdat groups that contain
+    // locals, record all their members here so we can keep them together.
+    // Comdat groups that only contain external globals are already handled by
+    // the MD5-based partitioning.
+    if (const Comdat *C = GV.getComdat()) {
+      auto &Member = ComdatMembers[C];
+      if (Member)
+        GVtoClusterMap.unionSets(Member, &GV);
+      else
+        Member = &GV;
+    }
+
+    // For aliases we should not separate them from their aliasees regardless
+    // of linkage.
+    if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(&GV)) {
+      if (const GlobalObject *Base = GIS->getBaseObject())
+        GVtoClusterMap.unionSets(&GV, Base);
+    }
+
+    if (const Function *F = dyn_cast<Function>(&GV)) {
+      for (const BasicBlock &BB : *F) {
+        BlockAddress *BA = BlockAddress::lookup(&BB);
+        if (!BA || !BA->isConstantUsed())
+          continue;
+        addAllGlobalValueUsers(GVtoClusterMap, F, BA);
+      }
+    }
+
+    if (GV.hasLocalLinkage())
+      addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV);
+  };
+
+  std::for_each(M->begin(), M->end(), recordGVSet);
+  std::for_each(M->global_begin(), M->global_end(), recordGVSet);
+  std::for_each(M->alias_begin(), M->alias_end(), recordGVSet);
+
+  // Assigned all GVs to merged clusters while balancing number of objects in
+  // each.
+  auto CompareClusters = [](const std::pair<unsigned, unsigned> &a,
+                            const std::pair<unsigned, unsigned> &b) {
+    if (a.second || b.second)
+      return a.second > b.second;
+    else
+      return a.first > b.first;
+  };
+
+  std::priority_queue<std::pair<unsigned, unsigned>,
+                      std::vector<std::pair<unsigned, unsigned>>,
+                      decltype(CompareClusters)>
+      BalancinQueue(CompareClusters);
+  // Pre-populate priority queue with N slot blanks.
+  for (unsigned i = 0; i < N; ++i)
+    BalancinQueue.push(std::make_pair(i, 0));
+
+  typedef std::pair<unsigned, ClusterMapType::iterator> SortType;
+  SmallVector<SortType, 64> Sets;
+  SmallPtrSet<const GlobalValue *, 32> Visited;
+
+  // To guarantee determinism, we have to sort SCC according to size.
+  // When size is the same, use leader's name.
+  for (ClusterMapType::iterator I = GVtoClusterMap.begin(),
+                                E = GVtoClusterMap.end(); I != E; ++I)
+    if (I->isLeader())
+      Sets.push_back(
+          std::make_pair(std::distance(GVtoClusterMap.member_begin(I),
+                                       GVtoClusterMap.member_end()), I));
+
+  std::sort(Sets.begin(), Sets.end(), [](const SortType &a, const SortType &b) {
+    if (a.first == b.first)
+      return a.second->getData()->getName() > b.second->getData()->getName();
+    else
+      return a.first > b.first;
+  });
+
+  for (auto &I : Sets) {
+    unsigned CurrentClusterID = BalancinQueue.top().first;
+    unsigned CurrentClusterSize = BalancinQueue.top().second;
+    BalancinQueue.pop();
+
+    DEBUG(dbgs() << "Root[" << CurrentClusterID << "] cluster_size(" << I.first
+                 << ") ----> " << I.second->getData()->getName() << "\n");
+
+    for (ClusterMapType::member_iterator MI =
+             GVtoClusterMap.findLeader(I.second);
+         MI != GVtoClusterMap.member_end(); ++MI) {
+      if (!Visited.insert(*MI).second)
+        continue;
+      DEBUG(dbgs() << "----> " << (*MI)->getName()
+                   << ((*MI)->hasLocalLinkage() ? " l " : " e ") << "\n");
+      Visited.insert(*MI);
+      ClusterIDMap[*MI] = CurrentClusterID;
+      CurrentClusterSize++;
+    }
+    // Add this set size to the number of entries in this cluster.
+    BalancinQueue.push(std::make_pair(CurrentClusterID, CurrentClusterSize));
+  }
+}
+
+static void externalize(GlobalValue *GV) {
+  if (GV->hasLocalLinkage()) {
+    GV->setLinkage(GlobalValue::ExternalLinkage);
+    GV->setVisibility(GlobalValue::HiddenVisibility);
+  }
+
+  // Unnamed entities must be named consistently between modules. setName will
+  // give a distinct name to each such entity.
+  if (!GV->hasName())
+    GV->setName("__llvmsplit_unnamed");
+}
+
+// Returns whether GV should be in partition (0-based) I of N.
+static bool isInPartition(const GlobalValue *GV, unsigned I, unsigned N) {
+  if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(GV))
+    if (const GlobalObject *Base = GIS->getBaseObject())
+      GV = Base;
+
+  StringRef Name;
+  if (const Comdat *C = GV->getComdat())
+    Name = C->getName();
+  else
+    Name = GV->getName();
+
+  // Partition by MD5 hash. We only need a few bits for evenness as the number
+  // of partitions will generally be in the 1-2 figure range; the low 16 bits
+  // are enough.
+  MD5 H;
+  MD5::MD5Result R;
+  H.update(Name);
+  H.final(R);
+  return (R[0] | (R[1] << 8)) % N == I;
+}
+
+void llvm::SplitModule(
+    std::unique_ptr<Module> M, unsigned N,
+    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback,
+    bool PreserveLocals) {
+  if (!PreserveLocals) {
+    for (Function &F : *M)
+      externalize(&F);
+    for (GlobalVariable &GV : M->globals())
+      externalize(&GV);
+    for (GlobalAlias &GA : M->aliases())
+      externalize(&GA);
+    for (GlobalIFunc &GIF : M->ifuncs())
+      externalize(&GIF);
+  }
+
+  // This performs splitting without a need for externalization, which might not
+  // always be possible.
+  ClusterIDMapType ClusterIDMap;
+  findPartitions(M.get(), ClusterIDMap, N);
+
+  // FIXME: We should be able to reuse M as the last partition instead of
+  // cloning it.
+  for (unsigned I = 0; I < N; ++I) {
+    ValueToValueMapTy VMap;
+    std::unique_ptr<Module> MPart(
+        CloneModule(M.get(), VMap, [&](const GlobalValue *GV) {
+          if (ClusterIDMap.count(GV))
+            return (ClusterIDMap[GV] == I);
+          else
+            return isInPartition(GV, I, N);
+        }));
+    if (I != 0)
+      MPart->setModuleInlineAsm("");
+    ModuleCallback(std::move(MPart));
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp b/contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
new file mode 100644
index 000000000000..49dc15cf5e7c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -0,0 +1,80 @@
+//===- StripGCRelocates.cpp - Remove gc.relocates inserted by RewriteStatePoints===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a little utility pass that removes the gc.relocates inserted by
+// RewriteStatepointsForGC. Note that the generated IR is incorrect,
+// but this is useful as a single pass in itself, for analysis of IR, without
+// the GC.relocates. The statepoint and gc.result instrinsics would still be
+// present.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+namespace {
+struct StripGCRelocates : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  StripGCRelocates() : FunctionPass(ID) {
+    initializeStripGCRelocatesPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &Info) const override {}
+
+  bool runOnFunction(Function &F) override;
+
+};
+char StripGCRelocates::ID = 0;
+}
+
+bool StripGCRelocates::runOnFunction(Function &F) {
+  // Nothing to do for declarations.
+  if (F.isDeclaration())
+    return false;
+  SmallVector<GCRelocateInst *, 20> GCRelocates;
+  // TODO: We currently do not handle gc.relocates that are in landing pads,
+  // i.e. not bound to a single statepoint token.
+  for (Instruction &I : instructions(F)) {
+    if (auto *GCR = dyn_cast<GCRelocateInst>(&I))
+      if (isStatepoint(GCR->getOperand(0)))
+        GCRelocates.push_back(GCR);
+  }
+  // All gc.relocates are bound to a single statepoint token. The order of
+  // visiting gc.relocates for deletion does not matter.
+  for (GCRelocateInst *GCRel : GCRelocates) {
+    Value *OrigPtr = GCRel->getDerivedPtr();
+    Value *ReplaceGCRel = OrigPtr;
+
+    // All gc_relocates are i8 addrspace(1)* typed, we need a bitcast from i8
+    // addrspace(1)* to the type of the OrigPtr, if the are not the same.
+    if (GCRel->getType() != OrigPtr->getType())
+      ReplaceGCRel = new BitCastInst(OrigPtr, GCRel->getType(), "cast", GCRel);
+
+    // Replace all uses of gc.relocate and delete the gc.relocate
+    // There maybe unncessary bitcasts back to the OrigPtr type, an instcombine
+    // pass would clear this up.
+    GCRel->replaceAllUsesWith(ReplaceGCRel);
+    GCRel->eraseFromParent();
+  }
+  return !GCRelocates.empty();
+}
+
+INITIALIZE_PASS(StripGCRelocates, "strip-gc-relocates",
+                "Strip gc.relocates inserted through RewriteStatepointsForGC",
+                true, false)
+FunctionPass *llvm::createStripGCRelocatesPass() {
+  return new StripGCRelocates();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp b/contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
new file mode 100644
index 000000000000..cd0378e0140c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
@@ -0,0 +1,42 @@
+//===- StripNonLineTableDebugInfo.cpp -- Strip parts of Debug Info --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+using namespace llvm;
+
+namespace {
+
+/// This pass strips all debug info that is not related line tables.
+/// The result will be the same as if the program where compiled with
+/// -gline-tables-only.
+struct StripNonLineTableDebugInfo : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  StripNonLineTableDebugInfo() : ModulePass(ID) {
+    initializeStripNonLineTableDebugInfoPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  bool runOnModule(Module &M) override {
+    return llvm::stripNonLineTableDebugInfo(M);
+  }
+};
+}
+
+char StripNonLineTableDebugInfo::ID = 0;
+INITIALIZE_PASS(StripNonLineTableDebugInfo, "strip-nonlinetable-debuginfo",
+                "Strip all debug info except linetables", false, false)
+
+ModulePass *llvm::createStripNonLineTableDebugInfoPass() {
+  return new StripNonLineTableDebugInfo();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
new file mode 100644
index 000000000000..20107553665f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -0,0 +1,565 @@
+//===- SymbolRewriter.cpp - Symbol Rewriter ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// SymbolRewriter is a LLVM pass which can rewrite symbols transparently within
+// existing code.  It is implemented as a compiler pass and is configured via a
+// YAML configuration file.
+//
+// The YAML configuration file format is as follows:
+//
+// RewriteMapFile := RewriteDescriptors
+// RewriteDescriptors := RewriteDescriptor | RewriteDescriptors
+// RewriteDescriptor := RewriteDescriptorType ':' '{' RewriteDescriptorFields '}'
+// RewriteDescriptorFields := RewriteDescriptorField | RewriteDescriptorFields
+// RewriteDescriptorField := FieldIdentifier ':' FieldValue ','
+// RewriteDescriptorType := Identifier
+// FieldIdentifier := Identifier
+// FieldValue := Identifier
+// Identifier := [0-9a-zA-Z]+
+//
+// Currently, the following descriptor types are supported:
+//
+// - function:          (function rewriting)
+//      + Source        (original name of the function)
+//      + Target        (explicit transformation)
+//      + Transform     (pattern transformation)
+//      + Naked         (boolean, whether the function is undecorated)
+// - global variable:   (external linkage global variable rewriting)
+//      + Source        (original name of externally visible variable)
+//      + Target        (explicit transformation)
+//      + Transform     (pattern transformation)
+// - global alias:      (global alias rewriting)
+//      + Source        (original name of the aliased name)
+//      + Target        (explicit transformation)
+//      + Transform     (pattern transformation)
+//
+// Note that source and exactly one of [Target, Transform] must be provided
+//
+// New rewrite descriptors can be created.  Addding a new rewrite descriptor
+// involves:
+//
+//  a) extended the rewrite descriptor kind enumeration
+//     (<anonymous>::RewriteDescriptor::RewriteDescriptorType)
+//  b) implementing the new descriptor
+//     (c.f. <anonymous>::ExplicitRewriteFunctionDescriptor)
+//  c) extending the rewrite map parser
+//     (<anonymous>::RewriteMapParser::parseEntry)
+//
+//  Specify to rewrite the symbols using the `-rewrite-symbols` option, and
+//  specify the map file to use for the rewriting via the `-rewrite-map-file`
+//  option.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "symbol-rewriter"
+#include "llvm/Transforms/Utils/SymbolRewriter.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace SymbolRewriter;
+
+static cl::list<std::string> RewriteMapFiles("rewrite-map-file",
+                                             cl::desc("Symbol Rewrite Map"),
+                                             cl::value_desc("filename"));
+
+static void rewriteComdat(Module &M, GlobalObject *GO,
+                          const std::string &Source,
+                          const std::string &Target) {
+  if (Comdat *CD = GO->getComdat()) {
+    auto &Comdats = M.getComdatSymbolTable();
+
+    Comdat *C = M.getOrInsertComdat(Target);
+    C->setSelectionKind(CD->getSelectionKind());
+    GO->setComdat(C);
+
+    Comdats.erase(Comdats.find(Source));
+  }
+}
+
+namespace {
+template <RewriteDescriptor::Type DT, typename ValueType,
+          ValueType *(llvm::Module::*Get)(StringRef) const>
+class ExplicitRewriteDescriptor : public RewriteDescriptor {
+public:
+  const std::string Source;
+  const std::string Target;
+
+  ExplicitRewriteDescriptor(StringRef S, StringRef T, const bool Naked)
+      : RewriteDescriptor(DT), Source(Naked ? StringRef("\01" + S.str()) : S),
+        Target(T) {}
+
+  bool performOnModule(Module &M) override;
+
+  static bool classof(const RewriteDescriptor *RD) {
+    return RD->getType() == DT;
+  }
+};
+
+template <RewriteDescriptor::Type DT, typename ValueType,
+          ValueType *(llvm::Module::*Get)(StringRef) const>
+bool ExplicitRewriteDescriptor<DT, ValueType, Get>::performOnModule(Module &M) {
+  bool Changed = false;
+  if (ValueType *S = (M.*Get)(Source)) {
+    if (GlobalObject *GO = dyn_cast<GlobalObject>(S))
+      rewriteComdat(M, GO, Source, Target);
+
+    if (Value *T = (M.*Get)(Target))
+      S->setValueName(T->getValueName());
+    else
+      S->setName(Target);
+
+    Changed = true;
+  }
+  return Changed;
+}
+
+template <RewriteDescriptor::Type DT, typename ValueType,
+          ValueType *(llvm::Module::*Get)(StringRef) const,
+          iterator_range<typename iplist<ValueType>::iterator>
+          (llvm::Module::*Iterator)()>
+class PatternRewriteDescriptor : public RewriteDescriptor {
+public:
+  const std::string Pattern;
+  const std::string Transform;
+
+  PatternRewriteDescriptor(StringRef P, StringRef T)
+    : RewriteDescriptor(DT), Pattern(P), Transform(T) { }
+
+  bool performOnModule(Module &M) override;
+
+  static bool classof(const RewriteDescriptor *RD) {
+    return RD->getType() == DT;
+  }
+};
+
+template <RewriteDescriptor::Type DT, typename ValueType,
+          ValueType *(llvm::Module::*Get)(StringRef) const,
+          iterator_range<typename iplist<ValueType>::iterator>
+          (llvm::Module::*Iterator)()>
+bool PatternRewriteDescriptor<DT, ValueType, Get, Iterator>::
+performOnModule(Module &M) {
+  bool Changed = false;
+  for (auto &C : (M.*Iterator)()) {
+    std::string Error;
+
+    std::string Name = Regex(Pattern).sub(Transform, C.getName(), &Error);
+    if (!Error.empty())
+      report_fatal_error("unable to transforn " + C.getName() + " in " +
+                         M.getModuleIdentifier() + ": " + Error);
+
+    if (C.getName() == Name)
+      continue;
+
+    if (GlobalObject *GO = dyn_cast<GlobalObject>(&C))
+      rewriteComdat(M, GO, C.getName(), Name);
+
+    if (Value *V = (M.*Get)(Name))
+      C.setValueName(V->getValueName());
+    else
+      C.setName(Name);
+
+    Changed = true;
+  }
+  return Changed;
+}
+
+/// Represents a rewrite for an explicitly named (function) symbol.  Both the
+/// source function name and target function name of the transformation are
+/// explicitly spelt out.
+typedef ExplicitRewriteDescriptor<RewriteDescriptor::Type::Function,
+                                  llvm::Function, &llvm::Module::getFunction>
+    ExplicitRewriteFunctionDescriptor;
+
+/// Represents a rewrite for an explicitly named (global variable) symbol.  Both
+/// the source variable name and target variable name are spelt out.  This
+/// applies only to module level variables.
+typedef ExplicitRewriteDescriptor<RewriteDescriptor::Type::GlobalVariable,
+                                  llvm::GlobalVariable,
+                                  &llvm::Module::getGlobalVariable>
+    ExplicitRewriteGlobalVariableDescriptor;
+
+/// Represents a rewrite for an explicitly named global alias.  Both the source
+/// and target name are explicitly spelt out.
+typedef ExplicitRewriteDescriptor<RewriteDescriptor::Type::NamedAlias,
+                                  llvm::GlobalAlias,
+                                  &llvm::Module::getNamedAlias>
+    ExplicitRewriteNamedAliasDescriptor;
+
+/// Represents a rewrite for a regular expression based pattern for functions.
+/// A pattern for the function name is provided and a transformation for that
+/// pattern to determine the target function name create the rewrite rule.
+typedef PatternRewriteDescriptor<RewriteDescriptor::Type::Function,
+                                 llvm::Function, &llvm::Module::getFunction,
+                                 &llvm::Module::functions>
+    PatternRewriteFunctionDescriptor;
+
+/// Represents a rewrite for a global variable based upon a matching pattern.
+/// Each global variable matching the provided pattern will be transformed as
+/// described in the transformation pattern for the target.  Applies only to
+/// module level variables.
+typedef PatternRewriteDescriptor<RewriteDescriptor::Type::GlobalVariable,
+                                 llvm::GlobalVariable,
+                                 &llvm::Module::getGlobalVariable,
+                                 &llvm::Module::globals>
+    PatternRewriteGlobalVariableDescriptor;
+
+/// PatternRewriteNamedAliasDescriptor - represents a rewrite for global
+/// aliases which match a given pattern.  The provided transformation will be
+/// applied to each of the matching names.
+typedef PatternRewriteDescriptor<RewriteDescriptor::Type::NamedAlias,
+                                 llvm::GlobalAlias,
+                                 &llvm::Module::getNamedAlias,
+                                 &llvm::Module::aliases>
+    PatternRewriteNamedAliasDescriptor;
+} // namespace
+
+bool RewriteMapParser::parse(const std::string &MapFile,
+                             RewriteDescriptorList *DL) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Mapping =
+      MemoryBuffer::getFile(MapFile);
+
+  if (!Mapping)
+    report_fatal_error("unable to read rewrite map '" + MapFile + "': " +
+                       Mapping.getError().message());
+
+  if (!parse(*Mapping, DL))
+    report_fatal_error("unable to parse rewrite map '" + MapFile + "'");
+
+  return true;
+}
+
+bool RewriteMapParser::parse(std::unique_ptr<MemoryBuffer> &MapFile,
+                             RewriteDescriptorList *DL) {
+  SourceMgr SM;
+  yaml::Stream YS(MapFile->getBuffer(), SM);
+
+  for (auto &Document : YS) {
+    yaml::MappingNode *DescriptorList;
+
+    // ignore empty documents
+    if (isa<yaml::NullNode>(Document.getRoot()))
+      continue;
+
+    DescriptorList = dyn_cast<yaml::MappingNode>(Document.getRoot());
+    if (!DescriptorList) {
+      YS.printError(Document.getRoot(), "DescriptorList node must be a map");
+      return false;
+    }
+
+    for (auto &Descriptor : *DescriptorList)
+      if (!parseEntry(YS, Descriptor, DL))
+        return false;
+  }
+
+  return true;
+}
+
+bool RewriteMapParser::parseEntry(yaml::Stream &YS, yaml::KeyValueNode &Entry,
+                                  RewriteDescriptorList *DL) {
+  yaml::ScalarNode *Key;
+  yaml::MappingNode *Value;
+  SmallString<32> KeyStorage;
+  StringRef RewriteType;
+
+  Key = dyn_cast<yaml::ScalarNode>(Entry.getKey());
+  if (!Key) {
+    YS.printError(Entry.getKey(), "rewrite type must be a scalar");
+    return false;
+  }
+
+  Value = dyn_cast<yaml::MappingNode>(Entry.getValue());
+  if (!Value) {
+    YS.printError(Entry.getValue(), "rewrite descriptor must be a map");
+    return false;
+  }
+
+  RewriteType = Key->getValue(KeyStorage);
+  if (RewriteType.equals("function"))
+    return parseRewriteFunctionDescriptor(YS, Key, Value, DL);
+  else if (RewriteType.equals("global variable"))
+    return parseRewriteGlobalVariableDescriptor(YS, Key, Value, DL);
+  else if (RewriteType.equals("global alias"))
+    return parseRewriteGlobalAliasDescriptor(YS, Key, Value, DL);
+
+  YS.printError(Entry.getKey(), "unknown rewrite type");
+  return false;
+}
+
+bool RewriteMapParser::
+parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
+                               yaml::MappingNode *Descriptor,
+                               RewriteDescriptorList *DL) {
+  bool Naked = false;
+  std::string Source;
+  std::string Target;
+  std::string Transform;
+
+  for (auto &Field : *Descriptor) {
+    yaml::ScalarNode *Key;
+    yaml::ScalarNode *Value;
+    SmallString<32> KeyStorage;
+    SmallString<32> ValueStorage;
+    StringRef KeyValue;
+
+    Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
+    if (!Key) {
+      YS.printError(Field.getKey(), "descriptor key must be a scalar");
+      return false;
+    }
+
+    Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
+    if (!Value) {
+      YS.printError(Field.getValue(), "descriptor value must be a scalar");
+      return false;
+    }
+
+    KeyValue = Key->getValue(KeyStorage);
+    if (KeyValue.equals("source")) {
+      std::string Error;
+
+      Source = Value->getValue(ValueStorage);
+      if (!Regex(Source).isValid(Error)) {
+        YS.printError(Field.getKey(), "invalid regex: " + Error);
+        return false;
+      }
+    } else if (KeyValue.equals("target")) {
+      Target = Value->getValue(ValueStorage);
+    } else if (KeyValue.equals("transform")) {
+      Transform = Value->getValue(ValueStorage);
+    } else if (KeyValue.equals("naked")) {
+      std::string Undecorated;
+
+      Undecorated = Value->getValue(ValueStorage);
+      Naked = StringRef(Undecorated).lower() == "true" || Undecorated == "1";
+    } else {
+      YS.printError(Field.getKey(), "unknown key for function");
+      return false;
+    }
+  }
+
+  if (Transform.empty() == Target.empty()) {
+    YS.printError(Descriptor,
+                  "exactly one of transform or target must be specified");
+    return false;
+  }
+
+  // TODO see if there is a more elegant solution to selecting the rewrite
+  // descriptor type
+  if (!Target.empty())
+    DL->push_back(llvm::make_unique<ExplicitRewriteFunctionDescriptor>(
+        Source, Target, Naked));
+  else
+    DL->push_back(
+        llvm::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform));
+
+  return true;
+}
+
+bool RewriteMapParser::
+parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
+                                     yaml::MappingNode *Descriptor,
+                                     RewriteDescriptorList *DL) {
+  std::string Source;
+  std::string Target;
+  std::string Transform;
+
+  for (auto &Field : *Descriptor) {
+    yaml::ScalarNode *Key;
+    yaml::ScalarNode *Value;
+    SmallString<32> KeyStorage;
+    SmallString<32> ValueStorage;
+    StringRef KeyValue;
+
+    Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
+    if (!Key) {
+      YS.printError(Field.getKey(), "descriptor Key must be a scalar");
+      return false;
+    }
+
+    Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
+    if (!Value) {
+      YS.printError(Field.getValue(), "descriptor value must be a scalar");
+      return false;
+    }
+
+    KeyValue = Key->getValue(KeyStorage);
+    if (KeyValue.equals("source")) {
+      std::string Error;
+
+      Source = Value->getValue(ValueStorage);
+      if (!Regex(Source).isValid(Error)) {
+        YS.printError(Field.getKey(), "invalid regex: " + Error);
+        return false;
+      }
+    } else if (KeyValue.equals("target")) {
+      Target = Value->getValue(ValueStorage);
+    } else if (KeyValue.equals("transform")) {
+      Transform = Value->getValue(ValueStorage);
+    } else {
+      YS.printError(Field.getKey(), "unknown Key for Global Variable");
+      return false;
+    }
+  }
+
+  if (Transform.empty() == Target.empty()) {
+    YS.printError(Descriptor,
+                  "exactly one of transform or target must be specified");
+    return false;
+  }
+
+  if (!Target.empty())
+    DL->push_back(llvm::make_unique<ExplicitRewriteGlobalVariableDescriptor>(
+        Source, Target,
+        /*Naked*/ false));
+  else
+    DL->push_back(llvm::make_unique<PatternRewriteGlobalVariableDescriptor>(
+        Source, Transform));
+
+  return true;
+}
+
+bool RewriteMapParser::
+parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
+                                  yaml::MappingNode *Descriptor,
+                                  RewriteDescriptorList *DL) {
+  std::string Source;
+  std::string Target;
+  std::string Transform;
+
+  for (auto &Field : *Descriptor) {
+    yaml::ScalarNode *Key;
+    yaml::ScalarNode *Value;
+    SmallString<32> KeyStorage;
+    SmallString<32> ValueStorage;
+    StringRef KeyValue;
+
+    Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
+    if (!Key) {
+      YS.printError(Field.getKey(), "descriptor key must be a scalar");
+      return false;
+    }
+
+    Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
+    if (!Value) {
+      YS.printError(Field.getValue(), "descriptor value must be a scalar");
+      return false;
+    }
+
+    KeyValue = Key->getValue(KeyStorage);
+    if (KeyValue.equals("source")) {
+      std::string Error;
+
+      Source = Value->getValue(ValueStorage);
+      if (!Regex(Source).isValid(Error)) {
+        YS.printError(Field.getKey(), "invalid regex: " + Error);
+        return false;
+      }
+    } else if (KeyValue.equals("target")) {
+      Target = Value->getValue(ValueStorage);
+    } else if (KeyValue.equals("transform")) {
+      Transform = Value->getValue(ValueStorage);
+    } else {
+      YS.printError(Field.getKey(), "unknown key for Global Alias");
+      return false;
+    }
+  }
+
+  if (Transform.empty() == Target.empty()) {
+    YS.printError(Descriptor,
+                  "exactly one of transform or target must be specified");
+    return false;
+  }
+
+  if (!Target.empty())
+    DL->push_back(llvm::make_unique<ExplicitRewriteNamedAliasDescriptor>(
+        Source, Target,
+        /*Naked*/ false));
+  else
+    DL->push_back(llvm::make_unique<PatternRewriteNamedAliasDescriptor>(
+        Source, Transform));
+
+  return true;
+}
+
+namespace {
+class RewriteSymbolsLegacyPass : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  RewriteSymbolsLegacyPass();
+  RewriteSymbolsLegacyPass(SymbolRewriter::RewriteDescriptorList &DL);
+
+  bool runOnModule(Module &M) override;
+
+private:
+  RewriteSymbolPass Impl;
+};
+
+char RewriteSymbolsLegacyPass::ID = 0;
+
+RewriteSymbolsLegacyPass::RewriteSymbolsLegacyPass() : ModulePass(ID), Impl() {
+  initializeRewriteSymbolsLegacyPassPass(*PassRegistry::getPassRegistry());  
+}
+
+RewriteSymbolsLegacyPass::RewriteSymbolsLegacyPass(
+    SymbolRewriter::RewriteDescriptorList &DL)
+    : ModulePass(ID), Impl(DL) {}
+
+bool RewriteSymbolsLegacyPass::runOnModule(Module &M) {
+  return Impl.runImpl(M);
+}
+}
+
+namespace llvm {
+PreservedAnalyses RewriteSymbolPass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (!runImpl(M))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+bool RewriteSymbolPass::runImpl(Module &M) {
+  bool Changed;
+
+  Changed = false;
+  for (auto &Descriptor : Descriptors)
+    Changed |= Descriptor->performOnModule(M);
+
+  return Changed;
+}
+
+void RewriteSymbolPass::loadAndParseMapFiles() {
+  const std::vector<std::string> MapFiles(RewriteMapFiles);
+  SymbolRewriter::RewriteMapParser Parser;
+
+  for (const auto &MapFile : MapFiles)
+    Parser.parse(MapFile, &Descriptors);
+}
+}
+
+INITIALIZE_PASS(RewriteSymbolsLegacyPass, "rewrite-symbols", "Rewrite Symbols",
+                false, false)
+
+ModulePass *llvm::createRewriteSymbolsPass() {
+  return new RewriteSymbolsLegacyPass();
+}
+
+ModulePass *
+llvm::createRewriteSymbolsPass(SymbolRewriter::RewriteDescriptorList &DL) {
+  return new RewriteSymbolsLegacyPass(DL);
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
new file mode 100644
index 000000000000..9385f825523c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -0,0 +1,116 @@
+//===- UnifyFunctionExitNodes.cpp - Make all functions have a single exit -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is used to ensure that functions have at most one return
+// instruction in them.  Additionally, it keeps track of which node is the new
+// exit node of the CFG.  If there are no exit nodes in the CFG, the getExitNode
+// method will return a null pointer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+char UnifyFunctionExitNodes::ID = 0;
+INITIALIZE_PASS(UnifyFunctionExitNodes, "mergereturn",
+                "Unify function exit nodes", false, false)
+
+Pass *llvm::createUnifyFunctionExitNodesPass() {
+  return new UnifyFunctionExitNodes();
+}
+
+void UnifyFunctionExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
+  // We preserve the non-critical-edgeness property
+  AU.addPreservedID(BreakCriticalEdgesID);
+  // This is a cluster of orthogonal Transforms
+  AU.addPreservedID(LowerSwitchID);
+}
+
+// UnifyAllExitNodes - Unify all exit nodes of the CFG by creating a new
+// BasicBlock, and converting all returns to unconditional branches to this
+// new basic block.  The singular exit node is returned.
+//
+// If there are no return stmts in the Function, a null pointer is returned.
+//
+bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
+  // Loop over all of the blocks in a function, tracking all of the blocks that
+  // return.
+  //
+  std::vector<BasicBlock*> ReturningBlocks;
+  std::vector<BasicBlock*> UnreachableBlocks;
+  for (BasicBlock &I : F)
+    if (isa<ReturnInst>(I.getTerminator()))
+      ReturningBlocks.push_back(&I);
+    else if (isa<UnreachableInst>(I.getTerminator()))
+      UnreachableBlocks.push_back(&I);
+
+  // Then unreachable blocks.
+  if (UnreachableBlocks.empty()) {
+    UnreachableBlock = nullptr;
+  } else if (UnreachableBlocks.size() == 1) {
+    UnreachableBlock = UnreachableBlocks.front();
+  } else {
+    UnreachableBlock = BasicBlock::Create(F.getContext(), 
+                                          "UnifiedUnreachableBlock", &F);
+    new UnreachableInst(F.getContext(), UnreachableBlock);
+
+    for (BasicBlock *BB : UnreachableBlocks) {
+      BB->getInstList().pop_back();  // Remove the unreachable inst.
+      BranchInst::Create(UnreachableBlock, BB);
+    }
+  }
+
+  // Now handle return blocks.
+  if (ReturningBlocks.empty()) {
+    ReturnBlock = nullptr;
+    return false;                          // No blocks return
+  } else if (ReturningBlocks.size() == 1) {
+    ReturnBlock = ReturningBlocks.front(); // Already has a single return block
+    return false;
+  }
+
+  // Otherwise, we need to insert a new basic block into the function, add a PHI
+  // nodes (if the function returns values), and convert all of the return
+  // instructions into unconditional branches.
+  //
+  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(),
+                                               "UnifiedReturnBlock", &F);
+
+  PHINode *PN = nullptr;
+  if (F.getReturnType()->isVoidTy()) {
+    ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+  } else {
+    // If the function doesn't return void... add a PHI node to the block...
+    PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
+                         "UnifiedRetVal");
+    NewRetBlock->getInstList().push_back(PN);
+    ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+  }
+
+  // Loop over all of the blocks, replacing the return instruction with an
+  // unconditional branch.
+  //
+  for (BasicBlock *BB : ReturningBlocks) {
+    // Add an incoming element to the PHI node for every return instruction that
+    // is merging into this new block...
+    if (PN)
+      PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+    BB->getInstList().pop_back();  // Remove the return insn
+    BranchInst::Create(NewRetBlock, BB);
+  }
+  ReturnBlock = NewRetBlock;
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/Utils.cpp b/contrib/llvm/lib/Transforms/Utils/Utils.cpp
new file mode 100644
index 000000000000..f6c7d1c4989e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/Utils.cpp
@@ -0,0 +1,45 @@
+//===-- Utils.cpp - TransformUtils Infrastructure -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the common initialization infrastructure for the
+// TransformUtils library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Initialization.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+/// initializeTransformUtils - Initialize all passes in the TransformUtils
+/// library.
+void llvm::initializeTransformUtils(PassRegistry &Registry) {
+  initializeAddDiscriminatorsLegacyPassPass(Registry);
+  initializeBreakCriticalEdgesPass(Registry);
+  initializeInstNamerPass(Registry);
+  initializeLCSSAWrapperPassPass(Registry);
+  initializeLibCallsShrinkWrapLegacyPassPass(Registry);
+  initializeLoopSimplifyPass(Registry);
+  initializeLowerInvokeLegacyPassPass(Registry);
+  initializeLowerSwitchPass(Registry);
+  initializeNameAnonGlobalLegacyPassPass(Registry);
+  initializePromoteLegacyPassPass(Registry);
+  initializeStripNonLineTableDebugInfoPass(Registry);
+  initializeUnifyFunctionExitNodesPass(Registry);
+  initializeInstSimplifierPass(Registry);
+  initializeMetaRenamerPass(Registry);
+  initializeStripGCRelocatesPass(Registry);
+  initializePredicateInfoPrinterLegacyPassPass(Registry);
+}
+
+/// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
+void LLVMInitializeTransformUtils(LLVMPassRegistryRef R) {
+  initializeTransformUtils(*unwrap(R));
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/VNCoercion.cpp b/contrib/llvm/lib/Transforms/Utils/VNCoercion.cpp
new file mode 100644
index 000000000000..c3feea6a0a41
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -0,0 +1,495 @@
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "vncoerce"
+namespace llvm {
+namespace VNCoercion {
+
+/// Return true if coerceAvailableValueToLoadType will succeed.
+bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
+                                     const DataLayout &DL) {
+  // If the loaded or stored value is an first class array or struct, don't try
+  // to transform them.  We need to be able to bitcast to integer.
+  if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
+      StoredVal->getType()->isStructTy() || StoredVal->getType()->isArrayTy())
+    return false;
+
+  // The store has to be at least as big as the load.
+  if (DL.getTypeSizeInBits(StoredVal->getType()) < DL.getTypeSizeInBits(LoadTy))
+    return false;
+
+  // Don't coerce non-integral pointers to integers or vice versa.
+  if (DL.isNonIntegralPointerType(StoredVal->getType()) !=
+      DL.isNonIntegralPointerType(LoadTy))
+    return false;
+
+  return true;
+}
+
+template <class T, class HelperClass>
+static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
+                                               HelperClass &Helper,
+                                               const DataLayout &DL) {
+  assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
+         "precondition violation - materialization can't fail");
+  if (auto *C = dyn_cast<Constant>(StoredVal))
+    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+      StoredVal = FoldedStoredVal;
+
+  // If this is already the right type, just return it.
+  Type *StoredValTy = StoredVal->getType();
+
+  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy);
+  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
+
+  // If the store and reload are the same size, we can always reuse it.
+  if (StoredValSize == LoadedValSize) {
+    // Pointer to Pointer -> use bitcast.
+    if (StoredValTy->isPtrOrPtrVectorTy() && LoadedTy->isPtrOrPtrVectorTy()) {
+      StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
+    } else {
+      // Convert source pointers to integers, which can be bitcast.
+      if (StoredValTy->isPtrOrPtrVectorTy()) {
+        StoredValTy = DL.getIntPtrType(StoredValTy);
+        StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
+      }
+
+      Type *TypeToCastTo = LoadedTy;
+      if (TypeToCastTo->isPtrOrPtrVectorTy())
+        TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
+
+      if (StoredValTy != TypeToCastTo)
+        StoredVal = Helper.CreateBitCast(StoredVal, TypeToCastTo);
+
+      // Cast to pointer if the load needs a pointer type.
+      if (LoadedTy->isPtrOrPtrVectorTy())
+        StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
+    }
+
+    if (auto *C = dyn_cast<ConstantExpr>(StoredVal))
+      if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+        StoredVal = FoldedStoredVal;
+
+    return StoredVal;
+  }
+  // If the loaded value is smaller than the available value, then we can
+  // extract out a piece from it.  If the available value is too small, then we
+  // can't do anything.
+  assert(StoredValSize >= LoadedValSize &&
+         "canCoerceMustAliasedValueToLoad fail");
+
+  // Convert source pointers to integers, which can be manipulated.
+  if (StoredValTy->isPtrOrPtrVectorTy()) {
+    StoredValTy = DL.getIntPtrType(StoredValTy);
+    StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
+  }
+
+  // Convert vectors and fp to integer, which can be manipulated.
+  if (!StoredValTy->isIntegerTy()) {
+    StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize);
+    StoredVal = Helper.CreateBitCast(StoredVal, StoredValTy);
+  }
+
+  // If this is a big-endian system, we need to shift the value down to the low
+  // bits so that a truncate will work.
+  if (DL.isBigEndian()) {
+    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) -
+                        DL.getTypeStoreSizeInBits(LoadedTy);
+    StoredVal = Helper.CreateLShr(
+        StoredVal, ConstantInt::get(StoredVal->getType(), ShiftAmt));
+  }
+
+  // Truncate the integer to the right size now.
+  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize);
+  StoredVal = Helper.CreateTruncOrBitCast(StoredVal, NewIntTy);
+
+  if (LoadedTy != NewIntTy) {
+    // If the result is a pointer, inttoptr.
+    if (LoadedTy->isPtrOrPtrVectorTy())
+      StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
+    else
+      // Otherwise, bitcast.
+      StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
+  }
+
+  if (auto *C = dyn_cast<Constant>(StoredVal))
+    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+      StoredVal = FoldedStoredVal;
+
+  return StoredVal;
+}
+
+/// If we saw a store of a value to memory, and
+/// then a load from a must-aliased pointer of a different type, try to coerce
+/// the stored value.  LoadedTy is the type of the load we want to replace.
+/// IRB is IRBuilder used to insert new instructions.
+///
+/// If we can't do it, return null.
+Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
+                                      IRBuilder<> &IRB, const DataLayout &DL) {
+  return coerceAvailableValueToLoadTypeHelper(StoredVal, LoadedTy, IRB, DL);
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering memory write (store, memset, memcpy, memmove).  This
+/// means that the write *may* provide bits used by the load but we can't be
+/// sure because the pointers don't must-alias.
+///
+/// Check this case to see if there is anything more we can do before we give
+/// up.  This returns -1 if we have to give up, or a byte number in the stored
+/// value of the piece that feeds the load.
+static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
+                                          Value *WritePtr,
+                                          uint64_t WriteSizeInBits,
+                                          const DataLayout &DL) {
+  // If the loaded or stored value is a first class array or struct, don't try
+  // to transform them.  We need to be able to bitcast to integer.
+  if (LoadTy->isStructTy() || LoadTy->isArrayTy())
+    return -1;
+
+  int64_t StoreOffset = 0, LoadOffset = 0;
+  Value *StoreBase =
+      GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
+  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
+  if (StoreBase != LoadBase)
+    return -1;
+
+  // If the load and store are to the exact same address, they should have been
+  // a must alias.  AA must have gotten confused.
+  // FIXME: Study to see if/when this happens.  One case is forwarding a memset
+  // to a load from the base of the memset.
+
+  // If the load and store don't overlap at all, the store doesn't provide
+  // anything to the load.  In this case, they really don't alias at all, AA
+  // must have gotten confused.
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
+
+  if ((WriteSizeInBits & 7) | (LoadSize & 7))
+    return -1;
+  uint64_t StoreSize = WriteSizeInBits / 8; // Convert to bytes.
+  LoadSize /= 8;
+
+  bool isAAFailure = false;
+  if (StoreOffset < LoadOffset)
+    isAAFailure = StoreOffset + int64_t(StoreSize) <= LoadOffset;
+  else
+    isAAFailure = LoadOffset + int64_t(LoadSize) <= StoreOffset;
+
+  if (isAAFailure)
+    return -1;
+
+  // If the Load isn't completely contained within the stored bits, we don't
+  // have all the bits to feed it.  We could do something crazy in the future
+  // (issue a smaller load then merge the bits in) but this seems unlikely to be
+  // valuable.
+  if (StoreOffset > LoadOffset ||
+      StoreOffset + StoreSize < LoadOffset + LoadSize)
+    return -1;
+
+  // Okay, we can do this transformation.  Return the number of bytes into the
+  // store that the load is.
+  return LoadOffset - StoreOffset;
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering store.
+int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
+                                   StoreInst *DepSI, const DataLayout &DL) {
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (DepSI->getValueOperand()->getType()->isStructTy() ||
+      DepSI->getValueOperand()->getType()->isArrayTy())
+    return -1;
+
+  Value *StorePtr = DepSI->getPointerOperand();
+  uint64_t StoreSize =
+      DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
+  return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize,
+                                        DL);
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being clobbered by another load.  See if
+/// the other load can feed into the second load.
+int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
+                                  const DataLayout &DL) {
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
+    return -1;
+
+  Value *DepPtr = DepLI->getPointerOperand();
+  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
+  int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
+  if (R != -1)
+    return R;
+
+  // If we have a load/load clobber an DepLI can be widened to cover this load,
+  // then we should widen it!
+  int64_t LoadOffs = 0;
+  const Value *LoadBase =
+      GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+
+  unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
+      LoadBase, LoadOffs, LoadSize, DepLI);
+  if (Size == 0)
+    return -1;
+
+  // Check non-obvious conditions enforced by MDA which we rely on for being
+  // able to materialize this potentially available value
+  assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
+  assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
+
+  return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size * 8, DL);
+}
+
+int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
+                                     MemIntrinsic *MI, const DataLayout &DL) {
+  // If the mem operation is a non-constant size, we can't handle it.
+  ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
+  if (!SizeCst)
+    return -1;
+  uint64_t MemSizeInBits = SizeCst->getZExtValue() * 8;
+
+  // If this is memset, we just need to see if the offset is valid in the size
+  // of the memset..
+  if (MI->getIntrinsicID() == Intrinsic::memset)
+    return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                          MemSizeInBits, DL);
+
+  // If we have a memcpy/memmove, the only case we can handle is if this is a
+  // copy from constant memory.  In that case, we can read directly from the
+  // constant memory.
+  MemTransferInst *MTI = cast<MemTransferInst>(MI);
+
+  Constant *Src = dyn_cast<Constant>(MTI->getSource());
+  if (!Src)
+    return -1;
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL));
+  if (!GV || !GV->isConstant())
+    return -1;
+
+  // See if the access is within the bounds of the transfer.
+  int Offset = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                              MemSizeInBits, DL);
+  if (Offset == -1)
+    return Offset;
+
+  unsigned AS = Src->getType()->getPointerAddressSpace();
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  Src =
+      ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS));
+  Constant *OffsetCst =
+      ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
+                                       OffsetCst);
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+  if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
+    return Offset;
+  return -1;
+}
+
+template <class T, class HelperClass>
+static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
+                                     HelperClass &Helper,
+                                     const DataLayout &DL) {
+  LLVMContext &Ctx = SrcVal->getType()->getContext();
+
+  // If two pointers are in the same address space, they have the same size,
+  // so we don't need to do any truncation, etc. This avoids introducing
+  // ptrtoint instructions for pointers that may be non-integral.
+  if (SrcVal->getType()->isPointerTy() && LoadTy->isPointerTy() &&
+      cast<PointerType>(SrcVal->getType())->getAddressSpace() ==
+          cast<PointerType>(LoadTy)->getAddressSpace()) {
+    return SrcVal;
+  }
+
+  uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
+  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
+  // Compute which bits of the stored value are being used by the load.  Convert
+  // to an integer type to start with.
+  if (SrcVal->getType()->isPtrOrPtrVectorTy())
+    SrcVal = Helper.CreatePtrToInt(SrcVal, DL.getIntPtrType(SrcVal->getType()));
+  if (!SrcVal->getType()->isIntegerTy())
+    SrcVal = Helper.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize * 8));
+
+  // Shift the bits to the least significant depending on endianness.
+  unsigned ShiftAmt;
+  if (DL.isLittleEndian())
+    ShiftAmt = Offset * 8;
+  else
+    ShiftAmt = (StoreSize - LoadSize - Offset) * 8;
+  if (ShiftAmt)
+    SrcVal = Helper.CreateLShr(SrcVal,
+                               ConstantInt::get(SrcVal->getType(), ShiftAmt));
+
+  if (LoadSize != StoreSize)
+    SrcVal = Helper.CreateTruncOrBitCast(SrcVal,
+                                         IntegerType::get(Ctx, LoadSize * 8));
+  return SrcVal;
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering store.  This means that the store provides bits used by
+/// the load but the pointers don't must-alias.  Check this case to see if
+/// there is anything more we can do before we give up.
+Value *getStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
+                            Instruction *InsertPt, const DataLayout &DL) {
+
+  IRBuilder<> Builder(InsertPt);
+  SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
+  return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, Builder, DL);
+}
+
+Constant *getConstantStoreValueForLoad(Constant *SrcVal, unsigned Offset,
+                                       Type *LoadTy, const DataLayout &DL) {
+  ConstantFolder F;
+  SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, F, DL);
+  return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, F, DL);
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering load.  This means that the load *may* provide bits used
+/// by the load but we can't be sure because the pointers don't must-alias.
+/// Check this case to see if there is anything more we can do before we give
+/// up.
+Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
+                           Instruction *InsertPt, const DataLayout &DL) {
+  // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
+  // widen SrcVal out to a larger load.
+  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  if (Offset + LoadSize > SrcValStoreSize) {
+    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
+    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
+    // If we have a load/load clobber an DepLI can be widened to cover this
+    // load, then we should widen it to the next power of 2 size big enough!
+    unsigned NewLoadSize = Offset + LoadSize;
+    if (!isPowerOf2_32(NewLoadSize))
+      NewLoadSize = NextPowerOf2(NewLoadSize);
+
+    Value *PtrVal = SrcVal->getPointerOperand();
+    // Insert the new load after the old load.  This ensures that subsequent
+    // memdep queries will find the new load.  We can't easily remove the old
+    // load completely because it is already in the value numbering table.
+    IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
+    Type *DestPTy = IntegerType::get(LoadTy->getContext(), NewLoadSize * 8);
+    DestPTy =
+        PointerType::get(DestPTy, PtrVal->getType()->getPointerAddressSpace());
+    Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
+    PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
+    LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
+    NewLoad->takeName(SrcVal);
+    NewLoad->setAlignment(SrcVal->getAlignment());
+
+    DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
+    DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
+
+    // Replace uses of the original load with the wider load.  On a big endian
+    // system, we need to shift down to get the relevant bits.
+    Value *RV = NewLoad;
+    if (DL.isBigEndian())
+      RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8);
+    RV = Builder.CreateTrunc(RV, SrcVal->getType());
+    SrcVal->replaceAllUsesWith(RV);
+
+    SrcVal = NewLoad;
+  }
+
+  return getStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
+}
+
+Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset,
+                                      Type *LoadTy, const DataLayout &DL) {
+  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  if (Offset + LoadSize > SrcValStoreSize)
+    return nullptr;
+  return getConstantStoreValueForLoad(SrcVal, Offset, LoadTy, DL);
+}
+
+template <class T, class HelperClass>
+T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset,
+                                Type *LoadTy, HelperClass &Helper,
+                                const DataLayout &DL) {
+  LLVMContext &Ctx = LoadTy->getContext();
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy) / 8;
+
+  // We know that this method is only called when the mem transfer fully
+  // provides the bits for the load.
+  if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
+    // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
+    // independently of what the offset is.
+    T *Val = cast<T>(MSI->getValue());
+    if (LoadSize != 1)
+      Val =
+          Helper.CreateZExtOrBitCast(Val, IntegerType::get(Ctx, LoadSize * 8));
+    T *OneElt = Val;
+
+    // Splat the value out to the right number of bits.
+    for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize;) {
+      // If we can double the number of bytes set, do it.
+      if (NumBytesSet * 2 <= LoadSize) {
+        T *ShVal = Helper.CreateShl(
+            Val, ConstantInt::get(Val->getType(), NumBytesSet * 8));
+        Val = Helper.CreateOr(Val, ShVal);
+        NumBytesSet <<= 1;
+        continue;
+      }
+
+      // Otherwise insert one byte at a time.
+      T *ShVal = Helper.CreateShl(Val, ConstantInt::get(Val->getType(), 1 * 8));
+      Val = Helper.CreateOr(OneElt, ShVal);
+      ++NumBytesSet;
+    }
+
+    return coerceAvailableValueToLoadTypeHelper(Val, LoadTy, Helper, DL);
+  }
+
+  // Otherwise, this is a memcpy/memmove from a constant global.
+  MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
+  Constant *Src = cast<Constant>(MTI->getSource());
+  unsigned AS = Src->getType()->getPointerAddressSpace();
+
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  Src =
+      ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS));
+  Constant *OffsetCst =
+      ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
+                                       OffsetCst);
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+  return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering mem intrinsic.
+Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                              Type *LoadTy, Instruction *InsertPt,
+                              const DataLayout &DL) {
+  IRBuilder<> Builder(InsertPt);
+  return getMemInstValueForLoadHelper<Value, IRBuilder<>>(SrcInst, Offset,
+                                                          LoadTy, Builder, DL);
+}
+
+Constant *getConstantMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                                         Type *LoadTy, const DataLayout &DL) {
+  // The only case analyzeLoadFromClobberingMemInst cannot be converted to a
+  // constant is when it's a memset of a non-constant.
+  if (auto *MSI = dyn_cast<MemSetInst>(SrcInst))
+    if (!isa<Constant>(MSI->getValue()))
+      return nullptr;
+  ConstantFolder F;
+  return getMemInstValueForLoadHelper<Constant, ConstantFolder>(SrcInst, Offset,
+                                                                LoadTy, F, DL);
+}
+} // namespace VNCoercion
+} // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
new file mode 100644
index 000000000000..930972924c3c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -0,0 +1,1109 @@
+//===- ValueMapper.cpp - Interface shared by lib/Transforms/Utils ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MapValue function, which is shared by various parts of
+// the lib/Transforms/Utils library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+using namespace llvm;
+
+// Out of line method to get vtable etc for class.
+void ValueMapTypeRemapper::anchor() {}
+void ValueMaterializer::anchor() {}
+
+namespace {
+
+/// A basic block used in a BlockAddress whose function body is not yet
+/// materialized.
+struct DelayedBasicBlock {
+  BasicBlock *OldBB;
+  std::unique_ptr<BasicBlock> TempBB;
+
+  DelayedBasicBlock(const BlockAddress &Old)
+      : OldBB(Old.getBasicBlock()),
+        TempBB(BasicBlock::Create(Old.getContext())) {}
+};
+
+struct WorklistEntry {
+  enum EntryKind {
+    MapGlobalInit,
+    MapAppendingVar,
+    MapGlobalAliasee,
+    RemapFunction
+  };
+  struct GVInitTy {
+    GlobalVariable *GV;
+    Constant *Init;
+  };
+  struct AppendingGVTy {
+    GlobalVariable *GV;
+    Constant *InitPrefix;
+  };
+  struct GlobalAliaseeTy {
+    GlobalAlias *GA;
+    Constant *Aliasee;
+  };
+
+  unsigned Kind : 2;
+  unsigned MCID : 29;
+  unsigned AppendingGVIsOldCtorDtor : 1;
+  unsigned AppendingGVNumNewMembers;
+  union {
+    GVInitTy GVInit;
+    AppendingGVTy AppendingGV;
+    GlobalAliaseeTy GlobalAliasee;
+    Function *RemapF;
+  } Data;
+};
+
+struct MappingContext {
+  ValueToValueMapTy *VM;
+  ValueMaterializer *Materializer = nullptr;
+
+  /// Construct a MappingContext with a value map and materializer.
+  explicit MappingContext(ValueToValueMapTy &VM,
+                          ValueMaterializer *Materializer = nullptr)
+      : VM(&VM), Materializer(Materializer) {}
+};
+
+class MDNodeMapper;
+class Mapper {
+  friend class MDNodeMapper;
+
+#ifndef NDEBUG
+  DenseSet<GlobalValue *> AlreadyScheduled;
+#endif
+
+  RemapFlags Flags;
+  ValueMapTypeRemapper *TypeMapper;
+  unsigned CurrentMCID = 0;
+  SmallVector<MappingContext, 2> MCs;
+  SmallVector<WorklistEntry, 4> Worklist;
+  SmallVector<DelayedBasicBlock, 1> DelayedBBs;
+  SmallVector<Constant *, 16> AppendingInits;
+
+public:
+  Mapper(ValueToValueMapTy &VM, RemapFlags Flags,
+         ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer)
+      : Flags(Flags), TypeMapper(TypeMapper),
+        MCs(1, MappingContext(VM, Materializer)) {}
+
+  /// ValueMapper should explicitly call \a flush() before destruction.
+  ~Mapper() { assert(!hasWorkToDo() && "Expected to be flushed"); }
+
+  bool hasWorkToDo() const { return !Worklist.empty(); }
+
+  unsigned
+  registerAlternateMappingContext(ValueToValueMapTy &VM,
+                                  ValueMaterializer *Materializer = nullptr) {
+    MCs.push_back(MappingContext(VM, Materializer));
+    return MCs.size() - 1;
+  }
+
+  void addFlags(RemapFlags Flags);
+
+  void remapGlobalObjectMetadata(GlobalObject &GO);
+
+  Value *mapValue(const Value *V);
+  void remapInstruction(Instruction *I);
+  void remapFunction(Function &F);
+
+  Constant *mapConstant(const Constant *C) {
+    return cast_or_null<Constant>(mapValue(C));
+  }
+
+  /// Map metadata.
+  ///
+  /// Find the mapping for MD.  Guarantees that the return will be resolved
+  /// (not an MDNode, or MDNode::isResolved() returns true).
+  Metadata *mapMetadata(const Metadata *MD);
+
+  void scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
+                                    unsigned MCID);
+  void scheduleMapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+                                    bool IsOldCtorDtor,
+                                    ArrayRef<Constant *> NewMembers,
+                                    unsigned MCID);
+  void scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
+                                unsigned MCID);
+  void scheduleRemapFunction(Function &F, unsigned MCID);
+
+  void flush();
+
+private:
+  void mapGlobalInitializer(GlobalVariable &GV, Constant &Init);
+  void mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+                            bool IsOldCtorDtor,
+                            ArrayRef<Constant *> NewMembers);
+  void mapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee);
+  void remapFunction(Function &F, ValueToValueMapTy &VM);
+
+  ValueToValueMapTy &getVM() { return *MCs[CurrentMCID].VM; }
+  ValueMaterializer *getMaterializer() { return MCs[CurrentMCID].Materializer; }
+
+  Value *mapBlockAddress(const BlockAddress &BA);
+
+  /// Map metadata that doesn't require visiting operands.
+  Optional<Metadata *> mapSimpleMetadata(const Metadata *MD);
+
+  Metadata *mapToMetadata(const Metadata *Key, Metadata *Val);
+  Metadata *mapToSelf(const Metadata *MD);
+};
+
+class MDNodeMapper {
+  Mapper &M;
+
+  /// Data about a node in \a UniquedGraph.
+  struct Data {
+    bool HasChanged = false;
+    unsigned ID = ~0u;
+    TempMDNode Placeholder;
+  };
+
+  /// A graph of uniqued nodes.
+  struct UniquedGraph {
+    SmallDenseMap<const Metadata *, Data, 32> Info; // Node properties.
+    SmallVector<MDNode *, 16> POT;                  // Post-order traversal.
+
+    /// Propagate changed operands through the post-order traversal.
+    ///
+    /// Iteratively update \a Data::HasChanged for each node based on \a
+    /// Data::HasChanged of its operands, until fixed point.
+    void propagateChanges();
+
+    /// Get a forward reference to a node to use as an operand.
+    Metadata &getFwdReference(MDNode &Op);
+  };
+
+  /// Worklist of distinct nodes whose operands need to be remapped.
+  SmallVector<MDNode *, 16> DistinctWorklist;
+
+  // Storage for a UniquedGraph.
+  SmallDenseMap<const Metadata *, Data, 32> InfoStorage;
+  SmallVector<MDNode *, 16> POTStorage;
+
+public:
+  MDNodeMapper(Mapper &M) : M(M) {}
+
+  /// Map a metadata node (and its transitive operands).
+  ///
+  /// Map all the (unmapped) nodes in the subgraph under \c N.  The iterative
+  /// algorithm handles distinct nodes and uniqued node subgraphs using
+  /// different strategies.
+  ///
+  /// Distinct nodes are immediately mapped and added to \a DistinctWorklist
+  /// using \a mapDistinctNode().  Their mapping can always be computed
+  /// immediately without visiting operands, even if their operands change.
+  ///
+  /// The mapping for uniqued nodes depends on whether their operands change.
+  /// \a mapTopLevelUniquedNode() traverses the transitive uniqued subgraph of
+  /// a node to calculate uniqued node mappings in bulk.  Distinct leafs are
+  /// added to \a DistinctWorklist with \a mapDistinctNode().
+  ///
+  /// After mapping \c N itself, this function remaps the operands of the
+  /// distinct nodes in \a DistinctWorklist until the entire subgraph under \c
+  /// N has been mapped.
+  Metadata *map(const MDNode &N);
+
+private:
+  /// Map a top-level uniqued node and the uniqued subgraph underneath it.
+  ///
+  /// This builds up a post-order traversal of the (unmapped) uniqued subgraph
+  /// underneath \c FirstN and calculates the nodes' mapping.  Each node uses
+  /// the identity mapping (\a Mapper::mapToSelf()) as long as all of its
+  /// operands uses the identity mapping.
+  ///
+  /// The algorithm works as follows:
+  ///
+  ///  1. \a createPOT(): traverse the uniqued subgraph under \c FirstN and
+  ///     save the post-order traversal in the given \a UniquedGraph, tracking
+  ///     nodes' operands change.
+  ///
+  ///  2. \a UniquedGraph::propagateChanges(): propagate changed operands
+  ///     through the \a UniquedGraph until fixed point, following the rule
+  ///     that if a node changes, any node that references must also change.
+  ///
+  ///  3. \a mapNodesInPOT(): map the uniqued nodes, creating new uniqued nodes
+  ///     (referencing new operands) where necessary.
+  Metadata *mapTopLevelUniquedNode(const MDNode &FirstN);
+
+  /// Try to map the operand of an \a MDNode.
+  ///
+  /// If \c Op is already mapped, return the mapping.  If it's not an \a
+  /// MDNode, compute and return the mapping.  If it's a distinct \a MDNode,
+  /// return the result of \a mapDistinctNode().
+  ///
+  /// \return None if \c Op is an unmapped uniqued \a MDNode.
+  /// \post getMappedOp(Op) only returns None if this returns None.
+  Optional<Metadata *> tryToMapOperand(const Metadata *Op);
+
+  /// Map a distinct node.
+  ///
+  /// Return the mapping for the distinct node \c N, saving the result in \a
+  /// DistinctWorklist for later remapping.
+  ///
+  /// \pre \c N is not yet mapped.
+  /// \pre \c N.isDistinct().
+  MDNode *mapDistinctNode(const MDNode &N);
+
+  /// Get a previously mapped node.
+  Optional<Metadata *> getMappedOp(const Metadata *Op) const;
+
+  /// Create a post-order traversal of an unmapped uniqued node subgraph.
+  ///
+  /// This traverses the metadata graph deeply enough to map \c FirstN.  It
+  /// uses \a tryToMapOperand() (via \a Mapper::mapSimplifiedNode()), so any
+  /// metadata that has already been mapped will not be part of the POT.
+  ///
+  /// Each node that has a changed operand from outside the graph (e.g., a
+  /// distinct node, an already-mapped uniqued node, or \a ConstantAsMetadata)
+  /// is marked with \a Data::HasChanged.
+  ///
+  /// \return \c true if any nodes in \c G have \a Data::HasChanged.
+  /// \post \c G.POT is a post-order traversal ending with \c FirstN.
+  /// \post \a Data::hasChanged in \c G.Info indicates whether any node needs
+  /// to change because of operands outside the graph.
+  bool createPOT(UniquedGraph &G, const MDNode &FirstN);
+
+  /// Visit the operands of a uniqued node in the POT.
+  ///
+  /// Visit the operands in the range from \c I to \c E, returning the first
+  /// uniqued node we find that isn't yet in \c G.  \c I is always advanced to
+  /// where to continue the loop through the operands.
+  ///
+  /// This sets \c HasChanged if any of the visited operands change.
+  MDNode *visitOperands(UniquedGraph &G, MDNode::op_iterator &I,
+                        MDNode::op_iterator E, bool &HasChanged);
+
+  /// Map all the nodes in the given uniqued graph.
+  ///
+  /// This visits all the nodes in \c G in post-order, using the identity
+  /// mapping or creating a new node depending on \a Data::HasChanged.
+  ///
+  /// \pre \a getMappedOp() returns None for nodes in \c G, but not for any of
+  /// their operands outside of \c G.
+  /// \pre \a Data::HasChanged is true for a node in \c G iff any of its
+  /// operands have changed.
+  /// \post \a getMappedOp() returns the mapped node for every node in \c G.
+  void mapNodesInPOT(UniquedGraph &G);
+
+  /// Remap a node's operands using the given functor.
+  ///
+  /// Iterate through the operands of \c N and update them in place using \c
+  /// mapOperand.
+  ///
+  /// \pre N.isDistinct() or N.isTemporary().
+  template <class OperandMapper>
+  void remapOperands(MDNode &N, OperandMapper mapOperand);
+};
+
+} // end namespace
+
+Value *Mapper::mapValue(const Value *V) {
+  ValueToValueMapTy::iterator I = getVM().find(V);
+
+  // If the value already exists in the map, use it.
+  if (I != getVM().end()) {
+    assert(I->second && "Unexpected null mapping");
+    return I->second;
+  }
+
+  // If we have a materializer and it can materialize a value, use that.
+  if (auto *Materializer = getMaterializer()) {
+    if (Value *NewV = Materializer->materialize(const_cast<Value *>(V))) {
+      getVM()[V] = NewV;
+      return NewV;
+    }
+  }
+
+  // Global values do not need to be seeded into the VM if they
+  // are using the identity mapping.
+  if (isa<GlobalValue>(V)) {
+    if (Flags & RF_NullMapMissingGlobalValues)
+      return nullptr;
+    return getVM()[V] = const_cast<Value *>(V);
+  }
+
+  if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
+    // Inline asm may need *type* remapping.
+    FunctionType *NewTy = IA->getFunctionType();
+    if (TypeMapper) {
+      NewTy = cast<FunctionType>(TypeMapper->remapType(NewTy));
+
+      if (NewTy != IA->getFunctionType())
+        V = InlineAsm::get(NewTy, IA->getAsmString(), IA->getConstraintString(),
+                           IA->hasSideEffects(), IA->isAlignStack());
+    }
+
+    return getVM()[V] = const_cast<Value *>(V);
+  }
+
+  if (const auto *MDV = dyn_cast<MetadataAsValue>(V)) {
+    const Metadata *MD = MDV->getMetadata();
+
+    if (auto *LAM = dyn_cast<LocalAsMetadata>(MD)) {
+      // Look through to grab the local value.
+      if (Value *LV = mapValue(LAM->getValue())) {
+        if (V == LAM->getValue())
+          return const_cast<Value *>(V);
+        return MetadataAsValue::get(V->getContext(), ValueAsMetadata::get(LV));
+      }
+
+      // FIXME: always return nullptr once Verifier::verifyDominatesUse()
+      // ensures metadata operands only reference defined SSA values.
+      return (Flags & RF_IgnoreMissingLocals)
+                 ? nullptr
+                 : MetadataAsValue::get(V->getContext(),
+                                        MDTuple::get(V->getContext(), None));
+    }
+
+    // If this is a module-level metadata and we know that nothing at the module
+    // level is changing, then use an identity mapping.
+    if (Flags & RF_NoModuleLevelChanges)
+      return getVM()[V] = const_cast<Value *>(V);
+
+    // Map the metadata and turn it into a value.
+    auto *MappedMD = mapMetadata(MD);
+    if (MD == MappedMD)
+      return getVM()[V] = const_cast<Value *>(V);
+    return getVM()[V] = MetadataAsValue::get(V->getContext(), MappedMD);
+  }
+
+  // Okay, this either must be a constant (which may or may not be mappable) or
+  // is something that is not in the mapping table.
+  Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V));
+  if (!C)
+    return nullptr;
+
+  if (BlockAddress *BA = dyn_cast<BlockAddress>(C))
+    return mapBlockAddress(*BA);
+
+  auto mapValueOrNull = [this](Value *V) {
+    auto Mapped = mapValue(V);
+    assert((Mapped || (Flags & RF_NullMapMissingGlobalValues)) &&
+           "Unexpected null mapping for constant operand without "
+           "NullMapMissingGlobalValues flag");
+    return Mapped;
+  };
+
+  // Otherwise, we have some other constant to remap.  Start by checking to see
+  // if all operands have an identity remapping.
+  unsigned OpNo = 0, NumOperands = C->getNumOperands();
+  Value *Mapped = nullptr;
+  for (; OpNo != NumOperands; ++OpNo) {
+    Value *Op = C->getOperand(OpNo);
+    Mapped = mapValueOrNull(Op);
+    if (!Mapped)
+      return nullptr;
+    if (Mapped != Op)
+      break;
+  }
+
+  // See if the type mapper wants to remap the type as well.
+  Type *NewTy = C->getType();
+  if (TypeMapper)
+    NewTy = TypeMapper->remapType(NewTy);
+
+  // If the result type and all operands match up, then just insert an identity
+  // mapping.
+  if (OpNo == NumOperands && NewTy == C->getType())
+    return getVM()[V] = C;
+
+  // Okay, we need to create a new constant.  We've already processed some or
+  // all of the operands, set them all up now.
+  SmallVector<Constant*, 8> Ops;
+  Ops.reserve(NumOperands);
+  for (unsigned j = 0; j != OpNo; ++j)
+    Ops.push_back(cast<Constant>(C->getOperand(j)));
+
+  // If one of the operands mismatch, push it and the other mapped operands.
+  if (OpNo != NumOperands) {
+    Ops.push_back(cast<Constant>(Mapped));
+
+    // Map the rest of the operands that aren't processed yet.
+    for (++OpNo; OpNo != NumOperands; ++OpNo) {
+      Mapped = mapValueOrNull(C->getOperand(OpNo));
+      if (!Mapped)
+        return nullptr;
+      Ops.push_back(cast<Constant>(Mapped));
+    }
+  }
+  Type *NewSrcTy = nullptr;
+  if (TypeMapper)
+    if (auto *GEPO = dyn_cast<GEPOperator>(C))
+      NewSrcTy = TypeMapper->remapType(GEPO->getSourceElementType());
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    return getVM()[V] = CE->getWithOperands(Ops, NewTy, false, NewSrcTy);
+  if (isa<ConstantArray>(C))
+    return getVM()[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops);
+  if (isa<ConstantStruct>(C))
+    return getVM()[V] = ConstantStruct::get(cast<StructType>(NewTy), Ops);
+  if (isa<ConstantVector>(C))
+    return getVM()[V] = ConstantVector::get(Ops);
+  // If this is a no-operand constant, it must be because the type was remapped.
+  if (isa<UndefValue>(C))
+    return getVM()[V] = UndefValue::get(NewTy);
+  if (isa<ConstantAggregateZero>(C))
+    return getVM()[V] = ConstantAggregateZero::get(NewTy);
+  assert(isa<ConstantPointerNull>(C));
+  return getVM()[V] = ConstantPointerNull::get(cast<PointerType>(NewTy));
+}
+
+Value *Mapper::mapBlockAddress(const BlockAddress &BA) {
+  Function *F = cast<Function>(mapValue(BA.getFunction()));
+
+  // F may not have materialized its initializer.  In that case, create a
+  // dummy basic block for now, and replace it once we've materialized all
+  // the initializers.
+  BasicBlock *BB;
+  if (F->empty()) {
+    DelayedBBs.push_back(DelayedBasicBlock(BA));
+    BB = DelayedBBs.back().TempBB.get();
+  } else {
+    BB = cast_or_null<BasicBlock>(mapValue(BA.getBasicBlock()));
+  }
+
+  return getVM()[&BA] = BlockAddress::get(F, BB ? BB : BA.getBasicBlock());
+}
+
+Metadata *Mapper::mapToMetadata(const Metadata *Key, Metadata *Val) {
+  getVM().MD()[Key].reset(Val);
+  return Val;
+}
+
+Metadata *Mapper::mapToSelf(const Metadata *MD) {
+  return mapToMetadata(MD, const_cast<Metadata *>(MD));
+}
+
+Optional<Metadata *> MDNodeMapper::tryToMapOperand(const Metadata *Op) {
+  if (!Op)
+    return nullptr;
+
+  if (Optional<Metadata *> MappedOp = M.mapSimpleMetadata(Op)) {
+#ifndef NDEBUG
+    if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op))
+      assert((!*MappedOp || M.getVM().count(CMD->getValue()) ||
+              M.getVM().getMappedMD(Op)) &&
+             "Expected Value to be memoized");
+    else
+      assert((isa<MDString>(Op) || M.getVM().getMappedMD(Op)) &&
+             "Expected result to be memoized");
+#endif
+    return *MappedOp;
+  }
+
+  const MDNode &N = *cast<MDNode>(Op);
+  if (N.isDistinct())
+    return mapDistinctNode(N);
+  return None;
+}
+
+MDNode *MDNodeMapper::mapDistinctNode(const MDNode &N) {
+  assert(N.isDistinct() && "Expected a distinct node");
+  assert(!M.getVM().getMappedMD(&N) && "Expected an unmapped node");
+  DistinctWorklist.push_back(cast<MDNode>(
+      (M.Flags & RF_MoveDistinctMDs)
+          ? M.mapToSelf(&N)
+          : M.mapToMetadata(&N, MDNode::replaceWithDistinct(N.clone()))));
+  return DistinctWorklist.back();
+}
+
+static ConstantAsMetadata *wrapConstantAsMetadata(const ConstantAsMetadata &CMD,
+                                                  Value *MappedV) {
+  if (CMD.getValue() == MappedV)
+    return const_cast<ConstantAsMetadata *>(&CMD);
+  return MappedV ? ConstantAsMetadata::getConstant(MappedV) : nullptr;
+}
+
+Optional<Metadata *> MDNodeMapper::getMappedOp(const Metadata *Op) const {
+  if (!Op)
+    return nullptr;
+
+  if (Optional<Metadata *> MappedOp = M.getVM().getMappedMD(Op))
+    return *MappedOp;
+
+  if (isa<MDString>(Op))
+    return const_cast<Metadata *>(Op);
+
+  if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op))
+    return wrapConstantAsMetadata(*CMD, M.getVM().lookup(CMD->getValue()));
+
+  return None;
+}
+
+Metadata &MDNodeMapper::UniquedGraph::getFwdReference(MDNode &Op) {
+  auto Where = Info.find(&Op);
+  assert(Where != Info.end() && "Expected a valid reference");
+
+  auto &OpD = Where->second;
+  if (!OpD.HasChanged)
+    return Op;
+
+  // Lazily construct a temporary node.
+  if (!OpD.Placeholder)
+    OpD.Placeholder = Op.clone();
+
+  return *OpD.Placeholder;
+}
+
+template <class OperandMapper>
+void MDNodeMapper::remapOperands(MDNode &N, OperandMapper mapOperand) {
+  assert(!N.isUniqued() && "Expected distinct or temporary nodes");
+  for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
+    Metadata *Old = N.getOperand(I);
+    Metadata *New = mapOperand(Old);
+
+    if (Old != New)
+      N.replaceOperandWith(I, New);
+  }
+}
+
+namespace {
+/// An entry in the worklist for the post-order traversal.
+struct POTWorklistEntry {
+  MDNode *N;              ///< Current node.
+  MDNode::op_iterator Op; ///< Current operand of \c N.
+
+  /// Keep a flag of whether operands have changed in the worklist to avoid
+  /// hitting the map in \a UniquedGraph.
+  bool HasChanged = false;
+
+  POTWorklistEntry(MDNode &N) : N(&N), Op(N.op_begin()) {}
+};
+} // end namespace
+
+bool MDNodeMapper::createPOT(UniquedGraph &G, const MDNode &FirstN) {
+  assert(G.Info.empty() && "Expected a fresh traversal");
+  assert(FirstN.isUniqued() && "Expected uniqued node in POT");
+
+  // Construct a post-order traversal of the uniqued subgraph under FirstN.
+  bool AnyChanges = false;
+  SmallVector<POTWorklistEntry, 16> Worklist;
+  Worklist.push_back(POTWorklistEntry(const_cast<MDNode &>(FirstN)));
+  (void)G.Info[&FirstN];
+  while (!Worklist.empty()) {
+    // Start or continue the traversal through the this node's operands.
+    auto &WE = Worklist.back();
+    if (MDNode *N = visitOperands(G, WE.Op, WE.N->op_end(), WE.HasChanged)) {
+      // Push a new node to traverse first.
+      Worklist.push_back(POTWorklistEntry(*N));
+      continue;
+    }
+
+    // Push the node onto the POT.
+    assert(WE.N->isUniqued() && "Expected only uniqued nodes");
+    assert(WE.Op == WE.N->op_end() && "Expected to visit all operands");
+    auto &D = G.Info[WE.N];
+    AnyChanges |= D.HasChanged = WE.HasChanged;
+    D.ID = G.POT.size();
+    G.POT.push_back(WE.N);
+
+    // Pop the node off the worklist.
+    Worklist.pop_back();
+  }
+  return AnyChanges;
+}
+
+MDNode *MDNodeMapper::visitOperands(UniquedGraph &G, MDNode::op_iterator &I,
+                                    MDNode::op_iterator E, bool &HasChanged) {
+  while (I != E) {
+    Metadata *Op = *I++; // Increment even on early return.
+    if (Optional<Metadata *> MappedOp = tryToMapOperand(Op)) {
+      // Check if the operand changes.
+      HasChanged |= Op != *MappedOp;
+      continue;
+    }
+
+    // A uniqued metadata node.
+    MDNode &OpN = *cast<MDNode>(Op);
+    assert(OpN.isUniqued() &&
+           "Only uniqued operands cannot be mapped immediately");
+    if (G.Info.insert(std::make_pair(&OpN, Data())).second)
+      return &OpN; // This is a new one.  Return it.
+  }
+  return nullptr;
+}
+
+void MDNodeMapper::UniquedGraph::propagateChanges() {
+  bool AnyChanges;
+  do {
+    AnyChanges = false;
+    for (MDNode *N : POT) {
+      auto &D = Info[N];
+      if (D.HasChanged)
+        continue;
+
+      if (none_of(N->operands(), [&](const Metadata *Op) {
+            auto Where = Info.find(Op);
+            return Where != Info.end() && Where->second.HasChanged;
+          }))
+        continue;
+
+      AnyChanges = D.HasChanged = true;
+    }
+  } while (AnyChanges);
+}
+
+void MDNodeMapper::mapNodesInPOT(UniquedGraph &G) {
+  // Construct uniqued nodes, building forward references as necessary.
+  SmallVector<MDNode *, 16> CyclicNodes;
+  for (auto *N : G.POT) {
+    auto &D = G.Info[N];
+    if (!D.HasChanged) {
+      // The node hasn't changed.
+      M.mapToSelf(N);
+      continue;
+    }
+
+    // Remember whether this node had a placeholder.
+    bool HadPlaceholder(D.Placeholder);
+
+    // Clone the uniqued node and remap the operands.
+    TempMDNode ClonedN = D.Placeholder ? std::move(D.Placeholder) : N->clone();
+    remapOperands(*ClonedN, [this, &D, &G](Metadata *Old) {
+      if (Optional<Metadata *> MappedOp = getMappedOp(Old))
+        return *MappedOp;
+      (void)D;
+      assert(G.Info[Old].ID > D.ID && "Expected a forward reference");
+      return &G.getFwdReference(*cast<MDNode>(Old));
+    });
+
+    auto *NewN = MDNode::replaceWithUniqued(std::move(ClonedN));
+    M.mapToMetadata(N, NewN);
+
+    // Nodes that were referenced out of order in the POT are involved in a
+    // uniquing cycle.
+    if (HadPlaceholder)
+      CyclicNodes.push_back(NewN);
+  }
+
+  // Resolve cycles.
+  for (auto *N : CyclicNodes)
+    if (!N->isResolved())
+      N->resolveCycles();
+}
+
+Metadata *MDNodeMapper::map(const MDNode &N) {
+  assert(DistinctWorklist.empty() && "MDNodeMapper::map is not recursive");
+  assert(!(M.Flags & RF_NoModuleLevelChanges) &&
+         "MDNodeMapper::map assumes module-level changes");
+
+  // Require resolved nodes whenever metadata might be remapped.
+  assert(N.isResolved() && "Unexpected unresolved node");
+
+  Metadata *MappedN =
+      N.isUniqued() ? mapTopLevelUniquedNode(N) : mapDistinctNode(N);
+  while (!DistinctWorklist.empty())
+    remapOperands(*DistinctWorklist.pop_back_val(), [this](Metadata *Old) {
+      if (Optional<Metadata *> MappedOp = tryToMapOperand(Old))
+        return *MappedOp;
+      return mapTopLevelUniquedNode(*cast<MDNode>(Old));
+    });
+  return MappedN;
+}
+
+Metadata *MDNodeMapper::mapTopLevelUniquedNode(const MDNode &FirstN) {
+  assert(FirstN.isUniqued() && "Expected uniqued node");
+
+  // Create a post-order traversal of uniqued nodes under FirstN.
+  UniquedGraph G;
+  if (!createPOT(G, FirstN)) {
+    // Return early if no nodes have changed.
+    for (const MDNode *N : G.POT)
+      M.mapToSelf(N);
+    return &const_cast<MDNode &>(FirstN);
+  }
+
+  // Update graph with all nodes that have changed.
+  G.propagateChanges();
+
+  // Map all the nodes in the graph.
+  mapNodesInPOT(G);
+
+  // Return the original node, remapped.
+  return *getMappedOp(&FirstN);
+}
+
+namespace {
+
+struct MapMetadataDisabler {
+  ValueToValueMapTy &VM;
+
+  MapMetadataDisabler(ValueToValueMapTy &VM) : VM(VM) {
+    VM.disableMapMetadata();
+  }
+  ~MapMetadataDisabler() { VM.enableMapMetadata(); }
+};
+
+} // end namespace
+
+Optional<Metadata *> Mapper::mapSimpleMetadata(const Metadata *MD) {
+  // If the value already exists in the map, use it.
+  if (Optional<Metadata *> NewMD = getVM().getMappedMD(MD))
+    return *NewMD;
+
+  if (isa<MDString>(MD))
+    return const_cast<Metadata *>(MD);
+
+  // This is a module-level metadata.  If nothing at the module level is
+  // changing, use an identity mapping.
+  if ((Flags & RF_NoModuleLevelChanges))
+    return const_cast<Metadata *>(MD);
+
+  if (auto *CMD = dyn_cast<ConstantAsMetadata>(MD)) {
+    // Disallow recursion into metadata mapping through mapValue.
+    MapMetadataDisabler MMD(getVM());
+
+    // Don't memoize ConstantAsMetadata.  Instead of lasting until the
+    // LLVMContext is destroyed, they can be deleted when the GlobalValue they
+    // reference is destructed.  These aren't super common, so the extra
+    // indirection isn't that expensive.
+    return wrapConstantAsMetadata(*CMD, mapValue(CMD->getValue()));
+  }
+
+  assert(isa<MDNode>(MD) && "Expected a metadata node");
+
+  return None;
+}
+
+Metadata *Mapper::mapMetadata(const Metadata *MD) {
+  assert(MD && "Expected valid metadata");
+  assert(!isa<LocalAsMetadata>(MD) && "Unexpected local metadata");
+
+  if (Optional<Metadata *> NewMD = mapSimpleMetadata(MD))
+    return *NewMD;
+
+  return MDNodeMapper(*this).map(*cast<MDNode>(MD));
+}
+
+void Mapper::flush() {
+  // Flush out the worklist of global values.
+  while (!Worklist.empty()) {
+    WorklistEntry E = Worklist.pop_back_val();
+    CurrentMCID = E.MCID;
+    switch (E.Kind) {
+    case WorklistEntry::MapGlobalInit:
+      E.Data.GVInit.GV->setInitializer(mapConstant(E.Data.GVInit.Init));
+      remapGlobalObjectMetadata(*E.Data.GVInit.GV);
+      break;
+    case WorklistEntry::MapAppendingVar: {
+      unsigned PrefixSize = AppendingInits.size() - E.AppendingGVNumNewMembers;
+      mapAppendingVariable(*E.Data.AppendingGV.GV,
+                           E.Data.AppendingGV.InitPrefix,
+                           E.AppendingGVIsOldCtorDtor,
+                           makeArrayRef(AppendingInits).slice(PrefixSize));
+      AppendingInits.resize(PrefixSize);
+      break;
+    }
+    case WorklistEntry::MapGlobalAliasee:
+      E.Data.GlobalAliasee.GA->setAliasee(
+          mapConstant(E.Data.GlobalAliasee.Aliasee));
+      break;
+    case WorklistEntry::RemapFunction:
+      remapFunction(*E.Data.RemapF);
+      break;
+    }
+  }
+  CurrentMCID = 0;
+
+  // Finish logic for block addresses now that all global values have been
+  // handled.
+  while (!DelayedBBs.empty()) {
+    DelayedBasicBlock DBB = DelayedBBs.pop_back_val();
+    BasicBlock *BB = cast_or_null<BasicBlock>(mapValue(DBB.OldBB));
+    DBB.TempBB->replaceAllUsesWith(BB ? BB : DBB.OldBB);
+  }
+}
+
+void Mapper::remapInstruction(Instruction *I) {
+  // Remap operands.
+  for (Use &Op : I->operands()) {
+    Value *V = mapValue(Op);
+    // If we aren't ignoring missing entries, assert that something happened.
+    if (V)
+      Op = V;
+    else
+      assert((Flags & RF_IgnoreMissingLocals) &&
+             "Referenced value not in value map!");
+  }
+
+  // Remap phi nodes' incoming blocks.
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *V = mapValue(PN->getIncomingBlock(i));
+      // If we aren't ignoring missing entries, assert that something happened.
+      if (V)
+        PN->setIncomingBlock(i, cast<BasicBlock>(V));
+      else
+        assert((Flags & RF_IgnoreMissingLocals) &&
+               "Referenced block not in value map!");
+    }
+  }
+
+  // Remap attached metadata.
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  I->getAllMetadata(MDs);
+  for (const auto &MI : MDs) {
+    MDNode *Old = MI.second;
+    MDNode *New = cast_or_null<MDNode>(mapMetadata(Old));
+    if (New != Old)
+      I->setMetadata(MI.first, New);
+  }
+
+  if (!TypeMapper)
+    return;
+
+  // If the instruction's type is being remapped, do so now.
+  if (auto CS = CallSite(I)) {
+    SmallVector<Type *, 3> Tys;
+    FunctionType *FTy = CS.getFunctionType();
+    Tys.reserve(FTy->getNumParams());
+    for (Type *Ty : FTy->params())
+      Tys.push_back(TypeMapper->remapType(Ty));
+    CS.mutateFunctionType(FunctionType::get(
+        TypeMapper->remapType(I->getType()), Tys, FTy->isVarArg()));
+    return;
+  }
+  if (auto *AI = dyn_cast<AllocaInst>(I))
+    AI->setAllocatedType(TypeMapper->remapType(AI->getAllocatedType()));
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    GEP->setSourceElementType(
+        TypeMapper->remapType(GEP->getSourceElementType()));
+    GEP->setResultElementType(
+        TypeMapper->remapType(GEP->getResultElementType()));
+  }
+  I->mutateType(TypeMapper->remapType(I->getType()));
+}
+
+void Mapper::remapGlobalObjectMetadata(GlobalObject &GO) {
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
+  GO.getAllMetadata(MDs);
+  GO.clearMetadata();
+  for (const auto &I : MDs)
+    GO.addMetadata(I.first, *cast<MDNode>(mapMetadata(I.second)));
+}
+
+void Mapper::remapFunction(Function &F) {
+  // Remap the operands.
+  for (Use &Op : F.operands())
+    if (Op)
+      Op = mapValue(Op);
+
+  // Remap the metadata attachments.
+  remapGlobalObjectMetadata(F);
+
+  // Remap the argument types.
+  if (TypeMapper)
+    for (Argument &A : F.args())
+      A.mutateType(TypeMapper->remapType(A.getType()));
+
+  // Remap the instructions.
+  for (BasicBlock &BB : F)
+    for (Instruction &I : BB)
+      remapInstruction(&I);
+}
+
+void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+                                  bool IsOldCtorDtor,
+                                  ArrayRef<Constant *> NewMembers) {
+  SmallVector<Constant *, 16> Elements;
+  if (InitPrefix) {
+    unsigned NumElements =
+        cast<ArrayType>(InitPrefix->getType())->getNumElements();
+    for (unsigned I = 0; I != NumElements; ++I)
+      Elements.push_back(InitPrefix->getAggregateElement(I));
+  }
+
+  PointerType *VoidPtrTy;
+  Type *EltTy;
+  if (IsOldCtorDtor) {
+    // FIXME: This upgrade is done during linking to support the C API.  See
+    // also IRLinker::linkAppendingVarProto() in IRMover.cpp.
+    VoidPtrTy = Type::getInt8Ty(GV.getContext())->getPointerTo();
+    auto &ST = *cast<StructType>(NewMembers.front()->getType());
+    Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy};
+    EltTy = StructType::get(GV.getContext(), Tys, false);
+  }
+
+  for (auto *V : NewMembers) {
+    Constant *NewV;
+    if (IsOldCtorDtor) {
+      auto *S = cast<ConstantStruct>(V);
+      auto *E1 = cast<Constant>(mapValue(S->getOperand(0)));
+      auto *E2 = cast<Constant>(mapValue(S->getOperand(1)));
+      Constant *Null = Constant::getNullValue(VoidPtrTy);
+      NewV = ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null);
+    } else {
+      NewV = cast_or_null<Constant>(mapValue(V));
+    }
+    Elements.push_back(NewV);
+  }
+
+  GV.setInitializer(ConstantArray::get(
+      cast<ArrayType>(GV.getType()->getElementType()), Elements));
+}
+
+void Mapper::scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
+                                          unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::MapGlobalInit;
+  WE.MCID = MCID;
+  WE.Data.GVInit.GV = &GV;
+  WE.Data.GVInit.Init = &Init;
+  Worklist.push_back(WE);
+}
+
+void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV,
+                                          Constant *InitPrefix,
+                                          bool IsOldCtorDtor,
+                                          ArrayRef<Constant *> NewMembers,
+                                          unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::MapAppendingVar;
+  WE.MCID = MCID;
+  WE.Data.AppendingGV.GV = &GV;
+  WE.Data.AppendingGV.InitPrefix = InitPrefix;
+  WE.AppendingGVIsOldCtorDtor = IsOldCtorDtor;
+  WE.AppendingGVNumNewMembers = NewMembers.size();
+  Worklist.push_back(WE);
+  AppendingInits.append(NewMembers.begin(), NewMembers.end());
+}
+
+void Mapper::scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
+                                      unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GA).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::MapGlobalAliasee;
+  WE.MCID = MCID;
+  WE.Data.GlobalAliasee.GA = &GA;
+  WE.Data.GlobalAliasee.Aliasee = &Aliasee;
+  Worklist.push_back(WE);
+}
+
+void Mapper::scheduleRemapFunction(Function &F, unsigned MCID) {
+  assert(AlreadyScheduled.insert(&F).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::RemapFunction;
+  WE.MCID = MCID;
+  WE.Data.RemapF = &F;
+  Worklist.push_back(WE);
+}
+
+void Mapper::addFlags(RemapFlags Flags) {
+  assert(!hasWorkToDo() && "Expected to have flushed the worklist");
+  this->Flags = this->Flags | Flags;
+}
+
+static Mapper *getAsMapper(void *pImpl) {
+  return reinterpret_cast<Mapper *>(pImpl);
+}
+
+namespace {
+
+class FlushingMapper {
+  Mapper &M;
+
+public:
+  explicit FlushingMapper(void *pImpl) : M(*getAsMapper(pImpl)) {
+    assert(!M.hasWorkToDo() && "Expected to be flushed");
+  }
+  ~FlushingMapper() { M.flush(); }
+  Mapper *operator->() const { return &M; }
+};
+
+} // end namespace
+
+ValueMapper::ValueMapper(ValueToValueMapTy &VM, RemapFlags Flags,
+                         ValueMapTypeRemapper *TypeMapper,
+                         ValueMaterializer *Materializer)
+    : pImpl(new Mapper(VM, Flags, TypeMapper, Materializer)) {}
+
+ValueMapper::~ValueMapper() { delete getAsMapper(pImpl); }
+
+unsigned
+ValueMapper::registerAlternateMappingContext(ValueToValueMapTy &VM,
+                                             ValueMaterializer *Materializer) {
+  return getAsMapper(pImpl)->registerAlternateMappingContext(VM, Materializer);
+}
+
+void ValueMapper::addFlags(RemapFlags Flags) {
+  FlushingMapper(pImpl)->addFlags(Flags);
+}
+
+Value *ValueMapper::mapValue(const Value &V) {
+  return FlushingMapper(pImpl)->mapValue(&V);
+}
+
+Constant *ValueMapper::mapConstant(const Constant &C) {
+  return cast_or_null<Constant>(mapValue(C));
+}
+
+Metadata *ValueMapper::mapMetadata(const Metadata &MD) {
+  return FlushingMapper(pImpl)->mapMetadata(&MD);
+}
+
+MDNode *ValueMapper::mapMDNode(const MDNode &N) {
+  return cast_or_null<MDNode>(mapMetadata(N));
+}
+
+void ValueMapper::remapInstruction(Instruction &I) {
+  FlushingMapper(pImpl)->remapInstruction(&I);
+}
+
+void ValueMapper::remapFunction(Function &F) {
+  FlushingMapper(pImpl)->remapFunction(F);
+}
+
+void ValueMapper::scheduleMapGlobalInitializer(GlobalVariable &GV,
+                                               Constant &Init,
+                                               unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapGlobalInitializer(GV, Init, MCID);
+}
+
+void ValueMapper::scheduleMapAppendingVariable(GlobalVariable &GV,
+                                               Constant *InitPrefix,
+                                               bool IsOldCtorDtor,
+                                               ArrayRef<Constant *> NewMembers,
+                                               unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapAppendingVariable(
+      GV, InitPrefix, IsOldCtorDtor, NewMembers, MCID);
+}
+
+void ValueMapper::scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
+                                           unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapGlobalAliasee(GA, Aliasee, MCID);
+}
+
+void ValueMapper::scheduleRemapFunction(Function &F, unsigned MCID) {
+  getAsMapper(pImpl)->scheduleRemapFunction(F, MCID);
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
new file mode 100644
index 000000000000..9cf66382b581
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -0,0 +1,1073 @@
+//===----- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/OrderedBasicBlock.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Vectorize.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "load-store-vectorizer"
+STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
+STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
+
+namespace {
+
+// FIXME: Assuming stack alignment of 4 is always good enough
+static const unsigned StackAdjustedAlignment = 4;
+typedef SmallVector<Instruction *, 8> InstrList;
+typedef MapVector<Value *, InstrList> InstrListMap;
+
+class Vectorizer {
+  Function &F;
+  AliasAnalysis &AA;
+  DominatorTree &DT;
+  ScalarEvolution &SE;
+  TargetTransformInfo &TTI;
+  const DataLayout &DL;
+  IRBuilder<> Builder;
+
+public:
+  Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
+             ScalarEvolution &SE, TargetTransformInfo &TTI)
+      : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
+        DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
+
+  bool run();
+
+private:
+  Value *getPointerOperand(Value *I) const;
+
+  GetElementPtrInst *getSourceGEP(Value *Src) const;
+
+  unsigned getPointerAddressSpace(Value *I);
+
+  unsigned getAlignment(LoadInst *LI) const {
+    unsigned Align = LI->getAlignment();
+    if (Align != 0)
+      return Align;
+
+    return DL.getABITypeAlignment(LI->getType());
+  }
+
+  unsigned getAlignment(StoreInst *SI) const {
+    unsigned Align = SI->getAlignment();
+    if (Align != 0)
+      return Align;
+
+    return DL.getABITypeAlignment(SI->getValueOperand()->getType());
+  }
+
+  bool isConsecutiveAccess(Value *A, Value *B);
+
+  /// After vectorization, reorder the instructions that I depends on
+  /// (the instructions defining its operands), to ensure they dominate I.
+  void reorder(Instruction *I);
+
+  /// Returns the first and the last instructions in Chain.
+  std::pair<BasicBlock::iterator, BasicBlock::iterator>
+  getBoundaryInstrs(ArrayRef<Instruction *> Chain);
+
+  /// Erases the original instructions after vectorizing.
+  void eraseInstructions(ArrayRef<Instruction *> Chain);
+
+  /// "Legalize" the vector type that would be produced by combining \p
+  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
+  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
+  /// expected to have more than 4 elements.
+  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
+  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
+
+  /// Finds the largest prefix of Chain that's vectorizable, checking for
+  /// intervening instructions which may affect the memory accessed by the
+  /// instructions within Chain.
+  ///
+  /// The elements of \p Chain must be all loads or all stores and must be in
+  /// address order.
+  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
+
+  /// Collects load and store instructions to vectorize.
+  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
+
+  /// Processes the collected instructions, the \p Map. The values of \p Map
+  /// should be all loads or all stores.
+  bool vectorizeChains(InstrListMap &Map);
+
+  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
+  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
+
+  /// Vectorizes the load instructions in Chain.
+  bool
+  vectorizeLoadChain(ArrayRef<Instruction *> Chain,
+                     SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
+
+  /// Vectorizes the store instructions in Chain.
+  bool
+  vectorizeStoreChain(ArrayRef<Instruction *> Chain,
+                      SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
+
+  /// Check if this load/store access is misaligned accesses.
+  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
+                          unsigned Alignment);
+};
+
+class LoadStoreVectorizer : public FunctionPass {
+public:
+  static char ID;
+
+  LoadStoreVectorizer() : FunctionPass(ID) {
+    initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override {
+    return "GPU Load and Store Vectorizer";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+}
+
+INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE,
+                      "Vectorize load and Store instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE,
+                    "Vectorize load and store instructions", false, false)
+
+char LoadStoreVectorizer::ID = 0;
+
+Pass *llvm::createLoadStoreVectorizerPass() {
+  return new LoadStoreVectorizer();
+}
+
+// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
+// vectors of Instructions.
+static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
+  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
+  propagateMetadata(I, VL);
+}
+
+bool LoadStoreVectorizer::runOnFunction(Function &F) {
+  // Don't vectorize when the attribute NoImplicitFloat is used.
+  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
+    return false;
+
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  Vectorizer V(F, AA, DT, SE, TTI);
+  return V.run();
+}
+
+// Vectorizer Implementation
+bool Vectorizer::run() {
+  bool Changed = false;
+
+  // Scan the blocks in the function in post order.
+  for (BasicBlock *BB : post_order(&F)) {
+    InstrListMap LoadRefs, StoreRefs;
+    std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
+    Changed |= vectorizeChains(LoadRefs);
+    Changed |= vectorizeChains(StoreRefs);
+  }
+
+  return Changed;
+}
+
+Value *Vectorizer::getPointerOperand(Value *I) const {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  return nullptr;
+}
+
+unsigned Vectorizer::getPointerAddressSpace(Value *I) {
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    return L->getPointerAddressSpace();
+  if (StoreInst *S = dyn_cast<StoreInst>(I))
+    return S->getPointerAddressSpace();
+  return -1;
+}
+
+GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const {
+  // First strip pointer bitcasts. Make sure pointee size is the same with
+  // and without casts.
+  // TODO: a stride set by the add instruction below can match the difference
+  // in pointee type size here. Currently it will not be vectorized.
+  Value *SrcPtr = getPointerOperand(Src);
+  Value *SrcBase = SrcPtr->stripPointerCasts();
+  if (DL.getTypeStoreSize(SrcPtr->getType()->getPointerElementType()) ==
+      DL.getTypeStoreSize(SrcBase->getType()->getPointerElementType()))
+    SrcPtr = SrcBase;
+  return dyn_cast<GetElementPtrInst>(SrcPtr);
+}
+
+// FIXME: Merge with llvm::isConsecutiveAccess
+bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
+  Value *PtrA = getPointerOperand(A);
+  Value *PtrB = getPointerOperand(B);
+  unsigned ASA = getPointerAddressSpace(A);
+  unsigned ASB = getPointerAddressSpace(B);
+
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB))
+    return false;
+
+  // Make sure that A and B are different pointers of the same size type.
+  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
+  Type *PtrATy = PtrA->getType()->getPointerElementType();
+  Type *PtrBTy = PtrB->getType()->getPointerElementType();
+  if (PtrA == PtrB ||
+      DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
+      DL.getTypeStoreSize(PtrATy->getScalarType()) !=
+          DL.getTypeStoreSize(PtrBTy->getScalarType()))
+    return false;
+
+  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
+
+  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+
+  APInt OffsetDelta = OffsetB - OffsetA;
+
+  // Check if they are based on the same pointer. That makes the offsets
+  // sufficient.
+  if (PtrA == PtrB)
+    return OffsetDelta == Size;
+
+  // Compute the necessary base pointer delta to have the necessary final delta
+  // equal to the size.
+  APInt BaseDelta = Size - OffsetDelta;
+
+  // Compute the distance with SCEV between the base pointers.
+  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
+  const SCEV *C = SE.getConstant(BaseDelta);
+  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
+  if (X == PtrSCEVB)
+    return true;
+
+  // Sometimes even this doesn't work, because SCEV can't always see through
+  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
+  // things the hard way.
+
+  // Look through GEPs after checking they're the same except for the last
+  // index.
+  GetElementPtrInst *GEPA = getSourceGEP(A);
+  GetElementPtrInst *GEPB = getSourceGEP(B);
+  if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands())
+    return false;
+  unsigned FinalIndex = GEPA->getNumOperands() - 1;
+  for (unsigned i = 0; i < FinalIndex; i++)
+    if (GEPA->getOperand(i) != GEPB->getOperand(i))
+      return false;
+
+  Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex));
+  Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex));
+  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
+      OpA->getType() != OpB->getType())
+    return false;
+
+  // Only look through a ZExt/SExt.
+  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
+    return false;
+
+  bool Signed = isa<SExtInst>(OpA);
+
+  OpA = dyn_cast<Instruction>(OpA->getOperand(0));
+  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
+  if (!OpA || !OpB || OpA->getType() != OpB->getType())
+    return false;
+
+  // Now we need to prove that adding 1 to OpA won't overflow.
+  bool Safe = false;
+  // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA,
+  // we're okay.
+  if (OpB->getOpcode() == Instruction::Add &&
+      isa<ConstantInt>(OpB->getOperand(1)) &&
+      cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) {
+    if (Signed)
+      Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
+    else
+      Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
+  }
+
+  unsigned BitWidth = OpA->getType()->getScalarSizeInBits();
+
+  // Second attempt:
+  // If any bits are known to be zero other than the sign bit in OpA, we can
+  // add 1 to it while guaranteeing no overflow of any sort.
+  if (!Safe) {
+    KnownBits Known(BitWidth);
+    computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
+    if (Known.countMaxTrailingOnes() < (BitWidth - 1))
+      Safe = true;
+  }
+
+  if (!Safe)
+    return false;
+
+  const SCEV *OffsetSCEVA = SE.getSCEV(OpA);
+  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
+  const SCEV *One = SE.getConstant(APInt(BitWidth, 1));
+  const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One);
+  return X2 == OffsetSCEVB;
+}
+
+void Vectorizer::reorder(Instruction *I) {
+  OrderedBasicBlock OBB(I->getParent());
+  SmallPtrSet<Instruction *, 16> InstructionsToMove;
+  SmallVector<Instruction *, 16> Worklist;
+
+  Worklist.push_back(I);
+  while (!Worklist.empty()) {
+    Instruction *IW = Worklist.pop_back_val();
+    int NumOperands = IW->getNumOperands();
+    for (int i = 0; i < NumOperands; i++) {
+      Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
+      if (!IM || IM->getOpcode() == Instruction::PHI)
+        continue;
+
+      // If IM is in another BB, no need to move it, because this pass only
+      // vectorizes instructions within one BB.
+      if (IM->getParent() != I->getParent())
+        continue;
+
+      if (!OBB.dominates(IM, I)) {
+        InstructionsToMove.insert(IM);
+        Worklist.push_back(IM);
+      }
+    }
+  }
+
+  // All instructions to move should follow I. Start from I, not from begin().
+  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
+       ++BBI) {
+    if (!InstructionsToMove.count(&*BBI))
+      continue;
+    Instruction *IM = &*BBI;
+    --BBI;
+    IM->removeFromParent();
+    IM->insertBefore(I);
+  }
+}
+
+std::pair<BasicBlock::iterator, BasicBlock::iterator>
+Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
+  Instruction *C0 = Chain[0];
+  BasicBlock::iterator FirstInstr = C0->getIterator();
+  BasicBlock::iterator LastInstr = C0->getIterator();
+
+  BasicBlock *BB = C0->getParent();
+  unsigned NumFound = 0;
+  for (Instruction &I : *BB) {
+    if (!is_contained(Chain, &I))
+      continue;
+
+    ++NumFound;
+    if (NumFound == 1) {
+      FirstInstr = I.getIterator();
+    }
+    if (NumFound == Chain.size()) {
+      LastInstr = I.getIterator();
+      break;
+    }
+  }
+
+  // Range is [first, last).
+  return std::make_pair(FirstInstr, ++LastInstr);
+}
+
+void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
+  SmallVector<Instruction *, 16> Instrs;
+  for (Instruction *I : Chain) {
+    Value *PtrOperand = getPointerOperand(I);
+    assert(PtrOperand && "Instruction must have a pointer operand.");
+    Instrs.push_back(I);
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
+      Instrs.push_back(GEP);
+  }
+
+  // Erase instructions.
+  for (Instruction *I : Instrs)
+    if (I->use_empty())
+      I->eraseFromParent();
+}
+
+std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
+Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
+                               unsigned ElementSizeBits) {
+  unsigned ElementSizeBytes = ElementSizeBits / 8;
+  unsigned SizeBytes = ElementSizeBytes * Chain.size();
+  unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
+  if (NumLeft == Chain.size()) {
+    if ((NumLeft & 1) == 0)
+      NumLeft /= 2; // Split even in half
+    else
+      --NumLeft;    // Split off last element
+  } else if (NumLeft == 0)
+    NumLeft = 1;
+  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
+}
+
+ArrayRef<Instruction *>
+Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
+  // These are in BB order, unlike Chain, which is in address order.
+  SmallVector<Instruction *, 16> MemoryInstrs;
+  SmallVector<Instruction *, 16> ChainInstrs;
+
+  bool IsLoadChain = isa<LoadInst>(Chain[0]);
+  DEBUG({
+    for (Instruction *I : Chain) {
+      if (IsLoadChain)
+        assert(isa<LoadInst>(I) &&
+               "All elements of Chain must be loads, or all must be stores.");
+      else
+        assert(isa<StoreInst>(I) &&
+               "All elements of Chain must be loads, or all must be stores.");
+    }
+  });
+
+  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
+    if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+      if (!is_contained(Chain, &I))
+        MemoryInstrs.push_back(&I);
+      else
+        ChainInstrs.push_back(&I);
+    } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
+      DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n');
+      break;
+    } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
+      DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
+                   << '\n');
+      break;
+    }
+  }
+
+  OrderedBasicBlock OBB(Chain[0]->getParent());
+
+  // Loop until we find an instruction in ChainInstrs that we can't vectorize.
+  unsigned ChainInstrIdx = 0;
+  Instruction *BarrierMemoryInstr = nullptr;
+
+  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
+    Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
+
+    // If a barrier memory instruction was found, chain instructions that follow
+    // will not be added to the valid prefix.
+    if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
+      break;
+
+    // Check (in BB order) if any instruction prevents ChainInstr from being
+    // vectorized. Find and store the first such "conflicting" instruction.
+    for (Instruction *MemInstr : MemoryInstrs) {
+      // If a barrier memory instruction was found, do not check past it.
+      if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
+        break;
+
+      if (isa<LoadInst>(MemInstr) && isa<LoadInst>(ChainInstr))
+        continue;
+
+      // We can ignore the alias as long as the load comes before the store,
+      // because that means we won't be moving the load past the store to
+      // vectorize it (the vectorized load is inserted at the location of the
+      // first load in the chain).
+      if (isa<StoreInst>(MemInstr) && isa<LoadInst>(ChainInstr) &&
+          OBB.dominates(ChainInstr, MemInstr))
+        continue;
+
+      // Same case, but in reverse.
+      if (isa<LoadInst>(MemInstr) && isa<StoreInst>(ChainInstr) &&
+          OBB.dominates(MemInstr, ChainInstr))
+        continue;
+
+      if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
+                        MemoryLocation::get(ChainInstr))) {
+        DEBUG({
+          dbgs() << "LSV: Found alias:\n"
+                    "  Aliasing instruction and pointer:\n"
+                 << "  " << *MemInstr << '\n'
+                 << "  " << *getPointerOperand(MemInstr) << '\n'
+                 << "  Aliased instruction and pointer:\n"
+                 << "  " << *ChainInstr << '\n'
+                 << "  " << *getPointerOperand(ChainInstr) << '\n';
+        });
+        // Save this aliasing memory instruction as a barrier, but allow other
+        // instructions that precede the barrier to be vectorized with this one.
+        BarrierMemoryInstr = MemInstr;
+        break;
+      }
+    }
+    // Continue the search only for store chains, since vectorizing stores that
+    // precede an aliasing load is valid. Conversely, vectorizing loads is valid
+    // up to an aliasing store, but should not pull loads from further down in
+    // the basic block.
+    if (IsLoadChain && BarrierMemoryInstr) {
+      // The BarrierMemoryInstr is a store that precedes ChainInstr.
+      assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
+      break;
+    }
+  }
+
+  // Find the largest prefix of Chain whose elements are all in
+  // ChainInstrs[0, ChainInstrIdx).  This is the largest vectorizable prefix of
+  // Chain.  (Recall that Chain is in address order, but ChainInstrs is in BB
+  // order.)
+  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
+      ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
+  unsigned ChainIdx = 0;
+  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
+    if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
+      break;
+  }
+  return Chain.slice(0, ChainIdx);
+}
+
+std::pair<InstrListMap, InstrListMap>
+Vectorizer::collectInstructions(BasicBlock *BB) {
+  InstrListMap LoadRefs;
+  InstrListMap StoreRefs;
+
+  for (Instruction &I : *BB) {
+    if (!I.mayReadOrWriteMemory())
+      continue;
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+      if (!LI->isSimple())
+        continue;
+
+      // Skip if it's not legal.
+      if (!TTI.isLegalToVectorizeLoad(LI))
+        continue;
+
+      Type *Ty = LI->getType();
+      if (!VectorType::isValidElementType(Ty->getScalarType()))
+        continue;
+
+      // Skip weird non-byte sizes. They probably aren't worth the effort of
+      // handling correctly.
+      unsigned TySize = DL.getTypeSizeInBits(Ty);
+      if (TySize < 8)
+        continue;
+
+      Value *Ptr = LI->getPointerOperand();
+      unsigned AS = Ptr->getType()->getPointerAddressSpace();
+      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+
+      // No point in looking at these if they're too big to vectorize.
+      if (TySize > VecRegSize / 2)
+        continue;
+
+      // Make sure all the users of a vector are constant-index extracts.
+      if (isa<VectorType>(Ty) && !all_of(LI->users(), [](const User *U) {
+            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
+            return EEI && isa<ConstantInt>(EEI->getOperand(1));
+          }))
+        continue;
+
+      // Save the load locations.
+      Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
+      LoadRefs[ObjPtr].push_back(LI);
+
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+      if (!SI->isSimple())
+        continue;
+
+      // Skip if it's not legal.
+      if (!TTI.isLegalToVectorizeStore(SI))
+        continue;
+
+      Type *Ty = SI->getValueOperand()->getType();
+      if (!VectorType::isValidElementType(Ty->getScalarType()))
+        continue;
+
+      // Skip weird non-byte sizes. They probably aren't worth the effort of
+      // handling correctly.
+      unsigned TySize = DL.getTypeSizeInBits(Ty);
+      if (TySize < 8)
+        continue;
+
+      Value *Ptr = SI->getPointerOperand();
+      unsigned AS = Ptr->getType()->getPointerAddressSpace();
+      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+      if (TySize > VecRegSize / 2)
+        continue;
+
+      if (isa<VectorType>(Ty) && !all_of(SI->users(), [](const User *U) {
+            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
+            return EEI && isa<ConstantInt>(EEI->getOperand(1));
+          }))
+        continue;
+
+      // Save store location.
+      Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
+      StoreRefs[ObjPtr].push_back(SI);
+    }
+  }
+
+  return {LoadRefs, StoreRefs};
+}
+
+bool Vectorizer::vectorizeChains(InstrListMap &Map) {
+  bool Changed = false;
+
+  for (const std::pair<Value *, InstrList> &Chain : Map) {
+    unsigned Size = Chain.second.size();
+    if (Size < 2)
+      continue;
+
+    DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
+
+    // Process the stores in chunks of 64.
+    for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
+      unsigned Len = std::min<unsigned>(CE - CI, 64);
+      ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
+      Changed |= vectorizeInstructions(Chunk);
+    }
+  }
+
+  return Changed;
+}
+
+bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
+  DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n");
+  SmallVector<int, 16> Heads, Tails;
+  int ConsecutiveChain[64];
+
+  // Do a quadratic search on all of the given stores and find all of the pairs
+  // of stores that follow each other.
+  for (int i = 0, e = Instrs.size(); i < e; ++i) {
+    ConsecutiveChain[i] = -1;
+    for (int j = e - 1; j >= 0; --j) {
+      if (i == j)
+        continue;
+
+      if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
+        if (ConsecutiveChain[i] != -1) {
+          int CurDistance = std::abs(ConsecutiveChain[i] - i);
+          int NewDistance = std::abs(ConsecutiveChain[i] - j);
+          if (j < i || NewDistance > CurDistance)
+            continue; // Should not insert.
+        }
+
+        Tails.push_back(j);
+        Heads.push_back(i);
+        ConsecutiveChain[i] = j;
+      }
+    }
+  }
+
+  bool Changed = false;
+  SmallPtrSet<Instruction *, 16> InstructionsProcessed;
+
+  for (int Head : Heads) {
+    if (InstructionsProcessed.count(Instrs[Head]))
+      continue;
+    bool LongerChainExists = false;
+    for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
+      if (Head == Tails[TIt] &&
+          !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
+        LongerChainExists = true;
+        break;
+      }
+    if (LongerChainExists)
+      continue;
+
+    // We found an instr that starts a chain. Now follow the chain and try to
+    // vectorize it.
+    SmallVector<Instruction *, 16> Operands;
+    int I = Head;
+    while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
+      if (InstructionsProcessed.count(Instrs[I]))
+        break;
+
+      Operands.push_back(Instrs[I]);
+      I = ConsecutiveChain[I];
+    }
+
+    bool Vectorized = false;
+    if (isa<LoadInst>(*Operands.begin()))
+      Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
+    else
+      Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
+
+    Changed |= Vectorized;
+  }
+
+  return Changed;
+}
+
+bool Vectorizer::vectorizeStoreChain(
+    ArrayRef<Instruction *> Chain,
+    SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
+  StoreInst *S0 = cast<StoreInst>(Chain[0]);
+
+  // If the vector has an int element, default to int for the whole load.
+  Type *StoreTy;
+  for (Instruction *I : Chain) {
+    StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
+    if (StoreTy->isIntOrIntVectorTy())
+      break;
+
+    if (StoreTy->isPtrOrPtrVectorTy()) {
+      StoreTy = Type::getIntNTy(F.getParent()->getContext(),
+                                DL.getTypeSizeInBits(StoreTy));
+      break;
+    }
+  }
+
+  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
+  unsigned AS = S0->getPointerAddressSpace();
+  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+  unsigned VF = VecRegSize / Sz;
+  unsigned ChainSize = Chain.size();
+  unsigned Alignment = getAlignment(S0);
+
+  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
+    InstructionsProcessed->insert(Chain.begin(), Chain.end());
+    return false;
+  }
+
+  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
+  if (NewChain.empty()) {
+    // No vectorization possible.
+    InstructionsProcessed->insert(Chain.begin(), Chain.end());
+    return false;
+  }
+  if (NewChain.size() == 1) {
+    // Failed after the first instruction. Discard it and try the smaller chain.
+    InstructionsProcessed->insert(NewChain.front());
+    return false;
+  }
+
+  // Update Chain to the valid vectorizable subchain.
+  Chain = NewChain;
+  ChainSize = Chain.size();
+
+  // Check if it's legal to vectorize this chain. If not, split the chain and
+  // try again.
+  unsigned EltSzInBytes = Sz / 8;
+  unsigned SzInBytes = EltSzInBytes * ChainSize;
+  if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
+    auto Chains = splitOddVectorElts(Chain, Sz);
+    return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
+           vectorizeStoreChain(Chains.second, InstructionsProcessed);
+  }
+
+  VectorType *VecTy;
+  VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
+  if (VecStoreTy)
+    VecTy = VectorType::get(StoreTy->getScalarType(),
+                            Chain.size() * VecStoreTy->getNumElements());
+  else
+    VecTy = VectorType::get(StoreTy, Chain.size());
+
+  // If it's more than the max vector size or the target has a better
+  // vector factor, break it into two pieces.
+  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
+  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
+    DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+                    " Creating two separate arrays.\n");
+    return vectorizeStoreChain(Chain.slice(0, TargetVF),
+                               InstructionsProcessed) |
+           vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
+  }
+
+  DEBUG({
+    dbgs() << "LSV: Stores to vectorize:\n";
+    for (Instruction *I : Chain)
+      dbgs() << "  " << *I << "\n";
+  });
+
+  // We won't try again to vectorize the elements of the chain, regardless of
+  // whether we succeed below.
+  InstructionsProcessed->insert(Chain.begin(), Chain.end());
+
+  // If the store is going to be misaligned, don't vectorize it.
+  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
+    if (S0->getPointerAddressSpace() != 0)
+      return false;
+
+    unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
+                                                   StackAdjustedAlignment,
+                                                   DL, S0, nullptr, &DT);
+    if (NewAlign < StackAdjustedAlignment)
+      return false;
+  }
+
+  BasicBlock::iterator First, Last;
+  std::tie(First, Last) = getBoundaryInstrs(Chain);
+  Builder.SetInsertPoint(&*Last);
+
+  Value *Vec = UndefValue::get(VecTy);
+
+  if (VecStoreTy) {
+    unsigned VecWidth = VecStoreTy->getNumElements();
+    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+      StoreInst *Store = cast<StoreInst>(Chain[I]);
+      for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
+        unsigned NewIdx = J + I * VecWidth;
+        Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
+                                                      Builder.getInt32(J));
+        if (Extract->getType() != StoreTy->getScalarType())
+          Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
+
+        Value *Insert =
+            Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
+        Vec = Insert;
+      }
+    }
+  } else {
+    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+      StoreInst *Store = cast<StoreInst>(Chain[I]);
+      Value *Extract = Store->getValueOperand();
+      if (Extract->getType() != StoreTy->getScalarType())
+        Extract =
+            Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
+
+      Value *Insert =
+          Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
+      Vec = Insert;
+    }
+  }
+
+  // This cast is safe because Builder.CreateStore() always creates a bona fide
+  // StoreInst.
+  StoreInst *SI = cast<StoreInst>(
+      Builder.CreateStore(Vec, Builder.CreateBitCast(S0->getPointerOperand(),
+                                                     VecTy->getPointerTo(AS))));
+  propagateMetadata(SI, Chain);
+  SI->setAlignment(Alignment);
+
+  eraseInstructions(Chain);
+  ++NumVectorInstructions;
+  NumScalarsVectorized += Chain.size();
+  return true;
+}
+
+bool Vectorizer::vectorizeLoadChain(
+    ArrayRef<Instruction *> Chain,
+    SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
+  LoadInst *L0 = cast<LoadInst>(Chain[0]);
+
+  // If the vector has an int element, default to int for the whole load.
+  Type *LoadTy;
+  for (const auto &V : Chain) {
+    LoadTy = cast<LoadInst>(V)->getType();
+    if (LoadTy->isIntOrIntVectorTy())
+      break;
+
+    if (LoadTy->isPtrOrPtrVectorTy()) {
+      LoadTy = Type::getIntNTy(F.getParent()->getContext(),
+                               DL.getTypeSizeInBits(LoadTy));
+      break;
+    }
+  }
+
+  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
+  unsigned AS = L0->getPointerAddressSpace();
+  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+  unsigned VF = VecRegSize / Sz;
+  unsigned ChainSize = Chain.size();
+  unsigned Alignment = getAlignment(L0);
+
+  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
+    InstructionsProcessed->insert(Chain.begin(), Chain.end());
+    return false;
+  }
+
+  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
+  if (NewChain.empty()) {
+    // No vectorization possible.
+    InstructionsProcessed->insert(Chain.begin(), Chain.end());
+    return false;
+  }
+  if (NewChain.size() == 1) {
+    // Failed after the first instruction. Discard it and try the smaller chain.
+    InstructionsProcessed->insert(NewChain.front());
+    return false;
+  }
+
+  // Update Chain to the valid vectorizable subchain.
+  Chain = NewChain;
+  ChainSize = Chain.size();
+
+  // Check if it's legal to vectorize this chain. If not, split the chain and
+  // try again.
+  unsigned EltSzInBytes = Sz / 8;
+  unsigned SzInBytes = EltSzInBytes * ChainSize;
+  if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
+    auto Chains = splitOddVectorElts(Chain, Sz);
+    return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
+           vectorizeLoadChain(Chains.second, InstructionsProcessed);
+  }
+
+  VectorType *VecTy;
+  VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
+  if (VecLoadTy)
+    VecTy = VectorType::get(LoadTy->getScalarType(),
+                            Chain.size() * VecLoadTy->getNumElements());
+  else
+    VecTy = VectorType::get(LoadTy, Chain.size());
+
+  // If it's more than the max vector size or the target has a better
+  // vector factor, break it into two pieces.
+  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
+  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
+    DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+                    " Creating two separate arrays.\n");
+    return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
+           vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
+  }
+
+  // We won't try again to vectorize the elements of the chain, regardless of
+  // whether we succeed below.
+  InstructionsProcessed->insert(Chain.begin(), Chain.end());
+
+  // If the load is going to be misaligned, don't vectorize it.
+  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
+    if (L0->getPointerAddressSpace() != 0)
+      return false;
+
+    unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
+                                                   StackAdjustedAlignment,
+                                                   DL, L0, nullptr, &DT);
+    if (NewAlign < StackAdjustedAlignment)
+      return false;
+
+    Alignment = NewAlign;
+  }
+
+  DEBUG({
+    dbgs() << "LSV: Loads to vectorize:\n";
+    for (Instruction *I : Chain)
+      I->dump();
+  });
+
+  // getVectorizablePrefix already computed getBoundaryInstrs.  The value of
+  // Last may have changed since then, but the value of First won't have.  If it
+  // matters, we could compute getBoundaryInstrs only once and reuse it here.
+  BasicBlock::iterator First, Last;
+  std::tie(First, Last) = getBoundaryInstrs(Chain);
+  Builder.SetInsertPoint(&*First);
+
+  Value *Bitcast =
+      Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
+  // This cast is safe because Builder.CreateLoad always creates a bona fide
+  // LoadInst.
+  LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast));
+  propagateMetadata(LI, Chain);
+  LI->setAlignment(Alignment);
+
+  if (VecLoadTy) {
+    SmallVector<Instruction *, 16> InstrsToErase;
+
+    unsigned VecWidth = VecLoadTy->getNumElements();
+    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+      for (auto Use : Chain[I]->users()) {
+        // All users of vector loads are ExtractElement instructions with
+        // constant indices, otherwise we would have bailed before now.
+        Instruction *UI = cast<Instruction>(Use);
+        unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
+        unsigned NewIdx = Idx + I * VecWidth;
+        Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
+                                                UI->getName());
+        if (V->getType() != UI->getType())
+          V = Builder.CreateBitCast(V, UI->getType());
+
+        // Replace the old instruction.
+        UI->replaceAllUsesWith(V);
+        InstrsToErase.push_back(UI);
+      }
+    }
+
+    // Bitcast might not be an Instruction, if the value being loaded is a
+    // constant.  In that case, no need to reorder anything.
+    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
+      reorder(BitcastInst);
+
+    for (auto I : InstrsToErase)
+      I->eraseFromParent();
+  } else {
+    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+      Value *CV = Chain[I];
+      Value *V =
+          Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
+      if (V->getType() != CV->getType()) {
+        V = Builder.CreateBitOrPointerCast(V, CV->getType());
+      }
+
+      // Replace the old instruction.
+      CV->replaceAllUsesWith(V);
+    }
+
+    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
+      reorder(BitcastInst);
+  }
+
+  eraseInstructions(Chain);
+
+  ++NumVectorInstructions;
+  NumScalarsVectorized += Chain.size();
+  return true;
+}
+
+bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
+                                    unsigned Alignment) {
+  if (Alignment % SzInBytes == 0)
+    return false;
+
+  bool Fast = false;
+  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
+                                                   SzInBytes * 8, AddressSpace,
+                                                   Alignment, &Fast);
+  DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
+               << " and fast? " << Fast << "\n";);
+  return !Allows || !Fast;
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
new file mode 100644
index 000000000000..eb82ee283d44
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -0,0 +1,8128 @@
+//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
+// and generates target-independent LLVM-IR.
+// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
+// of instructions in order to estimate the profitability of vectorization.
+//
+// The loop vectorizer combines consecutive loop iterations into a single
+// 'wide' iteration. After this transformation the index is incremented
+// by the SIMD vector width, and not by one.
+//
+// This pass has three parts:
+// 1. The main loop pass that drives the different parts.
+// 2. LoopVectorizationLegality - A unit that checks for the legality
+//    of the vectorization.
+// 3. InnerLoopVectorizer - A unit that performs the actual
+//    widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+//    of vectorization. It decides on the optimal vector width, which
+//    can be one, if vectorization is not profitable.
+//
+//===----------------------------------------------------------------------===//
+//
+// The reduction-variable vectorization is based on the paper:
+//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
+//
+// Variable uniformity checks are inspired by:
+//  Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//
+// The interleaved access vectorization is based on the paper:
+//  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
+//  Data for SIMD
+//
+// Other ideas/concepts are from:
+//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
+//
+//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
+//  Vectorizing Compilers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Vectorize.h"
+#include <algorithm>
+#include <map>
+#include <tuple>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+STATISTIC(LoopsVectorized, "Number of loops vectorized");
+STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
+
+static cl::opt<bool>
+    EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+                       cl::desc("Enable if-conversion during vectorization."));
+
+/// Loops with a known constant trip count below this number are vectorized only
+/// if no scalar iteration overheads are incurred.
+static cl::opt<unsigned> TinyTripCountVectorThreshold(
+    "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
+    cl::desc("Loops with a constant trip count that is smaller than this "
+             "value are vectorized only if no scalar iteration overheads "
+             "are incurred."));
+
+static cl::opt<bool> MaximizeBandwidth(
+    "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
+    cl::desc("Maximize bandwidth when selecting vectorization factor which "
+             "will be determined by the smallest type in loop."));
+
+static cl::opt<bool> EnableInterleavedMemAccesses(
+    "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
+    cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
+
+/// Maximum factor for an interleaved memory access.
+static cl::opt<unsigned> MaxInterleaveGroupFactor(
+    "max-interleave-group-factor", cl::Hidden,
+    cl::desc("Maximum factor for an interleaved access group (default = 8)"),
+    cl::init(8));
+
+/// We don't interleave loops with a known constant trip count below this
+/// number.
+static const unsigned TinyTripCountInterleaveThreshold = 128;
+
+static cl::opt<unsigned> ForceTargetNumScalarRegs(
+    "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's number of scalar registers."));
+
+static cl::opt<unsigned> ForceTargetNumVectorRegs(
+    "force-target-num-vector-regs", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's number of vector registers."));
+
+/// Maximum vectorization interleave count.
+static const unsigned MaxInterleaveFactor = 16;
+
+static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
+    "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's max interleave factor for "
+             "scalar loops."));
+
+static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
+    "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's max interleave factor for "
+             "vectorized loops."));
+
+static cl::opt<unsigned> ForceTargetInstructionCost(
+    "force-target-instruction-cost", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's expected cost for "
+             "an instruction to a single constant value. Mostly "
+             "useful for getting consistent testing."));
+
+static cl::opt<unsigned> SmallLoopCost(
+    "small-loop-cost", cl::init(20), cl::Hidden,
+    cl::desc(
+        "The cost of a loop that is considered 'small' by the interleaver."));
+
+static cl::opt<bool> LoopVectorizeWithBlockFrequency(
+    "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
+    cl::desc("Enable the use of the block frequency analysis to access PGO "
+             "heuristics minimizing code growth in cold regions and being more "
+             "aggressive in hot regions."));
+
+// Runtime interleave loops for load/store throughput.
+static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
+    "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
+    cl::desc(
+        "Enable runtime interleaving until load/store ports are saturated"));
+
+/// The number of stores in a loop that are allowed to need predication.
+static cl::opt<unsigned> NumberOfStoresToPredicate(
+    "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
+    cl::desc("Max number of stores to be predicated behind an if."));
+
+static cl::opt<bool> EnableIndVarRegisterHeur(
+    "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
+    cl::desc("Count the induction variable only once when interleaving"));
+
+static cl::opt<bool> EnableCondStoresVectorization(
+    "enable-cond-stores-vec", cl::init(true), cl::Hidden,
+    cl::desc("Enable if predication of stores during vectorization."));
+
+static cl::opt<unsigned> MaxNestedScalarReductionIC(
+    "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
+    cl::desc("The maximum interleave count to use when interleaving a scalar "
+             "reduction in a nested loop."));
+
+static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
+    "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+    cl::desc("The maximum allowed number of runtime memory checks with a "
+             "vectorize(enable) pragma."));
+
+static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
+    "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed."));
+
+static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
+    "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed with a "
+             "vectorize(enable) pragma"));
+
+/// Create an analysis remark that explains why vectorization failed
+///
+/// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
+/// RemarkName is the identifier for the remark.  If \p I is passed it is an
+/// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
+/// the location of the remark.  \return the remark object that can be
+/// streamed to.
+static OptimizationRemarkAnalysis
+createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
+                     Instruction *I = nullptr) {
+  Value *CodeRegion = TheLoop->getHeader();
+  DebugLoc DL = TheLoop->getStartLoc();
+
+  if (I) {
+    CodeRegion = I->getParent();
+    // If there is no debug location attached to the instruction, revert back to
+    // using the loop's.
+    if (I->getDebugLoc())
+      DL = I->getDebugLoc();
+  }
+
+  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
+  R << "loop not vectorized: ";
+  return R;
+}
+
+namespace {
+
+// Forward declarations.
+class LoopVectorizeHints;
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+class LoopVectorizationRequirements;
+
+/// Returns true if the given loop body has a cycle, excluding the loop
+/// itself.
+static bool hasCyclesInLoopBody(const Loop &L) {
+  if (!L.empty())
+    return true;
+
+  for (const auto &SCC :
+       make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L),
+                  scc_iterator<Loop, LoopBodyTraits>::end(L))) {
+    if (SCC.size() > 1) {
+      DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n");
+      DEBUG(L.dump());
+      return true;
+    }
+  }
+  return false;
+}
+
+/// A helper function for converting Scalar types to vector types.
+/// If the incoming type is void, we return void. If the VF is 1, we return
+/// the scalar type.
+static Type *ToVectorTy(Type *Scalar, unsigned VF) {
+  if (Scalar->isVoidTy() || VF == 1)
+    return Scalar;
+  return VectorType::get(Scalar, VF);
+}
+
+// FIXME: The following helper functions have multiple implementations
+// in the project. They can be effectively organized in a common Load/Store
+// utilities unit.
+
+/// A helper function that returns the pointer operand of a load or store
+/// instruction.
+static Value *getPointerOperand(Value *I) {
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (auto *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  return nullptr;
+}
+
+/// A helper function that returns the type of loaded or stored value.
+static Type *getMemInstValueType(Value *I) {
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Expected Load or Store instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getType();
+  return cast<StoreInst>(I)->getValueOperand()->getType();
+}
+
+/// A helper function that returns the alignment of load or store instruction.
+static unsigned getMemInstAlignment(Value *I) {
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Expected Load or Store instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getAlignment();
+  return cast<StoreInst>(I)->getAlignment();
+}
+
+/// A helper function that returns the address space of the pointer operand of
+/// load or store instruction.
+static unsigned getMemInstAddressSpace(Value *I) {
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Expected Load or Store instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerAddressSpace();
+  return cast<StoreInst>(I)->getPointerAddressSpace();
+}
+
+/// A helper function that returns true if the given type is irregular. The
+/// type is irregular if its allocated size doesn't equal the store size of an
+/// element of the corresponding vector type at the given vectorization factor.
+static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
+
+  // Determine if an array of VF elements of type Ty is "bitcast compatible"
+  // with a <VF x Ty> vector.
+  if (VF > 1) {
+    auto *VectorTy = VectorType::get(Ty, VF);
+    return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
+  }
+
+  // If the vectorization factor is one, we just check if an array of type Ty
+  // requires padding between elements.
+  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
+}
+
+/// A helper function that returns the reciprocal of the block probability of
+/// predicated blocks. If we return X, we are assuming the predicated block
+/// will execute once for for every X iterations of the loop header.
+///
+/// TODO: We should use actual block probability here, if available. Currently,
+///       we always assume predicated blocks have a 50% chance of executing.
+static unsigned getReciprocalPredBlockProb() { return 2; }
+
+/// A helper function that adds a 'fast' flag to floating-point operations.
+static Value *addFastMathFlag(Value *V) {
+  if (isa<FPMathOperator>(V)) {
+    FastMathFlags Flags;
+    Flags.setUnsafeAlgebra();
+    cast<Instruction>(V)->setFastMathFlags(Flags);
+  }
+  return V;
+}
+
+/// A helper function that returns an integer or floating-point constant with
+/// value C.
+static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
+  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
+                           : ConstantFP::get(Ty, C);
+}
+
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// block to a specified vectorization factor (VF).
+/// This class performs the widening of scalars into vectors, or multiple
+/// scalars. This class also implements the following features:
+/// * It inserts an epilogue loop for handling loops that don't have iteration
+///   counts that are known to be a multiple of the vectorization factor.
+/// * It handles the code generation for reduction variables.
+/// * Scalarization (implementation using scalars) of un-vectorizable
+///   instructions.
+/// InnerLoopVectorizer does not perform any vectorization-legality
+/// checks, and relies on the caller to check for the different legality
+/// aspects. The InnerLoopVectorizer relies on the
+/// LoopVectorizationLegality class to provide information about the induction
+/// and reduction variables that were found to a given vectorization factor.
+class InnerLoopVectorizer {
+public:
+  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                      LoopInfo *LI, DominatorTree *DT,
+                      const TargetLibraryInfo *TLI,
+                      const TargetTransformInfo *TTI, AssumptionCache *AC,
+                      OptimizationRemarkEmitter *ORE, unsigned VecWidth,
+                      unsigned UnrollFactor, LoopVectorizationLegality *LVL,
+                      LoopVectorizationCostModel *CM)
+      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
+        AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
+        Builder(PSE.getSE()->getContext()), Induction(nullptr),
+        OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth),
+        TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
+        AddedSafetyChecks(false) {}
+
+  /// Create a new empty loop. Unlink the old loop and connect the new one.
+  void createVectorizedLoopSkeleton();
+
+  /// Vectorize a single instruction within the innermost loop.
+  void vectorizeInstruction(Instruction &I);
+
+  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
+  void fixVectorizedLoop();
+
+  // Return true if any runtime check is added.
+  bool areSafetyChecksAdded() { return AddedSafetyChecks; }
+
+  virtual ~InnerLoopVectorizer() {}
+
+protected:
+  /// A small list of PHINodes.
+  typedef SmallVector<PHINode *, 4> PhiVector;
+
+  /// A type for vectorized values in the new loop. Each value from the
+  /// original loop, when vectorized, is represented by UF vector values in the
+  /// new unrolled loop, where UF is the unroll factor.
+  typedef SmallVector<Value *, 2> VectorParts;
+
+  /// A type for scalarized values in the new loop. Each value from the
+  /// original loop, when scalarized, is represented by UF x VF scalar values
+  /// in the new unrolled loop, where UF is the unroll factor and VF is the
+  /// vectorization factor.
+  typedef SmallVector<SmallVector<Value *, 4>, 2> ScalarParts;
+
+  // When we if-convert we need to create edge masks. We have to cache values
+  // so that we don't end up with exponential recursion/IR.
+  typedef DenseMap<std::pair<BasicBlock *, BasicBlock *>, VectorParts>
+      EdgeMaskCacheTy;
+  typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy;
+
+  /// Set up the values of the IVs correctly when exiting the vector loop.
+  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
+                    Value *CountRoundDown, Value *EndValue,
+                    BasicBlock *MiddleBlock);
+
+  /// Create a new induction variable inside L.
+  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
+                                   Value *Step, Instruction *DL);
+
+  /// Handle all cross-iteration phis in the header.
+  void fixCrossIterationPHIs();
+
+  /// Fix a first-order recurrence. This is the second phase of vectorizing
+  /// this phi node.
+  void fixFirstOrderRecurrence(PHINode *Phi);
+
+  /// Fix a reduction cross-iteration phi. This is the second phase of
+  /// vectorizing this phi node.
+  void fixReduction(PHINode *Phi);
+
+  /// \brief The Loop exit block may have single value PHI nodes with some
+  /// incoming value. While vectorizing we only handled real values
+  /// that were defined inside the loop and we should have one value for
+  /// each predecessor of its parent basic block. See PR14725.
+  void fixLCSSAPHIs();
+
+  /// Iteratively sink the scalarized operands of a predicated instruction into
+  /// the block that was created for it.
+  void sinkScalarOperands(Instruction *PredInst);
+
+  /// Predicate conditional instructions that require predication on their
+  /// respective conditions.
+  void predicateInstructions();
+
+  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
+  /// represented as.
+  void truncateToMinimalBitwidths();
+
+  /// A helper function that computes the predicate of the block BB, assuming
+  /// that the header block of the loop is set to True. It returns the *entry*
+  /// mask for the block BB.
+  VectorParts createBlockInMask(BasicBlock *BB);
+  /// A helper function that computes the predicate of the edge between SRC
+  /// and DST.
+  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
+
+  /// Vectorize a single PHINode in a block. This method handles the induction
+  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
+  /// arbitrary length vectors.
+  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
+
+  /// Insert the new loop to the loop hierarchy and pass manager
+  /// and update the analysis passes.
+  void updateAnalysis();
+
+  /// This instruction is un-vectorizable. Implement it as a sequence
+  /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each
+  /// scalarized instruction behind an if block predicated on the control
+  /// dependence of the instruction.
+  void scalarizeInstruction(Instruction *Instr, bool IfPredicateInstr = false);
+
+  /// Vectorize Load and Store instructions,
+  virtual void vectorizeMemoryInstruction(Instruction *Instr);
+
+  /// Create a broadcast instruction. This method generates a broadcast
+  /// instruction (shuffle) for loop invariant values and for the induction
+  /// value. If this is the induction variable then we extend it to N, N+1, ...
+  /// this is needed because each iteration in the loop corresponds to a SIMD
+  /// element.
+  virtual Value *getBroadcastInstrs(Value *V);
+
+  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
+  /// to each vector element of Val. The sequence starts at StartIndex.
+  /// \p Opcode is relevant for FP induction variable.
+  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
+                               Instruction::BinaryOps Opcode =
+                               Instruction::BinaryOpsEnd);
+
+  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
+  /// variable on which to base the steps, \p Step is the size of the step, and
+  /// \p EntryVal is the value from the original loop that maps to the steps.
+  /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
+  /// can be a truncate instruction).
+  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,
+                        const InductionDescriptor &ID);
+
+  /// Create a vector induction phi node based on an existing scalar one. \p
+  /// EntryVal is the value from the original loop that maps to the vector phi
+  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
+  /// truncate instruction, instead of widening the original IV, we widen a
+  /// version of the IV truncated to \p EntryVal's type.
+  void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
+                                       Value *Step, Instruction *EntryVal);
+
+  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
+  /// is provided, the integer induction variable will first be truncated to
+  /// the corresponding type.
+  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
+
+  /// Returns true if an instruction \p I should be scalarized instead of
+  /// vectorized for the chosen vectorization factor.
+  bool shouldScalarizeInstruction(Instruction *I) const;
+
+  /// Returns true if we should generate a scalar version of \p IV.
+  bool needsScalarInduction(Instruction *IV) const;
+
+  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
+  /// vector or scalar value on-demand if one is not yet available. When
+  /// vectorizing a loop, we visit the definition of an instruction before its
+  /// uses. When visiting the definition, we either vectorize or scalarize the
+  /// instruction, creating an entry for it in the corresponding map. (In some
+  /// cases, such as induction variables, we will create both vector and scalar
+  /// entries.) Then, as we encounter uses of the definition, we derive values
+  /// for each scalar or vector use unless such a value is already available.
+  /// For example, if we scalarize a definition and one of its uses is vector,
+  /// we build the required vector on-demand with an insertelement sequence
+  /// when visiting the use. Otherwise, if the use is scalar, we can use the
+  /// existing scalar definition.
+  ///
+  /// Return a value in the new loop corresponding to \p V from the original
+  /// loop at unroll index \p Part. If the value has already been vectorized,
+  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
+  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
+  /// a new vector value on-demand by inserting the scalar values into a vector
+  /// with an insertelement sequence. If the value has been neither vectorized
+  /// nor scalarized, it must be loop invariant, so we simply broadcast the
+  /// value into a vector.
+  Value *getOrCreateVectorValue(Value *V, unsigned Part);
+
+  /// Return a value in the new loop corresponding to \p V from the original
+  /// loop at unroll index \p Part and vector index \p Lane. If the value has
+  /// been vectorized but not scalarized, the necessary extractelement
+  /// instruction will be generated.
+  Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane);
+
+  /// Try to vectorize the interleaved access group that \p Instr belongs to.
+  void vectorizeInterleaveGroup(Instruction *Instr);
+
+  /// Generate a shuffle sequence that will reverse the vector Vec.
+  virtual Value *reverseVector(Value *Vec);
+
+  /// Returns (and creates if needed) the original loop trip count.
+  Value *getOrCreateTripCount(Loop *NewLoop);
+
+  /// Returns (and creates if needed) the trip count of the widened loop.
+  Value *getOrCreateVectorTripCount(Loop *NewLoop);
+
+  /// Emit a bypass check to see if the trip count would overflow, or we
+  /// wouldn't have enough iterations to execute one vector loop.
+  void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
+  /// Emit a bypass check to see if the vector trip count is nonzero.
+  void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass);
+  /// Emit a bypass check to see if all of the SCEV assumptions we've
+  /// had to make are correct.
+  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
+  /// Emit bypass checks to check any memory assumptions we may have made.
+  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
+
+  /// Add additional metadata to \p To that was not present on \p Orig.
+  ///
+  /// Currently this is used to add the noalias annotations based on the
+  /// inserted memchecks.  Use this for instructions that are *cloned* into the
+  /// vector loop.
+  void addNewMetadata(Instruction *To, const Instruction *Orig);
+
+  /// Add metadata from one instruction to another.
+  ///
+  /// This includes both the original MDs from \p From and additional ones (\see
+  /// addNewMetadata).  Use this for *newly created* instructions in the vector
+  /// loop.
+  void addMetadata(Instruction *To, Instruction *From);
+
+  /// \brief Similar to the previous function but it adds the metadata to a
+  /// vector of instructions.
+  void addMetadata(ArrayRef<Value *> To, Instruction *From);
+
+  /// \brief Set the debug location in the builder using the debug location in
+  /// the instruction.
+  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
+
+  /// This is a helper class for maintaining vectorization state. It's used for
+  /// mapping values from the original loop to their corresponding values in
+  /// the new loop. Two mappings are maintained: one for vectorized values and
+  /// one for scalarized values. Vectorized values are represented with UF
+  /// vector values in the new loop, and scalarized values are represented with
+  /// UF x VF scalar values in the new loop. UF and VF are the unroll and
+  /// vectorization factors, respectively.
+  ///
+  /// Entries can be added to either map with setVectorValue and setScalarValue,
+  /// which assert that an entry was not already added before. If an entry is to
+  /// replace an existing one, call resetVectorValue. This is currently needed
+  /// to modify the mapped values during "fix-up" operations that occur once the
+  /// first phase of widening is complete. These operations include type
+  /// truncation and the second phase of recurrence widening.
+  ///
+  /// Entries from either map can be retrieved using the getVectorValue and
+  /// getScalarValue functions, which assert that the desired value exists.
+
+  struct ValueMap {
+
+    /// Construct an empty map with the given unroll and vectorization factors.
+    ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {}
+
+    /// \return True if the map has any vector entry for \p Key.
+    bool hasAnyVectorValue(Value *Key) const {
+      return VectorMapStorage.count(Key);
+    }
+
+    /// \return True if the map has a vector entry for \p Key and \p Part.
+    bool hasVectorValue(Value *Key, unsigned Part) const {
+      assert(Part < UF && "Queried Vector Part is too large.");
+      if (!hasAnyVectorValue(Key))
+        return false;
+      const VectorParts &Entry = VectorMapStorage.find(Key)->second;
+      assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
+      return Entry[Part] != nullptr;
+    }
+
+    /// \return True if the map has any scalar entry for \p Key.
+    bool hasAnyScalarValue(Value *Key) const {
+      return ScalarMapStorage.count(Key);
+    }
+
+    /// \return True if the map has a scalar entry for \p Key, \p Part and
+    /// \p Part.
+    bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const {
+      assert(Part < UF && "Queried Scalar Part is too large.");
+      assert(Lane < VF && "Queried Scalar Lane is too large.");
+      if (!hasAnyScalarValue(Key))
+        return false;
+      const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
+      assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
+      assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions.");
+      return Entry[Part][Lane] != nullptr;
+    }
+
+    /// Retrieve the existing vector value that corresponds to \p Key and
+    /// \p Part.
+    Value *getVectorValue(Value *Key, unsigned Part) {
+      assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
+      return VectorMapStorage[Key][Part];
+    }
+
+    /// Retrieve the existing scalar value that corresponds to \p Key, \p Part
+    /// and \p Lane.
+    Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) {
+      assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value.");
+      return ScalarMapStorage[Key][Part][Lane];
+    }
+
+    /// Set a vector value associated with \p Key and \p Part. Assumes such a
+    /// value is not already set. If it is, use resetVectorValue() instead.
+    void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
+      assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
+      if (!VectorMapStorage.count(Key)) {
+        VectorParts Entry(UF);
+        VectorMapStorage[Key] = Entry;
+      }
+      VectorMapStorage[Key][Part] = Vector;
+    }
+
+    /// Set a scalar value associated with \p Key for \p Part and \p Lane.
+    /// Assumes such a value is not already set.
+    void setScalarValue(Value *Key, unsigned Part, unsigned Lane,
+                        Value *Scalar) {
+      assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set");
+      if (!ScalarMapStorage.count(Key)) {
+        ScalarParts Entry(UF);
+        for (unsigned Part = 0; Part < UF; ++Part)
+          Entry[Part].resize(VF, nullptr);
+          // TODO: Consider storing uniform values only per-part, as they occupy
+          //       lane 0 only, keeping the other VF-1 redundant entries null.
+        ScalarMapStorage[Key] = Entry;
+      }
+      ScalarMapStorage[Key][Part][Lane] = Scalar;
+    }
+
+    /// Reset the vector value associated with \p Key for the given \p Part.
+    /// This function can be used to update values that have already been
+    /// vectorized. This is the case for "fix-up" operations including type
+    /// truncation and the second phase of recurrence vectorization.
+    void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
+      assert(hasVectorValue(Key, Part) && "Vector value not set for part");
+      VectorMapStorage[Key][Part] = Vector;
+    }
+
+  private:
+    /// The unroll factor. Each entry in the vector map contains UF vector
+    /// values.
+    unsigned UF;
+
+    /// The vectorization factor. Each entry in the scalar map contains UF x VF
+    /// scalar values.
+    unsigned VF;
+
+    /// The vector and scalar map storage. We use std::map and not DenseMap
+    /// because insertions to DenseMap invalidate its iterators.
+    std::map<Value *, VectorParts> VectorMapStorage;
+    std::map<Value *, ScalarParts> ScalarMapStorage;
+  };
+
+  /// The original loop.
+  Loop *OrigLoop;
+  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
+  /// dynamic knowledge to simplify SCEV expressions and converts them to a
+  /// more usable form.
+  PredicatedScalarEvolution &PSE;
+  /// Loop Info.
+  LoopInfo *LI;
+  /// Dominator Tree.
+  DominatorTree *DT;
+  /// Alias Analysis.
+  AliasAnalysis *AA;
+  /// Target Library Info.
+  const TargetLibraryInfo *TLI;
+  /// Target Transform Info.
+  const TargetTransformInfo *TTI;
+  /// Assumption Cache.
+  AssumptionCache *AC;
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
+
+  /// \brief LoopVersioning.  It's only set up (non-null) if memchecks were
+  /// used.
+  ///
+  /// This is currently only used to add no-alias metadata based on the
+  /// memchecks.  The actually versioning is performed manually.
+  std::unique_ptr<LoopVersioning> LVer;
+
+  /// The vectorization SIMD factor to use. Each vector will have this many
+  /// vector elements.
+  unsigned VF;
+
+protected:
+  /// The vectorization unroll factor to use. Each scalar is vectorized to this
+  /// many different vector instructions.
+  unsigned UF;
+
+  /// The builder that we use
+  IRBuilder<> Builder;
+
+  // --- Vectorization state ---
+
+  /// The vector-loop preheader.
+  BasicBlock *LoopVectorPreHeader;
+  /// The scalar-loop preheader.
+  BasicBlock *LoopScalarPreHeader;
+  /// Middle Block between the vector and the scalar.
+  BasicBlock *LoopMiddleBlock;
+  /// The ExitBlock of the scalar loop.
+  BasicBlock *LoopExitBlock;
+  /// The vector loop body.
+  BasicBlock *LoopVectorBody;
+  /// The scalar loop body.
+  BasicBlock *LoopScalarBody;
+  /// A list of all bypass blocks. The first block is the entry of the loop.
+  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
+
+  /// The new Induction variable which was added to the new block.
+  PHINode *Induction;
+  /// The induction variable of the old basic block.
+  PHINode *OldInduction;
+
+  /// Maps values from the original loop to their corresponding values in the
+  /// vectorized loop. A key value can map to either vector values, scalar
+  /// values or both kinds of values, depending on whether the key was
+  /// vectorized and scalarized.
+  ValueMap VectorLoopValueMap;
+
+  /// Store instructions that should be predicated, as a pair
+  ///   <StoreInst, Predicate>
+  SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions;
+  EdgeMaskCacheTy EdgeMaskCache;
+  BlockMaskCacheTy BlockMaskCache;
+  /// Trip count of the original loop.
+  Value *TripCount;
+  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
+  Value *VectorTripCount;
+
+  /// The legality analysis.
+  LoopVectorizationLegality *Legal;
+
+  /// The profitablity analysis.
+  LoopVectorizationCostModel *Cost;
+
+  // Record whether runtime checks are added.
+  bool AddedSafetyChecks;
+
+  // Holds the end values for each induction variable. We save the end values
+  // so we can later fix-up the external users of the induction variables.
+  DenseMap<PHINode *, Value *> IVEndValues;
+};
+
+class InnerLoopUnroller : public InnerLoopVectorizer {
+public:
+  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                    LoopInfo *LI, DominatorTree *DT,
+                    const TargetLibraryInfo *TLI,
+                    const TargetTransformInfo *TTI, AssumptionCache *AC,
+                    OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
+                    LoopVectorizationLegality *LVL,
+                    LoopVectorizationCostModel *CM)
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
+                            UnrollFactor, LVL, CM) {}
+
+private:
+  void vectorizeMemoryInstruction(Instruction *Instr) override;
+  Value *getBroadcastInstrs(Value *V) override;
+  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
+                       Instruction::BinaryOps Opcode =
+                       Instruction::BinaryOpsEnd) override;
+  Value *reverseVector(Value *Vec) override;
+};
+
+/// \brief Look for a meaningful debug location on the instruction or it's
+/// operands.
+static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
+  if (!I)
+    return I;
+
+  DebugLoc Empty;
+  if (I->getDebugLoc() != Empty)
+    return I;
+
+  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
+    if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
+      if (OpInst->getDebugLoc() != Empty)
+        return OpInst;
+  }
+
+  return I;
+}
+
+void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
+  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
+    const DILocation *DIL = Inst->getDebugLoc();
+    if (DIL && Inst->getFunction()->isDebugInfoForProfiling())
+      B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));
+    else
+      B.SetCurrentDebugLocation(DIL);
+  } else
+    B.SetCurrentDebugLocation(DebugLoc());
+}
+
+#ifndef NDEBUG
+/// \return string containing a file name and a line # for the given loop.
+static std::string getDebugLocString(const Loop *L) {
+  std::string Result;
+  if (L) {
+    raw_string_ostream OS(Result);
+    if (const DebugLoc LoopDbgLoc = L->getStartLoc())
+      LoopDbgLoc.print(OS);
+    else
+      // Just print the module name.
+      OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
+    OS.flush();
+  }
+  return Result;
+}
+#endif
+
+void InnerLoopVectorizer::addNewMetadata(Instruction *To,
+                                         const Instruction *Orig) {
+  // If the loop was versioned with memchecks, add the corresponding no-alias
+  // metadata.
+  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
+    LVer->annotateInstWithNoAlias(To, Orig);
+}
+
+void InnerLoopVectorizer::addMetadata(Instruction *To,
+                                      Instruction *From) {
+  propagateMetadata(To, From);
+  addNewMetadata(To, From);
+}
+
+void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
+                                      Instruction *From) {
+  for (Value *V : To) {
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      addMetadata(I, From);
+  }
+}
+
+/// \brief The group of interleaved loads/stores sharing the same stride and
+/// close to each other.
+///
+/// Each member in this group has an index starting from 0, and the largest
+/// index should be less than interleaved factor, which is equal to the absolute
+/// value of the access's stride.
+///
+/// E.g. An interleaved load group of factor 4:
+///        for (unsigned i = 0; i < 1024; i+=4) {
+///          a = A[i];                           // Member of index 0
+///          b = A[i+1];                         // Member of index 1
+///          d = A[i+3];                         // Member of index 3
+///          ...
+///        }
+///
+///      An interleaved store group of factor 4:
+///        for (unsigned i = 0; i < 1024; i+=4) {
+///          ...
+///          A[i]   = a;                         // Member of index 0
+///          A[i+1] = b;                         // Member of index 1
+///          A[i+2] = c;                         // Member of index 2
+///          A[i+3] = d;                         // Member of index 3
+///        }
+///
+/// Note: the interleaved load group could have gaps (missing members), but
+/// the interleaved store group doesn't allow gaps.
+class InterleaveGroup {
+public:
+  InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
+      : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) {
+    assert(Align && "The alignment should be non-zero");
+
+    Factor = std::abs(Stride);
+    assert(Factor > 1 && "Invalid interleave factor");
+
+    Reverse = Stride < 0;
+    Members[0] = Instr;
+  }
+
+  bool isReverse() const { return Reverse; }
+  unsigned getFactor() const { return Factor; }
+  unsigned getAlignment() const { return Align; }
+  unsigned getNumMembers() const { return Members.size(); }
+
+  /// \brief Try to insert a new member \p Instr with index \p Index and
+  /// alignment \p NewAlign. The index is related to the leader and it could be
+  /// negative if it is the new leader.
+  ///
+  /// \returns false if the instruction doesn't belong to the group.
+  bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
+    assert(NewAlign && "The new member's alignment should be non-zero");
+
+    int Key = Index + SmallestKey;
+
+    // Skip if there is already a member with the same index.
+    if (Members.count(Key))
+      return false;
+
+    if (Key > LargestKey) {
+      // The largest index is always less than the interleave factor.
+      if (Index >= static_cast<int>(Factor))
+        return false;
+
+      LargestKey = Key;
+    } else if (Key < SmallestKey) {
+      // The largest index is always less than the interleave factor.
+      if (LargestKey - Key >= static_cast<int>(Factor))
+        return false;
+
+      SmallestKey = Key;
+    }
+
+    // It's always safe to select the minimum alignment.
+    Align = std::min(Align, NewAlign);
+    Members[Key] = Instr;
+    return true;
+  }
+
+  /// \brief Get the member with the given index \p Index
+  ///
+  /// \returns nullptr if contains no such member.
+  Instruction *getMember(unsigned Index) const {
+    int Key = SmallestKey + Index;
+    if (!Members.count(Key))
+      return nullptr;
+
+    return Members.find(Key)->second;
+  }
+
+  /// \brief Get the index for the given member. Unlike the key in the member
+  /// map, the index starts from 0.
+  unsigned getIndex(Instruction *Instr) const {
+    for (auto I : Members)
+      if (I.second == Instr)
+        return I.first - SmallestKey;
+
+    llvm_unreachable("InterleaveGroup contains no such member");
+  }
+
+  Instruction *getInsertPos() const { return InsertPos; }
+  void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
+
+private:
+  unsigned Factor; // Interleave Factor.
+  bool Reverse;
+  unsigned Align;
+  DenseMap<int, Instruction *> Members;
+  int SmallestKey;
+  int LargestKey;
+
+  // To avoid breaking dependences, vectorized instructions of an interleave
+  // group should be inserted at either the first load or the last store in
+  // program order.
+  //
+  // E.g. %even = load i32             // Insert Position
+  //      %add = add i32 %even         // Use of %even
+  //      %odd = load i32
+  //
+  //      store i32 %even
+  //      %odd = add i32               // Def of %odd
+  //      store i32 %odd               // Insert Position
+  Instruction *InsertPos;
+};
+
+/// \brief Drive the analysis of interleaved memory accesses in the loop.
+///
+/// Use this class to analyze interleaved accesses only when we can vectorize
+/// a loop. Otherwise it's meaningless to do analysis as the vectorization
+/// on interleaved accesses is unsafe.
+///
+/// The analysis collects interleave groups and records the relationships
+/// between the member and the group in a map.
+class InterleavedAccessInfo {
+public:
+  InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
+                        DominatorTree *DT, LoopInfo *LI)
+      : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr),
+        RequiresScalarEpilogue(false) {}
+
+  ~InterleavedAccessInfo() {
+    SmallSet<InterleaveGroup *, 4> DelSet;
+    // Avoid releasing a pointer twice.
+    for (auto &I : InterleaveGroupMap)
+      DelSet.insert(I.second);
+    for (auto *Ptr : DelSet)
+      delete Ptr;
+  }
+
+  /// \brief Analyze the interleaved accesses and collect them in interleave
+  /// groups. Substitute symbolic strides using \p Strides.
+  void analyzeInterleaving(const ValueToValueMap &Strides);
+
+  /// \brief Check if \p Instr belongs to any interleave group.
+  bool isInterleaved(Instruction *Instr) const {
+    return InterleaveGroupMap.count(Instr);
+  }
+
+  /// \brief Return the maximum interleave factor of all interleaved groups.
+  unsigned getMaxInterleaveFactor() const {
+    unsigned MaxFactor = 1;
+    for (auto &Entry : InterleaveGroupMap)
+      MaxFactor = std::max(MaxFactor, Entry.second->getFactor());
+    return MaxFactor;
+  }
+
+  /// \brief Get the interleave group that \p Instr belongs to.
+  ///
+  /// \returns nullptr if doesn't have such group.
+  InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
+    if (InterleaveGroupMap.count(Instr))
+      return InterleaveGroupMap.find(Instr)->second;
+    return nullptr;
+  }
+
+  /// \brief Returns true if an interleaved group that may access memory
+  /// out-of-bounds requires a scalar epilogue iteration for correctness.
+  bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
+
+  /// \brief Initialize the LoopAccessInfo used for dependence checking.
+  void setLAI(const LoopAccessInfo *Info) { LAI = Info; }
+
+private:
+  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
+  /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
+  /// The interleaved access analysis can also add new predicates (for example
+  /// by versioning strides of pointers).
+  PredicatedScalarEvolution &PSE;
+  Loop *TheLoop;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const LoopAccessInfo *LAI;
+
+  /// True if the loop may contain non-reversed interleaved groups with
+  /// out-of-bounds accesses. We ensure we don't speculatively access memory
+  /// out-of-bounds by executing at least one scalar epilogue iteration.
+  bool RequiresScalarEpilogue;
+
+  /// Holds the relationships between the members and the interleave group.
+  DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
+
+  /// Holds dependences among the memory accesses in the loop. It maps a source
+  /// access to a set of dependent sink accesses.
+  DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
+
+  /// \brief The descriptor for a strided memory access.
+  struct StrideDescriptor {
+    StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
+                     unsigned Align)
+        : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
+
+    StrideDescriptor() = default;
+
+    // The access's stride. It is negative for a reverse access.
+    int64_t Stride = 0;
+    const SCEV *Scev = nullptr; // The scalar expression of this access
+    uint64_t Size = 0;          // The size of the memory object.
+    unsigned Align = 0;         // The alignment of this access.
+  };
+
+  /// \brief A type for holding instructions and their stride descriptors.
+  typedef std::pair<Instruction *, StrideDescriptor> StrideEntry;
+
+  /// \brief Create a new interleave group with the given instruction \p Instr,
+  /// stride \p Stride and alignment \p Align.
+  ///
+  /// \returns the newly created interleave group.
+  InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
+                                         unsigned Align) {
+    assert(!InterleaveGroupMap.count(Instr) &&
+           "Already in an interleaved access group");
+    InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
+    return InterleaveGroupMap[Instr];
+  }
+
+  /// \brief Release the group and remove all the relationships.
+  void releaseGroup(InterleaveGroup *Group) {
+    for (unsigned i = 0; i < Group->getFactor(); i++)
+      if (Instruction *Member = Group->getMember(i))
+        InterleaveGroupMap.erase(Member);
+
+    delete Group;
+  }
+
+  /// \brief Collect all the accesses with a constant stride in program order.
+  void collectConstStrideAccesses(
+      MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
+      const ValueToValueMap &Strides);
+
+  /// \brief Returns true if \p Stride is allowed in an interleaved group.
+  static bool isStrided(int Stride) {
+    unsigned Factor = std::abs(Stride);
+    return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
+  }
+
+  /// \brief Returns true if \p BB is a predicated block.
+  bool isPredicated(BasicBlock *BB) const {
+    return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+  }
+
+  /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
+  bool areDependencesValid() const {
+    return LAI && LAI->getDepChecker().getDependences();
+  }
+
+  /// \brief Returns true if memory accesses \p A and \p B can be reordered, if
+  /// necessary, when constructing interleaved groups.
+  ///
+  /// \p A must precede \p B in program order. We return false if reordering is
+  /// not necessary or is prevented because \p A and \p B may be dependent.
+  bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
+                                                 StrideEntry *B) const {
+
+    // Code motion for interleaved accesses can potentially hoist strided loads
+    // and sink strided stores. The code below checks the legality of the
+    // following two conditions:
+    //
+    // 1. Potentially moving a strided load (B) before any store (A) that
+    //    precedes B, or
+    //
+    // 2. Potentially moving a strided store (A) after any load or store (B)
+    //    that A precedes.
+    //
+    // It's legal to reorder A and B if we know there isn't a dependence from A
+    // to B. Note that this determination is conservative since some
+    // dependences could potentially be reordered safely.
+
+    // A is potentially the source of a dependence.
+    auto *Src = A->first;
+    auto SrcDes = A->second;
+
+    // B is potentially the sink of a dependence.
+    auto *Sink = B->first;
+    auto SinkDes = B->second;
+
+    // Code motion for interleaved accesses can't violate WAR dependences.
+    // Thus, reordering is legal if the source isn't a write.
+    if (!Src->mayWriteToMemory())
+      return true;
+
+    // At least one of the accesses must be strided.
+    if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
+      return true;
+
+    // If dependence information is not available from LoopAccessInfo,
+    // conservatively assume the instructions can't be reordered.
+    if (!areDependencesValid())
+      return false;
+
+    // If we know there is a dependence from source to sink, assume the
+    // instructions can't be reordered. Otherwise, reordering is legal.
+    return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
+  }
+
+  /// \brief Collect the dependences from LoopAccessInfo.
+  ///
+  /// We process the dependences once during the interleaved access analysis to
+  /// enable constant-time dependence queries.
+  void collectDependences() {
+    if (!areDependencesValid())
+      return;
+    auto *Deps = LAI->getDepChecker().getDependences();
+    for (auto Dep : *Deps)
+      Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
+  }
+};
+
+/// Utility class for getting and setting loop vectorizer hints in the form
+/// of loop metadata.
+/// This class keeps a number of loop annotations locally (as member variables)
+/// and can, upon request, write them back as metadata on the loop. It will
+/// initially scan the loop for existing metadata, and will update the local
+/// values based on information in the loop.
+/// We cannot write all values to metadata, as the mere presence of some info,
+/// for example 'force', means a decision has been made. So, we need to be
+/// careful NOT to add them if the user hasn't specifically asked so.
+class LoopVectorizeHints {
+  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE };
+
+  /// Hint - associates name and validation with the hint value.
+  struct Hint {
+    const char *Name;
+    unsigned Value; // This may have to change for non-numeric values.
+    HintKind Kind;
+
+    Hint(const char *Name, unsigned Value, HintKind Kind)
+        : Name(Name), Value(Value), Kind(Kind) {}
+
+    bool validate(unsigned Val) {
+      switch (Kind) {
+      case HK_WIDTH:
+        return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
+      case HK_UNROLL:
+        return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
+      case HK_FORCE:
+        return (Val <= 1);
+      }
+      return false;
+    }
+  };
+
+  /// Vectorization width.
+  Hint Width;
+  /// Vectorization interleave factor.
+  Hint Interleave;
+  /// Vectorization forced
+  Hint Force;
+
+  /// Return the loop metadata prefix.
+  static StringRef Prefix() { return "llvm.loop."; }
+
+  /// True if there is any unsafe math in the loop.
+  bool PotentiallyUnsafe;
+
+public:
+  enum ForceKind {
+    FK_Undefined = -1, ///< Not selected.
+    FK_Disabled = 0,   ///< Forcing disabled.
+    FK_Enabled = 1,    ///< Forcing enabled.
+  };
+
+  LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
+                     OptimizationRemarkEmitter &ORE)
+      : Width("vectorize.width", VectorizerParams::VectorizationFactor,
+              HK_WIDTH),
+        Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
+        Force("vectorize.enable", FK_Undefined, HK_FORCE),
+        PotentiallyUnsafe(false), TheLoop(L), ORE(ORE) {
+    // Populate values with existing loop metadata.
+    getHintsFromMetadata();
+
+    // force-vector-interleave overrides DisableInterleaving.
+    if (VectorizerParams::isInterleaveForced())
+      Interleave.Value = VectorizerParams::VectorizationInterleave;
+
+    DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
+          << "LV: Interleaving disabled by the pass manager\n");
+  }
+
+  /// Mark the loop L as already vectorized by setting the width to 1.
+  void setAlreadyVectorized() {
+    Width.Value = Interleave.Value = 1;
+    Hint Hints[] = {Width, Interleave};
+    writeHintsToMetadata(Hints);
+  }
+
+  bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
+    if (getForce() == LoopVectorizeHints::FK_Disabled) {
+      DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+      emitRemarkWithHints();
+      return false;
+    }
+
+    if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
+      DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+      emitRemarkWithHints();
+      return false;
+    }
+
+    if (getWidth() == 1 && getInterleave() == 1) {
+      // FIXME: Add a separate metadata to indicate when the loop has already
+      // been vectorized instead of setting width and count to 1.
+      DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
+      // FIXME: Add interleave.disable metadata. This will allow
+      // vectorize.disable to be used without disabling the pass and errors
+      // to differentiate between disabled vectorization and a width of 1.
+      ORE.emit(OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
+                                          "AllDisabled", L->getStartLoc(),
+                                          L->getHeader())
+               << "loop not vectorized: vectorization and interleaving are "
+                  "explicitly disabled, or vectorize width and interleave "
+                  "count are both set to 1");
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Dumps all the hint information.
+  void emitRemarkWithHints() const {
+    using namespace ore;
+    if (Force.Value == LoopVectorizeHints::FK_Disabled)
+      ORE.emit(OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
+                                        TheLoop->getStartLoc(),
+                                        TheLoop->getHeader())
+               << "loop not vectorized: vectorization is explicitly disabled");
+    else {
+      OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
+                                 TheLoop->getStartLoc(), TheLoop->getHeader());
+      R << "loop not vectorized";
+      if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+        R << " (Force=" << NV("Force", true);
+        if (Width.Value != 0)
+          R << ", Vector Width=" << NV("VectorWidth", Width.Value);
+        if (Interleave.Value != 0)
+          R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
+        R << ")";
+      }
+      ORE.emit(R);
+    }
+  }
+
+  unsigned getWidth() const { return Width.Value; }
+  unsigned getInterleave() const { return Interleave.Value; }
+  enum ForceKind getForce() const { return (ForceKind)Force.Value; }
+
+  /// \brief If hints are provided that force vectorization, use the AlwaysPrint
+  /// pass name to force the frontend to print the diagnostic.
+  const char *vectorizeAnalysisPassName() const {
+    if (getWidth() == 1)
+      return LV_NAME;
+    if (getForce() == LoopVectorizeHints::FK_Disabled)
+      return LV_NAME;
+    if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
+      return LV_NAME;
+    return OptimizationRemarkAnalysis::AlwaysPrint;
+  }
+
+  bool allowReordering() const {
+    // When enabling loop hints are provided we allow the vectorizer to change
+    // the order of operations that is given by the scalar loop. This is not
+    // enabled by default because can be unsafe or inefficient. For example,
+    // reordering floating-point operations will change the way round-off
+    // error accumulates in the loop.
+    return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
+  }
+
+  bool isPotentiallyUnsafe() const {
+    // Avoid FP vectorization if the target is unsure about proper support.
+    // This may be related to the SIMD unit in the target not handling
+    // IEEE 754 FP ops properly, or bad single-to-double promotions.
+    // Otherwise, a sequence of vectorized loops, even without reduction,
+    // could lead to different end results on the destination vectors.
+    return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
+  }
+
+  void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }
+
+private:
+  /// Find hints specified in the loop metadata and update local values.
+  void getHintsFromMetadata() {
+    MDNode *LoopID = TheLoop->getLoopID();
+    if (!LoopID)
+      return;
+
+    // First operand should refer to the loop id itself.
+    assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+    assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      const MDString *S = nullptr;
+      SmallVector<Metadata *, 4> Args;
+
+      // The expected hint is either a MDString or a MDNode with the first
+      // operand a MDString.
+      if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+        if (!MD || MD->getNumOperands() == 0)
+          continue;
+        S = dyn_cast<MDString>(MD->getOperand(0));
+        for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+          Args.push_back(MD->getOperand(i));
+      } else {
+        S = dyn_cast<MDString>(LoopID->getOperand(i));
+        assert(Args.size() == 0 && "too many arguments for MDString");
+      }
+
+      if (!S)
+        continue;
+
+      // Check if the hint starts with the loop metadata prefix.
+      StringRef Name = S->getString();
+      if (Args.size() == 1)
+        setHint(Name, Args[0]);
+    }
+  }
+
+  /// Checks string hint with one operand and set value if valid.
+  void setHint(StringRef Name, Metadata *Arg) {
+    if (!Name.startswith(Prefix()))
+      return;
+    Name = Name.substr(Prefix().size(), StringRef::npos);
+
+    const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
+    if (!C)
+      return;
+    unsigned Val = C->getZExtValue();
+
+    Hint *Hints[] = {&Width, &Interleave, &Force};
+    for (auto H : Hints) {
+      if (Name == H->Name) {
+        if (H->validate(Val))
+          H->Value = Val;
+        else
+          DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
+        break;
+      }
+    }
+  }
+
+  /// Create a new hint from name / value pair.
+  MDNode *createHintMetadata(StringRef Name, unsigned V) const {
+    LLVMContext &Context = TheLoop->getHeader()->getContext();
+    Metadata *MDs[] = {MDString::get(Context, Name),
+                       ConstantAsMetadata::get(
+                           ConstantInt::get(Type::getInt32Ty(Context), V))};
+    return MDNode::get(Context, MDs);
+  }
+
+  /// Matches metadata with hint name.
+  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
+    MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
+    if (!Name)
+      return false;
+
+    for (auto H : HintTypes)
+      if (Name->getString().endswith(H.Name))
+        return true;
+    return false;
+  }
+
+  /// Sets current hints into loop metadata, keeping other values intact.
+  void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
+    if (HintTypes.size() == 0)
+      return;
+
+    // Reserve the first element to LoopID (see below).
+    SmallVector<Metadata *, 4> MDs(1);
+    // If the loop already has metadata, then ignore the existing operands.
+    MDNode *LoopID = TheLoop->getLoopID();
+    if (LoopID) {
+      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+        MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+        // If node in update list, ignore old value.
+        if (!matchesHintMetadataName(Node, HintTypes))
+          MDs.push_back(Node);
+      }
+    }
+
+    // Now, add the missing hints.
+    for (auto H : HintTypes)
+      MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
+
+    // Replace current metadata node with new one.
+    LLVMContext &Context = TheLoop->getHeader()->getContext();
+    MDNode *NewLoopID = MDNode::get(Context, MDs);
+    // Set operand 0 to refer to the loop id itself.
+    NewLoopID->replaceOperandWith(0, NewLoopID);
+
+    TheLoop->setLoopID(NewLoopID);
+  }
+
+  /// The loop these hints belong to.
+  const Loop *TheLoop;
+
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter &ORE;
+};
+
+static void emitMissedWarning(Function *F, Loop *L,
+                              const LoopVectorizeHints &LH,
+                              OptimizationRemarkEmitter *ORE) {
+  LH.emitRemarkWithHints();
+
+  if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
+    if (LH.getWidth() != 1)
+      ORE->emit(DiagnosticInfoOptimizationFailure(
+                    DEBUG_TYPE, "FailedRequestedVectorization",
+                    L->getStartLoc(), L->getHeader())
+                << "loop not vectorized: "
+                << "failed explicitly specified loop vectorization");
+    else if (LH.getInterleave() != 1)
+      ORE->emit(DiagnosticInfoOptimizationFailure(
+                    DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
+                    L->getHeader())
+                << "loop not interleaved: "
+                << "failed explicitly specified loop interleaving");
+  }
+}
+
+/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
+/// to what vectorization factor.
+/// This class does not look at the profitability of vectorization, only the
+/// legality. This class has two main kinds of checks:
+/// * Memory checks - The code in canVectorizeMemory checks if vectorization
+///   will change the order of memory accesses in a way that will change the
+///   correctness of the program.
+/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
+/// checks for a number of different conditions, such as the availability of a
+/// single induction variable, that all types are supported and vectorize-able,
+/// etc. This code reflects the capabilities of InnerLoopVectorizer.
+/// This class is also used by InnerLoopVectorizer for identifying
+/// induction variable and the different reduction variables.
+class LoopVectorizationLegality {
+public:
+  LoopVectorizationLegality(
+      Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
+      TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
+      const TargetTransformInfo *TTI,
+      std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
+      OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
+      LoopVectorizeHints *H)
+      : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT),
+        GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI),
+        PrimaryInduction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
+        Requirements(R), Hints(H) {}
+
+  /// ReductionList contains the reduction descriptors for all
+  /// of the reductions that were found in the loop.
+  typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList;
+
+  /// InductionList saves induction variables and maps them to the
+  /// induction descriptor.
+  typedef MapVector<PHINode *, InductionDescriptor> InductionList;
+
+  /// RecurrenceSet contains the phi nodes that are recurrences other than
+  /// inductions and reductions.
+  typedef SmallPtrSet<const PHINode *, 8> RecurrenceSet;
+
+  /// Returns true if it is legal to vectorize this loop.
+  /// This does not mean that it is profitable to vectorize this
+  /// loop, only that it is legal to do so.
+  bool canVectorize();
+
+  /// Returns the primary induction variable.
+  PHINode *getPrimaryInduction() { return PrimaryInduction; }
+
+  /// Returns the reduction variables found in the loop.
+  ReductionList *getReductionVars() { return &Reductions; }
+
+  /// Returns the induction variables found in the loop.
+  InductionList *getInductionVars() { return &Inductions; }
+
+  /// Return the first-order recurrences found in the loop.
+  RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
+
+  /// Return the set of instructions to sink to handle first-order recurrences.
+  DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
+
+  /// Returns the widest induction type.
+  Type *getWidestInductionType() { return WidestIndTy; }
+
+  /// Returns True if V is an induction variable in this loop.
+  bool isInductionVariable(const Value *V);
+
+  /// Returns True if PN is a reduction variable in this loop.
+  bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
+
+  /// Returns True if Phi is a first-order recurrence in this loop.
+  bool isFirstOrderRecurrence(const PHINode *Phi);
+
+  /// Return true if the block BB needs to be predicated in order for the loop
+  /// to be vectorized.
+  bool blockNeedsPredication(BasicBlock *BB);
+
+  /// Check if this pointer is consecutive when vectorizing. This happens
+  /// when the last index of the GEP is the induction variable, or that the
+  /// pointer itself is an induction variable.
+  /// This check allows us to vectorize A[idx] into a wide load/store.
+  /// Returns:
+  /// 0 - Stride is unknown or non-consecutive.
+  /// 1 - Address is consecutive.
+  /// -1 - Address is consecutive, and decreasing.
+  int isConsecutivePtr(Value *Ptr);
+
+  /// Returns true if the value V is uniform within the loop.
+  bool isUniform(Value *V);
+
+  /// Returns the information that we collected about runtime memory check.
+  const RuntimePointerChecking *getRuntimePointerChecking() const {
+    return LAI->getRuntimePointerChecking();
+  }
+
+  const LoopAccessInfo *getLAI() const { return LAI; }
+
+  /// \brief Check if \p Instr belongs to any interleaved access group.
+  bool isAccessInterleaved(Instruction *Instr) {
+    return InterleaveInfo.isInterleaved(Instr);
+  }
+
+  /// \brief Return the maximum interleave factor of all interleaved groups.
+  unsigned getMaxInterleaveFactor() const {
+    return InterleaveInfo.getMaxInterleaveFactor();
+  }
+
+  /// \brief Get the interleaved access group that \p Instr belongs to.
+  const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
+    return InterleaveInfo.getInterleaveGroup(Instr);
+  }
+
+  /// \brief Returns true if an interleaved group requires a scalar iteration
+  /// to handle accesses with gaps.
+  bool requiresScalarEpilogue() const {
+    return InterleaveInfo.requiresScalarEpilogue();
+  }
+
+  unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
+
+  bool hasStride(Value *V) { return LAI->hasStride(V); }
+
+  /// Returns true if the target machine supports masked store operation
+  /// for the given \p DataType and kind of access to \p Ptr.
+  bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
+    return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
+  }
+  /// Returns true if the target machine supports masked load operation
+  /// for the given \p DataType and kind of access to \p Ptr.
+  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
+    return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
+  }
+  /// Returns true if the target machine supports masked scatter operation
+  /// for the given \p DataType.
+  bool isLegalMaskedScatter(Type *DataType) {
+    return TTI->isLegalMaskedScatter(DataType);
+  }
+  /// Returns true if the target machine supports masked gather operation
+  /// for the given \p DataType.
+  bool isLegalMaskedGather(Type *DataType) {
+    return TTI->isLegalMaskedGather(DataType);
+  }
+  /// Returns true if the target machine can represent \p V as a masked gather
+  /// or scatter operation.
+  bool isLegalGatherOrScatter(Value *V) {
+    auto *LI = dyn_cast<LoadInst>(V);
+    auto *SI = dyn_cast<StoreInst>(V);
+    if (!LI && !SI)
+      return false;
+    auto *Ptr = getPointerOperand(V);
+    auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
+    return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
+  }
+
+  /// Returns true if vector representation of the instruction \p I
+  /// requires mask.
+  bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
+  unsigned getNumStores() const { return LAI->getNumStores(); }
+  unsigned getNumLoads() const { return LAI->getNumLoads(); }
+  unsigned getNumPredStores() const { return NumPredStores; }
+
+  /// Returns true if \p I is an instruction that will be scalarized with
+  /// predication. Such instructions include conditional stores and
+  /// instructions that may divide by zero.
+  bool isScalarWithPredication(Instruction *I);
+
+  /// Returns true if \p I is a memory instruction with consecutive memory
+  /// access that can be widened.
+  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+
+  // Returns true if the NoNaN attribute is set on the function.
+  bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
+
+private:
+  /// Check if a single basic block loop is vectorizable.
+  /// At this point we know that this is a loop with a constant trip count
+  /// and we only need to check individual instructions.
+  bool canVectorizeInstrs();
+
+  /// When we vectorize loops we may change the order in which
+  /// we read and write from memory. This method checks if it is
+  /// legal to vectorize the code, considering only memory constrains.
+  /// Returns true if the loop is vectorizable
+  bool canVectorizeMemory();
+
+  /// Return true if we can vectorize this loop using the IF-conversion
+  /// transformation.
+  bool canVectorizeWithIfConvert();
+
+  /// Return true if all of the instructions in the block can be speculatively
+  /// executed. \p SafePtrs is a list of addresses that are known to be legal
+  /// and we know that we can read from them without segfault.
+  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
+
+  /// Updates the vectorization state by adding \p Phi to the inductions list.
+  /// This can set \p Phi as the main induction of the loop if \p Phi is a
+  /// better choice for the main induction than the existing one.
+  void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
+                       SmallPtrSetImpl<Value *> &AllowedExit);
+
+  /// Create an analysis remark that explains why vectorization failed
+  ///
+  /// \p RemarkName is the identifier for the remark.  If \p I is passed it is
+  /// an instruction that prevents vectorization.  Otherwise the loop is used
+  /// for the location of the remark.  \return the remark object that can be
+  /// streamed to.
+  OptimizationRemarkAnalysis
+  createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
+    return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
+                                  RemarkName, TheLoop, I);
+  }
+
+  /// \brief If an access has a symbolic strides, this maps the pointer value to
+  /// the stride symbol.
+  const ValueToValueMap *getSymbolicStrides() {
+    // FIXME: Currently, the set of symbolic strides is sometimes queried before
+    // it's collected.  This happens from canVectorizeWithIfConvert, when the
+    // pointer is checked to reference consecutive elements suitable for a
+    // masked access.
+    return LAI ? &LAI->getSymbolicStrides() : nullptr;
+  }
+
+  unsigned NumPredStores;
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
+  /// Applies dynamic knowledge to simplify SCEV expressions in the context
+  /// of existing SCEV assumptions. The analysis will also add a minimal set
+  /// of new predicates if this is required to enable vectorization and
+  /// unrolling.
+  PredicatedScalarEvolution &PSE;
+  /// Target Library Info.
+  TargetLibraryInfo *TLI;
+  /// Target Transform Info
+  const TargetTransformInfo *TTI;
+  /// Dominator Tree.
+  DominatorTree *DT;
+  // LoopAccess analysis.
+  std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
+  // And the loop-accesses info corresponding to this loop.  This pointer is
+  // null until canVectorizeMemory sets it up.
+  const LoopAccessInfo *LAI;
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
+
+  /// The interleave access information contains groups of interleaved accesses
+  /// with the same stride and close to each other.
+  InterleavedAccessInfo InterleaveInfo;
+
+  //  ---  vectorization state --- //
+
+  /// Holds the primary induction variable. This is the counter of the
+  /// loop.
+  PHINode *PrimaryInduction;
+  /// Holds the reduction variables.
+  ReductionList Reductions;
+  /// Holds all of the induction variables that we found in the loop.
+  /// Notice that inductions don't need to start at zero and that induction
+  /// variables can be pointers.
+  InductionList Inductions;
+  /// Holds the phi nodes that are first-order recurrences.
+  RecurrenceSet FirstOrderRecurrences;
+  /// Holds instructions that need to sink past other instructions to handle
+  /// first-order recurrences.
+  DenseMap<Instruction *, Instruction *> SinkAfter;
+  /// Holds the widest induction type encountered.
+  Type *WidestIndTy;
+
+  /// Allowed outside users. This holds the induction and reduction
+  /// vars which can be accessed from outside the loop.
+  SmallPtrSet<Value *, 4> AllowedExit;
+
+  /// Can we assume the absence of NaNs.
+  bool HasFunNoNaNAttr;
+
+  /// Vectorization requirements that will go through late-evaluation.
+  LoopVectorizationRequirements *Requirements;
+
+  /// Used to emit an analysis of any legality issues.
+  LoopVectorizeHints *Hints;
+
+  /// While vectorizing these instructions we have to generate a
+  /// call to the appropriate masked intrinsic
+  SmallPtrSet<const Instruction *, 8> MaskedOp;
+};
+
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because of
+/// a number of reasons. In this class we mainly attempt to predict the
+/// expected speedup/slowdowns due to the supported instruction set. We use the
+/// TargetTransformInfo to query the different backends for the cost of
+/// different operations.
+class LoopVectorizationCostModel {
+public:
+  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
+                             LoopInfo *LI, LoopVectorizationLegality *Legal,
+                             const TargetTransformInfo &TTI,
+                             const TargetLibraryInfo *TLI, DemandedBits *DB,
+                             AssumptionCache *AC,
+                             OptimizationRemarkEmitter *ORE, const Function *F,
+                             const LoopVectorizeHints *Hints)
+      : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
+        AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
+
+  /// \return An upper bound for the vectorization factor, or None if
+  /// vectorization should be avoided up front.
+  Optional<unsigned> computeMaxVF(bool OptForSize);
+
+  /// Information about vectorization costs
+  struct VectorizationFactor {
+    unsigned Width; // Vector width with best cost
+    unsigned Cost;  // Cost of the loop with that width
+  };
+  /// \return The most profitable vectorization factor and the cost of that VF.
+  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
+  /// then this vectorization factor will be selected if vectorization is
+  /// possible.
+  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
+
+  /// Setup cost-based decisions for user vectorization factor.
+  void selectUserVectorizationFactor(unsigned UserVF) {
+    collectUniformsAndScalars(UserVF);
+    collectInstsToScalarize(UserVF);
+  }
+
+  /// \return The size (in bits) of the smallest and widest types in the code
+  /// that needs to be vectorized. We ignore values that remain scalar such as
+  /// 64 bit loop indices.
+  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
+
+  /// \return The desired interleave count.
+  /// If interleave count has been specified by metadata it will be returned.
+  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
+  /// are the selected vectorization factor and the cost of the selected VF.
+  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
+                                 unsigned LoopCost);
+
+  /// Memory access instruction may be vectorized in more than one way.
+  /// Form of instruction after vectorization depends on cost.
+  /// This function takes cost-based decisions for Load/Store instructions
+  /// and collects them in a map. This decisions map is used for building
+  /// the lists of loop-uniform and loop-scalar instructions.
+  /// The calculated cost is saved with widening decision in order to
+  /// avoid redundant calculations.
+  void setCostBasedWideningDecision(unsigned VF);
+
+  /// \brief A struct that represents some properties of the register usage
+  /// of a loop.
+  struct RegisterUsage {
+    /// Holds the number of loop invariant values that are used in the loop.
+    unsigned LoopInvariantRegs;
+    /// Holds the maximum number of concurrent live intervals in the loop.
+    unsigned MaxLocalUsers;
+    /// Holds the number of instructions in the loop.
+    unsigned NumInstructions;
+  };
+
+  /// \return Returns information about the register usages of the loop for the
+  /// given vectorization factors.
+  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
+
+  /// Collect values we want to ignore in the cost model.
+  void collectValuesToIgnore();
+
+  /// \returns The smallest bitwidth each instruction can be represented with.
+  /// The vector equivalents of these instructions should be truncated to this
+  /// type.
+  const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
+    return MinBWs;
+  }
+
+  /// \returns True if it is more profitable to scalarize instruction \p I for
+  /// vectorization factor \p VF.
+  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
+    auto Scalars = InstsToScalarize.find(VF);
+    assert(Scalars != InstsToScalarize.end() &&
+           "VF not yet analyzed for scalarization profitability");
+    return Scalars->second.count(I);
+  }
+
+  /// Returns true if \p I is known to be uniform after vectorization.
+  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
+    if (VF == 1)
+      return true;
+    assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");
+    auto UniformsPerVF = Uniforms.find(VF);
+    return UniformsPerVF->second.count(I);
+  }
+
+  /// Returns true if \p I is known to be scalar after vectorization.
+  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
+    if (VF == 1)
+      return true;
+    assert(Scalars.count(VF) && "Scalar values are not calculated for VF");
+    auto ScalarsPerVF = Scalars.find(VF);
+    return ScalarsPerVF->second.count(I);
+  }
+
+  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
+  /// for vectorization factor \p VF.
+  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
+    return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
+           !isScalarAfterVectorization(I, VF);
+  }
+
+  /// Decision that was taken during cost calculation for memory instruction.
+  enum InstWidening {
+    CM_Unknown,
+    CM_Widen,
+    CM_Interleave,
+    CM_GatherScatter,
+    CM_Scalarize
+  };
+
+  /// Save vectorization decision \p W and \p Cost taken by the cost model for
+  /// instruction \p I and vector width \p VF.
+  void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
+                           unsigned Cost) {
+    assert(VF >= 2 && "Expected VF >=2");
+    WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+  }
+
+  /// Save vectorization decision \p W and \p Cost taken by the cost model for
+  /// interleaving group \p Grp and vector width \p VF.
+  void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
+                           InstWidening W, unsigned Cost) {
+    assert(VF >= 2 && "Expected VF >=2");
+    /// Broadcast this decicion to all instructions inside the group.
+    /// But the cost will be assigned to one instruction only.
+    for (unsigned i = 0; i < Grp->getFactor(); ++i) {
+      if (auto *I = Grp->getMember(i)) {
+        if (Grp->getInsertPos() == I)
+          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+        else
+          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
+      }
+    }
+  }
+
+  /// Return the cost model decision for the given instruction \p I and vector
+  /// width \p VF. Return CM_Unknown if this instruction did not pass
+  /// through the cost modeling.
+  InstWidening getWideningDecision(Instruction *I, unsigned VF) {
+    assert(VF >= 2 && "Expected VF >=2");
+    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+    auto Itr = WideningDecisions.find(InstOnVF);
+    if (Itr == WideningDecisions.end())
+      return CM_Unknown;
+    return Itr->second.first;
+  }
+
+  /// Return the vectorization cost for the given instruction \p I and vector
+  /// width \p VF.
+  unsigned getWideningCost(Instruction *I, unsigned VF) {
+    assert(VF >= 2 && "Expected VF >=2");
+    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+    assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated");
+    return WideningDecisions[InstOnVF].second;
+  }
+
+  /// Return True if instruction \p I is an optimizable truncate whose operand
+  /// is an induction variable. Such a truncate will be removed by adding a new
+  /// induction variable with the destination type.
+  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
+
+    // If the instruction is not a truncate, return false.
+    auto *Trunc = dyn_cast<TruncInst>(I);
+    if (!Trunc)
+      return false;
+
+    // Get the source and destination types of the truncate.
+    Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
+    Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
+
+    // If the truncate is free for the given types, return false. Replacing a
+    // free truncate with an induction variable would add an induction variable
+    // update instruction to each iteration of the loop. We exclude from this
+    // check the primary induction variable since it will need an update
+    // instruction regardless.
+    Value *Op = Trunc->getOperand(0);
+    if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
+      return false;
+
+    // If the truncated value is not an induction variable, return false.
+    return Legal->isInductionVariable(Op);
+  }
+
+private:
+  /// \return An upper bound for the vectorization factor, larger than zero.
+  /// One is returned if vectorization should best be avoided due to cost.
+  unsigned computeFeasibleMaxVF(bool OptForSize);
+
+  /// The vectorization cost is a combination of the cost itself and a boolean
+  /// indicating whether any of the contributing operations will actually
+  /// operate on
+  /// vector values after type legalization in the backend. If this latter value
+  /// is
+  /// false, then all operations will be scalarized (i.e. no vectorization has
+  /// actually taken place).
+  typedef std::pair<unsigned, bool> VectorizationCostTy;
+
+  /// Returns the expected execution cost. The unit of the cost does
+  /// not matter because we use the 'cost' units to compare different
+  /// vector widths. The cost that is returned is *not* normalized by
+  /// the factor width.
+  VectorizationCostTy expectedCost(unsigned VF);
+
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
+
+  /// The cost-computation logic from getInstructionCost which provides
+  /// the vector type as an output parameter.
+  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
+
+  /// Calculate vectorization cost of memory instruction \p I.
+  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for scalarized memory instruction.
+  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for interleaving group of memory instructions.
+  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for Gather/Scatter instruction.
+  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for widening instruction \p I with consecutive
+  /// memory access.
+  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
+
+  /// The cost calculation for Load instruction \p I with uniform pointer -
+  /// scalar load + broadcast.
+  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
+
+  /// Returns whether the instruction is a load or store and will be a emitted
+  /// as a vector operation.
+  bool isConsecutiveLoadOrStore(Instruction *I);
+
+  /// Create an analysis remark that explains why vectorization failed
+  ///
+  /// \p RemarkName is the identifier for the remark.  \return the remark object
+  /// that can be streamed to.
+  OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
+    return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
+                                  RemarkName, TheLoop);
+  }
+
+  /// Map of scalar integer values to the smallest bitwidth they can be legally
+  /// represented as. The vector equivalents of these values should be truncated
+  /// to this type.
+  MapVector<Instruction *, uint64_t> MinBWs;
+
+  /// A type representing the costs for instructions if they were to be
+  /// scalarized rather than vectorized. The entries are Instruction-Cost
+  /// pairs.
+  typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;
+
+  /// A set containing all BasicBlocks that are known to present after
+  /// vectorization as a predicated block.
+  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
+
+  /// A map holding scalar costs for different vectorization factors. The
+  /// presence of a cost for an instruction in the mapping indicates that the
+  /// instruction will be scalarized when vectorizing with the associated
+  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
+  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
+
+  /// Holds the instructions known to be uniform after vectorization.
+  /// The data is collected per VF.
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
+
+  /// Holds the instructions known to be scalar after vectorization.
+  /// The data is collected per VF.
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
+
+  /// Holds the instructions (address computations) that are forced to be
+  /// scalarized.
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+
+  /// Returns the expected difference in cost from scalarizing the expression
+  /// feeding a predicated instruction \p PredInst. The instructions to
+  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
+  /// non-negative return value implies the expression will be scalarized.
+  /// Currently, only single-use chains are considered for scalarization.
+  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
+                              unsigned VF);
+
+  /// Collects the instructions to scalarize for each predicated instruction in
+  /// the loop.
+  void collectInstsToScalarize(unsigned VF);
+
+  /// Collect the instructions that are uniform after vectorization. An
+  /// instruction is uniform if we represent it with a single scalar value in
+  /// the vectorized loop corresponding to each vector iteration. Examples of
+  /// uniform instructions include pointer operands of consecutive or
+  /// interleaved memory accesses. Note that although uniformity implies an
+  /// instruction will be scalar, the reverse is not true. In general, a
+  /// scalarized instruction will be represented by VF scalar values in the
+  /// vectorized loop, each corresponding to an iteration of the original
+  /// scalar loop.
+  void collectLoopUniforms(unsigned VF);
+
+  /// Collect the instructions that are scalar after vectorization. An
+  /// instruction is scalar if it is known to be uniform or will be scalarized
+  /// during vectorization. Non-uniform scalarized instructions will be
+  /// represented by VF values in the vectorized loop, each corresponding to an
+  /// iteration of the original scalar loop.
+  void collectLoopScalars(unsigned VF);
+
+  /// Collect Uniform and Scalar values for the given \p VF.
+  /// The sets depend on CM decision for Load/Store instructions
+  /// that may be vectorized as interleave, gather-scatter or scalarized.
+  void collectUniformsAndScalars(unsigned VF) {
+    // Do the analysis once.
+    if (VF == 1 || Uniforms.count(VF))
+      return;
+    setCostBasedWideningDecision(VF);
+    collectLoopUniforms(VF);
+    collectLoopScalars(VF);
+  }
+
+  /// Keeps cost model vectorization decision and cost for instructions.
+  /// Right now it is used for memory instructions only.
+  typedef DenseMap<std::pair<Instruction *, unsigned>,
+                   std::pair<InstWidening, unsigned>>
+      DecisionList;
+
+  DecisionList WideningDecisions;
+
+public:
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// Predicated scalar evolution analysis.
+  PredicatedScalarEvolution &PSE;
+  /// Loop Info analysis.
+  LoopInfo *LI;
+  /// Vectorization legality.
+  LoopVectorizationLegality *Legal;
+  /// Vector target information.
+  const TargetTransformInfo &TTI;
+  /// Target Library Info.
+  const TargetLibraryInfo *TLI;
+  /// Demanded bits analysis.
+  DemandedBits *DB;
+  /// Assumption cache.
+  AssumptionCache *AC;
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
+
+  const Function *TheFunction;
+  /// Loop Vectorize Hint.
+  const LoopVectorizeHints *Hints;
+  /// Values to ignore in the cost model.
+  SmallPtrSet<const Value *, 16> ValuesToIgnore;
+  /// Values to ignore in the cost model when VF > 1.
+  SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+};
+
+/// LoopVectorizationPlanner - drives the vectorization process after having
+/// passed Legality checks.
+class LoopVectorizationPlanner {
+public:
+  LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI,
+                           LoopVectorizationLegality *Legal,
+                           LoopVectorizationCostModel &CM)
+      : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {}
+
+  ~LoopVectorizationPlanner() {}
+
+  /// Plan how to best vectorize, return the best VF and its cost.
+  LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
+                                                       unsigned UserVF);
+
+  /// Generate the IR code for the vectorized loop.
+  void executePlan(InnerLoopVectorizer &ILV);
+
+protected:
+  /// Collect the instructions from the original loop that would be trivially
+  /// dead in the vectorized loop if generated.
+  void collectTriviallyDeadInstructions(
+      SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
+private:
+  /// The loop that we evaluate.
+  Loop *OrigLoop;
+
+  /// Loop Info analysis.
+  LoopInfo *LI;
+
+  /// The legality analysis.
+  LoopVectorizationLegality *Legal;
+
+  /// The profitablity analysis.
+  LoopVectorizationCostModel &CM;
+};
+
+/// \brief This holds vectorization requirements that must be verified late in
+/// the process. The requirements are set by legalize and costmodel. Once
+/// vectorization has been determined to be possible and profitable the
+/// requirements can be verified by looking for metadata or compiler options.
+/// For example, some loops require FP commutativity which is only allowed if
+/// vectorization is explicitly specified or if the fast-math compiler option
+/// has been provided.
+/// Late evaluation of these requirements allows helpful diagnostics to be
+/// composed that tells the user what need to be done to vectorize the loop. For
+/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
+/// evaluation should be used only when diagnostics can generated that can be
+/// followed by a non-expert user.
+class LoopVectorizationRequirements {
+public:
+  LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE)
+      : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr), ORE(ORE) {}
+
+  void addUnsafeAlgebraInst(Instruction *I) {
+    // First unsafe algebra instruction.
+    if (!UnsafeAlgebraInst)
+      UnsafeAlgebraInst = I;
+  }
+
+  void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
+
+  bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
+    const char *PassName = Hints.vectorizeAnalysisPassName();
+    bool Failed = false;
+    if (UnsafeAlgebraInst && !Hints.allowReordering()) {
+      ORE.emit(
+          OptimizationRemarkAnalysisFPCommute(PassName, "CantReorderFPOps",
+                                              UnsafeAlgebraInst->getDebugLoc(),
+                                              UnsafeAlgebraInst->getParent())
+          << "loop not vectorized: cannot prove it is safe to reorder "
+             "floating-point operations");
+      Failed = true;
+    }
+
+    // Test if runtime memcheck thresholds are exceeded.
+    bool PragmaThresholdReached =
+        NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+    bool ThresholdReached =
+        NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
+    if ((ThresholdReached && !Hints.allowReordering()) ||
+        PragmaThresholdReached) {
+      ORE.emit(OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
+                                                  L->getStartLoc(),
+                                                  L->getHeader())
+               << "loop not vectorized: cannot prove it is safe to reorder "
+                  "memory operations");
+      DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+      Failed = true;
+    }
+
+    return Failed;
+  }
+
+private:
+  unsigned NumRuntimePointerChecks;
+  Instruction *UnsafeAlgebraInst;
+
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter &ORE;
+};
+
+static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
+  if (L.empty()) {
+    if (!hasCyclesInLoopBody(L))
+      V.push_back(&L);
+    return;
+  }
+  for (Loop *InnerL : L)
+    addAcyclicInnerLoop(*InnerL, V);
+}
+
+/// The LoopVectorize Pass.
+struct LoopVectorize : public FunctionPass {
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
+      : FunctionPass(ID) {
+    Impl.DisableUnrolling = NoUnrolling;
+    Impl.AlwaysVectorize = AlwaysVectorize;
+    initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
+  }
+
+  LoopVectorizePass Impl;
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+    auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+    std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+        [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
+
+    return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
+                        GetLAA, *ORE);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<DemandedBitsWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<BasicAAWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
+// LoopVectorizationCostModel and LoopVectorizationPlanner.
+//===----------------------------------------------------------------------===//
+
+Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
+  // We need to place the broadcast of invariant variables outside the loop.
+  Instruction *Instr = dyn_cast<Instruction>(V);
+  bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
+  bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
+
+  // Place the code for broadcasting invariant variables in the new preheader.
+  IRBuilder<>::InsertPointGuard Guard(Builder);
+  if (Invariant)
+    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
+  // Broadcast the scalar into all locations in the vector.
+  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
+
+  return Shuf;
+}
+
+void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
+    const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
+  Value *Start = II.getStartValue();
+
+  // Construct the initial value of the vector IV in the vector loop preheader
+  auto CurrIP = Builder.saveIP();
+  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+  if (isa<TruncInst>(EntryVal)) {
+    assert(Start->getType()->isIntegerTy() &&
+           "Truncation requires an integer type");
+    auto *TruncType = cast<IntegerType>(EntryVal->getType());
+    Step = Builder.CreateTrunc(Step, TruncType);
+    Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
+  }
+  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
+  Value *SteppedStart =
+      getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
+
+  // We create vector phi nodes for both integer and floating-point induction
+  // variables. Here, we determine the kind of arithmetic we will perform.
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  if (Step->getType()->isIntegerTy()) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = II.getInductionOpcode();
+    MulOp = Instruction::FMul;
+  }
+
+  // Multiply the vectorization factor by the step using integer or
+  // floating-point arithmetic as appropriate.
+  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
+  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
+
+  // Create a vector splat to use in the induction update.
+  //
+  // FIXME: If the step is non-constant, we create the vector splat with
+  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
+  //        handle a constant vector splat.
+  Value *SplatVF = isa<Constant>(Mul)
+                       ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
+                       : Builder.CreateVectorSplat(VF, Mul);
+  Builder.restoreIP(CurrIP);
+
+  // We may need to add the step a number of times, depending on the unroll
+  // factor. The last of those goes into the PHI.
+  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
+                                    &*LoopVectorBody->getFirstInsertionPt());
+  Instruction *LastInduction = VecInd;
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
+    if (isa<TruncInst>(EntryVal))
+      addMetadata(LastInduction, EntryVal);
+    LastInduction = cast<Instruction>(addFastMathFlag(
+        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
+  }
+
+  // Move the last step to the end of the latch block. This ensures consistent
+  // placement of all induction updates.
+  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
+  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
+  auto *ICmp = cast<Instruction>(Br->getCondition());
+  LastInduction->moveBefore(ICmp);
+  LastInduction->setName("vec.ind.next");
+
+  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
+  VecInd->addIncoming(LastInduction, LoopVectorLatch);
+}
+
+bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
+  return Cost->isScalarAfterVectorization(I, VF) ||
+         Cost->isProfitableToScalarize(I, VF);
+}
+
+bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
+  if (shouldScalarizeInstruction(IV))
+    return true;
+  auto isScalarInst = [&](User *U) -> bool {
+    auto *I = cast<Instruction>(U);
+    return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
+  };
+  return any_of(IV->users(), isScalarInst);
+}
+
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
+
+  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
+         "Primary induction variable must have an integer type");
+
+  auto II = Legal->getInductionVars()->find(IV);
+  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
+
+  auto ID = II->second;
+  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+
+  // The scalar value to broadcast. This will be derived from the canonical
+  // induction variable.
+  Value *ScalarIV = nullptr;
+
+  // The value from the original loop to which we are mapping the new induction
+  // variable.
+  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
+
+  // True if we have vectorized the induction variable.
+  auto VectorizedIV = false;
+
+  // Determine if we want a scalar version of the induction variable. This is
+  // true if the induction variable itself is not widened, or if it has at
+  // least one user in the loop that is not widened.
+  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
+
+  // Generate code for the induction step. Note that induction steps are
+  // required to be loop-invariant
+  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
+         "Induction step should be loop invariant");
+  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+  Value *Step = nullptr;
+  if (PSE.getSE()->isSCEVable(IV->getType())) {
+    SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+    Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
+                             LoopVectorPreHeader->getTerminator());
+  } else {
+    Step = cast<SCEVUnknown>(ID.getStep())->getValue();
+  }
+
+  // Try to create a new independent vector induction variable. If we can't
+  // create the phi node, we will splat the scalar induction variable in each
+  // loop iteration.
+  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
+    createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
+    VectorizedIV = true;
+  }
+
+  // If we haven't yet vectorized the induction variable, or if we will create
+  // a scalar one, we need to define the scalar induction variable and step
+  // values. If we were given a truncation type, truncate the canonical
+  // induction variable and step. Otherwise, derive these values from the
+  // induction descriptor.
+  if (!VectorizedIV || NeedsScalarIV) {
+    ScalarIV = Induction;
+    if (IV != OldInduction) {
+      ScalarIV = IV->getType()->isIntegerTy()
+                     ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
+                     : Builder.CreateCast(Instruction::SIToFP, Induction,
+                                          IV->getType());
+      ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
+      ScalarIV->setName("offset.idx");
+    }
+    if (Trunc) {
+      auto *TruncType = cast<IntegerType>(Trunc->getType());
+      assert(Step->getType()->isIntegerTy() &&
+             "Truncation requires an integer step");
+      ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
+      Step = Builder.CreateTrunc(Step, TruncType);
+    }
+  }
+
+  // If we haven't yet vectorized the induction variable, splat the scalar
+  // induction variable, and build the necessary step vectors.
+  if (!VectorizedIV) {
+    Value *Broadcasted = getBroadcastInstrs(ScalarIV);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *EntryPart =
+          getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
+      VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
+      if (Trunc)
+        addMetadata(EntryPart, Trunc);
+    }
+  }
+
+  // If an induction variable is only used for counting loop iterations or
+  // calculating addresses, it doesn't need to be widened. Create scalar steps
+  // that can be used by instructions we will later scalarize. Note that the
+  // addition of the scalar steps will not increase the number of instructions
+  // in the loop in the common case prior to InstCombine. We will be trading
+  // one vector extract for each scalar step.
+  if (NeedsScalarIV)
+    buildScalarSteps(ScalarIV, Step, EntryVal, ID);
+}
+
+Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
+                                          Instruction::BinaryOps BinOp) {
+  // Create and check the types.
+  assert(Val->getType()->isVectorTy() && "Must be a vector");
+  int VLen = Val->getType()->getVectorNumElements();
+
+  Type *STy = Val->getType()->getScalarType();
+  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
+         "Induction Step must be an integer or FP");
+  assert(Step->getType() == STy && "Step has wrong type");
+
+  SmallVector<Constant *, 8> Indices;
+
+  if (STy->isIntegerTy()) {
+    // Create a vector of consecutive numbers from zero to VF.
+    for (int i = 0; i < VLen; ++i)
+      Indices.push_back(ConstantInt::get(STy, StartIdx + i));
+
+    // Add the consecutive indices to the vector value.
+    Constant *Cv = ConstantVector::get(Indices);
+    assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
+    Step = Builder.CreateVectorSplat(VLen, Step);
+    assert(Step->getType() == Val->getType() && "Invalid step vec");
+    // FIXME: The newly created binary instructions should contain nsw/nuw flags,
+    // which can be found from the original scalar operations.
+    Step = Builder.CreateMul(Cv, Step);
+    return Builder.CreateAdd(Val, Step, "induction");
+  }
+
+  // Floating point induction.
+  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
+         "Binary Opcode should be specified for FP induction");
+  // Create a vector of consecutive numbers from zero to VF.
+  for (int i = 0; i < VLen; ++i)
+    Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
+
+  // Add the consecutive indices to the vector value.
+  Constant *Cv = ConstantVector::get(Indices);
+
+  Step = Builder.CreateVectorSplat(VLen, Step);
+
+  // Floating point operations had to be 'fast' to enable the induction.
+  FastMathFlags Flags;
+  Flags.setUnsafeAlgebra();
+
+  Value *MulOp = Builder.CreateFMul(Cv, Step);
+  if (isa<Instruction>(MulOp))
+    // Have to check, MulOp may be a constant
+    cast<Instruction>(MulOp)->setFastMathFlags(Flags);
+
+  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
+  if (isa<Instruction>(BOp))
+    cast<Instruction>(BOp)->setFastMathFlags(Flags);
+  return BOp;
+}
+
+void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
+                                           Value *EntryVal,
+                                           const InductionDescriptor &ID) {
+
+  // We shouldn't have to build scalar steps if we aren't vectorizing.
+  assert(VF > 1 && "VF should be greater than one");
+
+  // Get the value type and ensure it and the step have the same integer type.
+  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
+  assert(ScalarIVTy == Step->getType() &&
+         "Val and Step should have the same type");
+
+  // We build scalar steps for both integer and floating-point induction
+  // variables. Here, we determine the kind of arithmetic we will perform.
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  if (ScalarIVTy->isIntegerTy()) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = ID.getInductionOpcode();
+    MulOp = Instruction::FMul;
+  }
+
+  // Determine the number of scalars we need to generate for each unroll
+  // iteration. If EntryVal is uniform, we only need to generate the first
+  // lane. Otherwise, we generate all VF values.
+  unsigned Lanes =
+    Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF;
+
+  // Compute the scalar steps and save the results in VectorLoopValueMap.
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+      auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
+      auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
+      auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
+      VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add);
+    }
+  }
+}
+
+int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+
+  const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
+    ValueToValueMap();
+
+  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
+  if (Stride == 1 || Stride == -1)
+    return Stride;
+  return 0;
+}
+
+bool LoopVectorizationLegality::isUniform(Value *V) {
+  return LAI->isUniform(V);
+}
+
+Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
+  assert(V != Induction && "The new induction variable should not be used.");
+  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
+  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
+
+  // If we have a stride that is replaced by one, do it here.
+  if (Legal->hasStride(V))
+    V = ConstantInt::get(V->getType(), 1);
+
+  // If we have a vector mapped to this value, return it.
+  if (VectorLoopValueMap.hasVectorValue(V, Part))
+    return VectorLoopValueMap.getVectorValue(V, Part);
+
+  // If the value has not been vectorized, check if it has been scalarized
+  // instead. If it has been scalarized, and we actually need the value in
+  // vector form, we will construct the vector values on demand.
+  if (VectorLoopValueMap.hasAnyScalarValue(V)) {
+
+    Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0);
+
+    // If we've scalarized a value, that value should be an instruction.
+    auto *I = cast<Instruction>(V);
+
+    // If we aren't vectorizing, we can just copy the scalar map values over to
+    // the vector map.
+    if (VF == 1) {
+      VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
+      return ScalarValue;
+    }
+
+    // Get the last scalar instruction we generated for V and Part. If the value
+    // is known to be uniform after vectorization, this corresponds to lane zero
+    // of the Part unroll iteration. Otherwise, the last instruction is the one
+    // we created for the last vector lane of the Part unroll iteration.
+    unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
+    auto *LastInst =
+        cast<Instruction>(VectorLoopValueMap.getScalarValue(V, Part, LastLane));
+
+    // Set the insert point after the last scalarized instruction. This ensures
+    // the insertelement sequence will directly follow the scalar definitions.
+    auto OldIP = Builder.saveIP();
+    auto NewIP = std::next(BasicBlock::iterator(LastInst));
+    Builder.SetInsertPoint(&*NewIP);
+
+    // However, if we are vectorizing, we need to construct the vector values.
+    // If the value is known to be uniform after vectorization, we can just
+    // broadcast the scalar value corresponding to lane zero for each unroll
+    // iteration. Otherwise, we construct the vector values using insertelement
+    // instructions. Since the resulting vectors are stored in
+    // VectorLoopValueMap, we will only generate the insertelements once.
+    Value *VectorValue = nullptr;
+    if (Cost->isUniformAfterVectorization(I, VF)) {
+      VectorValue = getBroadcastInstrs(ScalarValue);
+    } else {
+      VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
+      for (unsigned Lane = 0; Lane < VF; ++Lane)
+        VectorValue = Builder.CreateInsertElement(
+            VectorValue, getOrCreateScalarValue(V, Part, Lane),
+            Builder.getInt32(Lane));
+    }
+    VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
+    Builder.restoreIP(OldIP);
+    return VectorValue;
+  }
+
+  // If this scalar is unknown, assume that it is a constant or that it is
+  // loop invariant. Broadcast V and save the value for future uses.
+  Value *B = getBroadcastInstrs(V);
+  VectorLoopValueMap.setVectorValue(V, Part, B);
+  return B;
+}
+
+Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part,
+                                                   unsigned Lane) {
+
+  // If the value is not an instruction contained in the loop, it should
+  // already be scalar.
+  if (OrigLoop->isLoopInvariant(V))
+    return V;
+
+  assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
+                  : true && "Uniform values only have lane zero");
+
+  // If the value from the original loop has not been vectorized, it is
+  // represented by UF x VF scalar values in the new loop. Return the requested
+  // scalar value.
+  if (VectorLoopValueMap.hasScalarValue(V, Part, Lane))
+    return VectorLoopValueMap.getScalarValue(V, Part, Lane);
+
+  // If the value has not been scalarized, get its entry in VectorLoopValueMap
+  // for the given unroll part. If this entry is not a vector type (i.e., the
+  // vectorization factor is one), there is no need to generate an
+  // extractelement instruction.
+  auto *U = getOrCreateVectorValue(V, Part);
+  if (!U->getType()->isVectorTy()) {
+    assert(VF == 1 && "Value not scalarized has non-vector type");
+    return U;
+  }
+
+  // Otherwise, the value from the original loop has been vectorized and is
+  // represented by UF vector values. Extract and return the requested scalar
+  // value from the appropriate vector lane.
+  return Builder.CreateExtractElement(U, Builder.getInt32(Lane));
+}
+
+Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
+  assert(Vec->getType()->isVectorTy() && "Invalid type");
+  SmallVector<Constant *, 8> ShuffleMask;
+  for (unsigned i = 0; i < VF; ++i)
+    ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
+
+  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
+                                     ConstantVector::get(ShuffleMask),
+                                     "reverse");
+}
+
+// Try to vectorize the interleave group that \p Instr belongs to.
+//
+// E.g. Translate following interleaved load group (factor = 3):
+//   for (i = 0; i < N; i+=3) {
+//     R = Pic[i];             // Member of index 0
+//     G = Pic[i+1];           // Member of index 1
+//     B = Pic[i+2];           // Member of index 2
+//     ... // do something to R, G, B
+//   }
+// To:
+//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
+//   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
+//   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
+//   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
+//
+// Or translate following interleaved store group (factor = 3):
+//   for (i = 0; i < N; i+=3) {
+//     ... do something to R, G, B
+//     Pic[i]   = R;           // Member of index 0
+//     Pic[i+1] = G;           // Member of index 1
+//     Pic[i+2] = B;           // Member of index 2
+//   }
+// To:
+//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
+//   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
+//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
+//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
+//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
+void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
+  const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
+  assert(Group && "Fail to get an interleaved access group.");
+
+  // Skip if current instruction is not the insert position.
+  if (Instr != Group->getInsertPos())
+    return;
+
+  Value *Ptr = getPointerOperand(Instr);
+
+  // Prepare for the vector type of the interleaved load/store.
+  Type *ScalarTy = getMemInstValueType(Instr);
+  unsigned InterleaveFactor = Group->getFactor();
+  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
+  Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr));
+
+  // Prepare for the new pointers.
+  setDebugLocFromInst(Builder, Ptr);
+  SmallVector<Value *, 2> NewPtrs;
+  unsigned Index = Group->getIndex(Instr);
+
+  // If the group is reverse, adjust the index to refer to the last vector lane
+  // instead of the first. We adjust the index from the first vector lane,
+  // rather than directly getting the pointer for lane VF - 1, because the
+  // pointer operand of the interleaved access is supposed to be uniform. For
+  // uniform instructions, we're only required to generate a value for the
+  // first vector lane in each unroll iteration.
+  if (Group->isReverse())
+    Index += (VF - 1) * Group->getFactor();
+
+  for (unsigned Part = 0; Part < UF; Part++) {
+    Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0);
+
+    // Notice current instruction could be any index. Need to adjust the address
+    // to the member of index 0.
+    //
+    // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
+    //       b = A[i];       // Member of index 0
+    // Current pointer is pointed to A[i+1], adjust it to A[i].
+    //
+    // E.g.  A[i+1] = a;     // Member of index 1
+    //       A[i]   = b;     // Member of index 0
+    //       A[i+2] = c;     // Member of index 2 (Current instruction)
+    // Current pointer is pointed to A[i+2], adjust it to A[i].
+    NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
+
+    // Cast to the vector pointer type.
+    NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
+  }
+
+  setDebugLocFromInst(Builder, Instr);
+  Value *UndefVec = UndefValue::get(VecTy);
+
+  // Vectorize the interleaved load group.
+  if (isa<LoadInst>(Instr)) {
+
+    // For each unroll part, create a wide load for the group.
+    SmallVector<Value *, 2> NewLoads;
+    for (unsigned Part = 0; Part < UF; Part++) {
+      auto *NewLoad = Builder.CreateAlignedLoad(
+          NewPtrs[Part], Group->getAlignment(), "wide.vec");
+      addMetadata(NewLoad, Instr);
+      NewLoads.push_back(NewLoad);
+    }
+
+    // For each member in the group, shuffle out the appropriate data from the
+    // wide loads.
+    for (unsigned I = 0; I < InterleaveFactor; ++I) {
+      Instruction *Member = Group->getMember(I);
+
+      // Skip the gaps in the group.
+      if (!Member)
+        continue;
+
+      Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
+      for (unsigned Part = 0; Part < UF; Part++) {
+        Value *StridedVec = Builder.CreateShuffleVector(
+            NewLoads[Part], UndefVec, StrideMask, "strided.vec");
+
+        // If this member has different type, cast the result type.
+        if (Member->getType() != ScalarTy) {
+          VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
+          StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
+        }
+
+        if (Group->isReverse())
+          StridedVec = reverseVector(StridedVec);
+
+        VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
+      }
+    }
+    return;
+  }
+
+  // The sub vector type for current instruction.
+  VectorType *SubVT = VectorType::get(ScalarTy, VF);
+
+  // Vectorize the interleaved store group.
+  for (unsigned Part = 0; Part < UF; Part++) {
+    // Collect the stored vector from each member.
+    SmallVector<Value *, 4> StoredVecs;
+    for (unsigned i = 0; i < InterleaveFactor; i++) {
+      // Interleaved store group doesn't allow a gap, so each index has a member
+      Instruction *Member = Group->getMember(i);
+      assert(Member && "Fail to get a member from an interleaved store group");
+
+      Value *StoredVec = getOrCreateVectorValue(
+          cast<StoreInst>(Member)->getValueOperand(), Part);
+      if (Group->isReverse())
+        StoredVec = reverseVector(StoredVec);
+
+      // If this member has different type, cast it to an unified type.
+      if (StoredVec->getType() != SubVT)
+        StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
+
+      StoredVecs.push_back(StoredVec);
+    }
+
+    // Concatenate all vectors into a wide vector.
+    Value *WideVec = concatenateVectors(Builder, StoredVecs);
+
+    // Interleave the elements in the wide vector.
+    Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
+    Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
+                                              "interleaved.vec");
+
+    Instruction *NewStoreInstr =
+        Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
+    addMetadata(NewStoreInstr, Instr);
+  }
+}
+
+void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
+  // Attempt to issue a wide load.
+  LoadInst *LI = dyn_cast<LoadInst>(Instr);
+  StoreInst *SI = dyn_cast<StoreInst>(Instr);
+
+  assert((LI || SI) && "Invalid Load/Store instruction");
+
+  LoopVectorizationCostModel::InstWidening Decision =
+      Cost->getWideningDecision(Instr, VF);
+  assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
+         "CM decision should be taken at this point");
+  if (Decision == LoopVectorizationCostModel::CM_Interleave)
+    return vectorizeInterleaveGroup(Instr);
+
+  Type *ScalarDataTy = getMemInstValueType(Instr);
+  Type *DataTy = VectorType::get(ScalarDataTy, VF);
+  Value *Ptr = getPointerOperand(Instr);
+  unsigned Alignment = getMemInstAlignment(Instr);
+  // An alignment of 0 means target abi alignment. We need to use the scalar's
+  // target abi alignment in such a case.
+  const DataLayout &DL = Instr->getModule()->getDataLayout();
+  if (!Alignment)
+    Alignment = DL.getABITypeAlignment(ScalarDataTy);
+  unsigned AddressSpace = getMemInstAddressSpace(Instr);
+
+  // Scalarize the memory instruction if necessary.
+  if (Decision == LoopVectorizationCostModel::CM_Scalarize)
+    return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
+
+  // Determine if the pointer operand of the access is either consecutive or
+  // reverse consecutive.
+  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+  bool Reverse = ConsecutiveStride < 0;
+  bool CreateGatherScatter =
+      (Decision == LoopVectorizationCostModel::CM_GatherScatter);
+
+  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
+  // gather/scatter. Otherwise Decision should have been to Scalarize.
+  assert((ConsecutiveStride || CreateGatherScatter) &&
+         "The instruction should be scalarized");
+
+  // Handle consecutive loads/stores.
+  if (ConsecutiveStride)
+    Ptr = getOrCreateScalarValue(Ptr, 0, 0);
+
+  VectorParts Mask = createBlockInMask(Instr->getParent());
+  // Handle Stores:
+  if (SI) {
+    assert(!Legal->isUniform(SI->getPointerOperand()) &&
+           "We do not allow storing to uniform addresses");
+    setDebugLocFromInst(Builder, SI);
+
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Instruction *NewSI = nullptr;
+      Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
+      if (CreateGatherScatter) {
+        Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr;
+        Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
+        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
+                                            MaskPart);
+      } else {
+        // Calculate the pointer for the specific unroll-part.
+        Value *PartPtr =
+            Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
+
+        if (Reverse) {
+          // If we store to reverse consecutive memory locations, then we need
+          // to reverse the order of elements in the stored value.
+          StoredVal = reverseVector(StoredVal);
+          // We don't want to update the value in the map as it might be used in
+          // another expression. So don't call resetVectorValue(StoredVal).
+
+          // If the address is consecutive but reversed, then the
+          // wide store needs to start at the last vector element.
+          PartPtr =
+              Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
+          PartPtr =
+              Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
+          Mask[Part] = reverseVector(Mask[Part]);
+        }
+
+        Value *VecPtr =
+            Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+
+        if (Legal->isMaskRequired(SI))
+          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
+                                            Mask[Part]);
+        else
+          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
+      }
+      addMetadata(NewSI, SI);
+    }
+    return;
+  }
+
+  // Handle loads.
+  assert(LI && "Must have a load instruction");
+  setDebugLocFromInst(Builder, LI);
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Value *NewLI;
+    if (CreateGatherScatter) {
+      Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr;
+      Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
+      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
+                                         nullptr, "wide.masked.gather");
+      addMetadata(NewLI, LI);
+    } else {
+      // Calculate the pointer for the specific unroll-part.
+      Value *PartPtr =
+          Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
+
+      if (Reverse) {
+        // If the address is consecutive but reversed, then the
+        // wide load needs to start at the last vector element.
+        PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
+        PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
+        Mask[Part] = reverseVector(Mask[Part]);
+      }
+
+      Value *VecPtr =
+          Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+      if (Legal->isMaskRequired(LI))
+        NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
+                                         UndefValue::get(DataTy),
+                                         "wide.masked.load");
+      else
+        NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
+
+      // Add metadata to the load, but setVectorValue to the reverse shuffle.
+      addMetadata(NewLI, LI);
+      if (Reverse)
+        NewLI = reverseVector(NewLI);
+    }
+    VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
+  }
+}
+
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+                                               bool IfPredicateInstr) {
+  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+  DEBUG(dbgs() << "LV: Scalarizing"
+               << (IfPredicateInstr ? " and predicating:" : ":") << *Instr
+               << '\n');
+  // Holds vector parameters or scalars, in case of uniform vals.
+  SmallVector<VectorParts, 4> Params;
+
+  setDebugLocFromInst(Builder, Instr);
+
+  // Does this instruction return a value ?
+  bool IsVoidRetTy = Instr->getType()->isVoidTy();
+
+  VectorParts Cond;
+  if (IfPredicateInstr)
+    Cond = createBlockInMask(Instr->getParent());
+
+  // Determine the number of scalars we need to generate for each unroll
+  // iteration. If the instruction is uniform, we only need to generate the
+  // first lane. Otherwise, we generate all VF values.
+  unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF;
+
+  // For each vector unroll 'part':
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    // For each scalar that we create:
+    for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+
+      // Start if-block.
+      Value *Cmp = nullptr;
+      if (IfPredicateInstr) {
+        Cmp = Cond[Part];
+        if (Cmp->getType()->isVectorTy())
+          Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane));
+        Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
+                                 ConstantInt::get(Cmp->getType(), 1));
+      }
+
+      Instruction *Cloned = Instr->clone();
+      if (!IsVoidRetTy)
+        Cloned->setName(Instr->getName() + ".cloned");
+
+      // Replace the operands of the cloned instructions with their scalar
+      // equivalents in the new loop.
+      for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+        auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane);
+        Cloned->setOperand(op, NewOp);
+      }
+      addNewMetadata(Cloned, Instr);
+
+      // Place the cloned scalar in the new loop.
+      Builder.Insert(Cloned);
+
+      // Add the cloned scalar to the scalar map entry.
+      VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned);
+
+      // If we just cloned a new assumption, add it the assumption cache.
+      if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
+
+      // End if-block.
+      if (IfPredicateInstr)
+        PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
+    }
+  }
+}
+
+PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
+                                                      Value *End, Value *Step,
+                                                      Instruction *DL) {
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  // As we're just creating this loop, it's possible no latch exists
+  // yet. If so, use the header as this will be a single block loop.
+  if (!Latch)
+    Latch = Header;
+
+  IRBuilder<> Builder(&*Header->getFirstInsertionPt());
+  Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
+  setDebugLocFromInst(Builder, OldInst);
+  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
+
+  Builder.SetInsertPoint(Latch->getTerminator());
+  setDebugLocFromInst(Builder, OldInst);
+
+  // Create i+1 and fill the PHINode.
+  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
+  Induction->addIncoming(Start, L->getLoopPreheader());
+  Induction->addIncoming(Next, Latch);
+  // Create the compare.
+  Value *ICmp = Builder.CreateICmpEQ(Next, End);
+  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
+
+  // Now we have two terminators. Remove the old one from the block.
+  Latch->getTerminator()->eraseFromParent();
+
+  return Induction;
+}
+
+Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
+  if (TripCount)
+    return TripCount;
+
+  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+  // Find the loop boundaries.
+  ScalarEvolution *SE = PSE.getSE();
+  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
+  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
+         "Invalid loop count");
+
+  Type *IdxTy = Legal->getWidestInductionType();
+
+  // The exit count might have the type of i64 while the phi is i32. This can
+  // happen if we have an induction variable that is sign extended before the
+  // compare. The only way that we get a backedge taken count is that the
+  // induction variable was signed and as such will not overflow. In such a case
+  // truncation is legal.
+  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
+      IdxTy->getPrimitiveSizeInBits())
+    BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
+  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
+
+  // Get the total trip count from the count by adding 1.
+  const SCEV *ExitCount = SE->getAddExpr(
+      BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+  // Expand the trip count and place the new instructions in the preheader.
+  // Notice that the pre-header does not change, only the loop body.
+  SCEVExpander Exp(*SE, DL, "induction");
+
+  // Count holds the overall loop count (N).
+  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+                                L->getLoopPreheader()->getTerminator());
+
+  if (TripCount->getType()->isPointerTy())
+    TripCount =
+        CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
+                                    L->getLoopPreheader()->getTerminator());
+
+  return TripCount;
+}
+
+Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
+  if (VectorTripCount)
+    return VectorTripCount;
+
+  Value *TC = getOrCreateTripCount(L);
+  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+
+  // Now we need to generate the expression for the part of the loop that the
+  // vectorized body will execute. This is equal to N - (N % Step) if scalar
+  // iterations are not required for correctness, or N - Step, otherwise. Step
+  // is equal to the vectorization factor (number of SIMD elements) times the
+  // unroll factor (number of SIMD instructions).
+  Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
+  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
+
+  // If there is a non-reversed interleaved group that may speculatively access
+  // memory out-of-bounds, we need to ensure that there will be at least one
+  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
+  // the trip count, we set the remainder to be equal to the step. If the step
+  // does not evenly divide the trip count, no adjustment is necessary since
+  // there will already be scalar iterations. Note that the minimum iterations
+  // check ensures that N >= Step.
+  if (VF > 1 && Legal->requiresScalarEpilogue()) {
+    auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
+    R = Builder.CreateSelect(IsZero, Step, R);
+  }
+
+  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
+
+  return VectorTripCount;
+}
+
+void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
+                                                         BasicBlock *Bypass) {
+  Value *Count = getOrCreateTripCount(L);
+  BasicBlock *BB = L->getLoopPreheader();
+  IRBuilder<> Builder(BB->getTerminator());
+
+  // Generate code to check that the loop's trip count that we computed by
+  // adding one to the backedge-taken count will not overflow.
+  Value *CheckMinIters = Builder.CreateICmpULT(
+      Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
+
+  BasicBlock *NewBB =
+      BB->splitBasicBlock(BB->getTerminator(), "min.iters.checked");
+  // Update dominator tree immediately if the generated block is a
+  // LoopBypassBlock because SCEV expansions to generate loop bypass
+  // checks may query it before the current function is finished.
+  DT->addNewBlock(NewBB, BB);
+  if (L->getParentLoop())
+    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+  ReplaceInstWithInst(BB->getTerminator(),
+                      BranchInst::Create(Bypass, NewBB, CheckMinIters));
+  LoopBypassBlocks.push_back(BB);
+}
+
+void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
+                                                     BasicBlock *Bypass) {
+  Value *TC = getOrCreateVectorTripCount(L);
+  BasicBlock *BB = L->getLoopPreheader();
+  IRBuilder<> Builder(BB->getTerminator());
+
+  // Now, compare the new count to zero. If it is zero skip the vector loop and
+  // jump to the scalar loop.
+  Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()),
+                                    "cmp.zero");
+
+  // Generate code to check that the loop's trip count that we computed by
+  // adding one to the backedge-taken count will not overflow.
+  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+  // Update dominator tree immediately if the generated block is a
+  // LoopBypassBlock because SCEV expansions to generate loop bypass
+  // checks may query it before the current function is finished.
+  DT->addNewBlock(NewBB, BB);
+  if (L->getParentLoop())
+    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+  ReplaceInstWithInst(BB->getTerminator(),
+                      BranchInst::Create(Bypass, NewBB, Cmp));
+  LoopBypassBlocks.push_back(BB);
+}
+
+void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
+  BasicBlock *BB = L->getLoopPreheader();
+
+  // Generate the code to check that the SCEV assumptions that we made.
+  // We want the new basic block to start at the first instruction in a
+  // sequence of instructions that form a check.
+  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
+                   "scev.check");
+  Value *SCEVCheck =
+      Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
+
+  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
+    if (C->isZero())
+      return;
+
+  // Create a new block containing the stride check.
+  BB->setName("vector.scevcheck");
+  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+  // Update dominator tree immediately if the generated block is a
+  // LoopBypassBlock because SCEV expansions to generate loop bypass
+  // checks may query it before the current function is finished.
+  DT->addNewBlock(NewBB, BB);
+  if (L->getParentLoop())
+    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+  ReplaceInstWithInst(BB->getTerminator(),
+                      BranchInst::Create(Bypass, NewBB, SCEVCheck));
+  LoopBypassBlocks.push_back(BB);
+  AddedSafetyChecks = true;
+}
+
+void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
+  BasicBlock *BB = L->getLoopPreheader();
+
+  // Generate the code that checks in runtime if arrays overlap. We put the
+  // checks into a separate block to make the more common case of few elements
+  // faster.
+  Instruction *FirstCheckInst;
+  Instruction *MemRuntimeCheck;
+  std::tie(FirstCheckInst, MemRuntimeCheck) =
+      Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
+  if (!MemRuntimeCheck)
+    return;
+
+  // Create a new block containing the memory check.
+  BB->setName("vector.memcheck");
+  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+  // Update dominator tree immediately if the generated block is a
+  // LoopBypassBlock because SCEV expansions to generate loop bypass
+  // checks may query it before the current function is finished.
+  DT->addNewBlock(NewBB, BB);
+  if (L->getParentLoop())
+    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+  ReplaceInstWithInst(BB->getTerminator(),
+                      BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
+  LoopBypassBlocks.push_back(BB);
+  AddedSafetyChecks = true;
+
+  // We currently don't use LoopVersioning for the actual loop cloning but we
+  // still use it to add the noalias metadata.
+  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
+                                           PSE.getSE());
+  LVer->prepareNoAliasMetadata();
+}
+
+void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+  /*
+   In this function we generate a new loop. The new loop will contain
+   the vectorized instructions while the old loop will continue to run the
+   scalar remainder.
+
+       [ ] <-- loop iteration number check.
+    /   |
+   /    v
+  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
+  |  /  |
+  | /   v
+  ||   [ ]     <-- vector pre header.
+  |/    |
+  |     v
+  |    [  ] \
+  |    [  ]_|   <-- vector loop.
+  |     |
+  |     v
+  |   -[ ]   <--- middle-block.
+  |  /  |
+  | /   v
+  -|- >[ ]     <--- new preheader.
+   |    |
+   |    v
+   |   [ ] \
+   |   [ ]_|   <-- old scalar loop to handle remainder.
+    \   |
+     \  v
+      >[ ]     <-- exit block.
+   ...
+   */
+
+  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
+  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+  assert(VectorPH && "Invalid loop structure");
+  assert(ExitBlock && "Must have an exit block");
+
+  // Some loops have a single integer induction variable, while other loops
+  // don't. One example is c++ iterators that often have multiple pointer
+  // induction variables. In the code below we also support a case where we
+  // don't have a single induction variable.
+  //
+  // We try to obtain an induction variable from the original loop as hard
+  // as possible. However if we don't find one that:
+  //   - is an integer
+  //   - counts from zero, stepping by one
+  //   - is the size of the widest induction variable type
+  // then we create a new one.
+  OldInduction = Legal->getPrimaryInduction();
+  Type *IdxTy = Legal->getWidestInductionType();
+
+  // Split the single block loop into the two loop structure described above.
+  BasicBlock *VecBody =
+      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+  BasicBlock *MiddleBlock =
+      VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
+  BasicBlock *ScalarPH =
+      MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+
+  // Create and register the new vector loop.
+  Loop *Lp = new Loop();
+  Loop *ParentLoop = OrigLoop->getParentLoop();
+
+  // Insert the new loop into the loop nest and register the new basic blocks
+  // before calling any utilities such as SCEV that require valid LoopInfo.
+  if (ParentLoop) {
+    ParentLoop->addChildLoop(Lp);
+    ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
+    ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
+  } else {
+    LI->addTopLevelLoop(Lp);
+  }
+  Lp->addBasicBlockToLoop(VecBody, *LI);
+
+  // Find the loop boundaries.
+  Value *Count = getOrCreateTripCount(Lp);
+
+  Value *StartIdx = ConstantInt::get(IdxTy, 0);
+
+  // We need to test whether the backedge-taken count is uint##_max. Adding one
+  // to it will cause overflow and an incorrect loop trip count in the vector
+  // body. In case of overflow we want to directly jump to the scalar remainder
+  // loop.
+  emitMinimumIterationCountCheck(Lp, ScalarPH);
+  // Now, compare the new count to zero. If it is zero skip the vector loop and
+  // jump to the scalar loop.
+  emitVectorLoopEnteredCheck(Lp, ScalarPH);
+  // Generate the code to check any assumptions that we've made for SCEV
+  // expressions.
+  emitSCEVChecks(Lp, ScalarPH);
+
+  // Generate the code that checks in runtime if arrays overlap. We put the
+  // checks into a separate block to make the more common case of few elements
+  // faster.
+  emitMemRuntimeChecks(Lp, ScalarPH);
+
+  // Generate the induction variable.
+  // The loop step is equal to the vectorization factor (num of SIMD elements)
+  // times the unroll factor (num of SIMD instructions).
+  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
+  Induction =
+      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+                              getDebugLocFromInstOrOperands(OldInduction));
+
+  // We are going to resume the execution of the scalar loop.
+  // Go over all of the induction variables that we found and fix the
+  // PHIs that are left in the scalar version of the loop.
+  // The starting values of PHI nodes depend on the counter of the last
+  // iteration in the vectorized loop.
+  // If we come from a bypass edge then we need to start from the original
+  // start value.
+
+  // This variable saves the new starting index for the scalar loop. It is used
+  // to test if there are any tail iterations left once the vector loop has
+  // completed.
+  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+  for (auto &InductionEntry : *List) {
+    PHINode *OrigPhi = InductionEntry.first;
+    InductionDescriptor II = InductionEntry.second;
+
+    // Create phi nodes to merge from the  backedge-taken check block.
+    PHINode *BCResumeVal = PHINode::Create(
+        OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
+    Value *&EndValue = IVEndValues[OrigPhi];
+    if (OrigPhi == OldInduction) {
+      // We know what the end value is.
+      EndValue = CountRoundDown;
+    } else {
+      IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
+      Type *StepType = II.getStep()->getType();
+      Instruction::CastOps CastOp =
+        CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
+      Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
+      const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+      EndValue = II.transform(B, CRD, PSE.getSE(), DL);
+      EndValue->setName("ind.end");
+    }
+
+    // The new PHI merges the original incoming value, in case of a bypass,
+    // or the value at the end of the vectorized loop.
+    BCResumeVal->addIncoming(EndValue, MiddleBlock);
+
+    // Fix the scalar body counter (PHI node).
+    unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
+
+    // The old induction's phi node in the scalar body needs the truncated
+    // value.
+    for (BasicBlock *BB : LoopBypassBlocks)
+      BCResumeVal->addIncoming(II.getStartValue(), BB);
+    OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
+  }
+
+  // Add a check in the middle block to see if we have completed
+  // all of the iterations in the first vector loop.
+  // If (N - N%VF) == N, then we *don't* need to run the remainder.
+  Value *CmpN =
+      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+                      CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
+  ReplaceInstWithInst(MiddleBlock->getTerminator(),
+                      BranchInst::Create(ExitBlock, ScalarPH, CmpN));
+
+  // Get ready to start creating new instructions into the vectorized body.
+  Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
+
+  // Save the state.
+  LoopVectorPreHeader = Lp->getLoopPreheader();
+  LoopScalarPreHeader = ScalarPH;
+  LoopMiddleBlock = MiddleBlock;
+  LoopExitBlock = ExitBlock;
+  LoopVectorBody = VecBody;
+  LoopScalarBody = OldBasicBlock;
+
+  // Keep all loop hints from the original loop on the vector loop (we'll
+  // replace the vectorizer-specific hints below).
+  if (MDNode *LID = OrigLoop->getLoopID())
+    Lp->setLoopID(LID);
+
+  LoopVectorizeHints Hints(Lp, true, *ORE);
+  Hints.setAlreadyVectorized();
+}
+
+// Fix up external users of the induction variable. At this point, we are
+// in LCSSA form, with all external PHIs that use the IV having one input value,
+// coming from the remainder loop. We need those PHIs to also have a correct
+// value for the IV when arriving directly from the middle block.
+void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
+                                       const InductionDescriptor &II,
+                                       Value *CountRoundDown, Value *EndValue,
+                                       BasicBlock *MiddleBlock) {
+  // There are two kinds of external IV usages - those that use the value
+  // computed in the last iteration (the PHI) and those that use the penultimate
+  // value (the value that feeds into the phi from the loop latch).
+  // We allow both, but they, obviously, have different values.
+
+  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
+
+  DenseMap<Value *, Value *> MissingVals;
+
+  // An external user of the last iteration's value should see the value that
+  // the remainder loop uses to initialize its own IV.
+  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
+  for (User *U : PostInc->users()) {
+    Instruction *UI = cast<Instruction>(U);
+    if (!OrigLoop->contains(UI)) {
+      assert(isa<PHINode>(UI) && "Expected LCSSA form");
+      MissingVals[UI] = EndValue;
+    }
+  }
+
+  // An external user of the penultimate value need to see EndValue - Step.
+  // The simplest way to get this is to recompute it from the constituent SCEVs,
+  // that is Start + (Step * (CRD - 1)).
+  for (User *U : OrigPhi->users()) {
+    auto *UI = cast<Instruction>(U);
+    if (!OrigLoop->contains(UI)) {
+      const DataLayout &DL =
+          OrigLoop->getHeader()->getModule()->getDataLayout();
+      assert(isa<PHINode>(UI) && "Expected LCSSA form");
+
+      IRBuilder<> B(MiddleBlock->getTerminator());
+      Value *CountMinusOne = B.CreateSub(
+          CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
+      Value *CMO =
+          !II.getStep()->getType()->isIntegerTy()
+              ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
+                             II.getStep()->getType())
+              : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
+      CMO->setName("cast.cmo");
+      Value *Escape = II.transform(B, CMO, PSE.getSE(), DL);
+      Escape->setName("ind.escape");
+      MissingVals[UI] = Escape;
+    }
+  }
+
+  for (auto &I : MissingVals) {
+    PHINode *PHI = cast<PHINode>(I.first);
+    // One corner case we have to handle is two IVs "chasing" each-other,
+    // that is %IV2 = phi [...], [ %IV1, %latch ]
+    // In this case, if IV1 has an external use, we need to avoid adding both
+    // "last value of IV1" and "penultimate value of IV2". So, verify that we
+    // don't already have an incoming value for the middle block.
+    if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
+      PHI->addIncoming(I.second, MiddleBlock);
+  }
+}
+
+namespace {
+struct CSEDenseMapInfo {
+  static bool canHandle(const Instruction *I) {
+    return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+           isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
+  }
+  static inline Instruction *getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+  static inline Instruction *getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+  static unsigned getHashValue(const Instruction *I) {
+    assert(canHandle(I) && "Unknown instruction!");
+    return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
+                                                           I->value_op_end()));
+  }
+  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
+    if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
+        LHS == getTombstoneKey() || RHS == getTombstoneKey())
+      return LHS == RHS;
+    return LHS->isIdenticalTo(RHS);
+  }
+};
+}
+
+///\brief Perform cse of induction variable instructions.
+static void cse(BasicBlock *BB) {
+  // Perform simple cse.
+  SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+    Instruction *In = &*I++;
+
+    if (!CSEDenseMapInfo::canHandle(In))
+      continue;
+
+    // Check if we can replace this instruction with any of the
+    // visited instructions.
+    if (Instruction *V = CSEMap.lookup(In)) {
+      In->replaceAllUsesWith(V);
+      In->eraseFromParent();
+      continue;
+    }
+
+    CSEMap[In] = In;
+  }
+}
+
+/// \brief Estimate the overhead of scalarizing an instruction. This is a
+/// convenience wrapper for the type-based getScalarizationOverhead API.
+static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
+                                         const TargetTransformInfo &TTI) {
+  if (VF == 1)
+    return 0;
+
+  unsigned Cost = 0;
+  Type *RetTy = ToVectorTy(I->getType(), VF);
+  if (!RetTy->isVoidTy() &&
+      (!isa<LoadInst>(I) ||
+       !TTI.supportsEfficientVectorElementLoadStore()))
+    Cost += TTI.getScalarizationOverhead(RetTy, true, false);
+
+  if (CallInst *CI = dyn_cast<CallInst>(I)) {
+    SmallVector<const Value *, 4> Operands(CI->arg_operands());
+    Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
+  }
+  else if (!isa<StoreInst>(I) ||
+           !TTI.supportsEfficientVectorElementLoadStore()) {
+    SmallVector<const Value *, 4> Operands(I->operand_values());
+    Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
+  }
+
+  return Cost;
+}
+
+// Estimate cost of a call instruction CI if it were vectorized with factor VF.
+// Return the cost of the instruction, including scalarization overhead if it's
+// needed. The flag NeedToScalarize shows if the call needs to be scalarized -
+// i.e. either vector version isn't available, or is too expensive.
+static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
+                                  const TargetTransformInfo &TTI,
+                                  const TargetLibraryInfo *TLI,
+                                  bool &NeedToScalarize) {
+  Function *F = CI->getCalledFunction();
+  StringRef FnName = CI->getCalledFunction()->getName();
+  Type *ScalarRetTy = CI->getType();
+  SmallVector<Type *, 4> Tys, ScalarTys;
+  for (auto &ArgOp : CI->arg_operands())
+    ScalarTys.push_back(ArgOp->getType());
+
+  // Estimate cost of scalarized vector call. The source operands are assumed
+  // to be vectors, so we need to extract individual elements from there,
+  // execute VF scalar calls, and then gather the result into the vector return
+  // value.
+  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
+  if (VF == 1)
+    return ScalarCallCost;
+
+  // Compute corresponding vector type for return value and arguments.
+  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
+  for (Type *ScalarTy : ScalarTys)
+    Tys.push_back(ToVectorTy(ScalarTy, VF));
+
+  // Compute costs of unpacking argument values for the scalar calls and
+  // packing the return values to a vector.
+  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
+
+  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
+
+  // If we can't emit a vector call for this function, then the currently found
+  // cost is the cost we need to return.
+  NeedToScalarize = true;
+  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
+    return Cost;
+
+  // If the corresponding vector cost is cheaper, return its cost.
+  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
+  if (VectorCallCost < Cost) {
+    NeedToScalarize = false;
+    return VectorCallCost;
+  }
+  return Cost;
+}
+
+// Estimate cost of an intrinsic call instruction CI if it were vectorized with
+// factor VF.  Return the cost of the instruction, including scalarization
+// overhead if it's needed.
+static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
+                                       const TargetTransformInfo &TTI,
+                                       const TargetLibraryInfo *TLI) {
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+  assert(ID && "Expected intrinsic call!");
+
+  FastMathFlags FMF;
+  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
+    FMF = FPMO->getFastMathFlags();
+
+  SmallVector<Value *, 4> Operands(CI->arg_operands());
+  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
+}
+
+static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
+  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
+  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
+  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
+}
+static Type *largestIntegerVectorType(Type *T1, Type *T2) {
+  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
+  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
+  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
+}
+
+void InnerLoopVectorizer::truncateToMinimalBitwidths() {
+  // For every instruction `I` in MinBWs, truncate the operands, create a
+  // truncated version of `I` and reextend its result. InstCombine runs
+  // later and will remove any ext/trunc pairs.
+  //
+  SmallPtrSet<Value *, 4> Erased;
+  for (const auto &KV : Cost->getMinimalBitwidths()) {
+    // If the value wasn't vectorized, we must maintain the original scalar
+    // type. The absence of the value from VectorLoopValueMap indicates that it
+    // wasn't vectorized.
+    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
+      continue;
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *I = getOrCreateVectorValue(KV.first, Part);
+      if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
+        continue;
+      Type *OriginalTy = I->getType();
+      Type *ScalarTruncatedTy =
+          IntegerType::get(OriginalTy->getContext(), KV.second);
+      Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
+                                          OriginalTy->getVectorNumElements());
+      if (TruncatedTy == OriginalTy)
+        continue;
+
+      IRBuilder<> B(cast<Instruction>(I));
+      auto ShrinkOperand = [&](Value *V) -> Value * {
+        if (auto *ZI = dyn_cast<ZExtInst>(V))
+          if (ZI->getSrcTy() == TruncatedTy)
+            return ZI->getOperand(0);
+        return B.CreateZExtOrTrunc(V, TruncatedTy);
+      };
+
+      // The actual instruction modification depends on the instruction type,
+      // unfortunately.
+      Value *NewI = nullptr;
+      if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+        NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
+                             ShrinkOperand(BO->getOperand(1)));
+
+        // Any wrapping introduced by shrinking this operation shouldn't be
+        // considered undefined behavior. So, we can't unconditionally copy
+        // arithmetic wrapping flags to NewI.
+        cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
+      } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
+        NewI =
+            B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
+                         ShrinkOperand(CI->getOperand(1)));
+      } else if (auto *SI = dyn_cast<SelectInst>(I)) {
+        NewI = B.CreateSelect(SI->getCondition(),
+                              ShrinkOperand(SI->getTrueValue()),
+                              ShrinkOperand(SI->getFalseValue()));
+      } else if (auto *CI = dyn_cast<CastInst>(I)) {
+        switch (CI->getOpcode()) {
+        default:
+          llvm_unreachable("Unhandled cast!");
+        case Instruction::Trunc:
+          NewI = ShrinkOperand(CI->getOperand(0));
+          break;
+        case Instruction::SExt:
+          NewI = B.CreateSExtOrTrunc(
+              CI->getOperand(0),
+              smallestIntegerVectorType(OriginalTy, TruncatedTy));
+          break;
+        case Instruction::ZExt:
+          NewI = B.CreateZExtOrTrunc(
+              CI->getOperand(0),
+              smallestIntegerVectorType(OriginalTy, TruncatedTy));
+          break;
+        }
+      } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
+        auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
+        auto *O0 = B.CreateZExtOrTrunc(
+            SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
+        auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
+        auto *O1 = B.CreateZExtOrTrunc(
+            SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
+
+        NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
+      } else if (isa<LoadInst>(I)) {
+        // Don't do anything with the operands, just extend the result.
+        continue;
+      } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
+        auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
+        auto *O0 = B.CreateZExtOrTrunc(
+            IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
+        auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
+        NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
+        auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
+        auto *O0 = B.CreateZExtOrTrunc(
+            EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
+        NewI = B.CreateExtractElement(O0, EE->getOperand(2));
+      } else {
+        llvm_unreachable("Unhandled instruction type!");
+      }
+
+      // Lastly, extend the result.
+      NewI->takeName(cast<Instruction>(I));
+      Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
+      I->replaceAllUsesWith(Res);
+      cast<Instruction>(I)->eraseFromParent();
+      Erased.insert(I);
+      VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
+    }
+  }
+
+  // We'll have created a bunch of ZExts that are now parentless. Clean up.
+  for (const auto &KV : Cost->getMinimalBitwidths()) {
+    // If the value wasn't vectorized, we must maintain the original scalar
+    // type. The absence of the value from VectorLoopValueMap indicates that it
+    // wasn't vectorized.
+    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
+      continue;
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *I = getOrCreateVectorValue(KV.first, Part);
+      ZExtInst *Inst = dyn_cast<ZExtInst>(I);
+      if (Inst && Inst->use_empty()) {
+        Value *NewI = Inst->getOperand(0);
+        Inst->eraseFromParent();
+        VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
+      }
+    }
+  }
+}
+
+void InnerLoopVectorizer::fixVectorizedLoop() {
+  // Insert truncates and extends for any truncated instructions as hints to
+  // InstCombine.
+  if (VF > 1)
+    truncateToMinimalBitwidths();
+
+  // At this point every instruction in the original loop is widened to a
+  // vector form. Now we need to fix the recurrences in the loop. These PHI
+  // nodes are currently empty because we did not want to introduce cycles.
+  // This is the second stage of vectorizing recurrences.
+  fixCrossIterationPHIs();
+
+  // Update the dominator tree.
+  //
+  // FIXME: After creating the structure of the new loop, the dominator tree is
+  //        no longer up-to-date, and it remains that way until we update it
+  //        here. An out-of-date dominator tree is problematic for SCEV,
+  //        because SCEVExpander uses it to guide code generation. The
+  //        vectorizer use SCEVExpanders in several places. Instead, we should
+  //        keep the dominator tree up-to-date as we go.
+  updateAnalysis();
+
+  // Fix-up external users of the induction variables.
+  for (auto &Entry : *Legal->getInductionVars())
+    fixupIVUsers(Entry.first, Entry.second,
+                 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
+                 IVEndValues[Entry.first], LoopMiddleBlock);
+
+  fixLCSSAPHIs();
+  predicateInstructions();
+
+  // Remove redundant induction instructions.
+  cse(LoopVectorBody);
+}
+
+void InnerLoopVectorizer::fixCrossIterationPHIs() {
+  // In order to support recurrences we need to be able to vectorize Phi nodes.
+  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+  // stage #2: We now need to fix the recurrences by adding incoming edges to
+  // the currently empty PHI nodes. At this point every instruction in the
+  // original loop is widened to a vector form so we can use them to construct
+  // the incoming edges.
+  for (Instruction &I : *OrigLoop->getHeader()) {
+    PHINode *Phi = dyn_cast<PHINode>(&I);
+    if (!Phi)
+      break;
+    // Handle first-order recurrences and reductions that need to be fixed.
+    if (Legal->isFirstOrderRecurrence(Phi))
+      fixFirstOrderRecurrence(Phi);
+    else if (Legal->isReductionVariable(Phi))
+      fixReduction(Phi);
+  }
+}
+
+void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
+
+  // This is the second phase of vectorizing first-order recurrences. An
+  // overview of the transformation is described below. Suppose we have the
+  // following loop.
+  //
+  //   for (int i = 0; i < n; ++i)
+  //     b[i] = a[i] - a[i - 1];
+  //
+  // There is a first-order recurrence on "a". For this loop, the shorthand
+  // scalar IR looks like:
+  //
+  //   scalar.ph:
+  //     s_init = a[-1]
+  //     br scalar.body
+  //
+  //   scalar.body:
+  //     i = phi [0, scalar.ph], [i+1, scalar.body]
+  //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
+  //     s2 = a[i]
+  //     b[i] = s2 - s1
+  //     br cond, scalar.body, ...
+  //
+  // In this example, s1 is a recurrence because it's value depends on the
+  // previous iteration. In the first phase of vectorization, we created a
+  // temporary value for s1. We now complete the vectorization and produce the
+  // shorthand vector IR shown below (for VF = 4, UF = 1).
+  //
+  //   vector.ph:
+  //     v_init = vector(..., ..., ..., a[-1])
+  //     br vector.body
+  //
+  //   vector.body
+  //     i = phi [0, vector.ph], [i+4, vector.body]
+  //     v1 = phi [v_init, vector.ph], [v2, vector.body]
+  //     v2 = a[i, i+1, i+2, i+3];
+  //     v3 = vector(v1(3), v2(0, 1, 2))
+  //     b[i, i+1, i+2, i+3] = v2 - v3
+  //     br cond, vector.body, middle.block
+  //
+  //   middle.block:
+  //     x = v2(3)
+  //     br scalar.ph
+  //
+  //   scalar.ph:
+  //     s_init = phi [x, middle.block], [a[-1], otherwise]
+  //     br scalar.body
+  //
+  // After execution completes the vector loop, we extract the next value of
+  // the recurrence (x) to use as the initial value in the scalar loop.
+
+  // Get the original loop preheader and single loop latch.
+  auto *Preheader = OrigLoop->getLoopPreheader();
+  auto *Latch = OrigLoop->getLoopLatch();
+
+  // Get the initial and previous values of the scalar recurrence.
+  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
+  auto *Previous = Phi->getIncomingValueForBlock(Latch);
+
+  // Create a vector from the initial value.
+  auto *VectorInit = ScalarInit;
+  if (VF > 1) {
+    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+    VectorInit = Builder.CreateInsertElement(
+        UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
+        Builder.getInt32(VF - 1), "vector.recur.init");
+  }
+
+  // We constructed a temporary phi node in the first phase of vectorization.
+  // This phi node will eventually be deleted.
+  Builder.SetInsertPoint(
+      cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
+
+  // Create a phi node for the new recurrence. The current value will either be
+  // the initial value inserted into a vector or loop-varying vector value.
+  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
+  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
+
+  // Get the vectorized previous value of the last part UF - 1. It appears last
+  // among all unrolled iterations, due to the order of their construction.
+  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
+
+  // Set the insertion point after the previous value if it is an instruction.
+  // Note that the previous value may have been constant-folded so it is not
+  // guaranteed to be an instruction in the vector loop. Also, if the previous
+  // value is a phi node, we should insert after all the phi nodes to avoid
+  // breaking basic block verification.
+  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
+      isa<PHINode>(PreviousLastPart))
+    Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
+  else
+    Builder.SetInsertPoint(
+        &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
+
+  // We will construct a vector for the recurrence by combining the values for
+  // the current and previous iterations. This is the required shuffle mask.
+  SmallVector<Constant *, 8> ShuffleMask(VF);
+  ShuffleMask[0] = Builder.getInt32(VF - 1);
+  for (unsigned I = 1; I < VF; ++I)
+    ShuffleMask[I] = Builder.getInt32(I + VF - 1);
+
+  // The vector from which to take the initial value for the current iteration
+  // (actual or unrolled). Initially, this is the vector phi node.
+  Value *Incoming = VecPhi;
+
+  // Shuffle the current and previous vector and update the vector parts.
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
+    Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
+    auto *Shuffle =
+        VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
+                                             ConstantVector::get(ShuffleMask))
+               : Incoming;
+    PhiPart->replaceAllUsesWith(Shuffle);
+    cast<Instruction>(PhiPart)->eraseFromParent();
+    VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
+    Incoming = PreviousPart;
+  }
+
+  // Fix the latch value of the new recurrence in the vector loop.
+  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+
+  // Extract the last vector element in the middle block. This will be the
+  // initial value for the recurrence when jumping to the scalar loop.
+  auto *ExtractForScalar = Incoming;
+  if (VF > 1) {
+    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
+    ExtractForScalar = Builder.CreateExtractElement(
+        ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
+  }
+  // Extract the second last element in the middle block if the
+  // Phi is used outside the loop. We need to extract the phi itself
+  // and not the last element (the phi update in the current iteration). This
+  // will be the value when jumping to the exit block from the LoopMiddleBlock,
+  // when the scalar loop is not run at all.
+  Value *ExtractForPhiUsedOutsideLoop = nullptr;
+  if (VF > 1)
+    ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
+        Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
+  // When loop is unrolled without vectorizing, initialize
+  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
+  // `Incoming`. This is analogous to the vectorized case above: extracting the
+  // second last element when VF > 1.
+  else if (UF > 1)
+    ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
+
+  // Fix the initial value of the original recurrence in the scalar loop.
+  Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
+  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
+  for (auto *BB : predecessors(LoopScalarPreHeader)) {
+    auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
+    Start->addIncoming(Incoming, BB);
+  }
+
+  Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start);
+  Phi->setName("scalar.recur");
+
+  // Finally, fix users of the recurrence outside the loop. The users will need
+  // either the last value of the scalar recurrence or the last value of the
+  // vector recurrence we extracted in the middle block. Since the loop is in
+  // LCSSA form, we just need to find the phi node for the original scalar
+  // recurrence in the exit block, and then add an edge for the middle block.
+  for (auto &I : *LoopExitBlock) {
+    auto *LCSSAPhi = dyn_cast<PHINode>(&I);
+    if (!LCSSAPhi)
+      break;
+    if (LCSSAPhi->getIncomingValue(0) == Phi) {
+      LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
+      break;
+    }
+  }
+}
+
+void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
+  Constant *Zero = Builder.getInt32(0);
+
+  // Get it's reduction variable descriptor.
+  assert(Legal->isReductionVariable(Phi) &&
+         "Unable to find the reduction variable");
+  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
+
+  RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
+  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
+  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
+  RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
+    RdxDesc.getMinMaxRecurrenceKind();
+  setDebugLocFromInst(Builder, ReductionStartValue);
+
+  // We need to generate a reduction vector from the incoming scalar.
+  // To do so, we need to generate the 'identity' vector and override
+  // one of the elements with the incoming scalar reduction. We need
+  // to do it in the vector-loop preheader.
+  Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
+
+  // This is the vector-clone of the value that leaves the loop.
+  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
+
+  // Find the reduction identity variable. Zero for addition, or, xor,
+  // one for multiplication, -1 for And.
+  Value *Identity;
+  Value *VectorStart;
+  if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
+      RK == RecurrenceDescriptor::RK_FloatMinMax) {
+    // MinMax reduction have the start value as their identify.
+    if (VF == 1) {
+      VectorStart = Identity = ReductionStartValue;
+    } else {
+      VectorStart = Identity =
+        Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
+    }
+  } else {
+    // Handle other reduction kinds:
+    Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
+        RK, VecTy->getScalarType());
+    if (VF == 1) {
+      Identity = Iden;
+      // This vector is the Identity vector where the first element is the
+      // incoming scalar reduction.
+      VectorStart = ReductionStartValue;
+    } else {
+      Identity = ConstantVector::getSplat(VF, Iden);
+
+      // This vector is the Identity vector where the first element is the
+      // incoming scalar reduction.
+      VectorStart =
+        Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
+    }
+  }
+
+  // Fix the vector-loop phi.
+
+  // Reductions do not have to start at zero. They can start with
+  // any loop invariant values.
+  BasicBlock *Latch = OrigLoop->getLoopLatch();
+  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
+    Value *Val = getOrCreateVectorValue(LoopVal, Part);
+    // Make sure to add the reduction stat value only to the
+    // first unroll part.
+    Value *StartVal = (Part == 0) ? VectorStart : Identity;
+    cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
+    cast<PHINode>(VecRdxPhi)
+      ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+  }
+
+  // Before each round, move the insertion point right between
+  // the PHIs and the values we are going to write.
+  // This allows us to write both PHINodes and the extractelement
+  // instructions.
+  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+
+  setDebugLocFromInst(Builder, LoopExitInst);
+
+  // If the vector reduction can be performed in a smaller type, we truncate
+  // then extend the loop exit value to enable InstCombine to evaluate the
+  // entire expression in the smaller type.
+  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
+    Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
+    Builder.SetInsertPoint(LoopVectorBody->getTerminator());
+    VectorParts RdxParts(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+      Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+      Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
+                                        : Builder.CreateZExt(Trunc, VecTy);
+      for (Value::user_iterator UI = RdxParts[Part]->user_begin();
+           UI != RdxParts[Part]->user_end();)
+        if (*UI != Trunc) {
+          (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
+          RdxParts[Part] = Extnd;
+        } else {
+          ++UI;
+        }
+    }
+    Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
+    }
+  }
+
+  // Reduce all of the unrolled parts into a single vector.
+  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
+  unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
+  setDebugLocFromInst(Builder, ReducedPartRdx);
+  for (unsigned Part = 1; Part < UF; ++Part) {
+    Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+    if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+      // Floating point operations had to be 'fast' to enable the reduction.
+      ReducedPartRdx = addFastMathFlag(
+          Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
+                              ReducedPartRdx, "bin.rdx"));
+    else
+      ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
+          Builder, MinMaxKind, ReducedPartRdx, RdxPart);
+  }
+
+  if (VF > 1) {
+    bool NoNaN = Legal->hasFunNoNaNAttr();
+    ReducedPartRdx =
+        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
+    // If the reduction can be performed in a smaller type, we need to extend
+    // the reduction to the wider type before we branch to the original loop.
+    if (Phi->getType() != RdxDesc.getRecurrenceType())
+      ReducedPartRdx =
+        RdxDesc.isSigned()
+        ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
+        : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
+  }
+
+  // Create a phi node that merges control-flow from the backedge-taken check
+  // block and the middle block.
+  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
+                                        LoopScalarPreHeader->getTerminator());
+  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
+  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
+  // Now, we need to fix the users of the reduction variable
+  // inside and outside of the scalar remainder loop.
+  // We know that the loop is in LCSSA form. We need to update the
+  // PHI nodes in the exit blocks.
+  for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+         LEE = LoopExitBlock->end();
+       LEI != LEE; ++LEI) {
+    PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+    if (!LCSSAPhi)
+      break;
+
+    // All PHINodes need to have a single entry edge, or two if
+    // we already fixed them.
+    assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+
+    // We found a reduction value exit-PHI. Update it with the
+    // incoming bypass edge.
+    if (LCSSAPhi->getIncomingValue(0) == LoopExitInst)
+      LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+  } // end of the LCSSA phi scan.
+
+    // Fix the scalar loop reduction variable with the incoming reduction sum
+    // from the vector body and from the backedge value.
+  int IncomingEdgeBlockIdx =
+    Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
+  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+  // Pick the other block.
+  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
+  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+}
+
+void InnerLoopVectorizer::fixLCSSAPHIs() {
+  for (Instruction &LEI : *LoopExitBlock) {
+    auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
+    if (!LCSSAPhi)
+      break;
+    if (LCSSAPhi->getNumIncomingValues() == 1) {
+      assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) &&
+             "Incoming value isn't loop invariant");
+      LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock);
+    }
+  }
+}
+
+void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
+
+  // The basic block and loop containing the predicated instruction.
+  auto *PredBB = PredInst->getParent();
+  auto *VectorLoop = LI->getLoopFor(PredBB);
+
+  // Initialize a worklist with the operands of the predicated instruction.
+  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
+
+  // Holds instructions that we need to analyze again. An instruction may be
+  // reanalyzed if we don't yet know if we can sink it or not.
+  SmallVector<Instruction *, 8> InstsToReanalyze;
+
+  // Returns true if a given use occurs in the predicated block. Phi nodes use
+  // their operands in their corresponding predecessor blocks.
+  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
+    auto *I = cast<Instruction>(U.getUser());
+    BasicBlock *BB = I->getParent();
+    if (auto *Phi = dyn_cast<PHINode>(I))
+      BB = Phi->getIncomingBlock(
+          PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
+    return BB == PredBB;
+  };
+
+  // Iteratively sink the scalarized operands of the predicated instruction
+  // into the block we created for it. When an instruction is sunk, it's
+  // operands are then added to the worklist. The algorithm ends after one pass
+  // through the worklist doesn't sink a single instruction.
+  bool Changed;
+  do {
+
+    // Add the instructions that need to be reanalyzed to the worklist, and
+    // reset the changed indicator.
+    Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
+    InstsToReanalyze.clear();
+    Changed = false;
+
+    while (!Worklist.empty()) {
+      auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
+
+      // We can't sink an instruction if it is a phi node, is already in the
+      // predicated block, is not in the loop, or may have side effects.
+      if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
+          !VectorLoop->contains(I) || I->mayHaveSideEffects())
+        continue;
+
+      // It's legal to sink the instruction if all its uses occur in the
+      // predicated block. Otherwise, there's nothing to do yet, and we may
+      // need to reanalyze the instruction.
+      if (!all_of(I->uses(), isBlockOfUsePredicated)) {
+        InstsToReanalyze.push_back(I);
+        continue;
+      }
+
+      // Move the instruction to the beginning of the predicated block, and add
+      // it's operands to the worklist.
+      I->moveBefore(&*PredBB->getFirstInsertionPt());
+      Worklist.insert(I->op_begin(), I->op_end());
+
+      // The sinking may have enabled other instructions to be sunk, so we will
+      // need to iterate.
+      Changed = true;
+    }
+  } while (Changed);
+}
+
+void InnerLoopVectorizer::predicateInstructions() {
+
+  // For each instruction I marked for predication on value C, split I into its
+  // own basic block to form an if-then construct over C. Since I may be fed by
+  // an extractelement instruction or other scalar operand, we try to
+  // iteratively sink its scalar operands into the predicated block. If I feeds
+  // an insertelement instruction, we try to move this instruction into the
+  // predicated block as well. For non-void types, a phi node will be created
+  // for the resulting value (either vector or scalar).
+  //
+  // So for some predicated instruction, e.g. the conditional sdiv in:
+  //
+  // for.body:
+  //  ...
+  //  %add = add nsw i32 %mul, %0
+  //  %cmp5 = icmp sgt i32 %2, 7
+  //  br i1 %cmp5, label %if.then, label %if.end
+  //
+  // if.then:
+  //  %div = sdiv i32 %0, %1
+  //  br label %if.end
+  //
+  // if.end:
+  //  %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ]
+  //
+  // the sdiv at this point is scalarized and if-converted using a select.
+  // The inactive elements in the vector are not used, but the predicated
+  // instruction is still executed for all vector elements, essentially:
+  //
+  // vector.body:
+  //  ...
+  //  %17 = add nsw <2 x i32> %16, %wide.load
+  //  %29 = extractelement <2 x i32> %wide.load, i32 0
+  //  %30 = extractelement <2 x i32> %wide.load51, i32 0
+  //  %31 = sdiv i32 %29, %30
+  //  %32 = insertelement <2 x i32> undef, i32 %31, i32 0
+  //  %35 = extractelement <2 x i32> %wide.load, i32 1
+  //  %36 = extractelement <2 x i32> %wide.load51, i32 1
+  //  %37 = sdiv i32 %35, %36
+  //  %38 = insertelement <2 x i32> %32, i32 %37, i32 1
+  //  %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17
+  //
+  // Predication will now re-introduce the original control flow to avoid false
+  // side-effects by the sdiv instructions on the inactive elements, yielding
+  // (after cleanup):
+  //
+  // vector.body:
+  //  ...
+  //  %5 = add nsw <2 x i32> %4, %wide.load
+  //  %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7>
+  //  %9 = extractelement <2 x i1> %8, i32 0
+  //  br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue
+  //
+  // pred.sdiv.if:
+  //  %10 = extractelement <2 x i32> %wide.load, i32 0
+  //  %11 = extractelement <2 x i32> %wide.load51, i32 0
+  //  %12 = sdiv i32 %10, %11
+  //  %13 = insertelement <2 x i32> undef, i32 %12, i32 0
+  //  br label %pred.sdiv.continue
+  //
+  // pred.sdiv.continue:
+  //  %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ]
+  //  %15 = extractelement <2 x i1> %8, i32 1
+  //  br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55
+  //
+  // pred.sdiv.if54:
+  //  %16 = extractelement <2 x i32> %wide.load, i32 1
+  //  %17 = extractelement <2 x i32> %wide.load51, i32 1
+  //  %18 = sdiv i32 %16, %17
+  //  %19 = insertelement <2 x i32> %14, i32 %18, i32 1
+  //  br label %pred.sdiv.continue55
+  //
+  // pred.sdiv.continue55:
+  //  %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ]
+  //  %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5
+
+  for (auto KV : PredicatedInstructions) {
+    BasicBlock::iterator I(KV.first);
+    BasicBlock *Head = I->getParent();
+    auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
+                                        /*BranchWeights=*/nullptr, DT, LI);
+    I->moveBefore(T);
+    sinkScalarOperands(&*I);
+
+    BasicBlock *PredicatedBlock = I->getParent();
+    Twine BBNamePrefix = Twine("pred.") + I->getOpcodeName();
+    PredicatedBlock->setName(BBNamePrefix + ".if");
+    PredicatedBlock->getSingleSuccessor()->setName(BBNamePrefix + ".continue");
+
+    // If the instruction is non-void create a Phi node at reconvergence point.
+    if (!I->getType()->isVoidTy()) {
+      Value *IncomingTrue = nullptr;
+      Value *IncomingFalse = nullptr;
+
+      if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) {
+        // If the predicated instruction is feeding an insert-element, move it
+        // into the Then block; Phi node will be created for the vector.
+        InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin());
+        IEI->moveBefore(T);
+        IncomingTrue = IEI; // the new vector with the inserted element.
+        IncomingFalse = IEI->getOperand(0); // the unmodified vector
+      } else {
+        // Phi node will be created for the scalar predicated instruction.
+        IncomingTrue = &*I;
+        IncomingFalse = UndefValue::get(I->getType());
+      }
+
+      BasicBlock *PostDom = I->getParent()->getSingleSuccessor();
+      assert(PostDom && "Then block has multiple successors");
+      PHINode *Phi =
+          PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front());
+      IncomingTrue->replaceAllUsesWith(Phi);
+      Phi->addIncoming(IncomingFalse, Head);
+      Phi->addIncoming(IncomingTrue, I->getParent());
+    }
+  }
+
+  DEBUG(DT->verifyDomTree());
+}
+
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
+  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
+
+  // Look for cached value.
+  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
+  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
+  if (ECEntryIt != EdgeMaskCache.end())
+    return ECEntryIt->second;
+
+  VectorParts SrcMask = createBlockInMask(Src);
+
+  // The terminator has to be a branch inst!
+  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+  assert(BI && "Unexpected terminator found");
+
+  if (BI->isConditional()) {
+
+    VectorParts EdgeMask(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
+      if (BI->getSuccessor(0) != Dst)
+        EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
+
+      EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
+      EdgeMask[Part] = EdgeMaskPart;
+    }
+
+    EdgeMaskCache[Edge] = EdgeMask;
+    return EdgeMask;
+  }
+
+  EdgeMaskCache[Edge] = SrcMask;
+  return SrcMask;
+}
+
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
+  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+
+  // Look for cached value.
+  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
+  if (BCEntryIt != BlockMaskCache.end())
+    return BCEntryIt->second;
+
+  VectorParts BlockMask(UF);
+
+  // Loop incoming mask is all-one.
+  if (OrigLoop->getHeader() == BB) {
+    Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
+    for (unsigned Part = 0; Part < UF; ++Part)
+      BlockMask[Part] = getOrCreateVectorValue(C, Part);
+    BlockMaskCache[BB] = BlockMask;
+    return BlockMask;
+  }
+
+  // This is the block mask. We OR all incoming edges, and with zero.
+  Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
+  for (unsigned Part = 0; Part < UF; ++Part)
+    BlockMask[Part] = getOrCreateVectorValue(Zero, Part);
+
+  // For each pred:
+  for (pred_iterator It = pred_begin(BB), E = pred_end(BB); It != E; ++It) {
+    VectorParts EM = createEdgeMask(*It, BB);
+    for (unsigned Part = 0; Part < UF; ++Part)
+      BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EM[Part]);
+  }
+
+  BlockMaskCache[BB] = BlockMask;
+  return BlockMask;
+}
+
+void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
+                                              unsigned VF) {
+  PHINode *P = cast<PHINode>(PN);
+  // In order to support recurrences we need to be able to vectorize Phi nodes.
+  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
+  // this value when we vectorize all of the instructions that use the PHI.
+  if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      // This is phase one of vectorizing PHIs.
+      Type *VecTy =
+          (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
+      Value *EntryPart = PHINode::Create(
+          VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
+      VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
+    }
+    return;
+  }
+
+  setDebugLocFromInst(Builder, P);
+  // Check for PHI nodes that are lowered to vector selects.
+  if (P->getParent() != OrigLoop->getHeader()) {
+    // We know that all PHIs in non-header blocks are converted into
+    // selects, so we don't have to worry about the insertion order and we
+    // can just use the builder.
+    // At this point we generate the predication tree. There may be
+    // duplications since this is a simple recursive scan, but future
+    // optimizations will clean it up.
+
+    unsigned NumIncoming = P->getNumIncomingValues();
+
+    // Generate a sequence of selects of the form:
+    // SELECT(Mask3, In3,
+    //      SELECT(Mask2, In2,
+    //                   ( ...)))
+    VectorParts Entry(UF);
+    for (unsigned In = 0; In < NumIncoming; In++) {
+      VectorParts Cond =
+          createEdgeMask(P->getIncomingBlock(In), P->getParent());
+
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part);
+        // We might have single edge PHIs (blocks) - use an identity
+        // 'select' for the first PHI operand.
+        if (In == 0)
+          Entry[Part] = Builder.CreateSelect(Cond[Part], In0, In0);
+        else
+          // Select between the current value and the previous incoming edge
+          // based on the incoming mask.
+          Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part],
+                                             "predphi");
+      }
+    }
+    for (unsigned Part = 0; Part < UF; ++Part)
+      VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]);
+    return;
+  }
+
+  // This PHINode must be an induction variable.
+  // Make sure that we know about it.
+  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
+
+  InductionDescriptor II = Legal->getInductionVars()->lookup(P);
+  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+
+  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
+  // which can be found from the original scalar operations.
+  switch (II.getKind()) {
+  case InductionDescriptor::IK_NoInduction:
+    llvm_unreachable("Unknown induction");
+  case InductionDescriptor::IK_IntInduction:
+  case InductionDescriptor::IK_FpInduction:
+    return widenIntOrFpInduction(P);
+  case InductionDescriptor::IK_PtrInduction: {
+    // Handle the pointer induction variable case.
+    assert(P->getType()->isPointerTy() && "Unexpected type.");
+    // This is the normalized GEP that starts counting at zero.
+    Value *PtrInd = Induction;
+    PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
+    // Determine the number of scalars we need to generate for each unroll
+    // iteration. If the instruction is uniform, we only need to generate the
+    // first lane. Otherwise, we generate all VF values.
+    unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
+    // These are the scalar results. Notice that we don't generate vector GEPs
+    // because scalar GEPs result in better code.
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+        Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
+        Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
+        Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
+        SclrGep->setName("next.gep");
+        VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep);
+      }
+    }
+    return;
+  }
+  }
+}
+
+/// A helper function for checking whether an integer division-related
+/// instruction may divide by zero (in which case it must be predicated if
+/// executed conditionally in the scalar code).
+/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
+/// Non-zero divisors that are non compile-time constants will not be
+/// converted into multiplication, so we will still end up scalarizing
+/// the division, but can do so w/o predication.
+static bool mayDivideByZero(Instruction &I) {
+  assert((I.getOpcode() == Instruction::UDiv ||
+          I.getOpcode() == Instruction::SDiv ||
+          I.getOpcode() == Instruction::URem ||
+          I.getOpcode() == Instruction::SRem) &&
+         "Unexpected instruction");
+  Value *Divisor = I.getOperand(1);
+  auto *CInt = dyn_cast<ConstantInt>(Divisor);
+  return !CInt || CInt->isZero();
+}
+
+void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
+  // Scalarize instructions that should remain scalar after vectorization.
+  if (VF > 1 &&
+      !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) &&
+      shouldScalarizeInstruction(&I)) {
+    scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
+    return;
+  }
+
+  switch (I.getOpcode()) {
+  case Instruction::Br:
+    // Nothing to do for PHIs and BR, since we already took care of the
+    // loop control flow instructions.
+    break;
+  case Instruction::PHI: {
+    // Vectorize PHINodes.
+    widenPHIInstruction(&I, UF, VF);
+    break;
+  } // End of PHI.
+  case Instruction::GetElementPtr: {
+    // Construct a vector GEP by widening the operands of the scalar GEP as
+    // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+    // results in a vector of pointers when at least one operand of the GEP
+    // is vector-typed. Thus, to keep the representation compact, we only use
+    // vector-typed operands for loop-varying values.
+    auto *GEP = cast<GetElementPtrInst>(&I);
+
+    if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
+      // If we are vectorizing, but the GEP has only loop-invariant operands,
+      // the GEP we build (by only using vector-typed operands for
+      // loop-varying values) would be a scalar pointer. Thus, to ensure we
+      // produce a vector of pointers, we need to either arbitrarily pick an
+      // operand to broadcast, or broadcast a clone of the original GEP.
+      // Here, we broadcast a clone of the original.
+      //
+      // TODO: If at some point we decide to scalarize instructions having
+      //       loop-invariant operands, this special case will no longer be
+      //       required. We would add the scalarization decision to
+      //       collectLoopScalars() and teach getVectorValue() to broadcast
+      //       the lane-zero scalar value.
+      auto *Clone = Builder.Insert(GEP->clone());
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
+        VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
+        addMetadata(EntryPart, GEP);
+      }
+    } else {
+      // If the GEP has at least one loop-varying operand, we are sure to
+      // produce a vector of pointers. But if we are only unrolling, we want
+      // to produce a scalar GEP for each unroll part. Thus, the GEP we
+      // produce with the code below will be scalar (if VF == 1) or vector
+      // (otherwise). Note that for the unroll-only case, we still maintain
+      // values in the vector mapping with initVector, as we do for other
+      // instructions.
+      for (unsigned Part = 0; Part < UF; ++Part) {
+
+        // The pointer operand of the new GEP. If it's loop-invariant, we
+        // won't broadcast it.
+        auto *Ptr =
+            OrigLoop->isLoopInvariant(GEP->getPointerOperand())
+                ? GEP->getPointerOperand()
+                : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
+
+        // Collect all the indices for the new GEP. If any index is
+        // loop-invariant, we won't broadcast it.
+        SmallVector<Value *, 4> Indices;
+        for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
+          if (OrigLoop->isLoopInvariant(U.get()))
+            Indices.push_back(U.get());
+          else
+            Indices.push_back(getOrCreateVectorValue(U.get(), Part));
+        }
+
+        // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+        // but it should be a vector, otherwise.
+        auto *NewGEP = GEP->isInBounds()
+                           ? Builder.CreateInBoundsGEP(Ptr, Indices)
+                           : Builder.CreateGEP(Ptr, Indices);
+        assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
+               "NewGEP is not a pointer vector");
+        VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
+        addMetadata(NewGEP, GEP);
+      }
+    }
+
+    break;
+  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    // Scalarize with predication if this instruction may divide by zero and
+    // block execution is conditional, otherwise fallthrough.
+    if (Legal->isScalarWithPredication(&I)) {
+      scalarizeInstruction(&I, true);
+      break;
+    }
+    LLVM_FALLTHROUGH;
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    // Just widen binops.
+    auto *BinOp = cast<BinaryOperator>(&I);
+    setDebugLocFromInst(Builder, BinOp);
+
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
+      Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
+      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+
+      if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
+        VecOp->copyIRFlags(BinOp);
+
+      // Use this vector value for all users of the original instruction.
+      VectorLoopValueMap.setVectorValue(&I, Part, V);
+      addMetadata(V, BinOp);
+    }
+
+    break;
+  }
+  case Instruction::Select: {
+    // Widen selects.
+    // If the selector is loop invariant we can create a select
+    // instruction with a scalar condition. Otherwise, use vector-select.
+    auto *SE = PSE.getSE();
+    bool InvariantCond =
+        SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
+    setDebugLocFromInst(Builder, &I);
+
+    // The condition can be loop invariant  but still defined inside the
+    // loop. This means that we can't just use the original 'cond' value.
+    // We have to take the 'vectorized' value and pick the first lane.
+    // Instcombine will make this a no-op.
+
+    auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0);
+
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
+      Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
+      Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
+      Value *Sel =
+          Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
+      VectorLoopValueMap.setVectorValue(&I, Part, Sel);
+      addMetadata(Sel, &I);
+    }
+
+    break;
+  }
+
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    // Widen compares. Generate vector compares.
+    bool FCmp = (I.getOpcode() == Instruction::FCmp);
+    auto *Cmp = dyn_cast<CmpInst>(&I);
+    setDebugLocFromInst(Builder, Cmp);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
+      Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
+      Value *C = nullptr;
+      if (FCmp) {
+        C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+        cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
+      } else {
+        C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+      }
+      VectorLoopValueMap.setVectorValue(&I, Part, C);
+      addMetadata(C, &I);
+    }
+
+    break;
+  }
+
+  case Instruction::Store:
+  case Instruction::Load:
+    vectorizeMemoryInstruction(&I);
+    break;
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    auto *CI = dyn_cast<CastInst>(&I);
+    setDebugLocFromInst(Builder, CI);
+
+    // Optimize the special case where the source is a constant integer
+    // induction variable. Notice that we can only optimize the 'trunc' case
+    // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
+    // (c) other casts depend on pointer size.
+    if (Cost->isOptimizableIVTruncate(CI, VF)) {
+      widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),
+                            cast<TruncInst>(CI));
+      break;
+    }
+
+    /// Vectorize casts.
+    Type *DestTy =
+        (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
+
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
+      Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+      VectorLoopValueMap.setVectorValue(&I, Part, Cast);
+      addMetadata(Cast, &I);
+    }
+    break;
+  }
+
+  case Instruction::Call: {
+    // Ignore dbg intrinsics.
+    if (isa<DbgInfoIntrinsic>(I))
+      break;
+    setDebugLocFromInst(Builder, &I);
+
+    Module *M = I.getParent()->getParent()->getParent();
+    auto *CI = cast<CallInst>(&I);
+
+    StringRef FnName = CI->getCalledFunction()->getName();
+    Function *F = CI->getCalledFunction();
+    Type *RetTy = ToVectorTy(CI->getType(), VF);
+    SmallVector<Type *, 4> Tys;
+    for (Value *ArgOperand : CI->arg_operands())
+      Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+    if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+               ID == Intrinsic::lifetime_start)) {
+      scalarizeInstruction(&I);
+      break;
+    }
+    // The flag shows whether we use Intrinsic or a usual Call for vectorized
+    // version of the instruction.
+    // Is it beneficial to perform intrinsic call compared to lib call?
+    bool NeedToScalarize;
+    unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
+    bool UseVectorIntrinsic =
+        ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
+    if (!UseVectorIntrinsic && NeedToScalarize) {
+      scalarizeInstruction(&I);
+      break;
+    }
+
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      SmallVector<Value *, 4> Args;
+      for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+        Value *Arg = CI->getArgOperand(i);
+        // Some intrinsics have a scalar argument - don't replace it with a
+        // vector.
+        if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
+          Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
+        Args.push_back(Arg);
+      }
+
+      Function *VectorF;
+      if (UseVectorIntrinsic) {
+        // Use vector version of the intrinsic.
+        Type *TysForDecl[] = {CI->getType()};
+        if (VF > 1)
+          TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+        VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+      } else {
+        // Use vector version of the library call.
+        StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
+        assert(!VFnName.empty() && "Vector function name is empty.");
+        VectorF = M->getFunction(VFnName);
+        if (!VectorF) {
+          // Generate a declaration
+          FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
+          VectorF =
+              Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
+          VectorF->copyAttributesFrom(F);
+        }
+      }
+      assert(VectorF && "Can't create vector function.");
+
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      CI->getOperandBundlesAsDefs(OpBundles);
+      CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
+
+      if (isa<FPMathOperator>(V))
+        V->copyFastMathFlags(CI);
+
+      VectorLoopValueMap.setVectorValue(&I, Part, V);
+      addMetadata(V, &I);
+    }
+
+    break;
+  }
+
+  default:
+    // All other instructions are unsupported. Scalarize them.
+    scalarizeInstruction(&I);
+    break;
+  } // end of switch.
+}
+
+void InnerLoopVectorizer::updateAnalysis() {
+  // Forget the original basic block.
+  PSE.getSE()->forgetLoop(OrigLoop);
+
+  // Update the dominator tree information.
+  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
+         "Entry does not dominate exit.");
+
+  DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(),
+                  LoopVectorPreHeader);
+  DT->addNewBlock(LoopMiddleBlock,
+                  LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+  DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
+  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
+  DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
+
+  DEBUG(DT->verifyDomTree());
+}
+
+/// \brief Check whether it is safe to if-convert this phi node.
+///
+/// Phi nodes with constant expressions that can trap are not safe to if
+/// convert.
+static bool canIfConvertPHINodes(BasicBlock *BB) {
+  for (Instruction &I : *BB) {
+    auto *Phi = dyn_cast<PHINode>(&I);
+    if (!Phi)
+      return true;
+    for (Value *V : Phi->incoming_values())
+      if (auto *C = dyn_cast<Constant>(V))
+        if (C->canTrap())
+          return false;
+  }
+  return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
+  if (!EnableIfConversion) {
+    ORE->emit(createMissedAnalysis("IfConversionDisabled")
+              << "if-conversion is disabled");
+    return false;
+  }
+
+  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
+
+  // A list of pointers that we can safely read and write to.
+  SmallPtrSet<Value *, 8> SafePointes;
+
+  // Collect safe addresses.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    if (blockNeedsPredication(BB))
+      continue;
+
+    for (Instruction &I : *BB)
+      if (auto *Ptr = getPointerOperand(&I))
+        SafePointes.insert(Ptr);
+  }
+
+  // Collect the blocks that need predication.
+  BasicBlock *Header = TheLoop->getHeader();
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // We don't support switch statements inside loops.
+    if (!isa<BranchInst>(BB->getTerminator())) {
+      ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
+                << "loop contains a switch statement");
+      return false;
+    }
+
+    // We must be able to predicate all blocks that need to be predicated.
+    if (blockNeedsPredication(BB)) {
+      if (!blockCanBePredicated(BB, SafePointes)) {
+        ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
+                  << "control flow cannot be substituted for a select");
+        return false;
+      }
+    } else if (BB != Header && !canIfConvertPHINodes(BB)) {
+      ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
+                << "control flow cannot be substituted for a select");
+      return false;
+    }
+  }
+
+  // We can if-convert this loop.
+  return true;
+}
+
+bool LoopVectorizationLegality::canVectorize() {
+  // Store the result and return it at the end instead of exiting early, in case
+  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+  bool Result = true;
+  // We must have a loop in canonical form. Loops with indirectbr in them cannot
+  // be canonicalized.
+  if (!TheLoop->getLoopPreheader()) {
+    ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+              << "loop control flow is not understood by vectorizer");
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
+  }
+
+  // FIXME: The code is currently dead, since the loop gets sent to
+  // LoopVectorizationLegality is already an innermost loop.
+  //
+  // We can only vectorize innermost loops.
+  if (!TheLoop->empty()) {
+    ORE->emit(createMissedAnalysis("NotInnermostLoop")
+              << "loop is not the innermost loop");
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
+  }
+
+  // We must have a single backedge.
+  if (TheLoop->getNumBackEdges() != 1) {
+    ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+              << "loop control flow is not understood by vectorizer");
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
+  }
+
+  // We must have a single exiting block.
+  if (!TheLoop->getExitingBlock()) {
+    ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+              << "loop control flow is not understood by vectorizer");
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
+  }
+
+  // We only handle bottom-tested loops, i.e. loop in which the condition is
+  // checked at the end of each iteration. With that we can assume that all
+  // instructions in the loop are executed the same number of times.
+  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+    ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+              << "loop control flow is not understood by vectorizer");
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
+  }
+
+  // We need to have a loop header.
+  DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
+               << '\n');
+
+  // Check if we can if-convert non-single-bb loops.
+  unsigned NumBlocks = TheLoop->getNumBlocks();
+  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
+    DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
+  }
+
+  // Check if we can vectorize the instructions and CFG in this loop.
+  if (!canVectorizeInstrs()) {
+    DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
+  }
+
+  // Go over each instruction and look at memory deps.
+  if (!canVectorizeMemory()) {
+    DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
+  }
+
+  DEBUG(dbgs() << "LV: We can vectorize this loop"
+               << (LAI->getRuntimePointerChecking()->Need
+                       ? " (with a runtime bound check)"
+                       : "")
+               << "!\n");
+
+  bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
+
+  // If an override option has been passed in for interleaved accesses, use it.
+  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
+    UseInterleaved = EnableInterleavedMemAccesses;
+
+  // Analyze interleaved memory accesses.
+  if (UseInterleaved)
+    InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
+
+  unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
+  if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
+    SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
+
+  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
+    ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
+              << "Too many SCEV assumptions need to be made and checked "
+              << "at runtime");
+    DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
+  }
+
+  // Okay! We've done all the tests. If any have failed, return false. Otherwise
+  // we can vectorize, and at this point we don't have any other mem analysis
+  // which may limit our maximum vectorization factor, so just return true with
+  // no restrictions.
+  return Result;
+}
+
+static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
+  if (Ty->isPointerTy())
+    return DL.getIntPtrType(Ty);
+
+  // It is possible that char's or short's overflow when we ask for the loop's
+  // trip count, work around this by changing the type size.
+  if (Ty->getScalarSizeInBits() < 32)
+    return Type::getInt32Ty(Ty->getContext());
+
+  return Ty;
+}
+
+static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
+  Ty0 = convertPointerToIntegerType(DL, Ty0);
+  Ty1 = convertPointerToIntegerType(DL, Ty1);
+  if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
+    return Ty0;
+  return Ty1;
+}
+
+/// \brief Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+                               SmallPtrSetImpl<Value *> &AllowedExit) {
+  // Reduction and Induction instructions are allowed to have exit users. All
+  // other instructions must not have external users.
+  if (!AllowedExit.count(Inst))
+    // Check that all of the users of the loop are inside the BB.
+    for (User *U : Inst->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      // This user may be a reduction exit value.
+      if (!TheLoop->contains(UI)) {
+        DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
+        return true;
+      }
+    }
+  return false;
+}
+
+void LoopVectorizationLegality::addInductionPhi(
+    PHINode *Phi, const InductionDescriptor &ID,
+    SmallPtrSetImpl<Value *> &AllowedExit) {
+  Inductions[Phi] = ID;
+  Type *PhiTy = Phi->getType();
+  const DataLayout &DL = Phi->getModule()->getDataLayout();
+
+  // Get the widest type.
+  if (!PhiTy->isFloatingPointTy()) {
+    if (!WidestIndTy)
+      WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
+    else
+      WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
+  }
+
+  // Int inductions are special because we only allow one IV.
+  if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
+      ID.getConstIntStepValue() &&
+      ID.getConstIntStepValue()->isOne() &&
+      isa<Constant>(ID.getStartValue()) &&
+      cast<Constant>(ID.getStartValue())->isNullValue()) {
+
+    // Use the phi node with the widest type as induction. Use the last
+    // one if there are multiple (no good reason for doing this other
+    // than it is expedient). We've checked that it begins at zero and
+    // steps by one, so this is a canonical induction variable.
+    if (!PrimaryInduction || PhiTy == WidestIndTy)
+      PrimaryInduction = Phi;
+  }
+
+  // Both the PHI node itself, and the "post-increment" value feeding
+  // back into the PHI node may have external users.
+  // We can allow those uses, except if the SCEVs we have for them rely
+  // on predicates that only hold within the loop, since allowing the exit
+  // currently means re-using this SCEV outside the loop.
+  if (PSE.getUnionPredicate().isAlwaysTrue()) {
+    AllowedExit.insert(Phi);
+    AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
+  }
+
+  DEBUG(dbgs() << "LV: Found an induction variable.\n");
+  return;
+}
+
+bool LoopVectorizationLegality::canVectorizeInstrs() {
+  BasicBlock *Header = TheLoop->getHeader();
+
+  // Look for the attribute signaling the absence of NaNs.
+  Function &F = *Header->getParent();
+  HasFunNoNaNAttr =
+      F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+
+  // For each block in the loop.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // Scan the instructions in the block and look for hazards.
+    for (Instruction &I : *BB) {
+      if (auto *Phi = dyn_cast<PHINode>(&I)) {
+        Type *PhiTy = Phi->getType();
+        // Check that this PHI type is allowed.
+        if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+            !PhiTy->isPointerTy()) {
+          ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
+                    << "loop control flow is not understood by vectorizer");
+          DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
+          return false;
+        }
+
+        // If this PHINode is not in the header block, then we know that we
+        // can convert it to select during if-conversion. No need to check if
+        // the PHIs in this block are induction or reduction variables.
+        if (BB != Header) {
+          // Check that this instruction has no outside users or is an
+          // identified reduction value with an outside user.
+          if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
+            continue;
+          ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
+                    << "value could not be identified as "
+                       "an induction or reduction variable");
+          return false;
+        }
+
+        // We only allow if-converted PHIs with exactly two incoming values.
+        if (Phi->getNumIncomingValues() != 2) {
+          ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
+                    << "control flow not understood by vectorizer");
+          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+          return false;
+        }
+
+        RecurrenceDescriptor RedDes;
+        if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) {
+          if (RedDes.hasUnsafeAlgebra())
+            Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
+          AllowedExit.insert(RedDes.getLoopExitInstr());
+          Reductions[Phi] = RedDes;
+          continue;
+        }
+
+        InductionDescriptor ID;
+        if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
+          addInductionPhi(Phi, ID, AllowedExit);
+          if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
+            Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
+          continue;
+        }
+
+        if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
+                                                         SinkAfter, DT)) {
+          FirstOrderRecurrences.insert(Phi);
+          continue;
+        }
+
+        // As a last resort, coerce the PHI to a AddRec expression
+        // and re-try classifying it a an induction PHI.
+        if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
+          addInductionPhi(Phi, ID, AllowedExit);
+          continue;
+        }
+
+        ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)
+                  << "value that could not be identified as "
+                     "reduction is used outside the loop");
+        DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
+        return false;
+      } // end of PHI handling
+
+      // We handle calls that:
+      //   * Are debug info intrinsics.
+      //   * Have a mapping to an IR intrinsic.
+      //   * Have a vector version available.
+      auto *CI = dyn_cast<CallInst>(&I);
+      if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
+          !isa<DbgInfoIntrinsic>(CI) &&
+          !(CI->getCalledFunction() && TLI &&
+            TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
+        ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
+                  << "call instruction cannot be vectorized");
+        DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
+        return false;
+      }
+
+      // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
+      // second argument is the same (i.e. loop invariant)
+      if (CI && hasVectorInstrinsicScalarOpd(
+                    getVectorIntrinsicIDForCall(CI, TLI), 1)) {
+        auto *SE = PSE.getSE();
+        if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
+          ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)
+                    << "intrinsic instruction cannot be vectorized");
+          DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
+          return false;
+        }
+      }
+
+      // Check that the instruction return type is vectorizable.
+      // Also, we can't vectorize extractelement instructions.
+      if ((!VectorType::isValidElementType(I.getType()) &&
+           !I.getType()->isVoidTy()) ||
+          isa<ExtractElementInst>(I)) {
+        ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)
+                  << "instruction return type cannot be vectorized");
+        DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
+        return false;
+      }
+
+      // Check that the stored type is vectorizable.
+      if (auto *ST = dyn_cast<StoreInst>(&I)) {
+        Type *T = ST->getValueOperand()->getType();
+        if (!VectorType::isValidElementType(T)) {
+          ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)
+                    << "store instruction cannot be vectorized");
+          return false;
+        }
+
+        // FP instructions can allow unsafe algebra, thus vectorizable by
+        // non-IEEE-754 compliant SIMD units.
+        // This applies to floating-point math operations and calls, not memory
+        // operations, shuffles, or casts, as they don't change precision or
+        // semantics.
+      } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
+                 !I.hasUnsafeAlgebra()) {
+        DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
+        Hints->setPotentiallyUnsafe();
+      }
+
+      // Reduction instructions are allowed to have exit users.
+      // All other instructions must not have external users.
+      if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
+        ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
+                  << "value cannot be used outside the loop");
+        return false;
+      }
+
+    } // next instr.
+  }
+
+  if (!PrimaryInduction) {
+    DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+    if (Inductions.empty()) {
+      ORE->emit(createMissedAnalysis("NoInductionVariable")
+                << "loop induction variable could not be identified");
+      return false;
+    }
+  }
+
+  // Now we know the widest induction type, check if our found induction
+  // is the same size. If it's not, unset it here and InnerLoopVectorizer
+  // will create another.
+  if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
+    PrimaryInduction = nullptr;
+
+  return true;
+}
+
+void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
+
+  // We should not collect Scalars more than once per VF. Right now, this
+  // function is called from collectUniformsAndScalars(), which already does
+  // this check. Collecting Scalars for VF=1 does not make any sense.
+  assert(VF >= 2 && !Scalars.count(VF) &&
+         "This function should not be visited twice for the same VF");
+
+  SmallSetVector<Instruction *, 8> Worklist;
+
+  // These sets are used to seed the analysis with pointers used by memory
+  // accesses that will remain scalar.
+  SmallSetVector<Instruction *, 8> ScalarPtrs;
+  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
+
+  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
+  // The pointer operands of loads and stores will be scalar as long as the
+  // memory access is not a gather or scatter operation. The value operand of a
+  // store will remain scalar if the store is scalarized.
+  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
+    InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
+    assert(WideningDecision != CM_Unknown &&
+           "Widening decision should be ready at this moment");
+    if (auto *Store = dyn_cast<StoreInst>(MemAccess))
+      if (Ptr == Store->getValueOperand())
+        return WideningDecision == CM_Scalarize;
+    assert(Ptr == getPointerOperand(MemAccess) &&
+           "Ptr is neither a value or pointer operand");
+    return WideningDecision != CM_GatherScatter;
+  };
+
+  // A helper that returns true if the given value is a bitcast or
+  // getelementptr instruction contained in the loop.
+  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
+    return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
+            isa<GetElementPtrInst>(V)) &&
+           !TheLoop->isLoopInvariant(V);
+  };
+
+  // A helper that evaluates a memory access's use of a pointer. If the use
+  // will be a scalar use, and the pointer is only used by memory accesses, we
+  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
+  // PossibleNonScalarPtrs.
+  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
+
+    // We only care about bitcast and getelementptr instructions contained in
+    // the loop.
+    if (!isLoopVaryingBitCastOrGEP(Ptr))
+      return;
+
+    // If the pointer has already been identified as scalar (e.g., if it was
+    // also identified as uniform), there's nothing to do.
+    auto *I = cast<Instruction>(Ptr);
+    if (Worklist.count(I))
+      return;
+
+    // If the use of the pointer will be a scalar use, and all users of the
+    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
+    // place the pointer in PossibleNonScalarPtrs.
+    if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) {
+          return isa<LoadInst>(U) || isa<StoreInst>(U);
+        }))
+      ScalarPtrs.insert(I);
+    else
+      PossibleNonScalarPtrs.insert(I);
+  };
+
+  // We seed the scalars analysis with three classes of instructions: (1)
+  // instructions marked uniform-after-vectorization, (2) bitcast and
+  // getelementptr instructions used by memory accesses requiring a scalar use,
+  // and (3) pointer induction variables and their update instructions (we
+  // currently only scalarize these).
+  //
+  // (1) Add to the worklist all instructions that have been identified as
+  // uniform-after-vectorization.
+  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
+
+  // (2) Add to the worklist all bitcast and getelementptr instructions used by
+  // memory accesses requiring a scalar use. The pointer operands of loads and
+  // stores will be scalar as long as the memory accesses is not a gather or
+  // scatter operation. The value operand of a store will remain scalar if the
+  // store is scalarized.
+  for (auto *BB : TheLoop->blocks())
+    for (auto &I : *BB) {
+      if (auto *Load = dyn_cast<LoadInst>(&I)) {
+        evaluatePtrUse(Load, Load->getPointerOperand());
+      } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
+        evaluatePtrUse(Store, Store->getPointerOperand());
+        evaluatePtrUse(Store, Store->getValueOperand());
+      }
+    }
+  for (auto *I : ScalarPtrs)
+    if (!PossibleNonScalarPtrs.count(I)) {
+      DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
+      Worklist.insert(I);
+    }
+
+  // (3) Add to the worklist all pointer induction variables and their update
+  // instructions.
+  //
+  // TODO: Once we are able to vectorize pointer induction variables we should
+  //       no longer insert them into the worklist here.
+  auto *Latch = TheLoop->getLoopLatch();
+  for (auto &Induction : *Legal->getInductionVars()) {
+    auto *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+    if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
+      continue;
+    Worklist.insert(Ind);
+    Worklist.insert(IndUpdate);
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
+  }
+
+  // Insert the forced scalars.
+  // FIXME: Currently widenPHIInstruction() often creates a dead vector
+  // induction variable when the PHI user is scalarized.
+  if (ForcedScalars.count(VF))
+    for (auto *I : ForcedScalars.find(VF)->second)
+      Worklist.insert(I);
+
+  // Expand the worklist by looking through any bitcasts and getelementptr
+  // instructions we've already identified as scalar. This is similar to the
+  // expansion step in collectLoopUniforms(); however, here we're only
+  // expanding to include additional bitcasts and getelementptr instructions.
+  unsigned Idx = 0;
+  while (Idx != Worklist.size()) {
+    Instruction *Dst = Worklist[Idx++];
+    if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
+      continue;
+    auto *Src = cast<Instruction>(Dst->getOperand(0));
+    if (all_of(Src->users(), [&](User *U) -> bool {
+          auto *J = cast<Instruction>(U);
+          return !TheLoop->contains(J) || Worklist.count(J) ||
+                 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
+                  isScalarUse(J, Src));
+        })) {
+      Worklist.insert(Src);
+      DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
+    }
+  }
+
+  // An induction variable will remain scalar if all users of the induction
+  // variable and induction variable update remain scalar.
+  for (auto &Induction : *Legal->getInductionVars()) {
+    auto *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+
+    // We already considered pointer induction variables, so there's no reason
+    // to look at their users again.
+    //
+    // TODO: Once we are able to vectorize pointer induction variables we
+    //       should no longer skip over them here.
+    if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
+      continue;
+
+    // Determine if all users of the induction variable are scalar after
+    // vectorization.
+    auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
+      auto *I = cast<Instruction>(U);
+      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
+    });
+    if (!ScalarInd)
+      continue;
+
+    // Determine if all users of the induction variable update instruction are
+    // scalar after vectorization.
+    auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
+      auto *I = cast<Instruction>(U);
+      return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
+    });
+    if (!ScalarIndUpdate)
+      continue;
+
+    // The induction variable and its update instruction will remain scalar.
+    Worklist.insert(Ind);
+    Worklist.insert(IndUpdate);
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
+  }
+
+  Scalars[VF].insert(Worklist.begin(), Worklist.end());
+}
+
+bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
+  if (!blockNeedsPredication(I->getParent()))
+    return false;
+  switch(I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Store:
+    return !isMaskRequired(I);
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    return mayDivideByZero(*I);
+  }
+  return false;
+}
+
+bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
+                                                              unsigned VF) {
+  // Get and ensure we have a valid memory instruction.
+  LoadInst *LI = dyn_cast<LoadInst>(I);
+  StoreInst *SI = dyn_cast<StoreInst>(I);
+  assert((LI || SI) && "Invalid memory instruction");
+
+  auto *Ptr = getPointerOperand(I);
+
+  // In order to be widened, the pointer should be consecutive, first of all.
+  if (!isConsecutivePtr(Ptr))
+    return false;
+
+  // If the instruction is a store located in a predicated block, it will be
+  // scalarized.
+  if (isScalarWithPredication(I))
+    return false;
+
+  // If the instruction's allocated size doesn't equal it's type size, it
+  // requires padding and will be scalarized.
+  auto &DL = I->getModule()->getDataLayout();
+  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+  if (hasIrregularType(ScalarTy, DL, VF))
+    return false;
+
+  return true;
+}
+
+void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
+
+  // We should not collect Uniforms more than once per VF. Right now,
+  // this function is called from collectUniformsAndScalars(), which
+  // already does this check. Collecting Uniforms for VF=1 does not make any
+  // sense.
+
+  assert(VF >= 2 && !Uniforms.count(VF) &&
+         "This function should not be visited twice for the same VF");
+
+  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
+  // not analyze again.  Uniforms.count(VF) will return 1.
+  Uniforms[VF].clear();
+
+  // We now know that the loop is vectorizable!
+  // Collect instructions inside the loop that will remain uniform after
+  // vectorization.
+
+  // Global values, params and instructions outside of current loop are out of
+  // scope.
+  auto isOutOfScope = [&](Value *V) -> bool {
+    Instruction *I = dyn_cast<Instruction>(V);
+    return (!I || !TheLoop->contains(I));
+  };
+
+  SetVector<Instruction *> Worklist;
+  BasicBlock *Latch = TheLoop->getLoopLatch();
+
+  // Start with the conditional branch. If the branch condition is an
+  // instruction contained in the loop that is only used by the branch, it is
+  // uniform.
+  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
+    Worklist.insert(Cmp);
+    DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
+  }
+
+  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
+  // are pointers that are treated like consecutive pointers during
+  // vectorization. The pointer operands of interleaved accesses are an
+  // example.
+  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
+
+  // Holds pointer operands of instructions that are possibly non-uniform.
+  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
+
+  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
+    InstWidening WideningDecision = getWideningDecision(I, VF);
+    assert(WideningDecision != CM_Unknown &&
+           "Widening decision should be ready at this moment");
+
+    return (WideningDecision == CM_Widen ||
+            WideningDecision == CM_Interleave);
+  };
+  // Iterate over the instructions in the loop, and collect all
+  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
+  // that a consecutive-like pointer operand will be scalarized, we collect it
+  // in PossibleNonUniformPtrs instead. We use two sets here because a single
+  // getelementptr instruction can be used by both vectorized and scalarized
+  // memory instructions. For example, if a loop loads and stores from the same
+  // location, but the store is conditional, the store will be scalarized, and
+  // the getelementptr won't remain uniform.
+  for (auto *BB : TheLoop->blocks())
+    for (auto &I : *BB) {
+
+      // If there's no pointer operand, there's nothing to do.
+      auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
+      if (!Ptr)
+        continue;
+
+      // True if all users of Ptr are memory accesses that have Ptr as their
+      // pointer operand.
+      auto UsersAreMemAccesses = all_of(Ptr->users(), [&](User *U) -> bool {
+        return getPointerOperand(U) == Ptr;
+      });
+
+      // Ensure the memory instruction will not be scalarized or used by
+      // gather/scatter, making its pointer operand non-uniform. If the pointer
+      // operand is used by any instruction other than a memory access, we
+      // conservatively assume the pointer operand may be non-uniform.
+      if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
+        PossibleNonUniformPtrs.insert(Ptr);
+
+      // If the memory instruction will be vectorized and its pointer operand
+      // is consecutive-like, or interleaving - the pointer operand should
+      // remain uniform.
+      else
+        ConsecutiveLikePtrs.insert(Ptr);
+    }
+
+  // Add to the Worklist all consecutive and consecutive-like pointers that
+  // aren't also identified as possibly non-uniform.
+  for (auto *V : ConsecutiveLikePtrs)
+    if (!PossibleNonUniformPtrs.count(V)) {
+      DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
+      Worklist.insert(V);
+    }
+
+  // Expand Worklist in topological order: whenever a new instruction
+  // is added , its users should be either already inside Worklist, or
+  // out of scope. It ensures a uniform instruction will only be used
+  // by uniform instructions or out of scope instructions.
+  unsigned idx = 0;
+  while (idx != Worklist.size()) {
+    Instruction *I = Worklist[idx++];
+
+    for (auto OV : I->operand_values()) {
+      if (isOutOfScope(OV))
+        continue;
+      auto *OI = cast<Instruction>(OV);
+      if (all_of(OI->users(), [&](User *U) -> bool {
+            auto *J = cast<Instruction>(U);
+            return !TheLoop->contains(J) || Worklist.count(J) ||
+                   (OI == getPointerOperand(J) && isUniformDecision(J, VF));
+          })) {
+        Worklist.insert(OI);
+        DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
+      }
+    }
+  }
+
+  // Returns true if Ptr is the pointer operand of a memory access instruction
+  // I, and I is known to not require scalarization.
+  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
+    return getPointerOperand(I) == Ptr && isUniformDecision(I, VF);
+  };
+
+  // For an instruction to be added into Worklist above, all its users inside
+  // the loop should also be in Worklist. However, this condition cannot be
+  // true for phi nodes that form a cyclic dependence. We must process phi
+  // nodes separately. An induction variable will remain uniform if all users
+  // of the induction variable and induction variable update remain uniform.
+  // The code below handles both pointer and non-pointer induction variables.
+  for (auto &Induction : *Legal->getInductionVars()) {
+    auto *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+
+    // Determine if all users of the induction variable are uniform after
+    // vectorization.
+    auto UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
+      auto *I = cast<Instruction>(U);
+      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
+             isVectorizedMemAccessUse(I, Ind);
+    });
+    if (!UniformInd)
+      continue;
+
+    // Determine if all users of the induction variable update instruction are
+    // uniform after vectorization.
+    auto UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
+      auto *I = cast<Instruction>(U);
+      return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
+             isVectorizedMemAccessUse(I, IndUpdate);
+    });
+    if (!UniformIndUpdate)
+      continue;
+
+    // The induction variable and its update instruction will remain uniform.
+    Worklist.insert(Ind);
+    Worklist.insert(IndUpdate);
+    DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
+    DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
+  }
+
+  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
+}
+
+bool LoopVectorizationLegality::canVectorizeMemory() {
+  LAI = &(*GetLAA)(*TheLoop);
+  InterleaveInfo.setLAI(LAI);
+  const OptimizationRemarkAnalysis *LAR = LAI->getReport();
+  if (LAR) {
+    OptimizationRemarkAnalysis VR(Hints->vectorizeAnalysisPassName(),
+                                  "loop not vectorized: ", *LAR);
+    ORE->emit(VR);
+  }
+  if (!LAI->canVectorizeMemory())
+    return false;
+
+  if (LAI->hasStoreToLoopInvariantAddress()) {
+    ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
+              << "write to a loop invariant address could not be vectorized");
+    DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+    return false;
+  }
+
+  Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
+  PSE.addPredicate(LAI->getPSE().getUnionPredicate());
+
+  return true;
+}
+
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+  Value *In0 = const_cast<Value *>(V);
+  PHINode *PN = dyn_cast_or_null<PHINode>(In0);
+  if (!PN)
+    return false;
+
+  return Inductions.count(PN);
+}
+
+bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
+  return FirstOrderRecurrences.count(Phi);
+}
+
+bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
+  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+}
+
+bool LoopVectorizationLegality::blockCanBePredicated(
+    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
+  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+
+  for (Instruction &I : *BB) {
+    // Check that we don't have a constant expression that can trap as operand.
+    for (Value *Operand : I.operands()) {
+      if (auto *C = dyn_cast<Constant>(Operand))
+        if (C->canTrap())
+          return false;
+    }
+    // We might be able to hoist the load.
+    if (I.mayReadFromMemory()) {
+      auto *LI = dyn_cast<LoadInst>(&I);
+      if (!LI)
+        return false;
+      if (!SafePtrs.count(LI->getPointerOperand())) {
+        if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) ||
+            isLegalMaskedGather(LI->getType())) {
+          MaskedOp.insert(LI);
+          continue;
+        }
+        // !llvm.mem.parallel_loop_access implies if-conversion safety.
+        if (IsAnnotatedParallel)
+          continue;
+        return false;
+      }
+    }
+
+    if (I.mayWriteToMemory()) {
+      auto *SI = dyn_cast<StoreInst>(&I);
+      // We only support predication of stores in basic blocks with one
+      // predecessor.
+      if (!SI)
+        return false;
+
+      // Build a masked store if it is legal for the target.
+      if (isLegalMaskedStore(SI->getValueOperand()->getType(),
+                             SI->getPointerOperand()) ||
+          isLegalMaskedScatter(SI->getValueOperand()->getType())) {
+        MaskedOp.insert(SI);
+        continue;
+      }
+
+      bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
+      bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
+
+      if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
+          !isSinglePredecessor)
+        return false;
+    }
+    if (I.mayThrow())
+      return false;
+  }
+
+  return true;
+}
+
+void InterleavedAccessInfo::collectConstStrideAccesses(
+    MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
+    const ValueToValueMap &Strides) {
+
+  auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+
+  // Since it's desired that the load/store instructions be maintained in
+  // "program order" for the interleaved access analysis, we have to visit the
+  // blocks in the loop in reverse postorder (i.e., in a topological order).
+  // Such an ordering will ensure that any load/store that may be executed
+  // before a second load/store will precede the second load/store in
+  // AccessStrideInfo.
+  LoopBlocksDFS DFS(TheLoop);
+  DFS.perform(LI);
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
+    for (auto &I : *BB) {
+      auto *LI = dyn_cast<LoadInst>(&I);
+      auto *SI = dyn_cast<StoreInst>(&I);
+      if (!LI && !SI)
+        continue;
+
+      Value *Ptr = getPointerOperand(&I);
+      // We don't check wrapping here because we don't know yet if Ptr will be
+      // part of a full group or a group with gaps. Checking wrapping for all
+      // pointers (even those that end up in groups with no gaps) will be overly
+      // conservative. For full groups, wrapping should be ok since if we would
+      // wrap around the address space we would do a memory access at nullptr
+      // even without the transformation. The wrapping checks are therefore
+      // deferred until after we've formed the interleaved groups.
+      int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides,
+                                    /*Assume=*/true, /*ShouldCheckWrap=*/false);
+
+      const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
+      PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+      uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
+
+      // An alignment of 0 means target ABI alignment.
+      unsigned Align = getMemInstAlignment(&I);
+      if (!Align)
+        Align = DL.getABITypeAlignment(PtrTy->getElementType());
+
+      AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, Align);
+    }
+}
+
+// Analyze interleaved accesses and collect them into interleaved load and
+// store groups.
+//
+// When generating code for an interleaved load group, we effectively hoist all
+// loads in the group to the location of the first load in program order. When
+// generating code for an interleaved store group, we sink all stores to the
+// location of the last store. This code motion can change the order of load
+// and store instructions and may break dependences.
+//
+// The code generation strategy mentioned above ensures that we won't violate
+// any write-after-read (WAR) dependences.
+//
+// E.g., for the WAR dependence:  a = A[i];      // (1)
+//                                A[i] = b;      // (2)
+//
+// The store group of (2) is always inserted at or below (2), and the load
+// group of (1) is always inserted at or above (1). Thus, the instructions will
+// never be reordered. All other dependences are checked to ensure the
+// correctness of the instruction reordering.
+//
+// The algorithm visits all memory accesses in the loop in bottom-up program
+// order. Program order is established by traversing the blocks in the loop in
+// reverse postorder when collecting the accesses.
+//
+// We visit the memory accesses in bottom-up order because it can simplify the
+// construction of store groups in the presence of write-after-write (WAW)
+// dependences.
+//
+// E.g., for the WAW dependence:  A[i] = a;      // (1)
+//                                A[i] = b;      // (2)
+//                                A[i + 1] = c;  // (3)
+//
+// We will first create a store group with (3) and (2). (1) can't be added to
+// this group because it and (2) are dependent. However, (1) can be grouped
+// with other accesses that may precede it in program order. Note that a
+// bottom-up order does not imply that WAW dependences should not be checked.
+void InterleavedAccessInfo::analyzeInterleaving(
+    const ValueToValueMap &Strides) {
+  DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
+
+  // Holds all accesses with a constant stride.
+  MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
+  collectConstStrideAccesses(AccessStrideInfo, Strides);
+
+  if (AccessStrideInfo.empty())
+    return;
+
+  // Collect the dependences in the loop.
+  collectDependences();
+
+  // Holds all interleaved store groups temporarily.
+  SmallSetVector<InterleaveGroup *, 4> StoreGroups;
+  // Holds all interleaved load groups temporarily.
+  SmallSetVector<InterleaveGroup *, 4> LoadGroups;
+
+  // Search in bottom-up program order for pairs of accesses (A and B) that can
+  // form interleaved load or store groups. In the algorithm below, access A
+  // precedes access B in program order. We initialize a group for B in the
+  // outer loop of the algorithm, and then in the inner loop, we attempt to
+  // insert each A into B's group if:
+  //
+  //  1. A and B have the same stride,
+  //  2. A and B have the same memory object size, and
+  //  3. A belongs in B's group according to its distance from B.
+  //
+  // Special care is taken to ensure group formation will not break any
+  // dependences.
+  for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
+       BI != E; ++BI) {
+    Instruction *B = BI->first;
+    StrideDescriptor DesB = BI->second;
+
+    // Initialize a group for B if it has an allowable stride. Even if we don't
+    // create a group for B, we continue with the bottom-up algorithm to ensure
+    // we don't break any of B's dependences.
+    InterleaveGroup *Group = nullptr;
+    if (isStrided(DesB.Stride)) {
+      Group = getInterleaveGroup(B);
+      if (!Group) {
+        DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B << '\n');
+        Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
+      }
+      if (B->mayWriteToMemory())
+        StoreGroups.insert(Group);
+      else
+        LoadGroups.insert(Group);
+    }
+
+    for (auto AI = std::next(BI); AI != E; ++AI) {
+      Instruction *A = AI->first;
+      StrideDescriptor DesA = AI->second;
+
+      // Our code motion strategy implies that we can't have dependences
+      // between accesses in an interleaved group and other accesses located
+      // between the first and last member of the group. Note that this also
+      // means that a group can't have more than one member at a given offset.
+      // The accesses in a group can have dependences with other accesses, but
+      // we must ensure we don't extend the boundaries of the group such that
+      // we encompass those dependent accesses.
+      //
+      // For example, assume we have the sequence of accesses shown below in a
+      // stride-2 loop:
+      //
+      //  (1, 2) is a group | A[i]   = a;  // (1)
+      //                    | A[i-1] = b;  // (2) |
+      //                      A[i-3] = c;  // (3)
+      //                      A[i]   = d;  // (4) | (2, 4) is not a group
+      //
+      // Because accesses (2) and (3) are dependent, we can group (2) with (1)
+      // but not with (4). If we did, the dependent access (3) would be within
+      // the boundaries of the (2, 4) group.
+      if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) {
+
+        // If a dependence exists and A is already in a group, we know that A
+        // must be a store since A precedes B and WAR dependences are allowed.
+        // Thus, A would be sunk below B. We release A's group to prevent this
+        // illegal code motion. A will then be free to form another group with
+        // instructions that precede it.
+        if (isInterleaved(A)) {
+          InterleaveGroup *StoreGroup = getInterleaveGroup(A);
+          StoreGroups.remove(StoreGroup);
+          releaseGroup(StoreGroup);
+        }
+
+        // If a dependence exists and A is not already in a group (or it was
+        // and we just released it), B might be hoisted above A (if B is a
+        // load) or another store might be sunk below A (if B is a store). In
+        // either case, we can't add additional instructions to B's group. B
+        // will only form a group with instructions that it precedes.
+        break;
+      }
+
+      // At this point, we've checked for illegal code motion. If either A or B
+      // isn't strided, there's nothing left to do.
+      if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
+        continue;
+
+      // Ignore A if it's already in a group or isn't the same kind of memory
+      // operation as B.
+      if (isInterleaved(A) || A->mayReadFromMemory() != B->mayReadFromMemory())
+        continue;
+
+      // Check rules 1 and 2. Ignore A if its stride or size is different from
+      // that of B.
+      if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
+        continue;
+
+      // Ignore A if the memory object of A and B don't belong to the same
+      // address space
+      if (getMemInstAddressSpace(A) != getMemInstAddressSpace(B))
+        continue;
+
+      // Calculate the distance from A to B.
+      const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
+          PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
+      if (!DistToB)
+        continue;
+      int64_t DistanceToB = DistToB->getAPInt().getSExtValue();
+
+      // Check rule 3. Ignore A if its distance to B is not a multiple of the
+      // size.
+      if (DistanceToB % static_cast<int64_t>(DesB.Size))
+        continue;
+
+      // Ignore A if either A or B is in a predicated block. Although we
+      // currently prevent group formation for predicated accesses, we may be
+      // able to relax this limitation in the future once we handle more
+      // complicated blocks.
+      if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
+        continue;
+
+      // The index of A is the index of B plus A's distance to B in multiples
+      // of the size.
+      int IndexA =
+          Group->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size);
+
+      // Try to insert A into B's group.
+      if (Group->insertMember(A, IndexA, DesA.Align)) {
+        DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
+                     << "    into the interleave group with" << *B << '\n');
+        InterleaveGroupMap[A] = Group;
+
+        // Set the first load in program order as the insert position.
+        if (A->mayReadFromMemory())
+          Group->setInsertPos(A);
+      }
+    } // Iteration over A accesses.
+  } // Iteration over B accesses.
+
+  // Remove interleaved store groups with gaps.
+  for (InterleaveGroup *Group : StoreGroups)
+    if (Group->getNumMembers() != Group->getFactor())
+      releaseGroup(Group);
+
+  // Remove interleaved groups with gaps (currently only loads) whose memory
+  // accesses may wrap around. We have to revisit the getPtrStride analysis,
+  // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
+  // not check wrapping (see documentation there).
+  // FORNOW we use Assume=false;
+  // TODO: Change to Assume=true but making sure we don't exceed the threshold
+  // of runtime SCEV assumptions checks (thereby potentially failing to
+  // vectorize altogether).
+  // Additional optional optimizations:
+  // TODO: If we are peeling the loop and we know that the first pointer doesn't
+  // wrap then we can deduce that all pointers in the group don't wrap.
+  // This means that we can forcefully peel the loop in order to only have to
+  // check the first pointer for no-wrap. When we'll change to use Assume=true
+  // we'll only need at most one runtime check per interleaved group.
+  //
+  for (InterleaveGroup *Group : LoadGroups) {
+
+    // Case 1: A full group. Can Skip the checks; For full groups, if the wide
+    // load would wrap around the address space we would do a memory access at
+    // nullptr even without the transformation.
+    if (Group->getNumMembers() == Group->getFactor())
+      continue;
+
+    // Case 2: If first and last members of the group don't wrap this implies
+    // that all the pointers in the group don't wrap.
+    // So we check only group member 0 (which is always guaranteed to exist),
+    // and group member Factor - 1; If the latter doesn't exist we rely on
+    // peeling (if it is a non-reveresed accsess -- see Case 3).
+    Value *FirstMemberPtr = getPointerOperand(Group->getMember(0));
+    if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
+                      /*ShouldCheckWrap=*/true)) {
+      DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
+                      "first group member potentially pointer-wrapping.\n");
+      releaseGroup(Group);
+      continue;
+    }
+    Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
+    if (LastMember) {
+      Value *LastMemberPtr = getPointerOperand(LastMember);
+      if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
+                        /*ShouldCheckWrap=*/true)) {
+        DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
+                        "last group member potentially pointer-wrapping.\n");
+        releaseGroup(Group);
+      }
+    } else {
+      // Case 3: A non-reversed interleaved load group with gaps: We need
+      // to execute at least one scalar epilogue iteration. This will ensure
+      // we don't speculatively access memory out-of-bounds. We only need
+      // to look for a member at index factor - 1, since every group must have
+      // a member at index zero.
+      if (Group->isReverse()) {
+        releaseGroup(Group);
+        continue;
+      }
+      DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
+      RequiresScalarEpilogue = true;
+    }
+  }
+}
+
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
+  if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
+    ORE->emit(createMissedAnalysis("ConditionalStore")
+              << "store that is conditionally executed prevents vectorization");
+    DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
+    return None;
+  }
+
+  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
+    return computeFeasibleMaxVF(OptForSize);
+
+  if (Legal->getRuntimePointerChecking()->Need) {
+    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
+              << "runtime pointer checks needed. Enable vectorization of this "
+                 "loop with '#pragma clang loop vectorize(enable)' when "
+                 "compiling with -Os/-Oz");
+    DEBUG(dbgs()
+          << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
+    return None;
+  }
+
+  // If we optimize the program for size, avoid creating the tail loop.
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+  DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
+
+  // If we don't know the precise trip count, don't try to vectorize.
+  if (TC < 2) {
+    ORE->emit(
+        createMissedAnalysis("UnknownLoopCountComplexCFG")
+        << "unable to calculate the loop count due to complex control flow");
+    DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+    return None;
+  }
+
+  unsigned MaxVF = computeFeasibleMaxVF(OptForSize);
+
+  if (TC % MaxVF != 0) {
+    // If the trip count that we found modulo the vectorization factor is not
+    // zero then we require a tail.
+    // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
+    // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
+    //        smaller MaxVF that does not require a scalar epilog.
+
+    ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
+              << "cannot optimize for size and vectorize at the "
+                 "same time. Enable vectorization of this loop "
+                 "with '#pragma clang loop vectorize(enable)' "
+                 "when compiling with -Os/-Oz");
+    DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+    return None;
+  }
+
+  return MaxVF;
+}
+
+unsigned LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize) {
+  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
+  unsigned SmallestType, WidestType;
+  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
+  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
+  unsigned MaxSafeDepDist = -1U;
+
+  // Get the maximum safe dependence distance in bits computed by LAA. If the
+  // loop contains any interleaved accesses, we divide the dependence distance
+  // by the maximum interleave factor of all interleaved groups. Note that
+  // although the division ensures correctness, this is a fairly conservative
+  // computation because the maximum distance computed by LAA may not involve
+  // any of the interleaved accesses.
+  if (Legal->getMaxSafeDepDistBytes() != -1U)
+    MaxSafeDepDist =
+        Legal->getMaxSafeDepDistBytes() * 8 / Legal->getMaxInterleaveFactor();
+
+  WidestRegister =
+      ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist);
+  unsigned MaxVectorSize = WidestRegister / WidestType;
+
+  DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
+               << WidestType << " bits.\n");
+  DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister
+               << " bits.\n");
+
+  if (MaxVectorSize == 0) {
+    DEBUG(dbgs() << "LV: The target has no vector registers.\n");
+    MaxVectorSize = 1;
+  }
+
+  assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
+                                " into one vector!");
+
+  unsigned MaxVF = MaxVectorSize;
+  if (MaximizeBandwidth && !OptForSize) {
+    // Collect all viable vectorization factors.
+    SmallVector<unsigned, 8> VFs;
+    unsigned NewMaxVectorSize = WidestRegister / SmallestType;
+    for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2)
+      VFs.push_back(VS);
+
+    // For each VF calculate its register usage.
+    auto RUs = calculateRegisterUsage(VFs);
+
+    // Select the largest VF which doesn't require more registers than existing
+    // ones.
+    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
+    for (int i = RUs.size() - 1; i >= 0; --i) {
+      if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
+        MaxVF = VFs[i];
+        break;
+      }
+    }
+  }
+  return MaxVF;
+}
+
+LoopVectorizationCostModel::VectorizationFactor
+LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
+  float Cost = expectedCost(1).first;
+#ifndef NDEBUG
+  const float ScalarCost = Cost;
+#endif /* NDEBUG */
+  unsigned Width = 1;
+  DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
+
+  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
+  // Ignore scalar width, because the user explicitly wants vectorization.
+  if (ForceVectorization && MaxVF > 1) {
+    Width = 2;
+    Cost = expectedCost(Width).first / (float)Width;
+  }
+
+  for (unsigned i = 2; i <= MaxVF; i *= 2) {
+    // Notice that the vector loop needs to be executed less times, so
+    // we need to divide the cost of the vector loops by the width of
+    // the vector elements.
+    VectorizationCostTy C = expectedCost(i);
+    float VectorCost = C.first / (float)i;
+    DEBUG(dbgs() << "LV: Vector loop of width " << i
+                 << " costs: " << (int)VectorCost << ".\n");
+    if (!C.second && !ForceVectorization) {
+      DEBUG(
+          dbgs() << "LV: Not considering vector loop of width " << i
+                 << " because it will not generate any vector instructions.\n");
+      continue;
+    }
+    if (VectorCost < Cost) {
+      Cost = VectorCost;
+      Width = i;
+    }
+  }
+
+  DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
+        << "LV: Vectorization seems to be not beneficial, "
+        << "but was forced by a user.\n");
+  DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
+  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
+  return Factor;
+}
+
+std::pair<unsigned, unsigned>
+LoopVectorizationCostModel::getSmallestAndWidestTypes() {
+  unsigned MinWidth = -1U;
+  unsigned MaxWidth = 8;
+  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+
+  // For each block.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // For each instruction in the loop.
+    for (Instruction &I : *BB) {
+      Type *T = I.getType();
+
+      // Skip ignored values.
+      if (ValuesToIgnore.count(&I))
+        continue;
+
+      // Only examine Loads, Stores and PHINodes.
+      if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
+        continue;
+
+      // Examine PHI nodes that are reduction variables. Update the type to
+      // account for the recurrence type.
+      if (auto *PN = dyn_cast<PHINode>(&I)) {
+        if (!Legal->isReductionVariable(PN))
+          continue;
+        RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
+        T = RdxDesc.getRecurrenceType();
+      }
+
+      // Examine the stored values.
+      if (auto *ST = dyn_cast<StoreInst>(&I))
+        T = ST->getValueOperand()->getType();
+
+      // Ignore loaded pointer types and stored pointer types that are not
+      // vectorizable.
+      //
+      // FIXME: The check here attempts to predict whether a load or store will
+      //        be vectorized. We only know this for certain after a VF has
+      //        been selected. Here, we assume that if an access can be
+      //        vectorized, it will be. We should also look at extending this
+      //        optimization to non-pointer types.
+      //
+      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
+          !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I))
+        continue;
+
+      MinWidth = std::min(MinWidth,
+                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
+      MaxWidth = std::max(MaxWidth,
+                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
+    }
+  }
+
+  return {MinWidth, MaxWidth};
+}
+
+unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
+                                                           unsigned VF,
+                                                           unsigned LoopCost) {
+
+  // -- The interleave heuristics --
+  // We interleave the loop in order to expose ILP and reduce the loop overhead.
+  // There are many micro-architectural considerations that we can't predict
+  // at this level. For example, frontend pressure (on decode or fetch) due to
+  // code size, or the number and capabilities of the execution ports.
+  //
+  // We use the following heuristics to select the interleave count:
+  // 1. If the code has reductions, then we interleave to break the cross
+  // iteration dependency.
+  // 2. If the loop is really small, then we interleave to reduce the loop
+  // overhead.
+  // 3. We don't interleave if we think that we will spill registers to memory
+  // due to the increased register pressure.
+
+  // When we optimize for size, we don't interleave.
+  if (OptForSize)
+    return 1;
+
+  // We used the distance for the interleave count.
+  if (Legal->getMaxSafeDepDistBytes() != -1U)
+    return 1;
+
+  // Do not interleave loops with a relatively small trip count.
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+  if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
+    return 1;
+
+  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
+  DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
+               << " registers\n");
+
+  if (VF == 1) {
+    if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
+      TargetNumRegisters = ForceTargetNumScalarRegs;
+  } else {
+    if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
+      TargetNumRegisters = ForceTargetNumVectorRegs;
+  }
+
+  RegisterUsage R = calculateRegisterUsage({VF})[0];
+  // We divide by these constants so assume that we have at least one
+  // instruction that uses at least one register.
+  R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
+  R.NumInstructions = std::max(R.NumInstructions, 1U);
+
+  // We calculate the interleave count using the following formula.
+  // Subtract the number of loop invariants from the number of available
+  // registers. These registers are used by all of the interleaved instances.
+  // Next, divide the remaining registers by the number of registers that is
+  // required by the loop, in order to estimate how many parallel instances
+  // fit without causing spills. All of this is rounded down if necessary to be
+  // a power of two. We want power of two interleave count to simplify any
+  // addressing operations or alignment considerations.
+  unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
+                              R.MaxLocalUsers);
+
+  // Don't count the induction variable as interleaved.
+  if (EnableIndVarRegisterHeur)
+    IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
+                       std::max(1U, (R.MaxLocalUsers - 1)));
+
+  // Clamp the interleave ranges to reasonable counts.
+  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
+
+  // Check if the user has overridden the max.
+  if (VF == 1) {
+    if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
+      MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
+  } else {
+    if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
+      MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
+  }
+
+  // If we did not calculate the cost for VF (because the user selected the VF)
+  // then we calculate the cost of VF here.
+  if (LoopCost == 0)
+    LoopCost = expectedCost(VF).first;
+
+  // Clamp the calculated IC to be between the 1 and the max interleave count
+  // that the target allows.
+  if (IC > MaxInterleaveCount)
+    IC = MaxInterleaveCount;
+  else if (IC < 1)
+    IC = 1;
+
+  // Interleave if we vectorized this loop and there is a reduction that could
+  // benefit from interleaving.
+  if (VF > 1 && Legal->getReductionVars()->size()) {
+    DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
+    return IC;
+  }
+
+  // Note that if we've already vectorized the loop we will have done the
+  // runtime check and so interleaving won't require further checks.
+  bool InterleavingRequiresRuntimePointerCheck =
+      (VF == 1 && Legal->getRuntimePointerChecking()->Need);
+
+  // We want to interleave small loops in order to reduce the loop overhead and
+  // potentially expose ILP opportunities.
+  DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+  if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
+    // We assume that the cost overhead is 1 and we use the cost model
+    // to estimate the cost of the loop and interleave until the cost of the
+    // loop overhead is about 5% of the cost of the loop.
+    unsigned SmallIC =
+        std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
+
+    // Interleave until store/load ports (estimated by max interleave count) are
+    // saturated.
+    unsigned NumStores = Legal->getNumStores();
+    unsigned NumLoads = Legal->getNumLoads();
+    unsigned StoresIC = IC / (NumStores ? NumStores : 1);
+    unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
+
+    // If we have a scalar reduction (vector reductions are already dealt with
+    // by this point), we can increase the critical path length if the loop
+    // we're interleaving is inside another loop. Limit, by default to 2, so the
+    // critical path only gets increased by one reduction operation.
+    if (Legal->getReductionVars()->size() && TheLoop->getLoopDepth() > 1) {
+      unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
+      SmallIC = std::min(SmallIC, F);
+      StoresIC = std::min(StoresIC, F);
+      LoadsIC = std::min(LoadsIC, F);
+    }
+
+    if (EnableLoadStoreRuntimeInterleave &&
+        std::max(StoresIC, LoadsIC) > SmallIC) {
+      DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
+      return std::max(StoresIC, LoadsIC);
+    }
+
+    DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+    return SmallIC;
+  }
+
+  // Interleave if this is a large loop (small loops are already dealt with by
+  // this point) that could benefit from interleaving.
+  bool HasReductions = (Legal->getReductionVars()->size() > 0);
+  if (TTI.enableAggressiveInterleaving(HasReductions)) {
+    DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+    return IC;
+  }
+
+  DEBUG(dbgs() << "LV: Not Interleaving.\n");
+  return 1;
+}
+
+SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
+LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
+  // This function calculates the register usage by measuring the highest number
+  // of values that are alive at a single location. Obviously, this is a very
+  // rough estimation. We scan the loop in a topological order in order and
+  // assign a number to each instruction. We use RPO to ensure that defs are
+  // met before their users. We assume that each instruction that has in-loop
+  // users starts an interval. We record every time that an in-loop value is
+  // used, so we have a list of the first and last occurrences of each
+  // instruction. Next, we transpose this data structure into a multi map that
+  // holds the list of intervals that *end* at a specific location. This multi
+  // map allows us to perform a linear search. We scan the instructions linearly
+  // and record each time that a new interval starts, by placing it in a set.
+  // If we find this value in the multi-map then we remove it from the set.
+  // The max register usage is the maximum size of the set.
+  // We also search for instructions that are defined outside the loop, but are
+  // used inside the loop. We need this number separately from the max-interval
+  // usage number because when we unroll, loop-invariant values do not take
+  // more register.
+  LoopBlocksDFS DFS(TheLoop);
+  DFS.perform(LI);
+
+  RegisterUsage RU;
+  RU.NumInstructions = 0;
+
+  // Each 'key' in the map opens a new interval. The values
+  // of the map are the index of the 'last seen' usage of the
+  // instruction that is the key.
+  typedef DenseMap<Instruction *, unsigned> IntervalMap;
+  // Maps instruction to its index.
+  DenseMap<unsigned, Instruction *> IdxToInstr;
+  // Marks the end of each interval.
+  IntervalMap EndPoint;
+  // Saves the list of instruction indices that are used in the loop.
+  SmallSet<Instruction *, 8> Ends;
+  // Saves the list of values that are used in the loop but are
+  // defined outside the loop, such as arguments and constants.
+  SmallPtrSet<Value *, 8> LoopInvariants;
+
+  unsigned Index = 0;
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
+    RU.NumInstructions += BB->size();
+    for (Instruction &I : *BB) {
+      IdxToInstr[Index++] = &I;
+
+      // Save the end location of each USE.
+      for (Value *U : I.operands()) {
+        auto *Instr = dyn_cast<Instruction>(U);
+
+        // Ignore non-instruction values such as arguments, constants, etc.
+        if (!Instr)
+          continue;
+
+        // If this instruction is outside the loop then record it and continue.
+        if (!TheLoop->contains(Instr)) {
+          LoopInvariants.insert(Instr);
+          continue;
+        }
+
+        // Overwrite previous end points.
+        EndPoint[Instr] = Index;
+        Ends.insert(Instr);
+      }
+    }
+  }
+
+  // Saves the list of intervals that end with the index in 'key'.
+  typedef SmallVector<Instruction *, 2> InstrList;
+  DenseMap<unsigned, InstrList> TransposeEnds;
+
+  // Transpose the EndPoints to a list of values that end at each index.
+  for (auto &Interval : EndPoint)
+    TransposeEnds[Interval.second].push_back(Interval.first);
+
+  SmallSet<Instruction *, 8> OpenIntervals;
+
+  // Get the size of the widest register.
+  unsigned MaxSafeDepDist = -1U;
+  if (Legal->getMaxSafeDepDistBytes() != -1U)
+    MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
+  unsigned WidestRegister =
+      std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
+  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+
+  SmallVector<RegisterUsage, 8> RUs(VFs.size());
+  SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
+
+  DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+
+  // A lambda that gets the register usage for the given type and VF.
+  auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
+    if (Ty->isTokenTy())
+      return 0U;
+    unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
+    return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
+  };
+
+  for (unsigned int i = 0; i < Index; ++i) {
+    Instruction *I = IdxToInstr[i];
+
+    // Remove all of the instructions that end at this location.
+    InstrList &List = TransposeEnds[i];
+    for (Instruction *ToRemove : List)
+      OpenIntervals.erase(ToRemove);
+
+    // Ignore instructions that are never used within the loop.
+    if (!Ends.count(I))
+      continue;
+
+    // Skip ignored values.
+    if (ValuesToIgnore.count(I))
+      continue;
+
+    // For each VF find the maximum usage of registers.
+    for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
+      if (VFs[j] == 1) {
+        MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
+        continue;
+      }
+      collectUniformsAndScalars(VFs[j]);
+      // Count the number of live intervals.
+      unsigned RegUsage = 0;
+      for (auto Inst : OpenIntervals) {
+        // Skip ignored values for VF > 1.
+        if (VecValuesToIgnore.count(Inst) ||
+            isScalarAfterVectorization(Inst, VFs[j]))
+          continue;
+        RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+      }
+      MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
+    }
+
+    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
+                 << OpenIntervals.size() << '\n');
+
+    // Add the current instruction to the list of open intervals.
+    OpenIntervals.insert(I);
+  }
+
+  for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
+    unsigned Invariant = 0;
+    if (VFs[i] == 1)
+      Invariant = LoopInvariants.size();
+    else {
+      for (auto Inst : LoopInvariants)
+        Invariant += GetRegUsage(Inst->getType(), VFs[i]);
+    }
+
+    DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
+    DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
+    DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
+    DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
+
+    RU.LoopInvariantRegs = Invariant;
+    RU.MaxLocalUsers = MaxUsages[i];
+    RUs[i] = RU;
+  }
+
+  return RUs;
+}
+
+void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
+
+  // If we aren't vectorizing the loop, or if we've already collected the
+  // instructions to scalarize, there's nothing to do. Collection may already
+  // have occurred if we have a user-selected VF and are now computing the
+  // expected cost for interleaving.
+  if (VF < 2 || InstsToScalarize.count(VF))
+    return;
+
+  // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
+  // not profitable to scalarize any instructions, the presence of VF in the
+  // map will indicate that we've analyzed it already.
+  ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
+
+  // Find all the instructions that are scalar with predication in the loop and
+  // determine if it would be better to not if-convert the blocks they are in.
+  // If so, we also record the instructions to scalarize.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    if (!Legal->blockNeedsPredication(BB))
+      continue;
+    for (Instruction &I : *BB)
+      if (Legal->isScalarWithPredication(&I)) {
+        ScalarCostsTy ScalarCosts;
+        if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+          ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
+
+        // Remember that BB will remain after vectorization.
+        PredicatedBBsAfterVectorization.insert(BB);
+      }
+  }
+}
+
+int LoopVectorizationCostModel::computePredInstDiscount(
+    Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
+    unsigned VF) {
+
+  assert(!isUniformAfterVectorization(PredInst, VF) &&
+         "Instruction marked uniform-after-vectorization will be predicated");
+
+  // Initialize the discount to zero, meaning that the scalar version and the
+  // vector version cost the same.
+  int Discount = 0;
+
+  // Holds instructions to analyze. The instructions we visit are mapped in
+  // ScalarCosts. Those instructions are the ones that would be scalarized if
+  // we find that the scalar version costs less.
+  SmallVector<Instruction *, 8> Worklist;
+
+  // Returns true if the given instruction can be scalarized.
+  auto canBeScalarized = [&](Instruction *I) -> bool {
+
+    // We only attempt to scalarize instructions forming a single-use chain
+    // from the original predicated block that would otherwise be vectorized.
+    // Although not strictly necessary, we give up on instructions we know will
+    // already be scalar to avoid traversing chains that are unlikely to be
+    // beneficial.
+    if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
+        isScalarAfterVectorization(I, VF))
+      return false;
+
+    // If the instruction is scalar with predication, it will be analyzed
+    // separately. We ignore it within the context of PredInst.
+    if (Legal->isScalarWithPredication(I))
+      return false;
+
+    // If any of the instruction's operands are uniform after vectorization,
+    // the instruction cannot be scalarized. This prevents, for example, a
+    // masked load from being scalarized.
+    //
+    // We assume we will only emit a value for lane zero of an instruction
+    // marked uniform after vectorization, rather than VF identical values.
+    // Thus, if we scalarize an instruction that uses a uniform, we would
+    // create uses of values corresponding to the lanes we aren't emitting code
+    // for. This behavior can be changed by allowing getScalarValue to clone
+    // the lane zero values for uniforms rather than asserting.
+    for (Use &U : I->operands())
+      if (auto *J = dyn_cast<Instruction>(U.get()))
+        if (isUniformAfterVectorization(J, VF))
+          return false;
+
+    // Otherwise, we can scalarize the instruction.
+    return true;
+  };
+
+  // Returns true if an operand that cannot be scalarized must be extracted
+  // from a vector. We will account for this scalarization overhead below. Note
+  // that the non-void predicated instructions are placed in their own blocks,
+  // and their return values are inserted into vectors. Thus, an extract would
+  // still be required.
+  auto needsExtract = [&](Instruction *I) -> bool {
+    return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
+  };
+
+  // Compute the expected cost discount from scalarizing the entire expression
+  // feeding the predicated instruction. We currently only consider expressions
+  // that are single-use instruction chains.
+  Worklist.push_back(PredInst);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+
+    // If we've already analyzed the instruction, there's nothing to do.
+    if (ScalarCosts.count(I))
+      continue;
+
+    // Compute the cost of the vector instruction. Note that this cost already
+    // includes the scalarization overhead of the predicated instruction.
+    unsigned VectorCost = getInstructionCost(I, VF).first;
+
+    // Compute the cost of the scalarized instruction. This cost is the cost of
+    // the instruction as if it wasn't if-converted and instead remained in the
+    // predicated block. We will scale this cost by block probability after
+    // computing the scalarization overhead.
+    unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
+
+    // Compute the scalarization overhead of needed insertelement instructions
+    // and phi nodes.
+    if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+      ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
+                                                 true, false);
+      ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
+    }
+
+    // Compute the scalarization overhead of needed extractelement
+    // instructions. For each of the instruction's operands, if the operand can
+    // be scalarized, add it to the worklist; otherwise, account for the
+    // overhead.
+    for (Use &U : I->operands())
+      if (auto *J = dyn_cast<Instruction>(U.get())) {
+        assert(VectorType::isValidElementType(J->getType()) &&
+               "Instruction has non-scalar type");
+        if (canBeScalarized(J))
+          Worklist.push_back(J);
+        else if (needsExtract(J))
+          ScalarCost += TTI.getScalarizationOverhead(
+                              ToVectorTy(J->getType(),VF), false, true);
+      }
+
+    // Scale the total scalar cost by block probability.
+    ScalarCost /= getReciprocalPredBlockProb();
+
+    // Compute the discount. A non-negative discount means the vector version
+    // of the instruction costs more, and scalarizing would be beneficial.
+    Discount += VectorCost - ScalarCost;
+    ScalarCosts[I] = ScalarCost;
+  }
+
+  return Discount;
+}
+
+LoopVectorizationCostModel::VectorizationCostTy
+LoopVectorizationCostModel::expectedCost(unsigned VF) {
+  VectorizationCostTy Cost;
+
+  // Collect Uniform and Scalar instructions after vectorization with VF.
+  collectUniformsAndScalars(VF);
+
+  // Collect the instructions (and their associated costs) that will be more
+  // profitable to scalarize.
+  collectInstsToScalarize(VF);
+
+  // For each block.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    VectorizationCostTy BlockCost;
+
+    // For each instruction in the old loop.
+    for (Instruction &I : *BB) {
+      // Skip dbg intrinsics.
+      if (isa<DbgInfoIntrinsic>(I))
+        continue;
+
+      // Skip ignored values.
+      if (ValuesToIgnore.count(&I))
+        continue;
+
+      VectorizationCostTy C = getInstructionCost(&I, VF);
+
+      // Check if we should override the cost.
+      if (ForceTargetInstructionCost.getNumOccurrences() > 0)
+        C.first = ForceTargetInstructionCost;
+
+      BlockCost.first += C.first;
+      BlockCost.second |= C.second;
+      DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first << " for VF "
+                   << VF << " For instruction: " << I << '\n');
+    }
+
+    // If we are vectorizing a predicated block, it will have been
+    // if-converted. This means that the block's instructions (aside from
+    // stores and instructions that may divide by zero) will now be
+    // unconditionally executed. For the scalar case, we may not always execute
+    // the predicated block. Thus, scale the block's cost by the probability of
+    // executing it.
+    if (VF == 1 && Legal->blockNeedsPredication(BB))
+      BlockCost.first /= getReciprocalPredBlockProb();
+
+    Cost.first += BlockCost.first;
+    Cost.second |= BlockCost.second;
+  }
+
+  return Cost;
+}
+
+/// \brief Gets Address Access SCEV after verifying that the access pattern
+/// is loop invariant except the induction variable dependence.
+///
+/// This SCEV can be sent to the Target in order to estimate the address
+/// calculation cost.
+static const SCEV *getAddressAccessSCEV(
+              Value *Ptr,
+              LoopVectorizationLegality *Legal,
+              ScalarEvolution *SE,
+              const Loop *TheLoop) {
+  auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!Gep)
+    return nullptr;
+
+  // We are looking for a gep with all loop invariant indices except for one
+  // which should be an induction variable.
+  unsigned NumOperands = Gep->getNumOperands();
+  for (unsigned i = 1; i < NumOperands; ++i) {
+    Value *Opd = Gep->getOperand(i);
+    if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
+        !Legal->isInductionVariable(Opd))
+      return nullptr;
+  }
+
+  // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
+  return SE->getSCEV(Ptr);
+}
+
+static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
+  return Legal->hasStride(I->getOperand(0)) ||
+         Legal->hasStride(I->getOperand(1));
+}
+
+unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
+                                                                 unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  auto SE = PSE.getSE();
+
+  unsigned Alignment = getMemInstAlignment(I);
+  unsigned AS = getMemInstAddressSpace(I);
+  Value *Ptr = getPointerOperand(I);
+  Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+
+  // Figure out whether the access is strided and get the stride value
+  // if it's known in compile time
+  const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop);
+
+  // Get the cost of the scalar memory instruction and address computation.
+  unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+
+  Cost += VF *
+          TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
+                              AS, I);
+
+  // Get the overhead of the extractelement and insertelement instructions
+  // we might create due to scalarization.
+  Cost += getScalarizationOverhead(I, VF, TTI);
+
+  // If we have a predicated store, it may not be executed for each vector
+  // lane. Scale the cost by the probability of executing the predicated
+  // block.
+  if (Legal->isScalarWithPredication(I))
+    Cost /= getReciprocalPredBlockProb();
+
+  return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
+                                                             unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned Alignment = getMemInstAlignment(I);
+  Value *Ptr = getPointerOperand(I);
+  unsigned AS = getMemInstAddressSpace(I);
+  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+
+  assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
+         "Stride should be 1 or -1 for consecutive memory access");
+  unsigned Cost = 0;
+  if (Legal->isMaskRequired(I))
+    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+  else
+    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
+
+  bool Reverse = ConsecutiveStride < 0;
+  if (Reverse)
+    Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
+                                                         unsigned VF) {
+  LoadInst *LI = cast<LoadInst>(I);
+  Type *ValTy = LI->getType();
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned Alignment = LI->getAlignment();
+  unsigned AS = LI->getPointerAddressSpace();
+
+  return TTI.getAddressComputationCost(ValTy) +
+         TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
+         TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+}
+
+unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
+                                                          unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned Alignment = getMemInstAlignment(I);
+  Value *Ptr = getPointerOperand(I);
+
+  return TTI.getAddressComputationCost(VectorTy) +
+         TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
+                                    Legal->isMaskRequired(I), Alignment);
+}
+
+unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
+                                                            unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned AS = getMemInstAddressSpace(I);
+
+  auto Group = Legal->getInterleavedAccessGroup(I);
+  assert(Group && "Fail to get an interleaved access group.");
+
+  unsigned InterleaveFactor = Group->getFactor();
+  Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
+
+  // Holds the indices of existing members in an interleaved load group.
+  // An interleaved store group doesn't need this as it doesn't allow gaps.
+  SmallVector<unsigned, 4> Indices;
+  if (isa<LoadInst>(I)) {
+    for (unsigned i = 0; i < InterleaveFactor; i++)
+      if (Group->getMember(i))
+        Indices.push_back(i);
+  }
+
+  // Calculate the cost of the whole interleaved group.
+  unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
+                                                 Group->getFactor(), Indices,
+                                                 Group->getAlignment(), AS);
+
+  if (Group->isReverse())
+    Cost += Group->getNumMembers() *
+            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
+                                                              unsigned VF) {
+
+  // Calculate scalar cost only. Vectorization cost should be ready at this
+  // moment.
+  if (VF == 1) {
+    Type *ValTy = getMemInstValueType(I);
+    unsigned Alignment = getMemInstAlignment(I);
+    unsigned AS = getMemInstAddressSpace(I);
+
+    return TTI.getAddressComputationCost(ValTy) +
+           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
+  }
+  return getWideningCost(I, VF);
+}
+
+LoopVectorizationCostModel::VectorizationCostTy
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+  // If we know that this instruction will remain uniform, check the cost of
+  // the scalar version.
+  if (isUniformAfterVectorization(I, VF))
+    VF = 1;
+
+  if (VF > 1 && isProfitableToScalarize(I, VF))
+    return VectorizationCostTy(InstsToScalarize[VF][I], false);
+
+  // Forced scalars do not have any scalarization overhead.
+  if (VF > 1 && ForcedScalars.count(VF) &&
+      ForcedScalars.find(VF)->second.count(I))
+    return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
+
+  Type *VectorTy;
+  unsigned C = getInstructionCost(I, VF, VectorTy);
+
+  bool TypeNotScalarized =
+      VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
+  return VectorizationCostTy(C, TypeNotScalarized);
+}
+
+void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
+  if (VF == 1)
+    return;
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // For each instruction in the old loop.
+    for (Instruction &I : *BB) {
+      Value *Ptr = getPointerOperand(&I);
+      if (!Ptr)
+        continue;
+
+      if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
+        // Scalar load + broadcast
+        unsigned Cost = getUniformMemOpCost(&I, VF);
+        setWideningDecision(&I, VF, CM_Scalarize, Cost);
+        continue;
+      }
+
+      // We assume that widening is the best solution when possible.
+      if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
+        unsigned Cost = getConsecutiveMemOpCost(&I, VF);
+        setWideningDecision(&I, VF, CM_Widen, Cost);
+        continue;
+      }
+
+      // Choose between Interleaving, Gather/Scatter or Scalarization.
+      unsigned InterleaveCost = UINT_MAX;
+      unsigned NumAccesses = 1;
+      if (Legal->isAccessInterleaved(&I)) {
+        auto Group = Legal->getInterleavedAccessGroup(&I);
+        assert(Group && "Fail to get an interleaved access group.");
+
+        // Make one decision for the whole group.
+        if (getWideningDecision(&I, VF) != CM_Unknown)
+          continue;
+
+        NumAccesses = Group->getNumMembers();
+        InterleaveCost = getInterleaveGroupCost(&I, VF);
+      }
+
+      unsigned GatherScatterCost =
+          Legal->isLegalGatherOrScatter(&I)
+              ? getGatherScatterCost(&I, VF) * NumAccesses
+              : UINT_MAX;
+
+      unsigned ScalarizationCost =
+          getMemInstScalarizationCost(&I, VF) * NumAccesses;
+
+      // Choose better solution for the current VF,
+      // write down this decision and use it during vectorization.
+      unsigned Cost;
+      InstWidening Decision;
+      if (InterleaveCost <= GatherScatterCost &&
+          InterleaveCost < ScalarizationCost) {
+        Decision = CM_Interleave;
+        Cost = InterleaveCost;
+      } else if (GatherScatterCost < ScalarizationCost) {
+        Decision = CM_GatherScatter;
+        Cost = GatherScatterCost;
+      } else {
+        Decision = CM_Scalarize;
+        Cost = ScalarizationCost;
+      }
+      // If the instructions belongs to an interleave group, the whole group
+      // receives the same decision. The whole group receives the cost, but
+      // the cost will actually be assigned to one instruction.
+      if (auto Group = Legal->getInterleavedAccessGroup(&I))
+        setWideningDecision(Group, VF, Decision, Cost);
+      else
+        setWideningDecision(&I, VF, Decision, Cost);
+    }
+  }
+
+  // Make sure that any load of address and any other address computation
+  // remains scalar unless there is gather/scatter support. This avoids
+  // inevitable extracts into address registers, and also has the benefit of
+  // activating LSR more, since that pass can't optimize vectorized
+  // addresses.
+  if (TTI.prefersVectorizedAddressing())
+    return;
+
+  // Start with all scalar pointer uses.
+  SmallPtrSet<Instruction *, 8> AddrDefs;
+  for (BasicBlock *BB : TheLoop->blocks())
+    for (Instruction &I : *BB) {
+      Instruction *PtrDef =
+        dyn_cast_or_null<Instruction>(getPointerOperand(&I));
+      if (PtrDef && TheLoop->contains(PtrDef) &&
+          getWideningDecision(&I, VF) != CM_GatherScatter)
+        AddrDefs.insert(PtrDef);
+    }
+
+  // Add all instructions used to generate the addresses.
+  SmallVector<Instruction *, 4> Worklist;
+  for (auto *I : AddrDefs)
+    Worklist.push_back(I);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    for (auto &Op : I->operands())
+      if (auto *InstOp = dyn_cast<Instruction>(Op))
+        if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
+            AddrDefs.insert(InstOp).second == true)
+          Worklist.push_back(InstOp);
+  }
+
+  for (auto *I : AddrDefs) {
+    if (isa<LoadInst>(I)) {
+      // Setting the desired widening decision should ideally be handled in
+      // by cost functions, but since this involves the task of finding out
+      // if the loaded register is involved in an address computation, it is
+      // instead changed here when we know this is the case.
+      if (getWideningDecision(I, VF) == CM_Widen)
+        // Scalarize a widened load of address.
+        setWideningDecision(I, VF, CM_Scalarize,
+                            (VF * getMemoryInstructionCost(I, 1)));
+      else if (auto Group = Legal->getInterleavedAccessGroup(I)) {
+        // Scalarize an interleave group of address loads.
+        for (unsigned I = 0; I < Group->getFactor(); ++I) {
+          if (Instruction *Member = Group->getMember(I))
+            setWideningDecision(Member, VF, CM_Scalarize,
+                                (VF * getMemoryInstructionCost(Member, 1)));
+        }
+      }
+    } else
+      // Make sure I gets scalarized and a cost estimate without
+      // scalarization overhead.
+      ForcedScalars[VF].insert(I);
+  }
+}
+
+unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
+                                                        unsigned VF,
+                                                        Type *&VectorTy) {
+  Type *RetTy = I->getType();
+  if (canTruncateToMinimalBitwidth(I, VF))
+    RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
+  VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
+  auto SE = PSE.getSE();
+
+  // TODO: We need to estimate the cost of intrinsic calls.
+  switch (I->getOpcode()) {
+  case Instruction::GetElementPtr:
+    // We mark this instruction as zero-cost because the cost of GEPs in
+    // vectorized code depends on whether the corresponding memory instruction
+    // is scalarized or not. Therefore, we handle GEPs with the memory
+    // instruction cost.
+    return 0;
+  case Instruction::Br: {
+    // In cases of scalarized and predicated instructions, there will be VF
+    // predicated blocks in the vectorized loop. Each branch around these
+    // blocks requires also an extract of its vector compare i1 element.
+    bool ScalarPredicatedBB = false;
+    BranchInst *BI = cast<BranchInst>(I);
+    if (VF > 1 && BI->isConditional() &&
+        (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
+         PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
+      ScalarPredicatedBB = true;
+
+    if (ScalarPredicatedBB) {
+      // Return cost for branches around scalarized and predicated blocks.
+      Type *Vec_i1Ty =
+          VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
+      return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
+              (TTI.getCFInstrCost(Instruction::Br) * VF));
+    } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
+      // The back-edge branch will remain, as will all scalar branches.
+      return TTI.getCFInstrCost(Instruction::Br);
+    else
+      // This branch will be eliminated by if-conversion.
+      return 0;
+    // Note: We currently assume zero cost for an unconditional branch inside
+    // a predicated block since it will become a fall-through, although we
+    // may decide in the future to call TTI for all branches.
+  }
+  case Instruction::PHI: {
+    auto *Phi = cast<PHINode>(I);
+
+    // First-order recurrences are replaced by vector shuffles inside the loop.
+    if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
+      return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                VectorTy, VF - 1, VectorTy);
+
+    // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
+    // converted into select instructions. We require N - 1 selects per phi
+    // node, where N is the number of incoming values.
+    if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
+      return (Phi->getNumIncomingValues() - 1) *
+             TTI.getCmpSelInstrCost(
+                 Instruction::Select, ToVectorTy(Phi->getType(), VF),
+                 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
+
+    return TTI.getCFInstrCost(Instruction::PHI);
+  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+    // If we have a predicated instruction, it may not be executed for each
+    // vector lane. Get the scalarization cost and scale this amount by the
+    // probability of executing the predicated block. If the instruction is not
+    // predicated, we fall through to the next case.
+    if (VF > 1 && Legal->isScalarWithPredication(I)) {
+      unsigned Cost = 0;
+
+      // These instructions have a non-void type, so account for the phi nodes
+      // that we will create. This cost is likely to be zero. The phi node
+      // cost, if any, should be scaled by the block probability because it
+      // models a copy at the end of each predicated block.
+      Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
+
+      // The cost of the non-predicated instruction.
+      Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
+
+      // The cost of insertelement and extractelement instructions needed for
+      // scalarization.
+      Cost += getScalarizationOverhead(I, VF, TTI);
+
+      // Scale the cost by the probability of executing the predicated blocks.
+      // This assumes the predicated block for each vector lane is equally
+      // likely.
+      return Cost / getReciprocalPredBlockProb();
+    }
+    LLVM_FALLTHROUGH;
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    // Since we will replace the stride by 1 the multiplication should go away.
+    if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
+      return 0;
+    // Certain instructions can be cheaper to vectorize if they have a constant
+    // second vector operand. One example of this are shifts on x86.
+    TargetTransformInfo::OperandValueKind Op1VK =
+        TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+        TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueProperties Op1VP =
+        TargetTransformInfo::OP_None;
+    TargetTransformInfo::OperandValueProperties Op2VP =
+        TargetTransformInfo::OP_None;
+    Value *Op2 = I->getOperand(1);
+
+    // Check for a splat or for a non uniform vector of constants.
+    if (isa<ConstantInt>(Op2)) {
+      ConstantInt *CInt = cast<ConstantInt>(Op2);
+      if (CInt && CInt->getValue().isPowerOf2())
+        Op2VP = TargetTransformInfo::OP_PowerOf2;
+      Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+    } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
+      Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+      Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
+      if (SplatValue) {
+        ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
+        if (CInt && CInt->getValue().isPowerOf2())
+          Op2VP = TargetTransformInfo::OP_PowerOf2;
+        Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+      }
+    } else if (Legal->isUniform(Op2)) {
+      Op2VK = TargetTransformInfo::OK_UniformValue;
+    }
+    SmallVector<const Value *, 4> Operands(I->operand_values());
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
+    return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
+                                          Op2VK, Op1VP, Op2VP, Operands);
+  }
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+    bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+    Type *CondTy = SI->getCondition()->getType();
+    if (!ScalarCond)
+      CondTy = VectorType::get(CondTy, VF);
+
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Type *ValTy = I->getOperand(0)->getType();
+    Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
+    if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
+      ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
+    VectorTy = ToVectorTy(ValTy, VF);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
+  }
+  case Instruction::Store:
+  case Instruction::Load: {
+    unsigned Width = VF;
+    if (Width > 1) {
+      InstWidening Decision = getWideningDecision(I, Width);
+      assert(Decision != CM_Unknown &&
+             "CM decision should be taken at this point");
+      if (Decision == CM_Scalarize)
+        Width = 1;
+    }
+    VectorTy = ToVectorTy(getMemInstValueType(I), Width);
+    return getMemoryInstructionCost(I, VF);
+  }
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    // We optimize the truncation of induction variables having constant
+    // integer steps. The cost of these truncations is the same as the scalar
+    // operation.
+    if (isOptimizableIVTruncate(I, VF)) {
+      auto *Trunc = cast<TruncInst>(I);
+      return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
+                                  Trunc->getSrcTy(), Trunc);
+    }
+
+    Type *SrcScalarTy = I->getOperand(0)->getType();
+    Type *SrcVecTy =
+        VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
+    if (canTruncateToMinimalBitwidth(I, VF)) {
+      // This cast is going to be shrunk. This may remove the cast or it might
+      // turn it into slightly different cast. For example, if MinBW == 16,
+      // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
+      //
+      // Calculate the modified src and dest types.
+      Type *MinVecTy = VectorTy;
+      if (I->getOpcode() == Instruction::Trunc) {
+        SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
+        VectorTy =
+            largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
+      } else if (I->getOpcode() == Instruction::ZExt ||
+                 I->getOpcode() == Instruction::SExt) {
+        SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
+        VectorTy =
+            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
+      }
+    }
+
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
+    return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
+  }
+  case Instruction::Call: {
+    bool NeedToScalarize;
+    CallInst *CI = cast<CallInst>(I);
+    unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
+    if (getVectorIntrinsicIDForCall(CI, TLI))
+      return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
+    return CallCost;
+  }
+  default:
+    // The cost of executing VF copies of the scalar instruction. This opcode
+    // is unknown. Assume that it is the same as 'mul'.
+    return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
+           getScalarizationOverhead(I, VF, TTI);
+  } // end of switch.
+}
+
+char LoopVectorize::ID = 0;
+static const char lv_name[] = "Loop Vectorization";
+INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
+
+namespace llvm {
+Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
+  return new LoopVectorize(NoUnrolling, AlwaysVectorize);
+}
+}
+
+bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
+
+  // Check if the pointer operand of a load or store instruction is
+  // consecutive.
+  if (auto *Ptr = getPointerOperand(Inst))
+    return Legal->isConsecutivePtr(Ptr);
+  return false;
+}
+
+void LoopVectorizationCostModel::collectValuesToIgnore() {
+  // Ignore ephemeral values.
+  CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+
+  // Ignore type-promoting instructions we identified during reduction
+  // detection.
+  for (auto &Reduction : *Legal->getReductionVars()) {
+    RecurrenceDescriptor &RedDes = Reduction.second;
+    SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
+    VecValuesToIgnore.insert(Casts.begin(), Casts.end());
+  }
+}
+
+LoopVectorizationCostModel::VectorizationFactor
+LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
+
+  // Width 1 means no vectorize, cost 0 means uncomputed cost.
+  const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,
+                                                                           0U};
+  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
+  if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
+    return NoVectorization;
+
+  if (UserVF) {
+    DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+    // Collect the instructions (and their associated costs) that will be more
+    // profitable to scalarize.
+    CM.selectUserVectorizationFactor(UserVF);
+    return {UserVF, 0};
+  }
+
+  unsigned MaxVF = MaybeMaxVF.getValue();
+  assert(MaxVF != 0 && "MaxVF is zero.");
+  if (MaxVF == 1)
+    return NoVectorization;
+
+  // Select the optimal vectorization factor.
+  return CM.selectVectorizationFactor(MaxVF);
+}
+
+void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
+  // Perform the actual loop transformation.
+
+  // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+  ILV.createVectorizedLoopSkeleton();
+
+  //===------------------------------------------------===//
+  //
+  // Notice: any optimization or new instruction that go
+  // into the code below should also be implemented in
+  // the cost-model.
+  //
+  //===------------------------------------------------===//
+
+  // 2. Copy and widen instructions from the old loop into the new loop.
+
+  // Move instructions to handle first-order recurrences.
+  DenseMap<Instruction *, Instruction *> SinkAfter = Legal->getSinkAfter();
+  for (auto &Entry : SinkAfter) {
+    Entry.first->removeFromParent();
+    Entry.first->insertAfter(Entry.second);
+    DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second
+                 << " to vectorize a 1st order recurrence.\n");
+  }
+
+  // Collect instructions from the original loop that will become trivially dead
+  // in the vectorized loop. We don't need to vectorize these instructions. For
+  // example, original induction update instructions can become dead because we
+  // separately emit induction "steps" when generating code for the new loop.
+  // Similarly, we create a new latch condition when setting up the structure
+  // of the new loop, so the old one can become dead.
+  SmallPtrSet<Instruction *, 4> DeadInstructions;
+  collectTriviallyDeadInstructions(DeadInstructions);
+
+  // Scan the loop in a topological order to ensure that defs are vectorized
+  // before users.
+  LoopBlocksDFS DFS(OrigLoop);
+  DFS.perform(LI);
+
+  // Vectorize all instructions in the original loop that will not become
+  // trivially dead when vectorized.
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
+    for (Instruction &I : *BB)
+      if (!DeadInstructions.count(&I))
+        ILV.vectorizeInstruction(I);
+
+  // 3. Fix the vectorized code: take care of header phi's, live-outs,
+  //    predication, updating analyses.
+  ILV.fixVectorizedLoop();
+}
+
+void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
+    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+  BasicBlock *Latch = OrigLoop->getLoopLatch();
+
+  // We create new control-flow for the vectorized loop, so the original
+  // condition will be dead after vectorization if it's only used by the
+  // branch.
+  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+  if (Cmp && Cmp->hasOneUse())
+    DeadInstructions.insert(Cmp);
+
+  // We create new "steps" for induction variable updates to which the original
+  // induction variables map. An original update instruction will be dead if
+  // all its users except the induction variable are dead.
+  for (auto &Induction : *Legal->getInductionVars()) {
+    PHINode *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+    if (all_of(IndUpdate->users(), [&](User *U) -> bool {
+          return U == Ind || DeadInstructions.count(cast<Instruction>(U));
+        }))
+      DeadInstructions.insert(IndUpdate);
+  }
+}
+
+void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
+  auto *SI = dyn_cast<StoreInst>(Instr);
+  bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
+
+  return scalarizeInstruction(Instr, IfPredicateInstr);
+}
+
+Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
+
+Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
+
+Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
+                                        Instruction::BinaryOps BinOp) {
+  // When unrolling and the VF is 1, we only need to add a simple scalar.
+  Type *Ty = Val->getType();
+  assert(!Ty->isVectorTy() && "Val must be a scalar");
+
+  if (Ty->isFloatingPointTy()) {
+    Constant *C = ConstantFP::get(Ty, (double)StartIdx);
+
+    // Floating point operations had to be 'fast' to enable the unrolling.
+    Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
+    return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
+  }
+  Constant *C = ConstantInt::get(Ty, StartIdx);
+  return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
+}
+
+static void AddRuntimeUnrollDisableMetaData(Loop *L) {
+  SmallVector<Metadata *, 4> MDs;
+  // Reserve first location for self reference to the LoopID metadata node.
+  MDs.push_back(nullptr);
+  bool IsUnrollMetadata = false;
+  MDNode *LoopID = L->getLoopID();
+  if (LoopID) {
+    // First find existing loop unrolling disable metadata.
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+      if (MD) {
+        const auto *S = dyn_cast<MDString>(MD->getOperand(0));
+        IsUnrollMetadata =
+            S && S->getString().startswith("llvm.loop.unroll.disable");
+      }
+      MDs.push_back(LoopID->getOperand(i));
+    }
+  }
+
+  if (!IsUnrollMetadata) {
+    // Add runtime unroll disable metadata.
+    LLVMContext &Context = L->getHeader()->getContext();
+    SmallVector<Metadata *, 1> DisableOperands;
+    DisableOperands.push_back(
+        MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
+    MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+    MDs.push_back(DisableNode);
+    MDNode *NewLoopID = MDNode::get(Context, MDs);
+    // Set operand 0 to refer to the loop id itself.
+    NewLoopID->replaceOperandWith(0, NewLoopID);
+    L->setLoopID(NewLoopID);
+  }
+}
+
+bool LoopVectorizePass::processLoop(Loop *L) {
+  assert(L->empty() && "Only process inner loops.");
+
+#ifndef NDEBUG
+  const std::string DebugLocStr = getDebugLocString(L);
+#endif /* NDEBUG */
+
+  DEBUG(dbgs() << "\nLV: Checking a loop in \""
+               << L->getHeader()->getParent()->getName() << "\" from "
+               << DebugLocStr << "\n");
+
+  LoopVectorizeHints Hints(L, DisableUnrolling, *ORE);
+
+  DEBUG(dbgs() << "LV: Loop hints:"
+               << " force="
+               << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
+                       ? "disabled"
+                       : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
+                              ? "enabled"
+                              : "?"))
+               << " width=" << Hints.getWidth()
+               << " unroll=" << Hints.getInterleave() << "\n");
+
+  // Function containing loop
+  Function *F = L->getHeader()->getParent();
+
+  // Looking at the diagnostic output is the only way to determine if a loop
+  // was vectorized (other than looking at the IR or machine code), so it
+  // is important to generate an optimization remark for each loop. Most of
+  // these messages are generated as OptimizationRemarkAnalysis. Remarks
+  // generated as OptimizationRemark and OptimizationRemarkMissed are
+  // less verbose reporting vectorized loops and unvectorized loops that may
+  // benefit from vectorization, respectively.
+
+  if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
+    DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
+    return false;
+  }
+
+  PredicatedScalarEvolution PSE(*SE, *L);
+
+  // Check if it is legal to vectorize the loop.
+  LoopVectorizationRequirements Requirements(*ORE);
+  LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
+                                &Requirements, &Hints);
+  if (!LVL.canVectorize()) {
+    DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+    emitMissedWarning(F, L, Hints, ORE);
+    return false;
+  }
+
+  // Check the function attributes to find out if this function should be
+  // optimized for size.
+  bool OptForSize =
+      Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
+
+  // Check the loop for a trip count threshold: vectorize loops with a tiny trip
+  // count by optimizing for size, to minimize overheads.
+  unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
+  bool HasExpectedTC = (ExpectedTC > 0);
+
+  if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
+    auto EstimatedTC = getLoopEstimatedTripCount(L);
+    if (EstimatedTC) {
+      ExpectedTC = *EstimatedTC;
+      HasExpectedTC = true;
+    }
+  }
+
+  if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
+    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
+                 << "This loop is worth vectorizing only if no scalar "
+                 << "iteration overheads are incurred.");
+    if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
+      DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
+    else {
+      DEBUG(dbgs() << "\n");
+      // Loops with a very small trip count are considered for vectorization
+      // under OptForSize, thereby making sure the cost of their loop body is
+      // dominant, free of runtime guards and scalar iteration overheads.
+      OptForSize = true;
+    }
+  }
+
+  // Check the function attributes to see if implicit floats are allowed.
+  // FIXME: This check doesn't seem possibly correct -- what if the loop is
+  // an integer loop and the vector instructions selected are purely integer
+  // vector instructions?
+  if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+    DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
+                    "attribute is used.\n");
+    ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
+                                   "NoImplicitFloat", L)
+              << "loop not vectorized due to NoImplicitFloat attribute");
+    emitMissedWarning(F, L, Hints, ORE);
+    return false;
+  }
+
+  // Check if the target supports potentially unsafe FP vectorization.
+  // FIXME: Add a check for the type of safety issue (denormal, signaling)
+  // for the target we're vectorizing for, to make sure none of the
+  // additional fp-math flags can help.
+  if (Hints.isPotentiallyUnsafe() &&
+      TTI->isFPVectorizationPotentiallyUnsafe()) {
+    DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
+    ORE->emit(
+        createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
+        << "loop not vectorized due to unsafe FP support.");
+    emitMissedWarning(F, L, Hints, ORE);
+    return false;
+  }
+
+  // Use the cost model.
+  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
+                                &Hints);
+  CM.collectValuesToIgnore();
+
+  // Use the planner for vectorization.
+  LoopVectorizationPlanner LVP(L, LI, &LVL, CM);
+
+  // Get user vectorization factor.
+  unsigned UserVF = Hints.getWidth();
+
+  // Plan how to best vectorize, return the best VF and its cost.
+  LoopVectorizationCostModel::VectorizationFactor VF =
+      LVP.plan(OptForSize, UserVF);
+
+  // Select the interleave count.
+  unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
+
+  // Get user interleave count.
+  unsigned UserIC = Hints.getInterleave();
+
+  // Identify the diagnostic messages that should be produced.
+  std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
+  bool VectorizeLoop = true, InterleaveLoop = true;
+  if (Requirements.doesNotMeet(F, L, Hints)) {
+    DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
+                    "requirements.\n");
+    emitMissedWarning(F, L, Hints, ORE);
+    return false;
+  }
+
+  if (VF.Width == 1) {
+    DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+    VecDiagMsg = std::make_pair(
+        "VectorizationNotBeneficial",
+        "the cost-model indicates that vectorization is not beneficial");
+    VectorizeLoop = false;
+  }
+
+  if (IC == 1 && UserIC <= 1) {
+    // Tell the user interleaving is not beneficial.
+    DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
+    IntDiagMsg = std::make_pair(
+        "InterleavingNotBeneficial",
+        "the cost-model indicates that interleaving is not beneficial");
+    InterleaveLoop = false;
+    if (UserIC == 1) {
+      IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
+      IntDiagMsg.second +=
+          " and is explicitly disabled or interleave count is set to 1";
+    }
+  } else if (IC > 1 && UserIC == 1) {
+    // Tell the user interleaving is beneficial, but it explicitly disabled.
+    DEBUG(dbgs()
+          << "LV: Interleaving is beneficial but is explicitly disabled.");
+    IntDiagMsg = std::make_pair(
+        "InterleavingBeneficialButDisabled",
+        "the cost-model indicates that interleaving is beneficial "
+        "but is explicitly disabled or interleave count is set to 1");
+    InterleaveLoop = false;
+  }
+
+  // Override IC if user provided an interleave count.
+  IC = UserIC > 0 ? UserIC : IC;
+
+  // Emit diagnostic messages, if any.
+  const char *VAPassName = Hints.vectorizeAnalysisPassName();
+  if (!VectorizeLoop && !InterleaveLoop) {
+    // Do not vectorize or interleaving the loop.
+    ORE->emit(OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
+                                         L->getStartLoc(), L->getHeader())
+              << VecDiagMsg.second);
+    ORE->emit(OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
+                                         L->getStartLoc(), L->getHeader())
+              << IntDiagMsg.second);
+    return false;
+  } else if (!VectorizeLoop && InterleaveLoop) {
+    DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+    ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
+                                         L->getStartLoc(), L->getHeader())
+              << VecDiagMsg.second);
+  } else if (VectorizeLoop && !InterleaveLoop) {
+    DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
+                 << DebugLocStr << '\n');
+    ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
+                                         L->getStartLoc(), L->getHeader())
+              << IntDiagMsg.second);
+  } else if (VectorizeLoop && InterleaveLoop) {
+    DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
+                 << DebugLocStr << '\n');
+    DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+  }
+
+  using namespace ore;
+  if (!VectorizeLoop) {
+    assert(IC > 1 && "interleave count should not be 1 or 0");
+    // If we decided that it is not legal to vectorize the loop, then
+    // interleave it.
+    InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
+                               &CM);
+    LVP.executePlan(Unroller);
+
+    ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
+                                 L->getHeader())
+              << "interleaved loop (interleaved count: "
+              << NV("InterleaveCount", IC) << ")");
+  } else {
+    // If we decided that it is *legal* to vectorize the loop, then do it.
+    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
+                           &LVL, &CM);
+    LVP.executePlan(LB);
+    ++LoopsVectorized;
+
+    // Add metadata to disable runtime unrolling a scalar loop when there are
+    // no runtime checks about strides and memory. A scalar loop that is
+    // rarely used is not worth unrolling.
+    if (!LB.areSafetyChecksAdded())
+      AddRuntimeUnrollDisableMetaData(L);
+
+    // Report the vectorization decision.
+    ORE->emit(OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
+                                 L->getHeader())
+              << "vectorized loop (vectorization width: "
+              << NV("VectorizationFactor", VF.Width)
+              << ", interleaved count: " << NV("InterleaveCount", IC) << ")");
+  }
+
+  // Mark the loop as already vectorized to avoid vectorizing again.
+  Hints.setAlreadyVectorized();
+
+  DEBUG(verifyFunction(*L->getHeader()->getParent()));
+  return true;
+}
+
+bool LoopVectorizePass::runImpl(
+    Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
+    DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
+    DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
+    std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
+    OptimizationRemarkEmitter &ORE_) {
+
+  SE = &SE_;
+  LI = &LI_;
+  TTI = &TTI_;
+  DT = &DT_;
+  BFI = &BFI_;
+  TLI = TLI_;
+  AA = &AA_;
+  AC = &AC_;
+  GetLAA = &GetLAA_;
+  DB = &DB_;
+  ORE = &ORE_;
+
+  // Don't attempt if
+  // 1. the target claims to have no vector registers, and
+  // 2. interleaving won't help ILP.
+  //
+  // The second condition is necessary because, even if the target has no
+  // vector registers, loop vectorization may still enable scalar
+  // interleaving.
+  if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
+    return false;
+
+  bool Changed = false;
+
+  // The vectorizer requires loops to be in simplified form.
+  // Since simplification may add new inner loops, it has to run before the
+  // legality and profitability checks. This means running the loop vectorizer
+  // will simplify all loops, regardless of whether anything end up being
+  // vectorized.
+  for (auto &L : *LI)
+    Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);
+
+  // Build up a worklist of inner-loops to vectorize. This is necessary as
+  // the act of vectorizing or partially unrolling a loop creates new loops
+  // and can invalidate iterators across the loops.
+  SmallVector<Loop *, 8> Worklist;
+
+  for (Loop *L : *LI)
+    addAcyclicInnerLoop(*L, Worklist);
+
+  LoopsAnalyzed += Worklist.size();
+
+  // Now walk the identified inner loops.
+  while (!Worklist.empty()) {
+    Loop *L = Worklist.pop_back_val();
+
+    // For the inner loops we actually process, form LCSSA to simplify the
+    // transform.
+    Changed |= formLCSSARecursively(*L, *DT, LI, SE);
+
+    Changed |= processLoop(L);
+  }
+
+  // Process each loop nest in the function.
+  return Changed;
+
+}
+
+
+PreservedAnalyses LoopVectorizePass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+    auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+    auto &LI = AM.getResult<LoopAnalysis>(F);
+    auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+    auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+    auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
+    auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+    auto &AA = AM.getResult<AAManager>(F);
+    auto &AC = AM.getResult<AssumptionAnalysis>(F);
+    auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
+    auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+    auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+    std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+        [&](Loop &L) -> const LoopAccessInfo & {
+      LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+      return LAM.getResult<LoopAccessAnalysis>(L, AR);
+    };
+    bool Changed =
+        runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
+    if (!Changed)
+      return PreservedAnalyses::all();
+    PreservedAnalyses PA;
+    PA.preserve<LoopAnalysis>();
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserve<BasicAA>();
+    PA.preserve<GlobalsAA>();
+    return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
new file mode 100644
index 000000000000..4425043ad39a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -0,0 +1,5157 @@
+//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
+// stores that can be put together into vector-stores. Next, it attempts to
+// construct vectorizable tree using the use-def chains. If a profitable tree
+// was found, the SLP vectorizer performs vectorization on the tree.
+//
+// The pass is inspired by the work described in the paper:
+//  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Vectorize.h"
+#include <algorithm>
+#include <memory>
+
+using namespace llvm;
+using namespace slpvectorizer;
+
+#define SV_NAME "slp-vectorizer"
+#define DEBUG_TYPE "SLP"
+
+STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
+
+static cl::opt<int>
+    SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
+                     cl::desc("Only vectorize if you gain more than this "
+                              "number "));
+
+static cl::opt<bool>
+ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
+                   cl::desc("Attempt to vectorize horizontal reductions"));
+
+static cl::opt<bool> ShouldStartVectorizeHorAtStore(
+    "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Attempt to vectorize horizontal reductions feeding into a store"));
+
+static cl::opt<int>
+MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
+    cl::desc("Attempt to vectorize for this register size in bits"));
+
+/// Limits the size of scheduling regions in a block.
+/// It avoid long compile times for _very_ large blocks where vector
+/// instructions are spread over a wide range.
+/// This limit is way higher than needed by real-world functions.
+static cl::opt<int>
+ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
+    cl::desc("Limit the size of the SLP scheduling region per block"));
+
+static cl::opt<int> MinVectorRegSizeOption(
+    "slp-min-reg-size", cl::init(128), cl::Hidden,
+    cl::desc("Attempt to vectorize for this register size in bits"));
+
+static cl::opt<unsigned> RecursionMaxDepth(
+    "slp-recursion-max-depth", cl::init(12), cl::Hidden,
+    cl::desc("Limit the recursion depth when building a vectorizable tree"));
+
+static cl::opt<unsigned> MinTreeSize(
+    "slp-min-tree-size", cl::init(3), cl::Hidden,
+    cl::desc("Only vectorize small trees if they are fully vectorizable"));
+
+static cl::opt<bool>
+    ViewSLPTree("view-slp-tree", cl::Hidden,
+                cl::desc("Display the SLP trees with Graphviz"));
+
+// Limit the number of alias checks. The limit is chosen so that
+// it has no negative effect on the llvm benchmarks.
+static const unsigned AliasedCheckLimit = 10;
+
+// Another limit for the alias checks: The maximum distance between load/store
+// instructions where alias checks are done.
+// This limit is useful for very large basic blocks.
+static const unsigned MaxMemDepDistance = 160;
+
+/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
+/// regions to be handled.
+static const int MinScheduleRegionSize = 16;
+
+/// \brief Predicate for the element types that the SLP vectorizer supports.
+///
+/// The most important thing to filter here are types which are invalid in LLVM
+/// vectors. We also filter target specific types which have absolutely no
+/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
+/// avoids spending time checking the cost model and realizing that they will
+/// be inevitably scalarized.
+static bool isValidElementType(Type *Ty) {
+  return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
+         !Ty->isPPC_FP128Ty();
+}
+
+/// \returns true if all of the instructions in \p VL are in the same block or
+/// false otherwise.
+static bool allSameBlock(ArrayRef<Value *> VL) {
+  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+  if (!I0)
+    return false;
+  BasicBlock *BB = I0->getParent();
+  for (int i = 1, e = VL.size(); i < e; i++) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (!I)
+      return false;
+
+    if (BB != I->getParent())
+      return false;
+  }
+  return true;
+}
+
+/// \returns True if all of the values in \p VL are constants.
+static bool allConstant(ArrayRef<Value *> VL) {
+  for (Value *i : VL)
+    if (!isa<Constant>(i))
+      return false;
+  return true;
+}
+
+/// \returns True if all of the values in \p VL are identical.
+static bool isSplat(ArrayRef<Value *> VL) {
+  for (unsigned i = 1, e = VL.size(); i < e; ++i)
+    if (VL[i] != VL[0])
+      return false;
+  return true;
+}
+
+///\returns Opcode that can be clubbed with \p Op to create an alternate
+/// sequence which can later be merged as a ShuffleVector instruction.
+static unsigned getAltOpcode(unsigned Op) {
+  switch (Op) {
+  case Instruction::FAdd:
+    return Instruction::FSub;
+  case Instruction::FSub:
+    return Instruction::FAdd;
+  case Instruction::Add:
+    return Instruction::Sub;
+  case Instruction::Sub:
+    return Instruction::Add;
+  default:
+    return 0;
+  }
+}
+
+/// true if the \p Value is odd, false otherwise.
+static bool isOdd(unsigned Value) {
+  return Value & 1;
+}
+
+///\returns bool representing if Opcode \p Op can be part
+/// of an alternate sequence which can later be merged as
+/// a ShuffleVector instruction.
+static bool canCombineAsAltInst(unsigned Op) {
+  return Op == Instruction::FAdd || Op == Instruction::FSub ||
+         Op == Instruction::Sub || Op == Instruction::Add;
+}
+
+/// \returns ShuffleVector instruction if instructions in \p VL have
+///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
+/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
+static unsigned isAltInst(ArrayRef<Value *> VL) {
+  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+  unsigned Opcode = I0->getOpcode();
+  unsigned AltOpcode = getAltOpcode(Opcode);
+  for (int i = 1, e = VL.size(); i < e; i++) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (!I || I->getOpcode() != (isOdd(i) ? AltOpcode : Opcode))
+      return 0;
+  }
+  return Instruction::ShuffleVector;
+}
+
+/// \returns The opcode if all of the Instructions in \p VL have the same
+/// opcode, or zero.
+static unsigned getSameOpcode(ArrayRef<Value *> VL) {
+  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+  if (!I0)
+    return 0;
+  unsigned Opcode = I0->getOpcode();
+  for (int i = 1, e = VL.size(); i < e; i++) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (!I || Opcode != I->getOpcode()) {
+      if (canCombineAsAltInst(Opcode) && i == 1)
+        return isAltInst(VL);
+      return 0;
+    }
+  }
+  return Opcode;
+}
+
+/// \returns true if all of the values in \p VL have the same type or false
+/// otherwise.
+static bool allSameType(ArrayRef<Value *> VL) {
+  Type *Ty = VL[0]->getType();
+  for (int i = 1, e = VL.size(); i < e; i++)
+    if (VL[i]->getType() != Ty)
+      return false;
+
+  return true;
+}
+
+/// \returns True if Extract{Value,Element} instruction extracts element Idx.
+static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
+  assert(Opcode == Instruction::ExtractElement ||
+         Opcode == Instruction::ExtractValue);
+  if (Opcode == Instruction::ExtractElement) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+    return CI && CI->getZExtValue() == Idx;
+  } else {
+    ExtractValueInst *EI = cast<ExtractValueInst>(E);
+    return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
+  }
+}
+
+/// \returns True if in-tree use also needs extract. This refers to
+/// possible scalar operand in vectorized instruction.
+static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
+                                    TargetLibraryInfo *TLI) {
+
+  unsigned Opcode = UserInst->getOpcode();
+  switch (Opcode) {
+  case Instruction::Load: {
+    LoadInst *LI = cast<LoadInst>(UserInst);
+    return (LI->getPointerOperand() == Scalar);
+  }
+  case Instruction::Store: {
+    StoreInst *SI = cast<StoreInst>(UserInst);
+    return (SI->getPointerOperand() == Scalar);
+  }
+  case Instruction::Call: {
+    CallInst *CI = cast<CallInst>(UserInst);
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+    if (hasVectorInstrinsicScalarOpd(ID, 1)) {
+      return (CI->getArgOperand(1) == Scalar);
+    }
+    LLVM_FALLTHROUGH;
+  }
+  default:
+    return false;
+  }
+}
+
+/// \returns the AA location that is being access by the instruction.
+static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return MemoryLocation::get(SI);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return MemoryLocation::get(LI);
+  return MemoryLocation();
+}
+
+/// \returns True if the instruction is not a volatile or atomic load/store.
+static bool isSimple(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isSimple();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isSimple();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return !MI->isVolatile();
+  return true;
+}
+
+namespace llvm {
+namespace slpvectorizer {
+/// Bottom Up SLP Vectorizer.
+class BoUpSLP {
+public:
+  typedef SmallVector<Value *, 8> ValueList;
+  typedef SmallVector<Instruction *, 16> InstrList;
+  typedef SmallPtrSet<Value *, 16> ValueSet;
+  typedef SmallVector<StoreInst *, 8> StoreList;
+  typedef MapVector<Value *, SmallVector<Instruction *, 2>>
+      ExtraValueToDebugLocsMap;
+
+  BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
+          TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
+          DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
+          const DataLayout *DL, OptimizationRemarkEmitter *ORE)
+      : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
+        SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
+        DL(DL), ORE(ORE), Builder(Se->getContext()) {
+    CodeMetrics::collectEphemeralValues(F, AC, EphValues);
+    // Use the vector register size specified by the target unless overridden
+    // by a command-line option.
+    // TODO: It would be better to limit the vectorization factor based on
+    //       data type rather than just register size. For example, x86 AVX has
+    //       256-bit registers, but it does not support integer operations
+    //       at that width (that requires AVX2).
+    if (MaxVectorRegSizeOption.getNumOccurrences())
+      MaxVecRegSize = MaxVectorRegSizeOption;
+    else
+      MaxVecRegSize = TTI->getRegisterBitWidth(true);
+
+    if (MinVectorRegSizeOption.getNumOccurrences())
+      MinVecRegSize = MinVectorRegSizeOption;
+    else
+      MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
+  }
+
+  /// \brief Vectorize the tree that starts with the elements in \p VL.
+  /// Returns the vectorized root.
+  Value *vectorizeTree();
+  /// Vectorize the tree but with the list of externally used values \p
+  /// ExternallyUsedValues. Values in this MapVector can be replaced but the
+  /// generated extractvalue instructions.
+  Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
+
+  /// \returns the cost incurred by unwanted spills and fills, caused by
+  /// holding live values over call sites.
+  int getSpillCost();
+
+  /// \returns the vectorization cost of the subtree that starts at \p VL.
+  /// A negative number means that this is profitable.
+  int getTreeCost();
+
+  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+  /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
+  void buildTree(ArrayRef<Value *> Roots,
+                 ArrayRef<Value *> UserIgnoreLst = None);
+  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+  /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
+  /// into account (anf updating it, if required) list of externally used
+  /// values stored in \p ExternallyUsedValues.
+  void buildTree(ArrayRef<Value *> Roots,
+                 ExtraValueToDebugLocsMap &ExternallyUsedValues,
+                 ArrayRef<Value *> UserIgnoreLst = None);
+
+  /// Clear the internal data structures that are created by 'buildTree'.
+  void deleteTree() {
+    VectorizableTree.clear();
+    ScalarToTreeEntry.clear();
+    MustGather.clear();
+    ExternalUses.clear();
+    NumLoadsWantToKeepOrder = 0;
+    NumLoadsWantToChangeOrder = 0;
+    for (auto &Iter : BlocksSchedules) {
+      BlockScheduling *BS = Iter.second.get();
+      BS->clear();
+    }
+    MinBWs.clear();
+  }
+
+  unsigned getTreeSize() const { return VectorizableTree.size(); }
+
+  /// \brief Perform LICM and CSE on the newly generated gather sequences.
+  void optimizeGatherSequence();
+
+  /// \returns true if it is beneficial to reverse the vector order.
+  bool shouldReorder() const {
+    return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
+  }
+
+  /// \return The vector element size in bits to use when vectorizing the
+  /// expression tree ending at \p V. If V is a store, the size is the width of
+  /// the stored value. Otherwise, the size is the width of the largest loaded
+  /// value reaching V. This method is used by the vectorizer to calculate
+  /// vectorization factors.
+  unsigned getVectorElementSize(Value *V);
+
+  /// Compute the minimum type sizes required to represent the entries in a
+  /// vectorizable tree.
+  void computeMinimumValueSizes();
+
+  // \returns maximum vector register size as set by TTI or overridden by cl::opt.
+  unsigned getMaxVecRegSize() const {
+    return MaxVecRegSize;
+  }
+
+  // \returns minimum vector register size as set by cl::opt.
+  unsigned getMinVecRegSize() const {
+    return MinVecRegSize;
+  }
+
+  /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
+  ///
+  /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
+  unsigned canMapToVector(Type *T, const DataLayout &DL) const;
+
+  /// \returns True if the VectorizableTree is both tiny and not fully
+  /// vectorizable. We do not vectorize such trees.
+  bool isTreeTinyAndNotFullyVectorizable();
+
+  OptimizationRemarkEmitter *getORE() { return ORE; }
+
+private:
+  struct TreeEntry;
+
+  /// \returns the cost of the vectorizable entry.
+  int getEntryCost(TreeEntry *E);
+
+  /// This is the recursive part of buildTree.
+  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
+
+  /// \returns True if the ExtractElement/ExtractValue instructions in VL can
+  /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
+  bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
+
+  /// Vectorize a single entry in the tree.
+  Value *vectorizeTree(TreeEntry *E);
+
+  /// Vectorize a single entry in the tree, starting in \p VL.
+  Value *vectorizeTree(ArrayRef<Value *> VL);
+
+  /// \returns the pointer to the vectorized value if \p VL is already
+  /// vectorized, or NULL. They may happen in cycles.
+  Value *alreadyVectorized(ArrayRef<Value *> VL) const;
+
+  /// \returns the scalarization cost for this type. Scalarization in this
+  /// context means the creation of vectors from a group of scalars.
+  int getGatherCost(Type *Ty);
+
+  /// \returns the scalarization cost for this list of values. Assuming that
+  /// this subtree gets vectorized, we may need to extract the values from the
+  /// roots. This method calculates the cost of extracting the values.
+  int getGatherCost(ArrayRef<Value *> VL);
+
+  /// \brief Set the Builder insert point to one after the last instruction in
+  /// the bundle
+  void setInsertPointAfterBundle(ArrayRef<Value *> VL);
+
+  /// \returns a vector from a collection of scalars in \p VL.
+  Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
+
+  /// \returns whether the VectorizableTree is fully vectorizable and will
+  /// be beneficial even the tree height is tiny.
+  bool isFullyVectorizableTinyTree();
+
+  /// \reorder commutative operands in alt shuffle if they result in
+  ///  vectorized code.
+  void reorderAltShuffleOperands(ArrayRef<Value *> VL,
+                                 SmallVectorImpl<Value *> &Left,
+                                 SmallVectorImpl<Value *> &Right);
+  /// \reorder commutative operands to get better probability of
+  /// generating vectorized code.
+  void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+                                      SmallVectorImpl<Value *> &Left,
+                                      SmallVectorImpl<Value *> &Right);
+  struct TreeEntry {
+    TreeEntry(std::vector<TreeEntry> &Container)
+        : Scalars(), VectorizedValue(nullptr), NeedToGather(0),
+          Container(Container) {}
+
+    /// \returns true if the scalars in VL are equal to this entry.
+    bool isSame(ArrayRef<Value *> VL) const {
+      assert(VL.size() == Scalars.size() && "Invalid size");
+      return std::equal(VL.begin(), VL.end(), Scalars.begin());
+    }
+
+    /// A vector of scalars.
+    ValueList Scalars;
+
+    /// The Scalars are vectorized into this value. It is initialized to Null.
+    Value *VectorizedValue;
+
+    /// Do we need to gather this sequence ?
+    bool NeedToGather;
+
+    /// Points back to the VectorizableTree.
+    ///
+    /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
+    /// to be a pointer and needs to be able to initialize the child iterator.
+    /// Thus we need a reference back to the container to translate the indices
+    /// to entries.
+    std::vector<TreeEntry> &Container;
+
+    /// The TreeEntry index containing the user of this entry.  We can actually
+    /// have multiple users so the data structure is not truly a tree.
+    SmallVector<int, 1> UserTreeIndices;
+  };
+
+  /// Create a new VectorizableTree entry.
+  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
+                          int &UserTreeIdx) {
+    VectorizableTree.emplace_back(VectorizableTree);
+    int idx = VectorizableTree.size() - 1;
+    TreeEntry *Last = &VectorizableTree[idx];
+    Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
+    Last->NeedToGather = !Vectorized;
+    if (Vectorized) {
+      for (int i = 0, e = VL.size(); i != e; ++i) {
+        assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
+        ScalarToTreeEntry[VL[i]] = idx;
+      }
+    } else {
+      MustGather.insert(VL.begin(), VL.end());
+    }
+
+    if (UserTreeIdx >= 0)
+      Last->UserTreeIndices.push_back(UserTreeIdx);
+    UserTreeIdx = idx;
+    return Last;
+  }
+
+  /// -- Vectorization State --
+  /// Holds all of the tree entries.
+  std::vector<TreeEntry> VectorizableTree;
+
+  TreeEntry *getTreeEntry(Value *V) {
+    auto I = ScalarToTreeEntry.find(V);
+    if (I != ScalarToTreeEntry.end())
+      return &VectorizableTree[I->second];
+    return nullptr;
+  }
+
+  const TreeEntry *getTreeEntry(Value *V) const {
+    auto I = ScalarToTreeEntry.find(V);
+    if (I != ScalarToTreeEntry.end())
+      return &VectorizableTree[I->second];
+    return nullptr;
+  }
+
+  /// Maps a specific scalar to its tree entry.
+  SmallDenseMap<Value*, int> ScalarToTreeEntry;
+
+  /// A list of scalars that we found that we need to keep as scalars.
+  ValueSet MustGather;
+
+  /// This POD struct describes one external user in the vectorized tree.
+  struct ExternalUser {
+    ExternalUser (Value *S, llvm::User *U, int L) :
+      Scalar(S), User(U), Lane(L){}
+    // Which scalar in our function.
+    Value *Scalar;
+    // Which user that uses the scalar.
+    llvm::User *User;
+    // Which lane does the scalar belong to.
+    int Lane;
+  };
+  typedef SmallVector<ExternalUser, 16> UserList;
+
+  /// Checks if two instructions may access the same memory.
+  ///
+  /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
+  /// is invariant in the calling loop.
+  bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
+                 Instruction *Inst2) {
+
+    // First check if the result is already in the cache.
+    AliasCacheKey key = std::make_pair(Inst1, Inst2);
+    Optional<bool> &result = AliasCache[key];
+    if (result.hasValue()) {
+      return result.getValue();
+    }
+    MemoryLocation Loc2 = getLocation(Inst2, AA);
+    bool aliased = true;
+    if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
+      // Do the alias check.
+      aliased = AA->alias(Loc1, Loc2);
+    }
+    // Store the result in the cache.
+    result = aliased;
+    return aliased;
+  }
+
+  typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
+
+  /// Cache for alias results.
+  /// TODO: consider moving this to the AliasAnalysis itself.
+  DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
+
+  /// Removes an instruction from its block and eventually deletes it.
+  /// It's like Instruction::eraseFromParent() except that the actual deletion
+  /// is delayed until BoUpSLP is destructed.
+  /// This is required to ensure that there are no incorrect collisions in the
+  /// AliasCache, which can happen if a new instruction is allocated at the
+  /// same address as a previously deleted instruction.
+  void eraseInstruction(Instruction *I) {
+    I->removeFromParent();
+    I->dropAllReferences();
+    DeletedInstructions.emplace_back(I);
+  }
+
+  /// Temporary store for deleted instructions. Instructions will be deleted
+  /// eventually when the BoUpSLP is destructed.
+  SmallVector<unique_value, 8> DeletedInstructions;
+
+  /// A list of values that need to extracted out of the tree.
+  /// This list holds pairs of (Internal Scalar : External User). External User
+  /// can be nullptr, it means that this Internal Scalar will be used later,
+  /// after vectorization.
+  UserList ExternalUses;
+
+  /// Values used only by @llvm.assume calls.
+  SmallPtrSet<const Value *, 32> EphValues;
+
+  /// Holds all of the instructions that we gathered.
+  SetVector<Instruction *> GatherSeq;
+  /// A list of blocks that we are going to CSE.
+  SetVector<BasicBlock *> CSEBlocks;
+
+  /// Contains all scheduling relevant data for an instruction.
+  /// A ScheduleData either represents a single instruction or a member of an
+  /// instruction bundle (= a group of instructions which is combined into a
+  /// vector instruction).
+  struct ScheduleData {
+
+    // The initial value for the dependency counters. It means that the
+    // dependencies are not calculated yet.
+    enum { InvalidDeps = -1 };
+
+    ScheduleData()
+        : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
+          NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
+          Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
+          UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
+
+    void init(int BlockSchedulingRegionID) {
+      FirstInBundle = this;
+      NextInBundle = nullptr;
+      NextLoadStore = nullptr;
+      IsScheduled = false;
+      SchedulingRegionID = BlockSchedulingRegionID;
+      UnscheduledDepsInBundle = UnscheduledDeps;
+      clearDependencies();
+    }
+
+    /// Returns true if the dependency information has been calculated.
+    bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
+
+    /// Returns true for single instructions and for bundle representatives
+    /// (= the head of a bundle).
+    bool isSchedulingEntity() const { return FirstInBundle == this; }
+
+    /// Returns true if it represents an instruction bundle and not only a
+    /// single instruction.
+    bool isPartOfBundle() const {
+      return NextInBundle != nullptr || FirstInBundle != this;
+    }
+
+    /// Returns true if it is ready for scheduling, i.e. it has no more
+    /// unscheduled depending instructions/bundles.
+    bool isReady() const {
+      assert(isSchedulingEntity() &&
+             "can't consider non-scheduling entity for ready list");
+      return UnscheduledDepsInBundle == 0 && !IsScheduled;
+    }
+
+    /// Modifies the number of unscheduled dependencies, also updating it for
+    /// the whole bundle.
+    int incrementUnscheduledDeps(int Incr) {
+      UnscheduledDeps += Incr;
+      return FirstInBundle->UnscheduledDepsInBundle += Incr;
+    }
+
+    /// Sets the number of unscheduled dependencies to the number of
+    /// dependencies.
+    void resetUnscheduledDeps() {
+      incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
+    }
+
+    /// Clears all dependency information.
+    void clearDependencies() {
+      Dependencies = InvalidDeps;
+      resetUnscheduledDeps();
+      MemoryDependencies.clear();
+    }
+
+    void dump(raw_ostream &os) const {
+      if (!isSchedulingEntity()) {
+        os << "/ " << *Inst;
+      } else if (NextInBundle) {
+        os << '[' << *Inst;
+        ScheduleData *SD = NextInBundle;
+        while (SD) {
+          os << ';' << *SD->Inst;
+          SD = SD->NextInBundle;
+        }
+        os << ']';
+      } else {
+        os << *Inst;
+      }
+    }
+
+    Instruction *Inst;
+
+    /// Points to the head in an instruction bundle (and always to this for
+    /// single instructions).
+    ScheduleData *FirstInBundle;
+
+    /// Single linked list of all instructions in a bundle. Null if it is a
+    /// single instruction.
+    ScheduleData *NextInBundle;
+
+    /// Single linked list of all memory instructions (e.g. load, store, call)
+    /// in the block - until the end of the scheduling region.
+    ScheduleData *NextLoadStore;
+
+    /// The dependent memory instructions.
+    /// This list is derived on demand in calculateDependencies().
+    SmallVector<ScheduleData *, 4> MemoryDependencies;
+
+    /// This ScheduleData is in the current scheduling region if this matches
+    /// the current SchedulingRegionID of BlockScheduling.
+    int SchedulingRegionID;
+
+    /// Used for getting a "good" final ordering of instructions.
+    int SchedulingPriority;
+
+    /// The number of dependencies. Constitutes of the number of users of the
+    /// instruction plus the number of dependent memory instructions (if any).
+    /// This value is calculated on demand.
+    /// If InvalidDeps, the number of dependencies is not calculated yet.
+    ///
+    int Dependencies;
+
+    /// The number of dependencies minus the number of dependencies of scheduled
+    /// instructions. As soon as this is zero, the instruction/bundle gets ready
+    /// for scheduling.
+    /// Note that this is negative as long as Dependencies is not calculated.
+    int UnscheduledDeps;
+
+    /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
+    /// single instructions.
+    int UnscheduledDepsInBundle;
+
+    /// True if this instruction is scheduled (or considered as scheduled in the
+    /// dry-run).
+    bool IsScheduled;
+  };
+
+#ifndef NDEBUG
+  friend inline raw_ostream &operator<<(raw_ostream &os,
+                                        const BoUpSLP::ScheduleData &SD) {
+    SD.dump(os);
+    return os;
+  }
+#endif
+  friend struct GraphTraits<BoUpSLP *>;
+  friend struct DOTGraphTraits<BoUpSLP *>;
+
+  /// Contains all scheduling data for a basic block.
+  ///
+  struct BlockScheduling {
+
+    BlockScheduling(BasicBlock *BB)
+        : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
+          ScheduleStart(nullptr), ScheduleEnd(nullptr),
+          FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
+          ScheduleRegionSize(0),
+          ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
+          // Make sure that the initial SchedulingRegionID is greater than the
+          // initial SchedulingRegionID in ScheduleData (which is 0).
+          SchedulingRegionID(1) {}
+
+    void clear() {
+      ReadyInsts.clear();
+      ScheduleStart = nullptr;
+      ScheduleEnd = nullptr;
+      FirstLoadStoreInRegion = nullptr;
+      LastLoadStoreInRegion = nullptr;
+
+      // Reduce the maximum schedule region size by the size of the
+      // previous scheduling run.
+      ScheduleRegionSizeLimit -= ScheduleRegionSize;
+      if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
+        ScheduleRegionSizeLimit = MinScheduleRegionSize;
+      ScheduleRegionSize = 0;
+
+      // Make a new scheduling region, i.e. all existing ScheduleData is not
+      // in the new region yet.
+      ++SchedulingRegionID;
+    }
+
+    ScheduleData *getScheduleData(Value *V) {
+      ScheduleData *SD = ScheduleDataMap[V];
+      if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+        return SD;
+      return nullptr;
+    }
+
+    bool isInSchedulingRegion(ScheduleData *SD) {
+      return SD->SchedulingRegionID == SchedulingRegionID;
+    }
+
+    /// Marks an instruction as scheduled and puts all dependent ready
+    /// instructions into the ready-list.
+    template <typename ReadyListType>
+    void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
+      SD->IsScheduled = true;
+      DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
+
+      ScheduleData *BundleMember = SD;
+      while (BundleMember) {
+        // Handle the def-use chain dependencies.
+        for (Use &U : BundleMember->Inst->operands()) {
+          ScheduleData *OpDef = getScheduleData(U.get());
+          if (OpDef && OpDef->hasValidDependencies() &&
+              OpDef->incrementUnscheduledDeps(-1) == 0) {
+            // There are no more unscheduled dependencies after decrementing,
+            // so we can put the dependent instruction into the ready list.
+            ScheduleData *DepBundle = OpDef->FirstInBundle;
+            assert(!DepBundle->IsScheduled &&
+                   "already scheduled bundle gets ready");
+            ReadyList.insert(DepBundle);
+            DEBUG(dbgs() << "SLP:    gets ready (def): " << *DepBundle << "\n");
+          }
+        }
+        // Handle the memory dependencies.
+        for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
+          if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
+            // There are no more unscheduled dependencies after decrementing,
+            // so we can put the dependent instruction into the ready list.
+            ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
+            assert(!DepBundle->IsScheduled &&
+                   "already scheduled bundle gets ready");
+            ReadyList.insert(DepBundle);
+            DEBUG(dbgs() << "SLP:    gets ready (mem): " << *DepBundle << "\n");
+          }
+        }
+        BundleMember = BundleMember->NextInBundle;
+      }
+    }
+
+    /// Put all instructions into the ReadyList which are ready for scheduling.
+    template <typename ReadyListType>
+    void initialFillReadyList(ReadyListType &ReadyList) {
+      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+        ScheduleData *SD = getScheduleData(I);
+        if (SD->isSchedulingEntity() && SD->isReady()) {
+          ReadyList.insert(SD);
+          DEBUG(dbgs() << "SLP:    initially in ready list: " << *I << "\n");
+        }
+      }
+    }
+
+    /// Checks if a bundle of instructions can be scheduled, i.e. has no
+    /// cyclic dependencies. This is only a dry-run, no instructions are
+    /// actually moved at this stage.
+    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
+
+    /// Un-bundles a group of instructions.
+    void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
+
+    /// Extends the scheduling region so that V is inside the region.
+    /// \returns true if the region size is within the limit.
+    bool extendSchedulingRegion(Value *V);
+
+    /// Initialize the ScheduleData structures for new instructions in the
+    /// scheduling region.
+    void initScheduleData(Instruction *FromI, Instruction *ToI,
+                          ScheduleData *PrevLoadStore,
+                          ScheduleData *NextLoadStore);
+
+    /// Updates the dependency information of a bundle and of all instructions/
+    /// bundles which depend on the original bundle.
+    void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
+                               BoUpSLP *SLP);
+
+    /// Sets all instruction in the scheduling region to un-scheduled.
+    void resetSchedule();
+
+    BasicBlock *BB;
+
+    /// Simple memory allocation for ScheduleData.
+    std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
+
+    /// The size of a ScheduleData array in ScheduleDataChunks.
+    int ChunkSize;
+
+    /// The allocator position in the current chunk, which is the last entry
+    /// of ScheduleDataChunks.
+    int ChunkPos;
+
+    /// Attaches ScheduleData to Instruction.
+    /// Note that the mapping survives during all vectorization iterations, i.e.
+    /// ScheduleData structures are recycled.
+    DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+
+    struct ReadyList : SmallVector<ScheduleData *, 8> {
+      void insert(ScheduleData *SD) { push_back(SD); }
+    };
+
+    /// The ready-list for scheduling (only used for the dry-run).
+    ReadyList ReadyInsts;
+
+    /// The first instruction of the scheduling region.
+    Instruction *ScheduleStart;
+
+    /// The first instruction _after_ the scheduling region.
+    Instruction *ScheduleEnd;
+
+    /// The first memory accessing instruction in the scheduling region
+    /// (can be null).
+    ScheduleData *FirstLoadStoreInRegion;
+
+    /// The last memory accessing instruction in the scheduling region
+    /// (can be null).
+    ScheduleData *LastLoadStoreInRegion;
+
+    /// The current size of the scheduling region.
+    int ScheduleRegionSize;
+
+    /// The maximum size allowed for the scheduling region.
+    int ScheduleRegionSizeLimit;
+
+    /// The ID of the scheduling region. For a new vectorization iteration this
+    /// is incremented which "removes" all ScheduleData from the region.
+    int SchedulingRegionID;
+  };
+
+  /// Attaches the BlockScheduling structures to basic blocks.
+  MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+
+  /// Performs the "real" scheduling. Done before vectorization is actually
+  /// performed in a basic block.
+  void scheduleBlock(BlockScheduling *BS);
+
+  /// List of users to ignore during scheduling and that don't need extracting.
+  ArrayRef<Value *> UserIgnoreList;
+
+  // Number of load bundles that contain consecutive loads.
+  int NumLoadsWantToKeepOrder;
+
+  // Number of load bundles that contain consecutive loads in reversed order.
+  int NumLoadsWantToChangeOrder;
+
+  // Analysis and block reference.
+  Function *F;
+  ScalarEvolution *SE;
+  TargetTransformInfo *TTI;
+  TargetLibraryInfo *TLI;
+  AliasAnalysis *AA;
+  LoopInfo *LI;
+  DominatorTree *DT;
+  AssumptionCache *AC;
+  DemandedBits *DB;
+  const DataLayout *DL;
+  OptimizationRemarkEmitter *ORE;
+
+  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
+  unsigned MinVecRegSize; // Set by cl::opt (default: 128).
+  /// Instruction builder to construct the vectorized tree.
+  IRBuilder<> Builder;
+
+  /// A map of scalar integer values to the smallest bit width with which they
+  /// can legally be represented. The values map to (width, signed) pairs,
+  /// where "width" indicates the minimum bit width and "signed" is True if the
+  /// value must be signed-extended, rather than zero-extended, back to its
+  /// original width.
+  MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
+};
+} // end namespace slpvectorizer
+
+template <> struct GraphTraits<BoUpSLP *> {
+  typedef BoUpSLP::TreeEntry TreeEntry;
+
+  /// NodeRef has to be a pointer per the GraphWriter.
+  typedef TreeEntry *NodeRef;
+
+  /// \brief Add the VectorizableTree to the index iterator to be able to return
+  /// TreeEntry pointers.
+  struct ChildIteratorType
+      : public iterator_adaptor_base<ChildIteratorType,
+                                     SmallVector<int, 1>::iterator> {
+
+    std::vector<TreeEntry> &VectorizableTree;
+
+    ChildIteratorType(SmallVector<int, 1>::iterator W,
+                      std::vector<TreeEntry> &VT)
+        : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
+
+    NodeRef operator*() { return &VectorizableTree[*I]; }
+  };
+
+  static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }
+
+  static ChildIteratorType child_begin(NodeRef N) {
+    return {N->UserTreeIndices.begin(), N->Container};
+  }
+  static ChildIteratorType child_end(NodeRef N) {
+    return {N->UserTreeIndices.end(), N->Container};
+  }
+
+  /// For the node iterator we just need to turn the TreeEntry iterator into a
+  /// TreeEntry* iterator so that it dereferences to NodeRef.
+  typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator;
+
+  static nodes_iterator nodes_begin(BoUpSLP *R) {
+    return nodes_iterator(R->VectorizableTree.begin());
+  }
+  static nodes_iterator nodes_end(BoUpSLP *R) {
+    return nodes_iterator(R->VectorizableTree.end());
+  }
+
+  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
+};
+
+template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
+  typedef BoUpSLP::TreeEntry TreeEntry;
+
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
+    std::string Str;
+    raw_string_ostream OS(Str);
+    if (isSplat(Entry->Scalars)) {
+      OS << "<splat> " << *Entry->Scalars[0];
+      return Str;
+    }
+    for (auto V : Entry->Scalars) {
+      OS << *V;
+      if (std::any_of(
+              R->ExternalUses.begin(), R->ExternalUses.end(),
+              [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
+        OS << " <extract>";
+      OS << "\n";
+    }
+    return Str;
+  }
+
+  static std::string getNodeAttributes(const TreeEntry *Entry,
+                                       const BoUpSLP *) {
+    if (Entry->NeedToGather)
+      return "color=red";
+    return "";
+  }
+};
+
+} // end namespace llvm
+
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+                        ArrayRef<Value *> UserIgnoreLst) {
+  ExtraValueToDebugLocsMap ExternallyUsedValues;
+  buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
+}
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+                        ExtraValueToDebugLocsMap &ExternallyUsedValues,
+                        ArrayRef<Value *> UserIgnoreLst) {
+  deleteTree();
+  UserIgnoreList = UserIgnoreLst;
+  if (!allSameType(Roots))
+    return;
+  buildTree_rec(Roots, 0, -1);
+
+  // Collect the values that we need to extract from the tree.
+  for (TreeEntry &EIdx : VectorizableTree) {
+    TreeEntry *Entry = &EIdx;
+
+    // No need to handle users of gathered values.
+    if (Entry->NeedToGather)
+      continue;
+
+    // For each lane:
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+
+      // Check if the scalar is externally used as an extra arg.
+      auto ExtI = ExternallyUsedValues.find(Scalar);
+      if (ExtI != ExternallyUsedValues.end()) {
+        DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
+              Lane << " from " << *Scalar << ".\n");
+        ExternalUses.emplace_back(Scalar, nullptr, Lane);
+        continue;
+      }
+      for (User *U : Scalar->users()) {
+        DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+
+        Instruction *UserInst = dyn_cast<Instruction>(U);
+        if (!UserInst)
+          continue;
+
+        // Skip in-tree scalars that become vectors
+        if (TreeEntry *UseEntry = getTreeEntry(U)) {
+          Value *UseScalar = UseEntry->Scalars[0];
+          // Some in-tree scalars will remain as scalar in vectorized
+          // instructions. If that is the case, the one in Lane 0 will
+          // be used.
+          if (UseScalar != U ||
+              !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+            DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+                         << ".\n");
+            assert(!UseEntry->NeedToGather && "Bad state");
+            continue;
+          }
+        }
+
+        // Ignore users in the user ignore list.
+        if (is_contained(UserIgnoreList, UserInst))
+          continue;
+
+        DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
+              Lane << " from " << *Scalar << ".\n");
+        ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
+      }
+    }
+  }
+}
+
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
+                            int UserTreeIdx) {
+  bool isAltShuffle = false;
+  assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
+
+  if (Depth == RecursionMaxDepth) {
+    DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
+    newTreeEntry(VL, false, UserTreeIdx);
+    return;
+  }
+
+  // Don't handle vectors.
+  if (VL[0]->getType()->isVectorTy()) {
+    DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
+    newTreeEntry(VL, false, UserTreeIdx);
+    return;
+  }
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    if (SI->getValueOperand()->getType()->isVectorTy()) {
+      DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
+      newTreeEntry(VL, false, UserTreeIdx);
+      return;
+    }
+  unsigned Opcode = getSameOpcode(VL);
+
+  // Check that this shuffle vector refers to the alternate
+  // sequence of opcodes.
+  if (Opcode == Instruction::ShuffleVector) {
+    Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+    unsigned Op = I0->getOpcode();
+    if (Op != Instruction::ShuffleVector)
+      isAltShuffle = true;
+  }
+
+  // If all of the operands are identical or constant we have a simple solution.
+  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
+    DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+    newTreeEntry(VL, false, UserTreeIdx);
+    return;
+  }
+
+  // We now know that this is a vector of instructions of the same type from
+  // the same block.
+
+  // Don't vectorize ephemeral values.
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    if (EphValues.count(VL[i])) {
+      DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
+            ") is ephemeral.\n");
+      newTreeEntry(VL, false, UserTreeIdx);
+      return;
+    }
+  }
+
+  // Check if this is a duplicate of another entry.
+  if (TreeEntry *E = getTreeEntry(VL[0])) {
+    for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+      DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
+      if (E->Scalars[i] != VL[i]) {
+        DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+        newTreeEntry(VL, false, UserTreeIdx);
+        return;
+      }
+    }
+    // Record the reuse of the tree node.  FIXME, currently this is only used to
+    // properly draw the graph rather than for the actual vectorization.
+    E->UserTreeIndices.push_back(UserTreeIdx);
+    DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
+    return;
+  }
+
+  // Check that none of the instructions in the bundle are already in the tree.
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    if (ScalarToTreeEntry.count(VL[i])) {
+      DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
+            ") is already in tree.\n");
+      newTreeEntry(VL, false, UserTreeIdx);
+      return;
+    }
+  }
+
+  // If any of the scalars is marked as a value that needs to stay scalar then
+  // we need to gather the scalars.
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    if (MustGather.count(VL[i])) {
+      DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
+      newTreeEntry(VL, false, UserTreeIdx);
+      return;
+    }
+  }
+
+  // Check that all of the users of the scalars that we want to vectorize are
+  // schedulable.
+  Instruction *VL0 = cast<Instruction>(VL[0]);
+  BasicBlock *BB = cast<Instruction>(VL0)->getParent();
+
+  if (!DT->isReachableFromEntry(BB)) {
+    // Don't go into unreachable blocks. They may contain instructions with
+    // dependency cycles which confuse the final scheduling.
+    DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
+    newTreeEntry(VL, false, UserTreeIdx);
+    return;
+  }
+
+  // Check that every instructions appears once in this bundle.
+  for (unsigned i = 0, e = VL.size(); i < e; ++i)
+    for (unsigned j = i+1; j < e; ++j)
+      if (VL[i] == VL[j]) {
+        DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+        newTreeEntry(VL, false, UserTreeIdx);
+        return;
+      }
+
+  auto &BSRef = BlocksSchedules[BB];
+  if (!BSRef) {
+    BSRef = llvm::make_unique<BlockScheduling>(BB);
+  }
+  BlockScheduling &BS = *BSRef.get();
+
+  if (!BS.tryScheduleBundle(VL, this)) {
+    DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
+    assert((!BS.getScheduleData(VL[0]) ||
+            !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
+           "tryScheduleBundle should cancelScheduling on failure");
+    newTreeEntry(VL, false, UserTreeIdx);
+    return;
+  }
+  DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
+
+  switch (Opcode) {
+    case Instruction::PHI: {
+      PHINode *PH = dyn_cast<PHINode>(VL0);
+
+      // Check for terminator values (e.g. invoke).
+      for (unsigned j = 0; j < VL.size(); ++j)
+        for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+          TerminatorInst *Term = dyn_cast<TerminatorInst>(
+              cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
+          if (Term) {
+            DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
+            BS.cancelScheduling(VL, VL0);
+            newTreeEntry(VL, false, UserTreeIdx);
+            return;
+          }
+        }
+
+      newTreeEntry(VL, true, UserTreeIdx);
+      DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+
+      for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (Value *j : VL)
+          Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
+              PH->getIncomingBlock(i)));
+
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+      }
+      return;
+    }
+    case Instruction::ExtractValue:
+    case Instruction::ExtractElement: {
+      bool Reuse = canReuseExtract(VL, Opcode);
+      if (Reuse) {
+        DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
+      } else {
+        BS.cancelScheduling(VL, VL0);
+      }
+      newTreeEntry(VL, Reuse, UserTreeIdx);
+      return;
+    }
+    case Instruction::Load: {
+      // Check that a vectorized load would load the same memory as a scalar
+      // load.
+      // For example we don't want vectorize loads that are smaller than 8 bit.
+      // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
+      // loading/storing it as an i8 struct. If we vectorize loads/stores from
+      // such a struct we read/write packed bits disagreeing with the
+      // unvectorized version.
+      Type *ScalarTy = VL[0]->getType();
+
+      if (DL->getTypeSizeInBits(ScalarTy) !=
+          DL->getTypeAllocSizeInBits(ScalarTy)) {
+        BS.cancelScheduling(VL, VL0);
+        newTreeEntry(VL, false, UserTreeIdx);
+        DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+        return;
+      }
+
+      // Make sure all loads in the bundle are simple - we can't vectorize
+      // atomic or volatile loads.
+      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
+        LoadInst *L = cast<LoadInst>(VL[i]);
+        if (!L->isSimple()) {
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
+          DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+          return;
+        }
+      }
+
+      // Check if the loads are consecutive, reversed, or neither.
+      // TODO: What we really want is to sort the loads, but for now, check
+      // the two likely directions.
+      bool Consecutive = true;
+      bool ReverseConsecutive = true;
+      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
+        if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
+          Consecutive = false;
+          break;
+        } else {
+          ReverseConsecutive = false;
+        }
+      }
+
+      if (Consecutive) {
+        ++NumLoadsWantToKeepOrder;
+        newTreeEntry(VL, true, UserTreeIdx);
+        DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+        return;
+      }
+
+      // If none of the load pairs were consecutive when checked in order,
+      // check the reverse order.
+      if (ReverseConsecutive)
+        for (unsigned i = VL.size() - 1; i > 0; --i)
+          if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
+            ReverseConsecutive = false;
+            break;
+          }
+
+      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, false, UserTreeIdx);
+
+      if (ReverseConsecutive) {
+        ++NumLoadsWantToChangeOrder;
+        DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
+      } else {
+        DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+      }
+      return;
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      Type *SrcTy = VL0->getOperand(0)->getType();
+      for (unsigned i = 0; i < VL.size(); ++i) {
+        Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
+        if (Ty != SrcTy || !isValidElementType(Ty)) {
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
+          DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
+          return;
+        }
+      }
+      newTreeEntry(VL, true, UserTreeIdx);
+      DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (Value *j : VL)
+          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+      }
+      return;
+    }
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      // Check that all of the compares have the same predicate.
+      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+      Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
+      for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+        CmpInst *Cmp = cast<CmpInst>(VL[i]);
+        if (Cmp->getPredicate() != P0 ||
+            Cmp->getOperand(0)->getType() != ComparedTy) {
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
+          DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
+          return;
+        }
+      }
+
+      newTreeEntry(VL, true, UserTreeIdx);
+      DEBUG(dbgs() << "SLP: added a vector of compares.\n");
+
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (Value *j : VL)
+          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+      }
+      return;
+    }
+    case Instruction::Select:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      newTreeEntry(VL, true, UserTreeIdx);
+      DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
+
+      // Sort operands of the instructions so that each side is more likely to
+      // have the same opcode.
+      if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
+        ValueList Left, Right;
+        reorderInputsAccordingToOpcode(VL, Left, Right);
+        buildTree_rec(Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(Right, Depth + 1, UserTreeIdx);
+        return;
+      }
+
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (Value *j : VL)
+          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+      }
+      return;
+    }
+    case Instruction::GetElementPtr: {
+      // We don't combine GEPs with complicated (nested) indexing.
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
+          DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
+          return;
+        }
+      }
+
+      // We can't combine several GEPs into one vector if they operate on
+      // different types.
+      Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
+        if (Ty0 != CurTy) {
+          DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
+          return;
+        }
+      }
+
+      // We don't combine GEPs with non-constant indexes.
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        auto Op = cast<Instruction>(VL[j])->getOperand(1);
+        if (!isa<ConstantInt>(Op)) {
+          DEBUG(
+              dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
+          return;
+        }
+      }
+
+      newTreeEntry(VL, true, UserTreeIdx);
+      DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+      for (unsigned i = 0, e = 2; i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (Value *j : VL)
+          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+      }
+      return;
+    }
+    case Instruction::Store: {
+      // Check if the stores are consecutive or of we need to swizzle them.
+      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
+        if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
+          DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+          return;
+        }
+
+      newTreeEntry(VL, true, UserTreeIdx);
+      DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+
+      ValueList Operands;
+      for (Value *j : VL)
+        Operands.push_back(cast<Instruction>(j)->getOperand(0));
+
+      buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+      return;
+    }
+    case Instruction::Call: {
+      // Check if the calls are all to the same vectorizable intrinsic.
+      CallInst *CI = cast<CallInst>(VL[0]);
+      // Check if this is an Intrinsic call or something that can be
+      // represented by an intrinsic call
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+      if (!isTriviallyVectorizable(ID)) {
+        BS.cancelScheduling(VL, VL0);
+        newTreeEntry(VL, false, UserTreeIdx);
+        DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+        return;
+      }
+      Function *Int = CI->getCalledFunction();
+      Value *A1I = nullptr;
+      if (hasVectorInstrinsicScalarOpd(ID, 1))
+        A1I = CI->getArgOperand(1);
+      for (unsigned i = 1, e = VL.size(); i != e; ++i) {
+        CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
+        if (!CI2 || CI2->getCalledFunction() != Int ||
+            getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
+            !CI->hasIdenticalOperandBundleSchema(*CI2)) {
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
+          DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
+                       << "\n");
+          return;
+        }
+        // ctlz,cttz and powi are special intrinsics whose second argument
+        // should be same in order for them to be vectorized.
+        if (hasVectorInstrinsicScalarOpd(ID, 1)) {
+          Value *A1J = CI2->getArgOperand(1);
+          if (A1I != A1J) {
+            BS.cancelScheduling(VL, VL0);
+            newTreeEntry(VL, false, UserTreeIdx);
+            DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
+                         << " argument "<< A1I<<"!=" << A1J
+                         << "\n");
+            return;
+          }
+        }
+        // Verify that the bundle operands are identical between the two calls.
+        if (CI->hasOperandBundles() &&
+            !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
+                        CI->op_begin() + CI->getBundleOperandsEndIndex(),
+                        CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
+          DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
+                       << *VL[i] << '\n');
+          return;
+        }
+      }
+
+      newTreeEntry(VL, true, UserTreeIdx);
+      for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (Value *j : VL) {
+          CallInst *CI2 = dyn_cast<CallInst>(j);
+          Operands.push_back(CI2->getArgOperand(i));
+        }
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+      }
+      return;
+    }
+    case Instruction::ShuffleVector: {
+      // If this is not an alternate sequence of opcode like add-sub
+      // then do not vectorize this instruction.
+      if (!isAltShuffle) {
+        BS.cancelScheduling(VL, VL0);
+        newTreeEntry(VL, false, UserTreeIdx);
+        DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+        return;
+      }
+      newTreeEntry(VL, true, UserTreeIdx);
+      DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+
+      // Reorder operands if reordering would enable vectorization.
+      if (isa<BinaryOperator>(VL0)) {
+        ValueList Left, Right;
+        reorderAltShuffleOperands(VL, Left, Right);
+        buildTree_rec(Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(Right, Depth + 1, UserTreeIdx);
+        return;
+      }
+
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (Value *j : VL)
+          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+      }
+      return;
+    }
+    default:
+      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, false, UserTreeIdx);
+      DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
+      return;
+  }
+}
+
+unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
+  unsigned N;
+  Type *EltTy;
+  auto *ST = dyn_cast<StructType>(T);
+  if (ST) {
+    N = ST->getNumElements();
+    EltTy = *ST->element_begin();
+  } else {
+    N = cast<ArrayType>(T)->getNumElements();
+    EltTy = cast<ArrayType>(T)->getElementType();
+  }
+  if (!isValidElementType(EltTy))
+    return 0;
+  uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
+  if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
+    return 0;
+  if (ST) {
+    // Check that struct is homogeneous.
+    for (const auto *Ty : ST->elements())
+      if (Ty != EltTy)
+        return 0;
+  }
+  return N;
+}
+
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
+  assert(Opcode == Instruction::ExtractElement ||
+         Opcode == Instruction::ExtractValue);
+  assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
+  // Check if all of the extracts come from the same vector and from the
+  // correct offset.
+  Value *VL0 = VL[0];
+  Instruction *E0 = cast<Instruction>(VL0);
+  Value *Vec = E0->getOperand(0);
+
+  // We have to extract from a vector/aggregate with the same number of elements.
+  unsigned NElts;
+  if (Opcode == Instruction::ExtractValue) {
+    const DataLayout &DL = E0->getModule()->getDataLayout();
+    NElts = canMapToVector(Vec->getType(), DL);
+    if (!NElts)
+      return false;
+    // Check if load can be rewritten as load of vector.
+    LoadInst *LI = dyn_cast<LoadInst>(Vec);
+    if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
+      return false;
+  } else {
+    NElts = Vec->getType()->getVectorNumElements();
+  }
+
+  if (NElts != VL.size())
+    return false;
+
+  // Check that all of the indices extract from the correct offset.
+  if (!matchExtractIndex(E0, 0, Opcode))
+    return false;
+
+  for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+    Instruction *E = cast<Instruction>(VL[i]);
+    if (!matchExtractIndex(E, i, Opcode))
+      return false;
+    if (E->getOperand(0) != Vec)
+      return false;
+  }
+
+  return true;
+}
+
+int BoUpSLP::getEntryCost(TreeEntry *E) {
+  ArrayRef<Value*> VL = E->Scalars;
+
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
+    ScalarTy = CI->getOperand(0)->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+  // If we have computed a smaller type for the expression, update VecTy so
+  // that the costs will be accurate.
+  if (MinBWs.count(VL[0]))
+    VecTy = VectorType::get(
+        IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+
+  if (E->NeedToGather) {
+    if (allConstant(VL))
+      return 0;
+    if (isSplat(VL)) {
+      return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+    }
+    return getGatherCost(E->Scalars);
+  }
+  unsigned Opcode = getSameOpcode(VL);
+  assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+  Instruction *VL0 = cast<Instruction>(VL[0]);
+  switch (Opcode) {
+    case Instruction::PHI: {
+      return 0;
+    }
+    case Instruction::ExtractValue:
+    case Instruction::ExtractElement: {
+      if (canReuseExtract(VL, Opcode)) {
+        int DeadCost = 0;
+        for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+          Instruction *E = cast<Instruction>(VL[i]);
+          // If all users are going to be vectorized, instruction can be
+          // considered as dead.
+          // The same, if have only one user, it will be vectorized for sure.
+          if (E->hasOneUse() ||
+              std::all_of(E->user_begin(), E->user_end(), [this](User *U) {
+                return ScalarToTreeEntry.count(U) > 0;
+              }))
+            // Take credit for instruction that will become dead.
+            DeadCost +=
+                TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
+        }
+        return -DeadCost;
+      }
+      return getGatherCost(VecTy);
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      Type *SrcTy = VL0->getOperand(0)->getType();
+
+      // Calculate the cost of this instruction.
+      int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
+                                                         VL0->getType(), SrcTy, VL0);
+
+      VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
+      return VecCost - ScalarCost;
+    }
+    case Instruction::FCmp:
+    case Instruction::ICmp:
+    case Instruction::Select: {
+      // Calculate the cost of this instruction.
+      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+      int ScalarCost = VecTy->getNumElements() *
+          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
+      int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0);
+      return VecCost - ScalarCost;
+    }
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      // Certain instructions can be cheaper to vectorize if they have a
+      // constant second vector operand.
+      TargetTransformInfo::OperandValueKind Op1VK =
+          TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Op2VK =
+          TargetTransformInfo::OK_UniformConstantValue;
+      TargetTransformInfo::OperandValueProperties Op1VP =
+          TargetTransformInfo::OP_None;
+      TargetTransformInfo::OperandValueProperties Op2VP =
+          TargetTransformInfo::OP_None;
+
+      // If all operands are exactly the same ConstantInt then set the
+      // operand kind to OK_UniformConstantValue.
+      // If instead not all operands are constants, then set the operand kind
+      // to OK_AnyValue. If all operands are constants but not the same,
+      // then set the operand kind to OK_NonUniformConstantValue.
+      ConstantInt *CInt = nullptr;
+      for (unsigned i = 0; i < VL.size(); ++i) {
+        const Instruction *I = cast<Instruction>(VL[i]);
+        if (!isa<ConstantInt>(I->getOperand(1))) {
+          Op2VK = TargetTransformInfo::OK_AnyValue;
+          break;
+        }
+        if (i == 0) {
+          CInt = cast<ConstantInt>(I->getOperand(1));
+          continue;
+        }
+        if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
+            CInt != cast<ConstantInt>(I->getOperand(1)))
+          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+      }
+      // FIXME: Currently cost of model modification for division by power of
+      // 2 is handled for X86 and AArch64. Add support for other targets.
+      if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
+          CInt->getValue().isPowerOf2())
+        Op2VP = TargetTransformInfo::OP_PowerOf2;
+
+      SmallVector<const Value *, 4> Operands(VL0->operand_values());
+      int ScalarCost =
+          VecTy->getNumElements() *
+          TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
+                                      Op2VP, Operands);
+      int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
+                                                Op1VP, Op2VP, Operands);
+      return VecCost - ScalarCost;
+    }
+    case Instruction::GetElementPtr: {
+      TargetTransformInfo::OperandValueKind Op1VK =
+          TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Op2VK =
+          TargetTransformInfo::OK_UniformConstantValue;
+
+      int ScalarCost =
+          VecTy->getNumElements() *
+          TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+      int VecCost =
+          TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
+
+      return VecCost - ScalarCost;
+    }
+    case Instruction::Load: {
+      // Cost of wide load - cost of scalar loads.
+      unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
+      int ScalarLdCost = VecTy->getNumElements() *
+          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
+      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
+                                           VecTy, alignment, 0, VL0);
+      return VecLdCost - ScalarLdCost;
+    }
+    case Instruction::Store: {
+      // We know that we can merge the stores. Calculate the cost.
+      unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
+      int ScalarStCost = VecTy->getNumElements() *
+          TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
+      int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
+                                           VecTy, alignment, 0, VL0);
+      return VecStCost - ScalarStCost;
+    }
+    case Instruction::Call: {
+      CallInst *CI = cast<CallInst>(VL0);
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+      // Calculate the cost of the scalar and vector calls.
+      SmallVector<Type*, 4> ScalarTys;
+      for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
+        ScalarTys.push_back(CI->getArgOperand(op)->getType());
+
+      FastMathFlags FMF;
+      if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
+        FMF = FPMO->getFastMathFlags();
+
+      int ScalarCallCost = VecTy->getNumElements() *
+          TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
+
+      SmallVector<Value *, 4> Args(CI->arg_operands());
+      int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
+                                                   VecTy->getNumElements());
+
+      DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
+            << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
+            << " for " << *CI << "\n");
+
+      return VecCallCost - ScalarCallCost;
+    }
+    case Instruction::ShuffleVector: {
+      TargetTransformInfo::OperandValueKind Op1VK =
+          TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Op2VK =
+          TargetTransformInfo::OK_AnyValue;
+      int ScalarCost = 0;
+      int VecCost = 0;
+      for (Value *i : VL) {
+        Instruction *I = cast<Instruction>(i);
+        if (!I)
+          break;
+        ScalarCost +=
+            TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
+      }
+      // VecCost is equal to sum of the cost of creating 2 vectors
+      // and the cost of creating shuffle.
+      Instruction *I0 = cast<Instruction>(VL[0]);
+      VecCost =
+          TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
+      Instruction *I1 = cast<Instruction>(VL[1]);
+      VecCost +=
+          TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
+      VecCost +=
+          TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
+      return VecCost - ScalarCost;
+    }
+    default:
+      llvm_unreachable("Unknown instruction");
+  }
+}
+
+bool BoUpSLP::isFullyVectorizableTinyTree() {
+  DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
+        VectorizableTree.size() << " is fully vectorizable .\n");
+
+  // We only handle trees of heights 1 and 2.
+  if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
+    return true;
+
+  if (VectorizableTree.size() != 2)
+    return false;
+
+  // Handle splat and all-constants stores.
+  if (!VectorizableTree[0].NeedToGather &&
+      (allConstant(VectorizableTree[1].Scalars) ||
+       isSplat(VectorizableTree[1].Scalars)))
+    return true;
+
+  // Gathering cost would be too much for tiny trees.
+  if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
+    return false;
+
+  return true;
+}
+
+bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
+
+  // We can vectorize the tree if its size is greater than or equal to the
+  // minimum size specified by the MinTreeSize command line option.
+  if (VectorizableTree.size() >= MinTreeSize)
+    return false;
+
+  // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
+  // can vectorize it if we can prove it fully vectorizable.
+  if (isFullyVectorizableTinyTree())
+    return false;
+
+  assert(VectorizableTree.empty()
+             ? ExternalUses.empty()
+             : true && "We shouldn't have any external users");
+
+  // Otherwise, we can't vectorize the tree. It is both tiny and not fully
+  // vectorizable.
+  return true;
+}
+
+int BoUpSLP::getSpillCost() {
+  // Walk from the bottom of the tree to the top, tracking which values are
+  // live. When we see a call instruction that is not part of our tree,
+  // query TTI to see if there is a cost to keeping values live over it
+  // (for example, if spills and fills are required).
+  unsigned BundleWidth = VectorizableTree.front().Scalars.size();
+  int Cost = 0;
+
+  SmallPtrSet<Instruction*, 4> LiveValues;
+  Instruction *PrevInst = nullptr;
+
+  for (const auto &N : VectorizableTree) {
+    Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
+    if (!Inst)
+      continue;
+
+    if (!PrevInst) {
+      PrevInst = Inst;
+      continue;
+    }
+
+    // Update LiveValues.
+    LiveValues.erase(PrevInst);
+    for (auto &J : PrevInst->operands()) {
+      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
+        LiveValues.insert(cast<Instruction>(&*J));
+    }
+
+    DEBUG(
+      dbgs() << "SLP: #LV: " << LiveValues.size();
+      for (auto *X : LiveValues)
+        dbgs() << " " << X->getName();
+      dbgs() << ", Looking at ";
+      Inst->dump();
+      );
+
+    // Now find the sequence of instructions between PrevInst and Inst.
+    BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
+                                 PrevInstIt =
+                                     PrevInst->getIterator().getReverse();
+    while (InstIt != PrevInstIt) {
+      if (PrevInstIt == PrevInst->getParent()->rend()) {
+        PrevInstIt = Inst->getParent()->rbegin();
+        continue;
+      }
+
+      if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
+        SmallVector<Type*, 4> V;
+        for (auto *II : LiveValues)
+          V.push_back(VectorType::get(II->getType(), BundleWidth));
+        Cost += TTI->getCostOfKeepingLiveOverCall(V);
+      }
+
+      ++PrevInstIt;
+    }
+
+    PrevInst = Inst;
+  }
+
+  return Cost;
+}
+
+int BoUpSLP::getTreeCost() {
+  int Cost = 0;
+  DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
+        VectorizableTree.size() << ".\n");
+
+  unsigned BundleWidth = VectorizableTree[0].Scalars.size();
+
+  for (TreeEntry &TE : VectorizableTree) {
+    int C = getEntryCost(&TE);
+    DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
+                 << *TE.Scalars[0] << ".\n");
+    Cost += C;
+  }
+
+  SmallSet<Value *, 16> ExtractCostCalculated;
+  int ExtractCost = 0;
+  for (ExternalUser &EU : ExternalUses) {
+    // We only add extract cost once for the same scalar.
+    if (!ExtractCostCalculated.insert(EU.Scalar).second)
+      continue;
+
+    // Uses by ephemeral values are free (because the ephemeral value will be
+    // removed prior to code generation, and so the extraction will be
+    // removed as well).
+    if (EphValues.count(EU.User))
+      continue;
+
+    // If we plan to rewrite the tree in a smaller type, we will need to sign
+    // extend the extracted value back to the original type. Here, we account
+    // for the extract and the added cost of the sign extend if needed.
+    auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
+    auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+    if (MinBWs.count(ScalarRoot)) {
+      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
+      auto Extend =
+          MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
+      VecTy = VectorType::get(MinTy, BundleWidth);
+      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
+                                                   VecTy, EU.Lane);
+    } else {
+      ExtractCost +=
+          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+    }
+  }
+
+  int SpillCost = getSpillCost();
+  Cost += SpillCost + ExtractCost;
+
+  std::string Str;
+  {
+    raw_string_ostream OS(Str);
+    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+       << "SLP: Extract Cost = " << ExtractCost << ".\n"
+       << "SLP: Total Cost = " << Cost << ".\n";
+  }
+  DEBUG(dbgs() << Str);
+
+  if (ViewSLPTree)
+    ViewGraph(this, "SLP" + F->getName(), false, Str);
+
+  return Cost;
+}
+
+int BoUpSLP::getGatherCost(Type *Ty) {
+  int Cost = 0;
+  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
+    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+  return Cost;
+}
+
+int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
+  // Find the type of the operands in VL.
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  // Find the cost of inserting/extracting values from the vector.
+  return getGatherCost(VecTy);
+}
+
+// Reorder commutative operations in alternate shuffle if the resulting vectors
+// are consecutive loads. This would allow us to vectorize the tree.
+// If we have something like-
+// load a[0] - load b[0]
+// load b[1] + load a[1]
+// load a[2] - load b[2]
+// load a[3] + load b[3]
+// Reordering the second load b[1]  load a[1] would allow us to vectorize this
+// code.
+void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
+                                        SmallVectorImpl<Value *> &Left,
+                                        SmallVectorImpl<Value *> &Right) {
+  // Push left and right operands of binary operation into Left and Right
+  for (Value *i : VL) {
+    Left.push_back(cast<Instruction>(i)->getOperand(0));
+    Right.push_back(cast<Instruction>(i)->getOperand(1));
+  }
+
+  // Reorder if we have a commutative operation and consecutive access
+  // are on either side of the alternate instructions.
+  for (unsigned j = 0; j < VL.size() - 1; ++j) {
+    if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
+      if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
+        Instruction *VL1 = cast<Instruction>(VL[j]);
+        Instruction *VL2 = cast<Instruction>(VL[j + 1]);
+        if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
+          std::swap(Left[j], Right[j]);
+          continue;
+        } else if (VL2->isCommutative() &&
+                   isConsecutiveAccess(L, L1, *DL, *SE)) {
+          std::swap(Left[j + 1], Right[j + 1]);
+          continue;
+        }
+        // else unchanged
+      }
+    }
+    if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
+      if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
+        Instruction *VL1 = cast<Instruction>(VL[j]);
+        Instruction *VL2 = cast<Instruction>(VL[j + 1]);
+        if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
+          std::swap(Left[j], Right[j]);
+          continue;
+        } else if (VL2->isCommutative() &&
+                   isConsecutiveAccess(L, L1, *DL, *SE)) {
+          std::swap(Left[j + 1], Right[j + 1]);
+          continue;
+        }
+        // else unchanged
+      }
+    }
+  }
+}
+
+// Return true if I should be commuted before adding it's left and right
+// operands to the arrays Left and Right.
+//
+// The vectorizer is trying to either have all elements one side being
+// instruction with the same opcode to enable further vectorization, or having
+// a splat to lower the vectorizing cost.
+static bool shouldReorderOperands(int i, Instruction &I,
+                                  SmallVectorImpl<Value *> &Left,
+                                  SmallVectorImpl<Value *> &Right,
+                                  bool AllSameOpcodeLeft,
+                                  bool AllSameOpcodeRight, bool SplatLeft,
+                                  bool SplatRight) {
+  Value *VLeft = I.getOperand(0);
+  Value *VRight = I.getOperand(1);
+  // If we have "SplatRight", try to see if commuting is needed to preserve it.
+  if (SplatRight) {
+    if (VRight == Right[i - 1])
+      // Preserve SplatRight
+      return false;
+    if (VLeft == Right[i - 1]) {
+      // Commuting would preserve SplatRight, but we don't want to break
+      // SplatLeft either, i.e. preserve the original order if possible.
+      // (FIXME: why do we care?)
+      if (SplatLeft && VLeft == Left[i - 1])
+        return false;
+      return true;
+    }
+  }
+  // Symmetrically handle Right side.
+  if (SplatLeft) {
+    if (VLeft == Left[i - 1])
+      // Preserve SplatLeft
+      return false;
+    if (VRight == Left[i - 1])
+      return true;
+  }
+
+  Instruction *ILeft = dyn_cast<Instruction>(VLeft);
+  Instruction *IRight = dyn_cast<Instruction>(VRight);
+
+  // If we have "AllSameOpcodeRight", try to see if the left operands preserves
+  // it and not the right, in this case we want to commute.
+  if (AllSameOpcodeRight) {
+    unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
+    if (IRight && RightPrevOpcode == IRight->getOpcode())
+      // Do not commute, a match on the right preserves AllSameOpcodeRight
+      return false;
+    if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
+      // We have a match and may want to commute, but first check if there is
+      // not also a match on the existing operands on the Left to preserve
+      // AllSameOpcodeLeft, i.e. preserve the original order if possible.
+      // (FIXME: why do we care?)
+      if (AllSameOpcodeLeft && ILeft &&
+          cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
+        return false;
+      return true;
+    }
+  }
+  // Symmetrically handle Left side.
+  if (AllSameOpcodeLeft) {
+    unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
+    if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
+      return false;
+    if (IRight && LeftPrevOpcode == IRight->getOpcode())
+      return true;
+  }
+  return false;
+}
+
+void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+                                             SmallVectorImpl<Value *> &Left,
+                                             SmallVectorImpl<Value *> &Right) {
+
+  if (VL.size()) {
+    // Peel the first iteration out of the loop since there's nothing
+    // interesting to do anyway and it simplifies the checks in the loop.
+    auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
+    auto VRight = cast<Instruction>(VL[0])->getOperand(1);
+    if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
+      // Favor having instruction to the right. FIXME: why?
+      std::swap(VLeft, VRight);
+    Left.push_back(VLeft);
+    Right.push_back(VRight);
+  }
+
+  // Keep track if we have instructions with all the same opcode on one side.
+  bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
+  bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
+  // Keep track if we have one side with all the same value (broadcast).
+  bool SplatLeft = true;
+  bool SplatRight = true;
+
+  for (unsigned i = 1, e = VL.size(); i != e; ++i) {
+    Instruction *I = cast<Instruction>(VL[i]);
+    assert(I->isCommutative() && "Can only process commutative instruction");
+    // Commute to favor either a splat or maximizing having the same opcodes on
+    // one side.
+    if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
+                              AllSameOpcodeRight, SplatLeft, SplatRight)) {
+      Left.push_back(I->getOperand(1));
+      Right.push_back(I->getOperand(0));
+    } else {
+      Left.push_back(I->getOperand(0));
+      Right.push_back(I->getOperand(1));
+    }
+    // Update Splat* and AllSameOpcode* after the insertion.
+    SplatRight = SplatRight && (Right[i - 1] == Right[i]);
+    SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
+    AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
+                        (cast<Instruction>(Left[i - 1])->getOpcode() ==
+                         cast<Instruction>(Left[i])->getOpcode());
+    AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
+                         (cast<Instruction>(Right[i - 1])->getOpcode() ==
+                          cast<Instruction>(Right[i])->getOpcode());
+  }
+
+  // If one operand end up being broadcast, return this operand order.
+  if (SplatRight || SplatLeft)
+    return;
+
+  // Finally check if we can get longer vectorizable chain by reordering
+  // without breaking the good operand order detected above.
+  // E.g. If we have something like-
+  // load a[0]  load b[0]
+  // load b[1]  load a[1]
+  // load a[2]  load b[2]
+  // load a[3]  load b[3]
+  // Reordering the second load b[1]  load a[1] would allow us to vectorize
+  // this code and we still retain AllSameOpcode property.
+  // FIXME: This load reordering might break AllSameOpcode in some rare cases
+  // such as-
+  // add a[0],c[0]  load b[0]
+  // add a[1],c[2]  load b[1]
+  // b[2]           load b[2]
+  // add a[3],c[3]  load b[3]
+  for (unsigned j = 0; j < VL.size() - 1; ++j) {
+    if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
+      if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
+        if (isConsecutiveAccess(L, L1, *DL, *SE)) {
+          std::swap(Left[j + 1], Right[j + 1]);
+          continue;
+        }
+      }
+    }
+    if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
+      if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
+        if (isConsecutiveAccess(L, L1, *DL, *SE)) {
+          std::swap(Left[j + 1], Right[j + 1]);
+          continue;
+        }
+      }
+    }
+    // else unchanged
+  }
+}
+
+void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
+
+  // Get the basic block this bundle is in. All instructions in the bundle
+  // should be in this block.
+  auto *Front = cast<Instruction>(VL.front());
+  auto *BB = Front->getParent();
+  assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool {
+    return cast<Instruction>(V)->getParent() == BB;
+  }));
+
+  // The last instruction in the bundle in program order.
+  Instruction *LastInst = nullptr;
+
+  // Find the last instruction. The common case should be that BB has been
+  // scheduled, and the last instruction is VL.back(). So we start with
+  // VL.back() and iterate over schedule data until we reach the end of the
+  // bundle. The end of the bundle is marked by null ScheduleData.
+  if (BlocksSchedules.count(BB)) {
+    auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back());
+    if (Bundle && Bundle->isPartOfBundle())
+      for (; Bundle; Bundle = Bundle->NextInBundle)
+        LastInst = Bundle->Inst;
+  }
+
+  // LastInst can still be null at this point if there's either not an entry
+  // for BB in BlocksSchedules or there's no ScheduleData available for
+  // VL.back(). This can be the case if buildTree_rec aborts for various
+  // reasons (e.g., the maximum recursion depth is reached, the maximum region
+  // size is reached, etc.). ScheduleData is initialized in the scheduling
+  // "dry-run".
+  //
+  // If this happens, we can still find the last instruction by brute force. We
+  // iterate forwards from Front (inclusive) until we either see all
+  // instructions in the bundle or reach the end of the block. If Front is the
+  // last instruction in program order, LastInst will be set to Front, and we
+  // will visit all the remaining instructions in the block.
+  //
+  // One of the reasons we exit early from buildTree_rec is to place an upper
+  // bound on compile-time. Thus, taking an additional compile-time hit here is
+  // not ideal. However, this should be exceedingly rare since it requires that
+  // we both exit early from buildTree_rec and that the bundle be out-of-order
+  // (causing us to iterate all the way to the end of the block).
+  if (!LastInst) {
+    SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
+    for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
+      if (Bundle.erase(&I))
+        LastInst = &I;
+      if (Bundle.empty())
+        break;
+    }
+  }
+
+  // Set the insertion point after the last instruction in the bundle. Set the
+  // debug location to Front.
+  Builder.SetInsertPoint(BB, ++LastInst->getIterator());
+  Builder.SetCurrentDebugLocation(Front->getDebugLoc());
+}
+
+Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
+  Value *Vec = UndefValue::get(Ty);
+  // Generate the 'InsertElement' instruction.
+  for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
+    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
+    if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
+      GatherSeq.insert(Insrt);
+      CSEBlocks.insert(Insrt->getParent());
+
+      // Add to our 'need-to-extract' list.
+      if (TreeEntry *E = getTreeEntry(VL[i])) {
+        // Find which lane we need to extract.
+        int FoundLane = -1;
+        for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
+          // Is this the lane of the scalar that we are looking for ?
+          if (E->Scalars[Lane] == VL[i]) {
+            FoundLane = Lane;
+            break;
+          }
+        }
+        assert(FoundLane >= 0 && "Could not find the correct lane");
+        ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
+      }
+    }
+  }
+
+  return Vec;
+}
+
+Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
+  if (const TreeEntry *En = getTreeEntry(VL[0])) {
+    if (En->isSame(VL) && En->VectorizedValue)
+      return En->VectorizedValue;
+  }
+  return nullptr;
+}
+
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
+  if (TreeEntry *E = getTreeEntry(VL[0]))
+    if (E->isSame(VL))
+      return vectorizeTree(E);
+
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+  return Gather(VL, VecTy);
+}
+
+Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+  IRBuilder<>::InsertPointGuard Guard(Builder);
+
+  if (E->VectorizedValue) {
+    DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+    return E->VectorizedValue;
+  }
+
+  Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
+  Type *ScalarTy = VL0->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
+
+  if (E->NeedToGather) {
+    setInsertPointAfterBundle(E->Scalars);
+    auto *V = Gather(E->Scalars, VecTy);
+    E->VectorizedValue = V;
+    return V;
+  }
+
+  unsigned Opcode = getSameOpcode(E->Scalars);
+
+  switch (Opcode) {
+    case Instruction::PHI: {
+      PHINode *PH = dyn_cast<PHINode>(VL0);
+      Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
+      Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+      PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
+      E->VectorizedValue = NewPhi;
+
+      // PHINodes may have multiple entries from the same block. We want to
+      // visit every block once.
+      SmallSet<BasicBlock*, 4> VisitedBBs;
+
+      for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+        ValueList Operands;
+        BasicBlock *IBB = PH->getIncomingBlock(i);
+
+        if (!VisitedBBs.insert(IBB).second) {
+          NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
+          continue;
+        }
+
+        // Prepare the operand vector.
+        for (Value *V : E->Scalars)
+          Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
+
+        Builder.SetInsertPoint(IBB->getTerminator());
+        Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+        Value *Vec = vectorizeTree(Operands);
+        NewPhi->addIncoming(Vec, IBB);
+      }
+
+      assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
+             "Invalid number of incoming values");
+      return NewPhi;
+    }
+
+    case Instruction::ExtractElement: {
+      if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
+        Value *V = VL0->getOperand(0);
+        E->VectorizedValue = V;
+        return V;
+      }
+      setInsertPointAfterBundle(E->Scalars);
+      auto *V = Gather(E->Scalars, VecTy);
+      E->VectorizedValue = V;
+      return V;
+    }
+    case Instruction::ExtractValue: {
+      if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
+        LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
+        Builder.SetInsertPoint(LI);
+        PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
+        Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+        LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
+        E->VectorizedValue = V;
+        return propagateMetadata(V, E->Scalars);
+      }
+      setInsertPointAfterBundle(E->Scalars);
+      auto *V = Gather(E->Scalars, VecTy);
+      E->VectorizedValue = V;
+      return V;
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      ValueList INVL;
+      for (Value *V : E->Scalars)
+        INVL.push_back(cast<Instruction>(V)->getOperand(0));
+
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *InVec = vectorizeTree(INVL);
+
+      if (Value *V = alreadyVectorized(E->Scalars))
+        return V;
+
+      CastInst *CI = dyn_cast<CastInst>(VL0);
+      Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+      return V;
+    }
+    case Instruction::FCmp:
+    case Instruction::ICmp: {
+      ValueList LHSV, RHSV;
+      for (Value *V : E->Scalars) {
+        LHSV.push_back(cast<Instruction>(V)->getOperand(0));
+        RHSV.push_back(cast<Instruction>(V)->getOperand(1));
+      }
+
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *L = vectorizeTree(LHSV);
+      Value *R = vectorizeTree(RHSV);
+
+      if (Value *V = alreadyVectorized(E->Scalars))
+        return V;
+
+      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+      Value *V;
+      if (Opcode == Instruction::FCmp)
+        V = Builder.CreateFCmp(P0, L, R);
+      else
+        V = Builder.CreateICmp(P0, L, R);
+
+      E->VectorizedValue = V;
+      propagateIRFlags(E->VectorizedValue, E->Scalars);
+      ++NumVectorInstructions;
+      return V;
+    }
+    case Instruction::Select: {
+      ValueList TrueVec, FalseVec, CondVec;
+      for (Value *V : E->Scalars) {
+        CondVec.push_back(cast<Instruction>(V)->getOperand(0));
+        TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
+        FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
+      }
+
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *Cond = vectorizeTree(CondVec);
+      Value *True = vectorizeTree(TrueVec);
+      Value *False = vectorizeTree(FalseVec);
+
+      if (Value *V = alreadyVectorized(E->Scalars))
+        return V;
+
+      Value *V = Builder.CreateSelect(Cond, True, False);
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+      return V;
+    }
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      ValueList LHSVL, RHSVL;
+      if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
+        reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
+      else
+        for (Value *V : E->Scalars) {
+          LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
+          RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
+        }
+
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *LHS = vectorizeTree(LHSVL);
+      Value *RHS = vectorizeTree(RHSVL);
+
+      if (Value *V = alreadyVectorized(E->Scalars))
+        return V;
+
+      BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
+      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
+      E->VectorizedValue = V;
+      propagateIRFlags(E->VectorizedValue, E->Scalars);
+      ++NumVectorInstructions;
+
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        return propagateMetadata(I, E->Scalars);
+
+      return V;
+    }
+    case Instruction::Load: {
+      // Loads are inserted at the head of the tree because we don't want to
+      // sink them all the way down past store instructions.
+      setInsertPointAfterBundle(E->Scalars);
+
+      LoadInst *LI = cast<LoadInst>(VL0);
+      Type *ScalarLoadTy = LI->getType();
+      unsigned AS = LI->getPointerAddressSpace();
+
+      Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
+                                            VecTy->getPointerTo(AS));
+
+      // The pointer operand uses an in-tree scalar so we add the new BitCast to
+      // ExternalUses list to make sure that an extract will be generated in the
+      // future.
+      Value *PO = LI->getPointerOperand();
+      if (getTreeEntry(PO))
+        ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
+
+      unsigned Alignment = LI->getAlignment();
+      LI = Builder.CreateLoad(VecPtr);
+      if (!Alignment) {
+        Alignment = DL->getABITypeAlignment(ScalarLoadTy);
+      }
+      LI->setAlignment(Alignment);
+      E->VectorizedValue = LI;
+      ++NumVectorInstructions;
+      return propagateMetadata(LI, E->Scalars);
+    }
+    case Instruction::Store: {
+      StoreInst *SI = cast<StoreInst>(VL0);
+      unsigned Alignment = SI->getAlignment();
+      unsigned AS = SI->getPointerAddressSpace();
+
+      ValueList ValueOp;
+      for (Value *V : E->Scalars)
+        ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
+
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *VecValue = vectorizeTree(ValueOp);
+      Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
+                                            VecTy->getPointerTo(AS));
+      StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
+
+      // The pointer operand uses an in-tree scalar so we add the new BitCast to
+      // ExternalUses list to make sure that an extract will be generated in the
+      // future.
+      Value *PO = SI->getPointerOperand();
+      if (getTreeEntry(PO))
+        ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
+
+      if (!Alignment) {
+        Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
+      }
+      S->setAlignment(Alignment);
+      E->VectorizedValue = S;
+      ++NumVectorInstructions;
+      return propagateMetadata(S, E->Scalars);
+    }
+    case Instruction::GetElementPtr: {
+      setInsertPointAfterBundle(E->Scalars);
+
+      ValueList Op0VL;
+      for (Value *V : E->Scalars)
+        Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
+
+      Value *Op0 = vectorizeTree(Op0VL);
+
+      std::vector<Value *> OpVecs;
+      for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
+           ++j) {
+        ValueList OpVL;
+        for (Value *V : E->Scalars)
+          OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
+
+        Value *OpVec = vectorizeTree(OpVL);
+        OpVecs.push_back(OpVec);
+      }
+
+      Value *V = Builder.CreateGEP(
+          cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        return propagateMetadata(I, E->Scalars);
+
+      return V;
+    }
+    case Instruction::Call: {
+      CallInst *CI = cast<CallInst>(VL0);
+      setInsertPointAfterBundle(E->Scalars);
+      Function *FI;
+      Intrinsic::ID IID  = Intrinsic::not_intrinsic;
+      Value *ScalarArg = nullptr;
+      if (CI && (FI = CI->getCalledFunction())) {
+        IID = FI->getIntrinsicID();
+      }
+      std::vector<Value *> OpVecs;
+      for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
+        ValueList OpVL;
+        // ctlz,cttz and powi are special intrinsics whose second argument is
+        // a scalar. This argument should not be vectorized.
+        if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
+          CallInst *CEI = cast<CallInst>(E->Scalars[0]);
+          ScalarArg = CEI->getArgOperand(j);
+          OpVecs.push_back(CEI->getArgOperand(j));
+          continue;
+        }
+        for (Value *V : E->Scalars) {
+          CallInst *CEI = cast<CallInst>(V);
+          OpVL.push_back(CEI->getArgOperand(j));
+        }
+
+        Value *OpVec = vectorizeTree(OpVL);
+        DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
+        OpVecs.push_back(OpVec);
+      }
+
+      Module *M = F->getParent();
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+      Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
+      Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      CI->getOperandBundlesAsDefs(OpBundles);
+      Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
+
+      // The scalar argument uses an in-tree scalar so we add the new vectorized
+      // call to ExternalUses list to make sure that an extract will be
+      // generated in the future.
+      if (ScalarArg && getTreeEntry(ScalarArg))
+        ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+
+      E->VectorizedValue = V;
+      propagateIRFlags(E->VectorizedValue, E->Scalars);
+      ++NumVectorInstructions;
+      return V;
+    }
+    case Instruction::ShuffleVector: {
+      ValueList LHSVL, RHSVL;
+      assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
+      reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *LHS = vectorizeTree(LHSVL);
+      Value *RHS = vectorizeTree(RHSVL);
+
+      if (Value *V = alreadyVectorized(E->Scalars))
+        return V;
+
+      // Create a vector of LHS op1 RHS
+      BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
+      Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
+
+      // Create a vector of LHS op2 RHS
+      Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
+      BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
+      Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
+
+      // Create shuffle to take alternate operations from the vector.
+      // Also, gather up odd and even scalar ops to propagate IR flags to
+      // each vector operation.
+      ValueList OddScalars, EvenScalars;
+      unsigned e = E->Scalars.size();
+      SmallVector<Constant *, 8> Mask(e);
+      for (unsigned i = 0; i < e; ++i) {
+        if (isOdd(i)) {
+          Mask[i] = Builder.getInt32(e + i);
+          OddScalars.push_back(E->Scalars[i]);
+        } else {
+          Mask[i] = Builder.getInt32(i);
+          EvenScalars.push_back(E->Scalars[i]);
+        }
+      }
+
+      Value *ShuffleMask = ConstantVector::get(Mask);
+      propagateIRFlags(V0, EvenScalars);
+      propagateIRFlags(V1, OddScalars);
+
+      Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        return propagateMetadata(I, E->Scalars);
+
+      return V;
+    }
+    default:
+    llvm_unreachable("unknown inst");
+  }
+  return nullptr;
+}
+
+Value *BoUpSLP::vectorizeTree() {
+  ExtraValueToDebugLocsMap ExternallyUsedValues;
+  return vectorizeTree(ExternallyUsedValues);
+}
+
+Value *
+BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
+
+  // All blocks must be scheduled before any instructions are inserted.
+  for (auto &BSIter : BlocksSchedules) {
+    scheduleBlock(BSIter.second.get());
+  }
+
+  Builder.SetInsertPoint(&F->getEntryBlock().front());
+  auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
+
+  // If the vectorized tree can be rewritten in a smaller type, we truncate the
+  // vectorized root. InstCombine will then rewrite the entire expression. We
+  // sign extend the extracted values below.
+  auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+  if (MinBWs.count(ScalarRoot)) {
+    if (auto *I = dyn_cast<Instruction>(VectorRoot))
+      Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
+    auto BundleWidth = VectorizableTree[0].Scalars.size();
+    auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
+    auto *VecTy = VectorType::get(MinTy, BundleWidth);
+    auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
+    VectorizableTree[0].VectorizedValue = Trunc;
+  }
+
+  DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
+
+  // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
+  // specified by ScalarType.
+  auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
+    if (!MinBWs.count(ScalarRoot))
+      return Ex;
+    if (MinBWs[ScalarRoot].second)
+      return Builder.CreateSExt(Ex, ScalarType);
+    return Builder.CreateZExt(Ex, ScalarType);
+  };
+
+  // Extract all of the elements with the external uses.
+  for (const auto &ExternalUse : ExternalUses) {
+    Value *Scalar = ExternalUse.Scalar;
+    llvm::User *User = ExternalUse.User;
+
+    // Skip users that we already RAUW. This happens when one instruction
+    // has multiple uses of the same value.
+    if (User && !is_contained(Scalar->users(), User))
+      continue;
+    TreeEntry *E = getTreeEntry(Scalar);
+    assert(E && "Invalid scalar");
+    assert(!E->NeedToGather && "Extracting from a gather list");
+
+    Value *Vec = E->VectorizedValue;
+    assert(Vec && "Can't find vectorizable value");
+
+    Value *Lane = Builder.getInt32(ExternalUse.Lane);
+    // If User == nullptr, the Scalar is used as extra arg. Generate
+    // ExtractElement instruction and update the record for this scalar in
+    // ExternallyUsedValues.
+    if (!User) {
+      assert(ExternallyUsedValues.count(Scalar) &&
+             "Scalar with nullptr as an external user must be registered in "
+             "ExternallyUsedValues map");
+      if (auto *VecI = dyn_cast<Instruction>(Vec)) {
+        Builder.SetInsertPoint(VecI->getParent(),
+                               std::next(VecI->getIterator()));
+      } else {
+        Builder.SetInsertPoint(&F->getEntryBlock().front());
+      }
+      Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+      Ex = extend(ScalarRoot, Ex, Scalar->getType());
+      CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
+      auto &Locs = ExternallyUsedValues[Scalar];
+      ExternallyUsedValues.insert({Ex, Locs});
+      ExternallyUsedValues.erase(Scalar);
+      continue;
+    }
+
+    // Generate extracts for out-of-tree users.
+    // Find the insertion point for the extractelement lane.
+    if (auto *VecI = dyn_cast<Instruction>(Vec)) {
+      if (PHINode *PH = dyn_cast<PHINode>(User)) {
+        for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
+          if (PH->getIncomingValue(i) == Scalar) {
+            TerminatorInst *IncomingTerminator =
+                PH->getIncomingBlock(i)->getTerminator();
+            if (isa<CatchSwitchInst>(IncomingTerminator)) {
+              Builder.SetInsertPoint(VecI->getParent(),
+                                     std::next(VecI->getIterator()));
+            } else {
+              Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
+            }
+            Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+            Ex = extend(ScalarRoot, Ex, Scalar->getType());
+            CSEBlocks.insert(PH->getIncomingBlock(i));
+            PH->setOperand(i, Ex);
+          }
+        }
+      } else {
+        Builder.SetInsertPoint(cast<Instruction>(User));
+        Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+        Ex = extend(ScalarRoot, Ex, Scalar->getType());
+        CSEBlocks.insert(cast<Instruction>(User)->getParent());
+        User->replaceUsesOfWith(Scalar, Ex);
+     }
+    } else {
+      Builder.SetInsertPoint(&F->getEntryBlock().front());
+      Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+      Ex = extend(ScalarRoot, Ex, Scalar->getType());
+      CSEBlocks.insert(&F->getEntryBlock());
+      User->replaceUsesOfWith(Scalar, Ex);
+    }
+
+    DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
+  }
+
+  // For each vectorized value:
+  for (TreeEntry &EIdx : VectorizableTree) {
+    TreeEntry *Entry = &EIdx;
+
+    // For each lane:
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+      // No need to handle users of gathered values.
+      if (Entry->NeedToGather)
+        continue;
+
+      assert(Entry->VectorizedValue && "Can't find vectorizable value");
+
+      Type *Ty = Scalar->getType();
+      if (!Ty->isVoidTy()) {
+#ifndef NDEBUG
+        for (User *U : Scalar->users()) {
+          DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
+
+          assert((getTreeEntry(U) ||
+                  // It is legal to replace users in the ignorelist by undef.
+                  is_contained(UserIgnoreList, U)) &&
+                 "Replacing out-of-tree value with undef");
+        }
+#endif
+        Value *Undef = UndefValue::get(Ty);
+        Scalar->replaceAllUsesWith(Undef);
+      }
+      DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
+      eraseInstruction(cast<Instruction>(Scalar));
+    }
+  }
+
+  Builder.ClearInsertionPoint();
+
+  return VectorizableTree[0].VectorizedValue;
+}
+
+void BoUpSLP::optimizeGatherSequence() {
+  DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+        << " gather sequences instructions.\n");
+  // LICM InsertElementInst sequences.
+  for (Instruction *it : GatherSeq) {
+    InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
+
+    if (!Insert)
+      continue;
+
+    // Check if this block is inside a loop.
+    Loop *L = LI->getLoopFor(Insert->getParent());
+    if (!L)
+      continue;
+
+    // Check if it has a preheader.
+    BasicBlock *PreHeader = L->getLoopPreheader();
+    if (!PreHeader)
+      continue;
+
+    // If the vector or the element that we insert into it are
+    // instructions that are defined in this basic block then we can't
+    // hoist this instruction.
+    Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
+    Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
+    if (CurrVec && L->contains(CurrVec))
+      continue;
+    if (NewElem && L->contains(NewElem))
+      continue;
+
+    // We can hoist this instruction. Move it to the pre-header.
+    Insert->moveBefore(PreHeader->getTerminator());
+  }
+
+  // Make a list of all reachable blocks in our CSE queue.
+  SmallVector<const DomTreeNode *, 8> CSEWorkList;
+  CSEWorkList.reserve(CSEBlocks.size());
+  for (BasicBlock *BB : CSEBlocks)
+    if (DomTreeNode *N = DT->getNode(BB)) {
+      assert(DT->isReachableFromEntry(N));
+      CSEWorkList.push_back(N);
+    }
+
+  // Sort blocks by domination. This ensures we visit a block after all blocks
+  // dominating it are visited.
+  std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
+                   [this](const DomTreeNode *A, const DomTreeNode *B) {
+    return DT->properlyDominates(A, B);
+  });
+
+  // Perform O(N^2) search over the gather sequences and merge identical
+  // instructions. TODO: We can further optimize this scan if we split the
+  // instructions into different buckets based on the insert lane.
+  SmallVector<Instruction *, 16> Visited;
+  for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
+    assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
+           "Worklist not sorted properly!");
+    BasicBlock *BB = (*I)->getBlock();
+    // For all instructions in blocks containing gather sequences:
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
+      Instruction *In = &*it++;
+      if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
+        continue;
+
+      // Check if we can replace this instruction with any of the
+      // visited instructions.
+      for (Instruction *v : Visited) {
+        if (In->isIdenticalTo(v) &&
+            DT->dominates(v->getParent(), In->getParent())) {
+          In->replaceAllUsesWith(v);
+          eraseInstruction(In);
+          In = nullptr;
+          break;
+        }
+      }
+      if (In) {
+        assert(!is_contained(Visited, In));
+        Visited.push_back(In);
+      }
+    }
+  }
+  CSEBlocks.clear();
+  GatherSeq.clear();
+}
+
+// Groups the instructions to a bundle (which is then a single scheduling entity)
+// and schedules instructions until the bundle gets ready.
+bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
+                                                 BoUpSLP *SLP) {
+  if (isa<PHINode>(VL[0]))
+    return true;
+
+  // Initialize the instruction bundle.
+  Instruction *OldScheduleEnd = ScheduleEnd;
+  ScheduleData *PrevInBundle = nullptr;
+  ScheduleData *Bundle = nullptr;
+  bool ReSchedule = false;
+  DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
+
+  // Make sure that the scheduling region contains all
+  // instructions of the bundle.
+  for (Value *V : VL) {
+    if (!extendSchedulingRegion(V))
+      return false;
+  }
+
+  for (Value *V : VL) {
+    ScheduleData *BundleMember = getScheduleData(V);
+    assert(BundleMember &&
+           "no ScheduleData for bundle member (maybe not in same basic block)");
+    if (BundleMember->IsScheduled) {
+      // A bundle member was scheduled as single instruction before and now
+      // needs to be scheduled as part of the bundle. We just get rid of the
+      // existing schedule.
+      DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
+                   << " was already scheduled\n");
+      ReSchedule = true;
+    }
+    assert(BundleMember->isSchedulingEntity() &&
+           "bundle member already part of other bundle");
+    if (PrevInBundle) {
+      PrevInBundle->NextInBundle = BundleMember;
+    } else {
+      Bundle = BundleMember;
+    }
+    BundleMember->UnscheduledDepsInBundle = 0;
+    Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
+
+    // Group the instructions to a bundle.
+    BundleMember->FirstInBundle = Bundle;
+    PrevInBundle = BundleMember;
+  }
+  if (ScheduleEnd != OldScheduleEnd) {
+    // The scheduling region got new instructions at the lower end (or it is a
+    // new region for the first bundle). This makes it necessary to
+    // recalculate all dependencies.
+    // It is seldom that this needs to be done a second time after adding the
+    // initial bundle to the region.
+    for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+      ScheduleData *SD = getScheduleData(I);
+      SD->clearDependencies();
+    }
+    ReSchedule = true;
+  }
+  if (ReSchedule) {
+    resetSchedule();
+    initialFillReadyList(ReadyInsts);
+  }
+
+  DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
+               << BB->getName() << "\n");
+
+  calculateDependencies(Bundle, true, SLP);
+
+  // Now try to schedule the new bundle. As soon as the bundle is "ready" it
+  // means that there are no cyclic dependencies and we can schedule it.
+  // Note that's important that we don't "schedule" the bundle yet (see
+  // cancelScheduling).
+  while (!Bundle->isReady() && !ReadyInsts.empty()) {
+
+    ScheduleData *pickedSD = ReadyInsts.back();
+    ReadyInsts.pop_back();
+
+    if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
+      schedule(pickedSD, ReadyInsts);
+    }
+  }
+  if (!Bundle->isReady()) {
+    cancelScheduling(VL, VL[0]);
+    return false;
+  }
+  return true;
+}
+
+void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
+                                                Value *OpValue) {
+  if (isa<PHINode>(OpValue))
+    return;
+
+  ScheduleData *Bundle = getScheduleData(OpValue);
+  DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
+  assert(!Bundle->IsScheduled &&
+         "Can't cancel bundle which is already scheduled");
+  assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
+         "tried to unbundle something which is not a bundle");
+
+  // Un-bundle: make single instructions out of the bundle.
+  ScheduleData *BundleMember = Bundle;
+  while (BundleMember) {
+    assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
+    BundleMember->FirstInBundle = BundleMember;
+    ScheduleData *Next = BundleMember->NextInBundle;
+    BundleMember->NextInBundle = nullptr;
+    BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
+    if (BundleMember->UnscheduledDepsInBundle == 0) {
+      ReadyInsts.insert(BundleMember);
+    }
+    BundleMember = Next;
+  }
+}
+
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
+  if (getScheduleData(V))
+    return true;
+  Instruction *I = dyn_cast<Instruction>(V);
+  assert(I && "bundle member must be an instruction");
+  assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
+  if (!ScheduleStart) {
+    // It's the first instruction in the new region.
+    initScheduleData(I, I->getNextNode(), nullptr, nullptr);
+    ScheduleStart = I;
+    ScheduleEnd = I->getNextNode();
+    assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
+    DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
+    return true;
+  }
+  // Search up and down at the same time, because we don't know if the new
+  // instruction is above or below the existing scheduling region.
+  BasicBlock::reverse_iterator UpIter =
+      ++ScheduleStart->getIterator().getReverse();
+  BasicBlock::reverse_iterator UpperEnd = BB->rend();
+  BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
+  BasicBlock::iterator LowerEnd = BB->end();
+  for (;;) {
+    if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
+      DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
+      return false;
+    }
+
+    if (UpIter != UpperEnd) {
+      if (&*UpIter == I) {
+        initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
+        ScheduleStart = I;
+        DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n");
+        return true;
+      }
+      UpIter++;
+    }
+    if (DownIter != LowerEnd) {
+      if (&*DownIter == I) {
+        initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
+                         nullptr);
+        ScheduleEnd = I->getNextNode();
+        assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
+        DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
+        return true;
+      }
+      DownIter++;
+    }
+    assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
+           "instruction not found in block");
+  }
+  return true;
+}
+
+void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
+                                                Instruction *ToI,
+                                                ScheduleData *PrevLoadStore,
+                                                ScheduleData *NextLoadStore) {
+  ScheduleData *CurrentLoadStore = PrevLoadStore;
+  for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
+    ScheduleData *SD = ScheduleDataMap[I];
+    if (!SD) {
+      // Allocate a new ScheduleData for the instruction.
+      if (ChunkPos >= ChunkSize) {
+        ScheduleDataChunks.push_back(
+            llvm::make_unique<ScheduleData[]>(ChunkSize));
+        ChunkPos = 0;
+      }
+      SD = &(ScheduleDataChunks.back()[ChunkPos++]);
+      ScheduleDataMap[I] = SD;
+      SD->Inst = I;
+    }
+    assert(!isInSchedulingRegion(SD) &&
+           "new ScheduleData already in scheduling region");
+    SD->init(SchedulingRegionID);
+
+    if (I->mayReadOrWriteMemory()) {
+      // Update the linked list of memory accessing instructions.
+      if (CurrentLoadStore) {
+        CurrentLoadStore->NextLoadStore = SD;
+      } else {
+        FirstLoadStoreInRegion = SD;
+      }
+      CurrentLoadStore = SD;
+    }
+  }
+  if (NextLoadStore) {
+    if (CurrentLoadStore)
+      CurrentLoadStore->NextLoadStore = NextLoadStore;
+  } else {
+    LastLoadStoreInRegion = CurrentLoadStore;
+  }
+}
+
+void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
+                                                     bool InsertInReadyList,
+                                                     BoUpSLP *SLP) {
+  assert(SD->isSchedulingEntity());
+
+  SmallVector<ScheduleData *, 10> WorkList;
+  WorkList.push_back(SD);
+
+  while (!WorkList.empty()) {
+    ScheduleData *SD = WorkList.back();
+    WorkList.pop_back();
+
+    ScheduleData *BundleMember = SD;
+    while (BundleMember) {
+      assert(isInSchedulingRegion(BundleMember));
+      if (!BundleMember->hasValidDependencies()) {
+
+        DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember << "\n");
+        BundleMember->Dependencies = 0;
+        BundleMember->resetUnscheduledDeps();
+
+        // Handle def-use chain dependencies.
+        for (User *U : BundleMember->Inst->users()) {
+          if (isa<Instruction>(U)) {
+            ScheduleData *UseSD = getScheduleData(U);
+            if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+              BundleMember->Dependencies++;
+              ScheduleData *DestBundle = UseSD->FirstInBundle;
+              if (!DestBundle->IsScheduled)
+                BundleMember->incrementUnscheduledDeps(1);
+              if (!DestBundle->hasValidDependencies())
+                WorkList.push_back(DestBundle);
+            }
+          } else {
+            // I'm not sure if this can ever happen. But we need to be safe.
+            // This lets the instruction/bundle never be scheduled and
+            // eventually disable vectorization.
+            BundleMember->Dependencies++;
+            BundleMember->incrementUnscheduledDeps(1);
+          }
+        }
+
+        // Handle the memory dependencies.
+        ScheduleData *DepDest = BundleMember->NextLoadStore;
+        if (DepDest) {
+          Instruction *SrcInst = BundleMember->Inst;
+          MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
+          bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+          unsigned numAliased = 0;
+          unsigned DistToSrc = 1;
+
+          while (DepDest) {
+            assert(isInSchedulingRegion(DepDest));
+
+            // We have two limits to reduce the complexity:
+            // 1) AliasedCheckLimit: It's a small limit to reduce calls to
+            //    SLP->isAliased (which is the expensive part in this loop).
+            // 2) MaxMemDepDistance: It's for very large blocks and it aborts
+            //    the whole loop (even if the loop is fast, it's quadratic).
+            //    It's important for the loop break condition (see below) to
+            //    check this limit even between two read-only instructions.
+            if (DistToSrc >= MaxMemDepDistance ||
+                    ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
+                     (numAliased >= AliasedCheckLimit ||
+                      SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
+
+              // We increment the counter only if the locations are aliased
+              // (instead of counting all alias checks). This gives a better
+              // balance between reduced runtime and accurate dependencies.
+              numAliased++;
+
+              DepDest->MemoryDependencies.push_back(BundleMember);
+              BundleMember->Dependencies++;
+              ScheduleData *DestBundle = DepDest->FirstInBundle;
+              if (!DestBundle->IsScheduled) {
+                BundleMember->incrementUnscheduledDeps(1);
+              }
+              if (!DestBundle->hasValidDependencies()) {
+                WorkList.push_back(DestBundle);
+              }
+            }
+            DepDest = DepDest->NextLoadStore;
+
+            // Example, explaining the loop break condition: Let's assume our
+            // starting instruction is i0 and MaxMemDepDistance = 3.
+            //
+            //                      +--------v--v--v
+            //             i0,i1,i2,i3,i4,i5,i6,i7,i8
+            //             +--------^--^--^
+            //
+            // MaxMemDepDistance let us stop alias-checking at i3 and we add
+            // dependencies from i0 to i3,i4,.. (even if they are not aliased).
+            // Previously we already added dependencies from i3 to i6,i7,i8
+            // (because of MaxMemDepDistance). As we added a dependency from
+            // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
+            // and we can abort this loop at i6.
+            if (DistToSrc >= 2 * MaxMemDepDistance)
+                break;
+            DistToSrc++;
+          }
+        }
+      }
+      BundleMember = BundleMember->NextInBundle;
+    }
+    if (InsertInReadyList && SD->isReady()) {
+      ReadyInsts.push_back(SD);
+      DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst << "\n");
+    }
+  }
+}
+
+void BoUpSLP::BlockScheduling::resetSchedule() {
+  assert(ScheduleStart &&
+         "tried to reset schedule on block which has not been scheduled");
+  for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+    ScheduleData *SD = getScheduleData(I);
+    assert(isInSchedulingRegion(SD));
+    SD->IsScheduled = false;
+    SD->resetUnscheduledDeps();
+  }
+  ReadyInsts.clear();
+}
+
+void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
+
+  if (!BS->ScheduleStart)
+    return;
+
+  DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
+
+  BS->resetSchedule();
+
+  // For the real scheduling we use a more sophisticated ready-list: it is
+  // sorted by the original instruction location. This lets the final schedule
+  // be as  close as possible to the original instruction order.
+  struct ScheduleDataCompare {
+    bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
+      return SD2->SchedulingPriority < SD1->SchedulingPriority;
+    }
+  };
+  std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
+
+  // Ensure that all dependency data is updated and fill the ready-list with
+  // initial instructions.
+  int Idx = 0;
+  int NumToSchedule = 0;
+  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    ScheduleData *SD = BS->getScheduleData(I);
+    assert(
+        SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) &&
+        "scheduler and vectorizer have different opinion on what is a bundle");
+    SD->FirstInBundle->SchedulingPriority = Idx++;
+    if (SD->isSchedulingEntity()) {
+      BS->calculateDependencies(SD, false, this);
+      NumToSchedule++;
+    }
+  }
+  BS->initialFillReadyList(ReadyInsts);
+
+  Instruction *LastScheduledInst = BS->ScheduleEnd;
+
+  // Do the "real" scheduling.
+  while (!ReadyInsts.empty()) {
+    ScheduleData *picked = *ReadyInsts.begin();
+    ReadyInsts.erase(ReadyInsts.begin());
+
+    // Move the scheduled instruction(s) to their dedicated places, if not
+    // there yet.
+    ScheduleData *BundleMember = picked;
+    while (BundleMember) {
+      Instruction *pickedInst = BundleMember->Inst;
+      if (LastScheduledInst->getNextNode() != pickedInst) {
+        BS->BB->getInstList().remove(pickedInst);
+        BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
+                                     pickedInst);
+      }
+      LastScheduledInst = pickedInst;
+      BundleMember = BundleMember->NextInBundle;
+    }
+
+    BS->schedule(picked, ReadyInsts);
+    NumToSchedule--;
+  }
+  assert(NumToSchedule == 0 && "could not schedule all instructions");
+
+  // Avoid duplicate scheduling of the block.
+  BS->ScheduleStart = nullptr;
+}
+
+unsigned BoUpSLP::getVectorElementSize(Value *V) {
+  // If V is a store, just return the width of the stored value without
+  // traversing the expression tree. This is the common case.
+  if (auto *Store = dyn_cast<StoreInst>(V))
+    return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+
+  // If V is not a store, we can traverse the expression tree to find loads
+  // that feed it. The type of the loaded value may indicate a more suitable
+  // width than V's type. We want to base the vector element size on the width
+  // of memory operations where possible.
+  SmallVector<Instruction *, 16> Worklist;
+  SmallPtrSet<Instruction *, 16> Visited;
+  if (auto *I = dyn_cast<Instruction>(V))
+    Worklist.push_back(I);
+
+  // Traverse the expression tree in bottom-up order looking for loads. If we
+  // encounter an instruciton we don't yet handle, we give up.
+  auto MaxWidth = 0u;
+  auto FoundUnknownInst = false;
+  while (!Worklist.empty() && !FoundUnknownInst) {
+    auto *I = Worklist.pop_back_val();
+    Visited.insert(I);
+
+    // We should only be looking at scalar instructions here. If the current
+    // instruction has a vector type, give up.
+    auto *Ty = I->getType();
+    if (isa<VectorType>(Ty))
+      FoundUnknownInst = true;
+
+    // If the current instruction is a load, update MaxWidth to reflect the
+    // width of the loaded value.
+    else if (isa<LoadInst>(I))
+      MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
+
+    // Otherwise, we need to visit the operands of the instruction. We only
+    // handle the interesting cases from buildTree here. If an operand is an
+    // instruction we haven't yet visited, we add it to the worklist.
+    else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
+             isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
+      for (Use &U : I->operands())
+        if (auto *J = dyn_cast<Instruction>(U.get()))
+          if (!Visited.count(J))
+            Worklist.push_back(J);
+    }
+
+    // If we don't yet handle the instruction, give up.
+    else
+      FoundUnknownInst = true;
+  }
+
+  // If we didn't encounter a memory access in the expression tree, or if we
+  // gave up for some reason, just return the width of V.
+  if (!MaxWidth || FoundUnknownInst)
+    return DL->getTypeSizeInBits(V->getType());
+
+  // Otherwise, return the maximum width we found.
+  return MaxWidth;
+}
+
+// Determine if a value V in a vectorizable expression Expr can be demoted to a
+// smaller type with a truncation. We collect the values that will be demoted
+// in ToDemote and additional roots that require investigating in Roots.
+static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
+                                  SmallVectorImpl<Value *> &ToDemote,
+                                  SmallVectorImpl<Value *> &Roots) {
+
+  // We can always demote constants.
+  if (isa<Constant>(V)) {
+    ToDemote.push_back(V);
+    return true;
+  }
+
+  // If the value is not an instruction in the expression with only one use, it
+  // cannot be demoted.
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I || !I->hasOneUse() || !Expr.count(I))
+    return false;
+
+  switch (I->getOpcode()) {
+
+  // We can always demote truncations and extensions. Since truncations can
+  // seed additional demotion, we save the truncated value.
+  case Instruction::Trunc:
+    Roots.push_back(I->getOperand(0));
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    break;
+
+  // We can demote certain binary operations if we can demote both of their
+  // operands.
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
+        !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
+      return false;
+    break;
+
+  // We can demote selects if we can demote their true and false values.
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
+        !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
+      return false;
+    break;
+  }
+
+  // We can demote phis if we can demote all their incoming operands. Note that
+  // we don't need to worry about cycles since we ensure single use above.
+  case Instruction::PHI: {
+    PHINode *PN = cast<PHINode>(I);
+    for (Value *IncValue : PN->incoming_values())
+      if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
+        return false;
+    break;
+  }
+
+  // Otherwise, conservatively give up.
+  default:
+    return false;
+  }
+
+  // Record the value that we can demote.
+  ToDemote.push_back(V);
+  return true;
+}
+
+void BoUpSLP::computeMinimumValueSizes() {
+  // If there are no external uses, the expression tree must be rooted by a
+  // store. We can't demote in-memory values, so there is nothing to do here.
+  if (ExternalUses.empty())
+    return;
+
+  // We only attempt to truncate integer expressions.
+  auto &TreeRoot = VectorizableTree[0].Scalars;
+  auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
+  if (!TreeRootIT)
+    return;
+
+  // If the expression is not rooted by a store, these roots should have
+  // external uses. We will rely on InstCombine to rewrite the expression in
+  // the narrower type. However, InstCombine only rewrites single-use values.
+  // This means that if a tree entry other than a root is used externally, it
+  // must have multiple uses and InstCombine will not rewrite it. The code
+  // below ensures that only the roots are used externally.
+  SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
+  for (auto &EU : ExternalUses)
+    if (!Expr.erase(EU.Scalar))
+      return;
+  if (!Expr.empty())
+    return;
+
+  // Collect the scalar values of the vectorizable expression. We will use this
+  // context to determine which values can be demoted. If we see a truncation,
+  // we mark it as seeding another demotion.
+  for (auto &Entry : VectorizableTree)
+    Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
+
+  // Ensure the roots of the vectorizable tree don't form a cycle. They must
+  // have a single external user that is not in the vectorizable tree.
+  for (auto *Root : TreeRoot)
+    if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
+      return;
+
+  // Conservatively determine if we can actually truncate the roots of the
+  // expression. Collect the values that can be demoted in ToDemote and
+  // additional roots that require investigating in Roots.
+  SmallVector<Value *, 32> ToDemote;
+  SmallVector<Value *, 4> Roots;
+  for (auto *Root : TreeRoot)
+    if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
+      return;
+
+  // The maximum bit width required to represent all the values that can be
+  // demoted without loss of precision. It would be safe to truncate the roots
+  // of the expression to this width.
+  auto MaxBitWidth = 8u;
+
+  // We first check if all the bits of the roots are demanded. If they're not,
+  // we can truncate the roots to this narrower type.
+  for (auto *Root : TreeRoot) {
+    auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
+    MaxBitWidth = std::max<unsigned>(
+        Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
+  }
+
+  // True if the roots can be zero-extended back to their original type, rather
+  // than sign-extended. We know that if the leading bits are not demanded, we
+  // can safely zero-extend. So we initialize IsKnownPositive to True.
+  bool IsKnownPositive = true;
+
+  // If all the bits of the roots are demanded, we can try a little harder to
+  // compute a narrower type. This can happen, for example, if the roots are
+  // getelementptr indices. InstCombine promotes these indices to the pointer
+  // width. Thus, all their bits are technically demanded even though the
+  // address computation might be vectorized in a smaller type.
+  //
+  // We start by looking at each entry that can be demoted. We compute the
+  // maximum bit width required to store the scalar by using ValueTracking to
+  // compute the number of high-order bits we can truncate.
+  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
+    MaxBitWidth = 8u;
+
+    // Determine if the sign bit of all the roots is known to be zero. If not,
+    // IsKnownPositive is set to False.
+    IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
+      KnownBits Known = computeKnownBits(R, *DL);
+      return Known.isNonNegative();
+    });
+
+    // Determine the maximum number of bits required to store the scalar
+    // values.
+    for (auto *Scalar : ToDemote) {
+      auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
+      auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
+      MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
+    }
+
+    // If we can't prove that the sign bit is zero, we must add one to the
+    // maximum bit width to account for the unknown sign bit. This preserves
+    // the existing sign bit so we can safely sign-extend the root back to the
+    // original type. Otherwise, if we know the sign bit is zero, we will
+    // zero-extend the root instead.
+    //
+    // FIXME: This is somewhat suboptimal, as there will be cases where adding
+    //        one to the maximum bit width will yield a larger-than-necessary
+    //        type. In general, we need to add an extra bit only if we can't
+    //        prove that the upper bit of the original type is equal to the
+    //        upper bit of the proposed smaller type. If these two bits are the
+    //        same (either zero or one) we know that sign-extending from the
+    //        smaller type will result in the same value. Here, since we can't
+    //        yet prove this, we are just making the proposed smaller type
+    //        larger to ensure correctness.
+    if (!IsKnownPositive)
+      ++MaxBitWidth;
+  }
+
+  // Round MaxBitWidth up to the next power-of-two.
+  if (!isPowerOf2_64(MaxBitWidth))
+    MaxBitWidth = NextPowerOf2(MaxBitWidth);
+
+  // If the maximum bit width we compute is less than the with of the roots'
+  // type, we can proceed with the narrowing. Otherwise, do nothing.
+  if (MaxBitWidth >= TreeRootIT->getBitWidth())
+    return;
+
+  // If we can truncate the root, we must collect additional values that might
+  // be demoted as a result. That is, those seeded by truncations we will
+  // modify.
+  while (!Roots.empty())
+    collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
+
+  // Finally, map the values we can demote to the maximum bit with we computed.
+  for (auto *Scalar : ToDemote)
+    MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
+}
+
+namespace {
+/// The SLPVectorizer Pass.
+struct SLPVectorizer : public FunctionPass {
+  SLPVectorizerPass Impl;
+
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  explicit SLPVectorizer() : FunctionPass(ID) {
+    initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
+  }
+
+
+  bool doInitialization(Module &M) override {
+    return false;
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<DemandedBitsWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+} // end anonymous namespace
+
+PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+  auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
+  auto *AA = &AM.getResult<AAManager>(F);
+  auto *LI = &AM.getResult<LoopAnalysis>(F);
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+  auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
+  auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<AAManager>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
+                                TargetTransformInfo *TTI_,
+                                TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
+                                LoopInfo *LI_, DominatorTree *DT_,
+                                AssumptionCache *AC_, DemandedBits *DB_,
+                                OptimizationRemarkEmitter *ORE_) {
+  SE = SE_;
+  TTI = TTI_;
+  TLI = TLI_;
+  AA = AA_;
+  LI = LI_;
+  DT = DT_;
+  AC = AC_;
+  DB = DB_;
+  DL = &F.getParent()->getDataLayout();
+
+  Stores.clear();
+  GEPs.clear();
+  bool Changed = false;
+
+  // If the target claims to have no vector registers don't attempt
+  // vectorization.
+  if (!TTI->getNumberOfRegisters(true))
+    return false;
+
+  // Don't vectorize when the attribute NoImplicitFloat is used.
+  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
+    return false;
+
+  DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
+
+  // Use the bottom up slp vectorizer to construct chains that start with
+  // store instructions.
+  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
+
+  // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
+  // delete instructions.
+
+  // Scan the blocks in the function in post order.
+  for (auto BB : post_order(&F.getEntryBlock())) {
+    collectSeedInstructions(BB);
+
+    // Vectorize trees that end at stores.
+    if (!Stores.empty()) {
+      DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
+                   << " underlying objects.\n");
+      Changed |= vectorizeStoreChains(R);
+    }
+
+    // Vectorize trees that end at reductions.
+    Changed |= vectorizeChainsInBlock(BB, R);
+
+    // Vectorize the index computations of getelementptr instructions. This
+    // is primarily intended to catch gather-like idioms ending at
+    // non-consecutive loads.
+    if (!GEPs.empty()) {
+      DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
+                   << " underlying objects.\n");
+      Changed |= vectorizeGEPIndices(BB, R);
+    }
+  }
+
+  if (Changed) {
+    R.optimizeGatherSequence();
+    DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
+    DEBUG(verifyFunction(F));
+  }
+  return Changed;
+}
+
+/// \brief Check that the Values in the slice in VL array are still existent in
+/// the WeakTrackingVH array.
+/// Vectorization of part of the VL array may cause later values in the VL array
+/// to become invalid. We track when this has happened in the WeakTrackingVH
+/// array.
+static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
+                               ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,
+                               unsigned SliceSize) {
+  VL = VL.slice(SliceBegin, SliceSize);
+  VH = VH.slice(SliceBegin, SliceSize);
+  return !std::equal(VL.begin(), VL.end(), VH.begin());
+}
+
+bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
+                                            unsigned VecRegSize) {
+  unsigned ChainLen = Chain.size();
+  DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
+        << "\n");
+  unsigned Sz = R.getVectorElementSize(Chain[0]);
+  unsigned VF = VecRegSize / Sz;
+
+  if (!isPowerOf2_32(Sz) || VF < 2)
+    return false;
+
+  // Keep track of values that were deleted by vectorizing in the loop below.
+  SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
+
+  bool Changed = false;
+  // Look for profitable vectorizable trees at all offsets, starting at zero.
+  for (unsigned i = 0, e = ChainLen; i < e; ++i) {
+    if (i + VF > e)
+      break;
+
+    // Check that a previous iteration of this loop did not delete the Value.
+    if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
+      continue;
+
+    DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
+          << "\n");
+    ArrayRef<Value *> Operands = Chain.slice(i, VF);
+
+    R.buildTree(Operands);
+    if (R.isTreeTinyAndNotFullyVectorizable())
+      continue;
+
+    R.computeMinimumValueSizes();
+
+    int Cost = R.getTreeCost();
+
+    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+    if (Cost < -SLPCostThreshold) {
+      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+      using namespace ore;
+      R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
+                                          cast<StoreInst>(Chain[i]))
+                       << "Stores SLP vectorized with cost " << NV("Cost", Cost)
+                       << " and with tree size "
+                       << NV("TreeSize", R.getTreeSize()));
+
+      R.vectorizeTree();
+
+      // Move to the next bundle.
+      i += VF - 1;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
+                                        BoUpSLP &R) {
+  SetVector<StoreInst *> Heads, Tails;
+  SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
+
+  // We may run into multiple chains that merge into a single chain. We mark the
+  // stores that we vectorized so that we don't visit the same store twice.
+  BoUpSLP::ValueSet VectorizedStores;
+  bool Changed = false;
+
+  // Do a quadratic search on all of the given stores and find
+  // all of the pairs of stores that follow each other.
+  SmallVector<unsigned, 16> IndexQueue;
+  for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
+    IndexQueue.clear();
+    // If a store has multiple consecutive store candidates, search Stores
+    // array according to the sequence: from i+1 to e, then from i-1 to 0.
+    // This is because usually pairing with immediate succeeding or preceding
+    // candidate create the best chance to find slp vectorization opportunity.
+    unsigned j = 0;
+    for (j = i + 1; j < e; ++j)
+      IndexQueue.push_back(j);
+    for (j = i; j > 0; --j)
+      IndexQueue.push_back(j - 1);
+
+    for (auto &k : IndexQueue) {
+      if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) {
+        Tails.insert(Stores[k]);
+        Heads.insert(Stores[i]);
+        ConsecutiveChain[Stores[i]] = Stores[k];
+        break;
+      }
+    }
+  }
+
+  // For stores that start but don't end a link in the chain:
+  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
+       it != e; ++it) {
+    if (Tails.count(*it))
+      continue;
+
+    // We found a store instr that starts a chain. Now follow the chain and try
+    // to vectorize it.
+    BoUpSLP::ValueList Operands;
+    StoreInst *I = *it;
+    // Collect the chain into a list.
+    while (Tails.count(I) || Heads.count(I)) {
+      if (VectorizedStores.count(I))
+        break;
+      Operands.push_back(I);
+      // Move to the next value in the chain.
+      I = ConsecutiveChain[I];
+    }
+
+    // FIXME: Is division-by-2 the correct step? Should we assert that the
+    // register size is a power-of-2?
+    for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
+         Size /= 2) {
+      if (vectorizeStoreChain(Operands, R, Size)) {
+        // Mark the vectorized stores so that we don't vectorize them again.
+        VectorizedStores.insert(Operands.begin(), Operands.end());
+        Changed = true;
+        break;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
+
+  // Initialize the collections. We will make a single pass over the block.
+  Stores.clear();
+  GEPs.clear();
+
+  // Visit the store and getelementptr instructions in BB and organize them in
+  // Stores and GEPs according to the underlying objects of their pointer
+  // operands.
+  for (Instruction &I : *BB) {
+
+    // Ignore store instructions that are volatile or have a pointer operand
+    // that doesn't point to a scalar type.
+    if (auto *SI = dyn_cast<StoreInst>(&I)) {
+      if (!SI->isSimple())
+        continue;
+      if (!isValidElementType(SI->getValueOperand()->getType()))
+        continue;
+      Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
+    }
+
+    // Ignore getelementptr instructions that have more than one index, a
+    // constant index, or a pointer operand that doesn't point to a scalar
+    // type.
+    else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+      auto Idx = GEP->idx_begin()->get();
+      if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
+        continue;
+      if (!isValidElementType(Idx->getType()))
+        continue;
+      if (GEP->getType()->isVectorTy())
+        continue;
+      GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
+    }
+  }
+}
+
+bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
+  if (!A || !B)
+    return false;
+  Value *VL[] = { A, B };
+  return tryToVectorizeList(VL, R, None, true);
+}
+
+bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+                                           ArrayRef<Value *> BuildVector,
+                                           bool AllowReorder) {
+  if (VL.size() < 2)
+    return false;
+
+  DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
+               << ".\n");
+
+  // Check that all of the parts are scalar instructions of the same type.
+  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+  if (!I0)
+    return false;
+
+  unsigned Opcode0 = I0->getOpcode();
+
+  unsigned Sz = R.getVectorElementSize(I0);
+  unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
+  unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+  if (MaxVF < 2)
+    return false;
+
+  for (Value *V : VL) {
+    Type *Ty = V->getType();
+    if (!isValidElementType(Ty))
+      return false;
+    Instruction *Inst = dyn_cast<Instruction>(V);
+    if (!Inst || Inst->getOpcode() != Opcode0)
+      return false;
+  }
+
+  bool Changed = false;
+
+  // Keep track of values that were deleted by vectorizing in the loop below.
+  SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
+
+  unsigned NextInst = 0, MaxInst = VL.size();
+  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
+       VF /= 2) {
+    // No actual vectorization should happen, if number of parts is the same as
+    // provided vectorization factor (i.e. the scalar type is used for vector
+    // code during codegen).
+    auto *VecTy = VectorType::get(VL[0]->getType(), VF);
+    if (TTI->getNumberOfParts(VecTy) == VF)
+      continue;
+    for (unsigned I = NextInst; I < MaxInst; ++I) {
+      unsigned OpsWidth = 0;
+
+      if (I + VF > MaxInst)
+        OpsWidth = MaxInst - I;
+      else
+        OpsWidth = VF;
+
+      if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+        break;
+
+      // Check that a previous iteration of this loop did not delete the Value.
+      if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
+        continue;
+
+      DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+                   << "\n");
+      ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
+
+      ArrayRef<Value *> BuildVectorSlice;
+      if (!BuildVector.empty())
+        BuildVectorSlice = BuildVector.slice(I, OpsWidth);
+
+      R.buildTree(Ops, BuildVectorSlice);
+      // TODO: check if we can allow reordering for more cases.
+      if (AllowReorder && R.shouldReorder()) {
+        // Conceptually, there is nothing actually preventing us from trying to
+        // reorder a larger list. In fact, we do exactly this when vectorizing
+        // reductions. However, at this point, we only expect to get here when
+        // there are exactly two operations.
+        assert(Ops.size() == 2);
+        assert(BuildVectorSlice.empty());
+        Value *ReorderedOps[] = {Ops[1], Ops[0]};
+        R.buildTree(ReorderedOps, None);
+      }
+      if (R.isTreeTinyAndNotFullyVectorizable())
+        continue;
+
+      R.computeMinimumValueSizes();
+      int Cost = R.getTreeCost();
+
+      if (Cost < -SLPCostThreshold) {
+        DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+        R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
+                                            cast<Instruction>(Ops[0]))
+                         << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+                         << " and with tree size "
+                         << ore::NV("TreeSize", R.getTreeSize()));
+
+        Value *VectorizedRoot = R.vectorizeTree();
+
+        // Reconstruct the build vector by extracting the vectorized root. This
+        // way we handle the case where some elements of the vector are
+        // undefined.
+        //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
+        if (!BuildVectorSlice.empty()) {
+          // The insert point is the last build vector instruction. The
+          // vectorized root will precede it. This guarantees that we get an
+          // instruction. The vectorized tree could have been constant folded.
+          Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
+          unsigned VecIdx = 0;
+          for (auto &V : BuildVectorSlice) {
+            IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+                                        ++BasicBlock::iterator(InsertAfter));
+            Instruction *I = cast<Instruction>(V);
+            assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
+            Instruction *Extract =
+                cast<Instruction>(Builder.CreateExtractElement(
+                    VectorizedRoot, Builder.getInt32(VecIdx++)));
+            I->setOperand(1, Extract);
+            I->removeFromParent();
+            I->insertAfter(Extract);
+            InsertAfter = I;
+          }
+        }
+        // Move to the next bundle.
+        I += VF - 1;
+        NextInst = I + 1;
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
+  if (!V)
+    return false;
+
+  Value *P = V->getParent();
+
+  // Vectorize in current basic block only.
+  auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));
+  auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));
+  if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
+    return false;
+
+  // Try to vectorize V.
+  if (tryToVectorizePair(Op0, Op1, R))
+    return true;
+
+  auto *A = dyn_cast<BinaryOperator>(Op0);
+  auto *B = dyn_cast<BinaryOperator>(Op1);
+  // Try to skip B.
+  if (B && B->hasOneUse()) {
+    auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+    auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+    if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
+      return true;
+    if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
+      return true;
+  }
+
+  // Try to skip A.
+  if (A && A->hasOneUse()) {
+    auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+    auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+    if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
+      return true;
+    if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
+      return true;
+  }
+  return false;
+}
+
+/// \brief Generate a shuffle mask to be used in a reduction tree.
+///
+/// \param VecLen The length of the vector to be reduced.
+/// \param NumEltsToRdx The number of elements that should be reduced in the
+///        vector.
+/// \param IsPairwise Whether the reduction is a pairwise or splitting
+///        reduction. A pairwise reduction will generate a mask of
+///        <0,2,...> or <1,3,..> while a splitting reduction will generate
+///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
+/// \param IsLeft True will generate a mask of even elements, odd otherwise.
+static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
+                                   bool IsPairwise, bool IsLeft,
+                                   IRBuilder<> &Builder) {
+  assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
+
+  SmallVector<Constant *, 32> ShuffleMask(
+      VecLen, UndefValue::get(Builder.getInt32Ty()));
+
+  if (IsPairwise)
+    // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
+    for (unsigned i = 0; i != NumEltsToRdx; ++i)
+      ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
+  else
+    // Move the upper half of the vector to the lower half.
+    for (unsigned i = 0; i != NumEltsToRdx; ++i)
+      ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
+
+  return ConstantVector::get(ShuffleMask);
+}
+
+namespace {
+/// Model horizontal reductions.
+///
+/// A horizontal reduction is a tree of reduction operations (currently add and
+/// fadd) that has operations that can be put into a vector as its leaf.
+/// For example, this tree:
+///
+/// mul mul mul mul
+///  \  /    \  /
+///   +       +
+///    \     /
+///       +
+/// This tree has "mul" as its reduced values and "+" as its reduction
+/// operations. A reduction might be feeding into a store or a binary operation
+/// feeding a phi.
+///    ...
+///    \  /
+///     +
+///     |
+///  phi +=
+///
+///  Or:
+///    ...
+///    \  /
+///     +
+///     |
+///   *p =
+///
+class HorizontalReduction {
+  SmallVector<Value *, 16> ReductionOps;
+  SmallVector<Value *, 32> ReducedVals;
+  // Use map vector to make stable output.
+  MapVector<Instruction *, Value *> ExtraArgs;
+
+  BinaryOperator *ReductionRoot = nullptr;
+
+  /// The opcode of the reduction.
+  Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd;
+  /// The opcode of the values we perform a reduction on.
+  unsigned ReducedValueOpcode = 0;
+  /// Should we model this reduction as a pairwise reduction tree or a tree that
+  /// splits the vector in halves and adds those halves.
+  bool IsPairwiseReduction = false;
+
+  /// Checks if the ParentStackElem.first should be marked as a reduction
+  /// operation with an extra argument or as extra argument itself.
+  void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
+                    Value *ExtraArg) {
+    if (ExtraArgs.count(ParentStackElem.first)) {
+      ExtraArgs[ParentStackElem.first] = nullptr;
+      // We ran into something like:
+      // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
+      // The whole ParentStackElem.first should be considered as an extra value
+      // in this case.
+      // Do not perform analysis of remaining operands of ParentStackElem.first
+      // instruction, this whole instruction is an extra argument.
+      ParentStackElem.second = ParentStackElem.first->getNumOperands();
+    } else {
+      // We ran into something like:
+      // ParentStackElem.first += ... + ExtraArg + ...
+      ExtraArgs[ParentStackElem.first] = ExtraArg;
+    }
+  }
+
+public:
+  HorizontalReduction() = default;
+
+  /// \brief Try to find a reduction tree.
+  bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
+    assert((!Phi || is_contained(Phi->operands(), B)) &&
+           "Thi phi needs to use the binary operator");
+
+    // We could have a initial reductions that is not an add.
+    //  r *= v1 + v2 + v3 + v4
+    // In such a case start looking for a tree rooted in the first '+'.
+    if (Phi) {
+      if (B->getOperand(0) == Phi) {
+        Phi = nullptr;
+        B = dyn_cast<BinaryOperator>(B->getOperand(1));
+      } else if (B->getOperand(1) == Phi) {
+        Phi = nullptr;
+        B = dyn_cast<BinaryOperator>(B->getOperand(0));
+      }
+    }
+
+    if (!B)
+      return false;
+
+    Type *Ty = B->getType();
+    if (!isValidElementType(Ty))
+      return false;
+
+    ReductionOpcode = B->getOpcode();
+    ReducedValueOpcode = 0;
+    ReductionRoot = B;
+
+    // We currently only support adds.
+    if ((ReductionOpcode != Instruction::Add &&
+         ReductionOpcode != Instruction::FAdd) ||
+        !B->isAssociative())
+      return false;
+
+    // Post order traverse the reduction tree starting at B. We only handle true
+    // trees containing only binary operators or selects.
+    SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
+    Stack.push_back(std::make_pair(B, 0));
+    while (!Stack.empty()) {
+      Instruction *TreeN = Stack.back().first;
+      unsigned EdgeToVist = Stack.back().second++;
+      bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
+
+      // Postorder vist.
+      if (EdgeToVist == 2 || IsReducedValue) {
+        if (IsReducedValue)
+          ReducedVals.push_back(TreeN);
+        else {
+          auto I = ExtraArgs.find(TreeN);
+          if (I != ExtraArgs.end() && !I->second) {
+            // Check if TreeN is an extra argument of its parent operation.
+            if (Stack.size() <= 1) {
+              // TreeN can't be an extra argument as it is a root reduction
+              // operation.
+              return false;
+            }
+            // Yes, TreeN is an extra argument, do not add it to a list of
+            // reduction operations.
+            // Stack[Stack.size() - 2] always points to the parent operation.
+            markExtraArg(Stack[Stack.size() - 2], TreeN);
+            ExtraArgs.erase(TreeN);
+          } else
+            ReductionOps.push_back(TreeN);
+        }
+        // Retract.
+        Stack.pop_back();
+        continue;
+      }
+
+      // Visit left or right.
+      Value *NextV = TreeN->getOperand(EdgeToVist);
+      if (NextV != Phi) {
+        auto *I = dyn_cast<Instruction>(NextV);
+        // Continue analysis if the next operand is a reduction operation or
+        // (possibly) a reduced value. If the reduced value opcode is not set,
+        // the first met operation != reduction operation is considered as the
+        // reduced value class.
+        if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
+                  I->getOpcode() == ReductionOpcode)) {
+          // Only handle trees in the current basic block.
+          if (I->getParent() != B->getParent()) {
+            // I is an extra argument for TreeN (its parent operation).
+            markExtraArg(Stack.back(), I);
+            continue;
+          }
+
+          // Each tree node needs to have one user except for the ultimate
+          // reduction.
+          if (!I->hasOneUse() && I != B) {
+            // I is an extra argument for TreeN (its parent operation).
+            markExtraArg(Stack.back(), I);
+            continue;
+          }
+
+          if (I->getOpcode() == ReductionOpcode) {
+            // We need to be able to reassociate the reduction operations.
+            if (!I->isAssociative()) {
+              // I is an extra argument for TreeN (its parent operation).
+              markExtraArg(Stack.back(), I);
+              continue;
+            }
+          } else if (ReducedValueOpcode &&
+                     ReducedValueOpcode != I->getOpcode()) {
+            // Make sure that the opcodes of the operations that we are going to
+            // reduce match.
+            // I is an extra argument for TreeN (its parent operation).
+            markExtraArg(Stack.back(), I);
+            continue;
+          } else if (!ReducedValueOpcode)
+            ReducedValueOpcode = I->getOpcode();
+
+          Stack.push_back(std::make_pair(I, 0));
+          continue;
+        }
+      }
+      // NextV is an extra argument for TreeN (its parent operation).
+      markExtraArg(Stack.back(), NextV);
+    }
+    return true;
+  }
+
+  /// \brief Attempt to vectorize the tree found by
+  /// matchAssociativeReduction.
+  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+    if (ReducedVals.empty())
+      return false;
+
+    // If there is a sufficient number of reduction values, reduce
+    // to a nearby power-of-2. Can safely generate oversized
+    // vectors and rely on the backend to split them to legal sizes.
+    unsigned NumReducedVals = ReducedVals.size();
+    if (NumReducedVals < 4)
+      return false;
+
+    unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
+
+    Value *VectorizedTree = nullptr;
+    IRBuilder<> Builder(ReductionRoot);
+    FastMathFlags Unsafe;
+    Unsafe.setUnsafeAlgebra();
+    Builder.setFastMathFlags(Unsafe);
+    unsigned i = 0;
+
+    BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
+    // The same extra argument may be used several time, so log each attempt
+    // to use it.
+    for (auto &Pair : ExtraArgs)
+      ExternallyUsedValues[Pair.second].push_back(Pair.first);
+    while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
+      auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
+      V.buildTree(VL, ExternallyUsedValues, ReductionOps);
+      if (V.shouldReorder()) {
+        SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
+        V.buildTree(Reversed, ExternallyUsedValues, ReductionOps);
+      }
+      if (V.isTreeTinyAndNotFullyVectorizable())
+        break;
+
+      V.computeMinimumValueSizes();
+
+      // Estimate cost.
+      int Cost =
+          V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+      if (Cost >= -SLPCostThreshold)
+        break;
+
+      DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
+                   << ". (HorRdx)\n");
+      auto *I0 = cast<Instruction>(VL[0]);
+      V.getORE()->emit(
+          OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
+          << "Vectorized horizontal reduction with cost "
+          << ore::NV("Cost", Cost) << " and with tree size "
+          << ore::NV("TreeSize", V.getTreeSize()));
+
+      // Vectorize a tree.
+      DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
+      Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
+
+      // Emit a reduction.
+      Value *ReducedSubTree =
+          emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
+      if (VectorizedTree) {
+        Builder.SetCurrentDebugLocation(Loc);
+        VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
+                                             ReducedSubTree, "bin.rdx");
+        propagateIRFlags(VectorizedTree, ReductionOps);
+      } else
+        VectorizedTree = ReducedSubTree;
+      i += ReduxWidth;
+      ReduxWidth = PowerOf2Floor(NumReducedVals - i);
+    }
+
+    if (VectorizedTree) {
+      // Finish the reduction.
+      for (; i < NumReducedVals; ++i) {
+        auto *I = cast<Instruction>(ReducedVals[i]);
+        Builder.SetCurrentDebugLocation(I->getDebugLoc());
+        VectorizedTree =
+            Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I);
+        propagateIRFlags(VectorizedTree, ReductionOps);
+      }
+      for (auto &Pair : ExternallyUsedValues) {
+        assert(!Pair.second.empty() &&
+               "At least one DebugLoc must be inserted");
+        // Add each externally used value to the final reduction.
+        for (auto *I : Pair.second) {
+          Builder.SetCurrentDebugLocation(I->getDebugLoc());
+          VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
+                                               Pair.first, "bin.extra");
+          propagateIRFlags(VectorizedTree, I);
+        }
+      }
+      // Update users.
+      ReductionRoot->replaceAllUsesWith(VectorizedTree);
+    }
+    return VectorizedTree != nullptr;
+  }
+
+  unsigned numReductionValues() const {
+    return ReducedVals.size();
+  }
+
+private:
+  /// \brief Calculate the cost of a reduction.
+  int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
+                       unsigned ReduxWidth) {
+    Type *ScalarTy = FirstReducedVal->getType();
+    Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
+
+    int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
+    int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
+
+    IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
+    int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
+
+    int ScalarReduxCost =
+        (ReduxWidth - 1) *
+        TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);
+
+    DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
+                 << " for reduction that starts with " << *FirstReducedVal
+                 << " (It is a "
+                 << (IsPairwiseReduction ? "pairwise" : "splitting")
+                 << " reduction)\n");
+
+    return VecReduxCost - ScalarReduxCost;
+  }
+
+  /// \brief Emit a horizontal reduction of the vectorized value.
+  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
+                       unsigned ReduxWidth, ArrayRef<Value *> RedOps,
+                       const TargetTransformInfo *TTI) {
+    assert(VectorizedValue && "Need to have a vectorized tree node");
+    assert(isPowerOf2_32(ReduxWidth) &&
+           "We only handle power-of-two reductions for now");
+
+    if (!IsPairwiseReduction)
+      return createSimpleTargetReduction(
+          Builder, TTI, ReductionOpcode, VectorizedValue,
+          TargetTransformInfo::ReductionFlags(), RedOps);
+
+    Value *TmpVec = VectorizedValue;
+    for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
+      Value *LeftMask =
+          createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
+      Value *RightMask =
+          createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
+
+      Value *LeftShuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
+      Value *RightShuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
+          "rdx.shuf.r");
+      TmpVec =
+          Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");
+      propagateIRFlags(TmpVec, RedOps);
+    }
+
+    // The result is in the first element of the vector.
+    return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+  }
+};
+} // end anonymous namespace
+
+/// \brief Recognize construction of vectors like
+///  %ra = insertelement <4 x float> undef, float %s0, i32 0
+///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+///
+/// Returns true if it matches
+///
+static bool findBuildVector(InsertElementInst *FirstInsertElem,
+                            SmallVectorImpl<Value *> &BuildVector,
+                            SmallVectorImpl<Value *> &BuildVectorOpds) {
+  if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
+    return false;
+
+  InsertElementInst *IE = FirstInsertElem;
+  while (true) {
+    BuildVector.push_back(IE);
+    BuildVectorOpds.push_back(IE->getOperand(1));
+
+    if (IE->use_empty())
+      return false;
+
+    InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
+    if (!NextUse)
+      return true;
+
+    // If this isn't the final use, make sure the next insertelement is the only
+    // use. It's OK if the final constructed vector is used multiple times
+    if (!IE->hasOneUse())
+      return false;
+
+    IE = NextUse;
+  }
+
+  return false;
+}
+
+/// \brief Like findBuildVector, but looks backwards for construction of aggregate.
+///
+/// \return true if it matches.
+static bool findBuildAggregate(InsertValueInst *IV,
+                               SmallVectorImpl<Value *> &BuildVector,
+                               SmallVectorImpl<Value *> &BuildVectorOpds) {
+  Value *V;
+  do {
+    BuildVector.push_back(IV);
+    BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+    V = IV->getAggregateOperand();
+    if (isa<UndefValue>(V))
+      break;
+    IV = dyn_cast<InsertValueInst>(V);
+    if (!IV || !IV->hasOneUse())
+      return false;
+  } while (true);
+  std::reverse(BuildVector.begin(), BuildVector.end());
+  std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
+  return true;
+}
+
+static bool PhiTypeSorterFunc(Value *V, Value *V2) {
+  return V->getType() < V2->getType();
+}
+
+/// \brief Try and get a reduction value from a phi node.
+///
+/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
+/// if they come from either \p ParentBB or a containing loop latch.
+///
+/// \returns A candidate reduction value if possible, or \code nullptr \endcode
+/// if not possible.
+static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
+                                BasicBlock *ParentBB, LoopInfo *LI) {
+  // There are situations where the reduction value is not dominated by the
+  // reduction phi. Vectorizing such cases has been reported to cause
+  // miscompiles. See PR25787.
+  auto DominatedReduxValue = [&](Value *R) {
+    return (
+        dyn_cast<Instruction>(R) &&
+        DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
+  };
+
+  Value *Rdx = nullptr;
+
+  // Return the incoming value if it comes from the same BB as the phi node.
+  if (P->getIncomingBlock(0) == ParentBB) {
+    Rdx = P->getIncomingValue(0);
+  } else if (P->getIncomingBlock(1) == ParentBB) {
+    Rdx = P->getIncomingValue(1);
+  }
+
+  if (Rdx && DominatedReduxValue(Rdx))
+    return Rdx;
+
+  // Otherwise, check whether we have a loop latch to look at.
+  Loop *BBL = LI->getLoopFor(ParentBB);
+  if (!BBL)
+    return nullptr;
+  BasicBlock *BBLatch = BBL->getLoopLatch();
+  if (!BBLatch)
+    return nullptr;
+
+  // There is a loop latch, return the incoming value if it comes from
+  // that. This reduction pattern occasionally turns up.
+  if (P->getIncomingBlock(0) == BBLatch) {
+    Rdx = P->getIncomingValue(0);
+  } else if (P->getIncomingBlock(1) == BBLatch) {
+    Rdx = P->getIncomingValue(1);
+  }
+
+  if (Rdx && DominatedReduxValue(Rdx))
+    return Rdx;
+
+  return nullptr;
+}
+
+/// Attempt to reduce a horizontal reduction.
+/// If it is legal to match a horizontal reduction feeding the phi node \a P
+/// with reduction operators \a Root (or one of its operands) in a basic block
+/// \a BB, then check if it can be done. If horizontal reduction is not found
+/// and root instruction is a binary operation, vectorization of the operands is
+/// attempted.
+/// \returns true if a horizontal reduction was matched and reduced or operands
+/// of one of the binary instruction were vectorized.
+/// \returns false if a horizontal reduction was not matched (or not possible)
+/// or no vectorization of any binary operation feeding \a Root instruction was
+/// performed.
+static bool tryToVectorizeHorReductionOrInstOperands(
+    PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
+    TargetTransformInfo *TTI,
+    const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
+  if (!ShouldVectorizeHor)
+    return false;
+
+  if (!Root)
+    return false;
+
+  if (Root->getParent() != BB)
+    return false;
+  // Start analysis starting from Root instruction. If horizontal reduction is
+  // found, try to vectorize it. If it is not a horizontal reduction or
+  // vectorization is not possible or not effective, and currently analyzed
+  // instruction is a binary operation, try to vectorize the operands, using
+  // pre-order DFS traversal order. If the operands were not vectorized, repeat
+  // the same procedure considering each operand as a possible root of the
+  // horizontal reduction.
+  // Interrupt the process if the Root instruction itself was vectorized or all
+  // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
+  SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
+  SmallSet<Value *, 8> VisitedInstrs;
+  bool Res = false;
+  while (!Stack.empty()) {
+    Value *V;
+    unsigned Level;
+    std::tie(V, Level) = Stack.pop_back_val();
+    if (!V)
+      continue;
+    auto *Inst = dyn_cast<Instruction>(V);
+    if (!Inst || isa<PHINode>(Inst))
+      continue;
+    if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
+      HorizontalReduction HorRdx;
+      if (HorRdx.matchAssociativeReduction(P, BI)) {
+        if (HorRdx.tryToReduce(R, TTI)) {
+          Res = true;
+          // Set P to nullptr to avoid re-analysis of phi node in
+          // matchAssociativeReduction function unless this is the root node.
+          P = nullptr;
+          continue;
+        }
+      }
+      if (P) {
+        Inst = dyn_cast<Instruction>(BI->getOperand(0));
+        if (Inst == P)
+          Inst = dyn_cast<Instruction>(BI->getOperand(1));
+        if (!Inst) {
+          // Set P to nullptr to avoid re-analysis of phi node in
+          // matchAssociativeReduction function unless this is the root node.
+          P = nullptr;
+          continue;
+        }
+      }
+    }
+    // Set P to nullptr to avoid re-analysis of phi node in
+    // matchAssociativeReduction function unless this is the root node.
+    P = nullptr;
+    if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+      Res = true;
+      continue;
+    }
+
+    // Try to vectorize operands.
+    if (++Level < RecursionMaxDepth)
+      for (auto *Op : Inst->operand_values())
+        Stack.emplace_back(Op, Level);
+  }
+  return Res;
+}
+
+bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
+                                                 BasicBlock *BB, BoUpSLP &R,
+                                                 TargetTransformInfo *TTI) {
+  if (!V)
+    return false;
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  if (!isa<BinaryOperator>(I))
+    P = nullptr;
+  // Try to match and vectorize a horizontal reduction.
+  return tryToVectorizeHorReductionOrInstOperands(
+      P, I, BB, R, TTI, [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
+        return tryToVectorize(BI, R);
+      });
+}
+
+bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
+  bool Changed = false;
+  SmallVector<Value *, 4> Incoming;
+  SmallSet<Value *, 16> VisitedInstrs;
+
+  bool HaveVectorizedPhiNodes = true;
+  while (HaveVectorizedPhiNodes) {
+    HaveVectorizedPhiNodes = false;
+
+    // Collect the incoming values from the PHIs.
+    Incoming.clear();
+    for (Instruction &I : *BB) {
+      PHINode *P = dyn_cast<PHINode>(&I);
+      if (!P)
+        break;
+
+      if (!VisitedInstrs.count(P))
+        Incoming.push_back(P);
+    }
+
+    // Sort by type.
+    std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
+
+    // Try to vectorize elements base on their type.
+    for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
+                                           E = Incoming.end();
+         IncIt != E;) {
+
+      // Look for the next elements with the same type.
+      SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
+      while (SameTypeIt != E &&
+             (*SameTypeIt)->getType() == (*IncIt)->getType()) {
+        VisitedInstrs.insert(*SameTypeIt);
+        ++SameTypeIt;
+      }
+
+      // Try to vectorize them.
+      unsigned NumElts = (SameTypeIt - IncIt);
+      DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
+      // The order in which the phi nodes appear in the program does not matter.
+      // So allow tryToVectorizeList to reorder them if it is beneficial. This
+      // is done when there are exactly two elements since tryToVectorizeList
+      // asserts that there are only two values when AllowReorder is true.
+      bool AllowReorder = NumElts == 2;
+      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
+                                            None, AllowReorder)) {
+        // Success start over because instructions might have been changed.
+        HaveVectorizedPhiNodes = true;
+        Changed = true;
+        break;
+      }
+
+      // Start over at the next instruction of a different type (or the end).
+      IncIt = SameTypeIt;
+    }
+  }
+
+  VisitedInstrs.clear();
+
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
+    // We may go through BB multiple times so skip the one we have checked.
+    if (!VisitedInstrs.insert(&*it).second)
+      continue;
+
+    if (isa<DbgInfoIntrinsic>(it))
+      continue;
+
+    // Try to vectorize reductions that use PHINodes.
+    if (PHINode *P = dyn_cast<PHINode>(it)) {
+      // Check that the PHI is a reduction PHI.
+      if (P->getNumIncomingValues() != 2)
+        return Changed;
+
+      // Try to match and vectorize a horizontal reduction.
+      if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
+                                   TTI)) {
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
+        continue;
+      }
+      continue;
+    }
+
+    if (ShouldStartVectorizeHorAtStore) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
+        // Try to match and vectorize a horizontal reduction.
+        if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
+                                     TTI)) {
+          Changed = true;
+          it = BB->begin();
+          e = BB->end();
+          continue;
+        }
+      }
+    }
+
+    // Try to vectorize horizontal reductions feeding into a return.
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
+      if (RI->getNumOperands() != 0) {
+        // Try to match and vectorize a horizontal reduction.
+        if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
+          Changed = true;
+          it = BB->begin();
+          e = BB->end();
+          continue;
+        }
+      }
+    }
+
+    // Try to vectorize trees that start at compare instructions.
+    if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
+      if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
+        Changed = true;
+        // We would like to start over since some instructions are deleted
+        // and the iterator may become invalid value.
+        it = BB->begin();
+        e = BB->end();
+        continue;
+      }
+
+      for (int I = 0; I < 2; ++I) {
+        if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
+          Changed = true;
+          // We would like to start over since some instructions are deleted
+          // and the iterator may become invalid value.
+          it = BB->begin();
+          e = BB->end();
+          break;
+        }
+      }
+      continue;
+    }
+
+    // Try to vectorize trees that start at insertelement instructions.
+    if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
+      SmallVector<Value *, 16> BuildVector;
+      SmallVector<Value *, 16> BuildVectorOpds;
+      if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
+        continue;
+
+      // Vectorize starting with the build vector operands ignoring the
+      // BuildVector instructions for the purpose of scheduling and user
+      // extraction.
+      if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
+      }
+
+      continue;
+    }
+
+    // Try to vectorize trees that start at insertvalue instructions feeding into
+    // a store.
+    if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
+      if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
+        const DataLayout &DL = BB->getModule()->getDataLayout();
+        if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
+          SmallVector<Value *, 16> BuildVector;
+          SmallVector<Value *, 16> BuildVectorOpds;
+          if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
+            continue;
+
+          DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
+          if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
+            Changed = true;
+            it = BB->begin();
+            e = BB->end();
+          }
+          continue;
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
+  auto Changed = false;
+  for (auto &Entry : GEPs) {
+
+    // If the getelementptr list has fewer than two elements, there's nothing
+    // to do.
+    if (Entry.second.size() < 2)
+      continue;
+
+    DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
+                 << Entry.second.size() << ".\n");
+
+    // We process the getelementptr list in chunks of 16 (like we do for
+    // stores) to minimize compile-time.
+    for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
+      auto Len = std::min<unsigned>(BE - BI, 16);
+      auto GEPList = makeArrayRef(&Entry.second[BI], Len);
+
+      // Initialize a set a candidate getelementptrs. Note that we use a
+      // SetVector here to preserve program order. If the index computations
+      // are vectorizable and begin with loads, we want to minimize the chance
+      // of having to reorder them later.
+      SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
+
+      // Some of the candidates may have already been vectorized after we
+      // initially collected them. If so, the WeakTrackingVHs will have
+      // nullified the
+      // values, so remove them from the set of candidates.
+      Candidates.remove(nullptr);
+
+      // Remove from the set of candidates all pairs of getelementptrs with
+      // constant differences. Such getelementptrs are likely not good
+      // candidates for vectorization in a bottom-up phase since one can be
+      // computed from the other. We also ensure all candidate getelementptr
+      // indices are unique.
+      for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
+        auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
+        if (!Candidates.count(GEPI))
+          continue;
+        auto *SCEVI = SE->getSCEV(GEPList[I]);
+        for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
+          auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
+          auto *SCEVJ = SE->getSCEV(GEPList[J]);
+          if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
+            Candidates.remove(GEPList[I]);
+            Candidates.remove(GEPList[J]);
+          } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
+            Candidates.remove(GEPList[J]);
+          }
+        }
+      }
+
+      // We break out of the above computation as soon as we know there are
+      // fewer than two candidates remaining.
+      if (Candidates.size() < 2)
+        continue;
+
+      // Add the single, non-constant index of each candidate to the bundle. We
+      // ensured the indices met these constraints when we originally collected
+      // the getelementptrs.
+      SmallVector<Value *, 16> Bundle(Candidates.size());
+      auto BundleIndex = 0u;
+      for (auto *V : Candidates) {
+        auto *GEP = cast<GetElementPtrInst>(V);
+        auto *GEPIdx = GEP->idx_begin()->get();
+        assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
+        Bundle[BundleIndex++] = GEPIdx;
+      }
+
+      // Try and vectorize the indices. We are currently only interested in
+      // gather-like cases of the form:
+      //
+      // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
+      //
+      // where the loads of "a", the loads of "b", and the subtractions can be
+      // performed in parallel. It's likely that detecting this pattern in a
+      // bottom-up phase will be simpler and less costly than building a
+      // full-blown top-down phase beginning at the consecutive loads.
+      Changed |= tryToVectorizeList(Bundle, R);
+    }
+  }
+  return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
+  bool Changed = false;
+  // Attempt to sort and vectorize each of the store-groups.
+  for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
+       ++it) {
+    if (it->second.size() < 2)
+      continue;
+
+    DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
+          << it->second.size() << ".\n");
+
+    // Process the stores in chunks of 16.
+    // TODO: The limit of 16 inhibits greater vectorization factors.
+    //       For example, AVX2 supports v32i8. Increasing this limit, however,
+    //       may cause a significant compile-time increase.
+    for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
+      unsigned Len = std::min<unsigned>(CE - CI, 16);
+      Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
+    }
+  }
+  return Changed;
+}
+
+char SLPVectorizer::ID = 0;
+static const char lv_name[] = "SLP Vectorizer";
+INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
+
+namespace llvm {
+Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
new file mode 100644
index 000000000000..fb2f509dcbaa
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
@@ -0,0 +1,48 @@
+//===-- Vectorize.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMVectorizeOpts.a, which
+// implements several vectorization transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Vectorize.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+/// initializeVectorizationPasses - Initialize all passes linked into the
+/// Vectorization library.
+void llvm::initializeVectorization(PassRegistry &Registry) {
+  initializeLoopVectorizePass(Registry);
+  initializeSLPVectorizerPass(Registry);
+  initializeLoadStoreVectorizerPass(Registry);
+}
+
+void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
+  initializeVectorization(*unwrap(R));
+}
+
+// DEPRECATED: Remove after the LLVM 5 release.
+void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
+}
+
+void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopVectorizePass());
+}
+
+void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSLPVectorizerPass());
+}